This repository was archived by the owner on Jul 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathavro_from_hackernews.py
44 lines (32 loc) · 1.6 KB
/
avro_from_hackernews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import requests
from bs4 import BeautifulSoup
import avro
from avro.datafile import DataFileWriter
from avro.io import DatumWriter
import argparse
parser = argparse.ArgumentParser(description="download articles from HackerNews")
parser.add_argument("-output_file", type=str, help="the avro serialized file", default="hackernews_utterances.avro")
args = parser.parse_args()
schema = avro.schema.Parse(open("corpus_utterance.avsc", "rb").read().decode('utf-8'))
writer = DataFileWriter(open(args.output_file, "wb"), DatumWriter(), schema)
response = requests.request("GET", "https://news.ycombinator.com/")
soup = BeautifulSoup(response.text, 'html.parser')
article_comment_links = set()
for link in soup.find_all('a'):
if 'item?id=' in link.get('href'):
link.get('href')
article_comment_links.add("https://news.ycombinator.com/" + link.get('href'))
print('found {0} articles from the HackerNews homepage'.format(len(article_comment_links)))
comment_count = 0
for article_link in article_comment_links:
print('examining ' + article_link)
response = requests.request("GET", article_link)
soup = BeautifulSoup(response.text, 'html.parser')
for comment in soup.find_all("div", class_="comment"):
clean_comment = comment.get_text().replace('\n', ' ').rsplit('reply', maxsplit=1)[0].strip()
print(clean_comment)
writer.append({"text": clean_comment, "timestamp": "", "source": article_link, "tags": []})
print('-----')
comment_count += 1
writer.close()
print('Done! Processed {0} comments from {1} article links'.format(comment_count, len(article_link)))