-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharticles.py
118 lines (97 loc) · 4.12 KB
/
articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import json
import configparser
import google.generativeai as genai
import requests
#from newspaper import Article
# Initialize the generative model
model = genai.GenerativeModel("gemini-1.5-pro-latest")
config = configparser.ConfigParser()
config.read('config.ini')
genai.configure(api_key=config['gemini']['api_key'])
output_file_path = "files/benchmarking_data/real_articles_temp.json"
def scrape_article(url):
try:
# Initialize the article object
article = Article(url)
# Download and parse the article
article.download()
article.parse()
return article.text
except Exception as e:
print(f"An error occurred: {e}")
return None
def fetch_news(search_term, max_articles=10):
"""
Fetch news articles for a given search term from Google RSS feed.
Args:
search_term (str): The search term for querying Google News RSS.
max_articles (int): Maximum number of articles to fetch.
Returns:
list: A list of dictionaries, each containing details of a news article.
"""
search_term = search_term.replace(" ", "+")
bugfree_sources = "\"The New York Times\", \"International New York Times\", \"International Herald Tribune\""
bugfree_sources = bugfree_sources.replace(" ", "+")
articles = []
response = requests.get(
f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={search_term}&fq=source:({bugfree_sources})&sort=relevance&api-key={config['nytimes']['api_key']}")
data = response.json()['response']['docs']
# print(data)
if True:
print(f"Found {len(data)} articles.")
for news_item in data[:max_articles]: # Limit the number of articles
print(f"Processing article: {news_item['snippet']}")
full_text = scrape_article(news_item['web_url'])
summary = preprocess_news(full_text)
article = {
"text": summary,
"source": news_item['source'],
"date": news_item['pub_date'],
"benchmarking": {
"model update triples": {
"unchanged": [],
"added": [],
"deleted": []
},
"correct update": None,
"wikidata structure": None
}
}
articles.append(article)
else:
print(f"No news found for the term: {search_term}")
return articles
def preprocess_news(full_text):
"""Uses the LLM to condense a news article into one sentence."""
prompt = f"""
You are a summarization assistant. Your task is to access the full text of the article {full_text} and then summarize this article from this into a single sentence. Keep the main event and relevant company details.
"""
result = model.generate_content(prompt, generation_config={"temperature": 0.2})
return result.text.strip()
def generate_real_articles(companies):
"""
Build a JSON structure for real articles.
Args:
companies (list): List of company names for which to fetch articles.
Returns:
dict: A dictionary structured as real articles JSON.
"""
real_articles = {}
for company in companies:
print(f"\nFetching articles for company: {company}")
articles = fetch_news(company) # Fetch news articles for the company
company_data = {company: {}}
for idx, article in enumerate(articles, 1):
article_key = f"article_{idx}"
print(f"Adding article {idx} to {company}'s data.")
company_data[company][article_key] = article # Directly assign the article data
real_articles.update(company_data)
print(f"Completed processing for company: {company}")
print("All companies processed. Returning synthetic articles JSON.")
return real_articles
def save_to_json(data, filename=output_file_path):
"""Saves the generated data to a JSON file."""
print(f"Saving synthetic articles to {filename}")
with open(filename, "w") as file:
json.dump(data, file, indent=4)
print("Data successfully saved.")