-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaziz.py
180 lines (155 loc) · 6.2 KB
/
aziz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#The goal of this script is to create a Json for each link in the input.
#The Json will contain the following information:
#1- The link
#2- The name of the website
#3- Each line from the article
#4- The date of the article
#5- The author of the article
#6- The Code for the line:
# 0: The line does not match any code
# 1:Economy/Money/Finance/Trade/GDP
# 2:Trust/Stability/Security/Peace
# 3:Violence/Conflict/Protest
# 4:Unity/Cooperation/Alliance
# 5:Change/ Reform/Revolution
# 6:Democracy/Politics/Parliament
# 7:Human Rights/Law/Justice
# 8:Religion/Religious
# 9:Social Issues/Social Justice
# 10:Health/Environment
# 11:Education/Science/Technology
#Explicit or Implicit
#Positive or Negative
#Example: 3, Explicit, Negative
#These codes will be calculated using the GPT api
#The script will have to download the article from the link which was passed through
import requests
from bs4 import BeautifulSoup
import json
import os
import openai
import csv
import time
import sys
def get_article_info(url):
# This function will get the article from the link and return a dictionary with the information
# The code will do this by running the extrablatt program from the command line
# The extrablatt program will return a json file with the information
# The code will then read the json file and return the information
info = {}
info["link"] = url
hurl = '"' + url + '"' # Add quotes to the url
nj = "{}.json".format(url) # The name of the json file
# Use the extrablatt program to get the information from the command line
try:
print(os.system("extrablatt article {} -o {}".format(hurl, nj)))
except:
print("Error")
# Read the json file
with open("article.json", "r", encoding="utf-8") as json_file:
data = json.load(json_file)
# Get the information from the json file
try:
info["website"] = data[0]["url"].split("/")[2]
except:
info["website"] = ""
try:
info["title"] = data[0]["content"]["title"]
except:
info["title"] = ""
try:
info["date"] = data[0]["content"]["publishing_date"]["published"]["DateTime"]
except:
info["date"] = ""
try:
info["author"] = data[0]["content"]["authors"][0]
except:
info["author"] = ""
try:
info["text"] = data[0]["content"]["text"]
except:
info["text"] = ""
try:
keywords = data[0]["content"]["keywords"]
info["keywords"] = ", ".join(keywords)
except:
info["keywords"] = ""
# Copy the json file to another file with the name of the article
os.popen("copy article.json {}.json".format(info["title"]))
return info
def get_sentences(info):
#Get the sentences from the text, split by periods
sentences = info["text"].split(".")
return sentences
def get_paragraphs(info):
#Get the paragraphs from the text, split by newlines
paragraphs = info["text"].split("\n")
return paragraphs
def get_codes(sentences):
#Use the GPT api to get the codes for the sentences
#The codes will be stored in a dictionary along with the sentence
#The dictionary will be returned
openai.api_key = "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"
codes = {}
for sentence in sentences:
#Build the prompt
#Print the sentence
print("Sentence: {}".format(sentence))
prompt="(The Code for the line:\nExample: Result: 3, Explicit, Negative, KeyWords: Economic Impact, Violence\nExample: Result: 1, Implicit, Positive, Keywords: Shut Down, Critical Access\n 1:Economy/Money/Finance/Trade/GDP\n 2:Trust/Stability/Security/Peace\n 3:Violence/Conflict/Protest\n 4:Unity/Cooperation/Alliance\n 5:Change/ Reform/Revolution \n Explicit or Implicit\nPositive or Negative)\n\n{}\nThe keywords should also be extracted\nResult:"
#Remove the {} from the prompt and replace it with the sentence
prompt = prompt.format(sentence)
try:
response = openai.Completion.create(
model="text-davinci-003",
prompt=prompt,
temperature=0.1,
max_tokens=256,
top_p=0.5,
frequency_penalty=0,
presence_penalty=0
)
codes.update({sentence: response["choices"][0]["text"]})
print(response["choices"][0]["text"])
#Wait for 1 second to avoid the api limit
time.sleep(2)
except:
codes.update({sentence: "error"})
return codes
def create_csv(info, sentences, codes):
#Create a csv file with the information
#The csv file will be named after the article title
#The csv file will have the following columns:
#Website, Title, Date, Author, Sentence, Code
#The csv file will be stored in the same directory as the python file
filename = info["title"] + ".csv"
#Take the codes and get the implicit/ explicit and positive/negative and keywords
IE = []
PN = []
KW = []
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerow(["Website", "Title", "Date", "Author", "Sentence", "Code", "Implicit/Explicit" ,"Intent", "Keywords"])
for sentence in sentences:
#Get the code
code = codes[sentence]
#Get the implicit/ explicit
IE.append(code.split(",")[1])
#Get the positive/ negative
PN.append(code.split(",")[2])
#Get the keywords
KW.append(code.split(",")[3])
#Fix any encoding issues
sentence = sentence.encode("utf-8", "ignore").decode()
writer.writerow([info["website"], info["title"], info["date"], info["author"], sentence, codes[sentence]])
def main():
#The link is the first argument passed to the program
link = sys.argv[1]
result = get_article_info(link)
sentences = get_sentences(result)
paragraphs = get_paragraphs(result)
#Print all the sentences
output = get_codes(sentences)
print(output)
create_csv(result, sentences, output)
#print(result)
main()