-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_FilmMain.py
123 lines (92 loc) · 4.63 KB
/
_FilmMain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import pandas as pd
import re
import string
import pickle
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
def predict_sentiment(model, tokenizer, max_tokens, text):
text_tokens = tokenizer.texts_to_sequences([text])
text_pad = pad_sequences(text_tokens, maxlen=max_tokens)
prediction = model.predict(text_pad)[0][0]
if prediction >= 0.5:
sentiment = 'Olumlu 😀'
probability = prediction * 100
else:
sentiment = 'Olumsuz 😞'
probability = (1 - prediction) * 100
return sentiment, probability
def remove_punctuation(text):
no_punc = [char for char in text if char not in string.punctuation]
word_wo_punc = "".join(no_punc)
return word_wo_punc
def remove_numeric(corpus):
output = "".join(words for words in corpus if not words.isdigit())
return output
def main(train_model=True):
if train_model:
df = pd.read_csv("C:/Users/dogab/OneDrive/Desktop/Proje_DuyguAnalizi/turkish_movie_sentiment_dataset.csv")
comments = lambda x : x[23:-24]
df["comment"] = df["comment"].apply(comments)
floatize = lambda x : float(x[0:-2])
df["point"] = df["point"].apply(floatize)
df.drop(df[df["point"] == 3].index, inplace = True)
df["point"] = df["point"].replace(1, 0)
df["point"] = df["point"].replace(2, 0)
df["point"] = df["point"].replace(4, 1)
df["point"] = df["point"].replace(5, 1)
df.reset_index(inplace = True)
df.drop("index", axis = 1, inplace = True)
df["comment"] = df["comment"].apply(lambda x: x.lower())
df["comment"] = df["comment"].apply(lambda x: remove_punctuation(x))
df["comment"] = df["comment"].apply(lambda x: x.replace("\r", " "))
df["comment"] = df["comment"].apply(lambda x: x.replace("\n", " "))
df["comment"] = df["comment"].apply(lambda x: remove_numeric(x))
target = df["point"].values.tolist()
data = df["comment"].values.tolist()
cutoff = int(len(data)*0.80)
X_train, X_test = data[:cutoff], data[cutoff:]
y_train, y_test = target[:cutoff], target[cutoff:]
num_words = 10000
tokenizer = Tokenizer(num_words = num_words)
tokenizer.fit_on_texts(data)
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)
num_tokens = [len(tokens) for tokens in X_train_tokens + X_test_tokens]
num_tokens = np.array(num_tokens)
max_tokens = int(np.mean(num_tokens) + (2*np.std(num_tokens)))
X_train_pad = pad_sequences(X_train_tokens, maxlen=max_tokens)
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_tokens)
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))
embedding_size = 50
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_size, input_length=max_tokens, name="embedding_layer"))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation="sigmoid"))
optimizer = Adam(learning_rate=1e-3)
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
X_train_pad = np.array(X_train_pad)
y_train = np.array(y_train)
model.fit(X_train_pad, y_train, epochs=5, batch_size=256)
model.save("model.h5")
with open("tokenizer.pickle", "wb") as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
model = load_model("model.h5")
with open("tokenizer.pickle", "rb") as handle:
tokenizer = pickle.load(handle)
max_tokens = 100
print("--------------------------------------------------------")
user_input = input("Lütfen bir metin girin: ")
predicted_sentiment, probability = predict_sentiment(model, tokenizer, max_tokens, user_input)
print(f"Girilen metnin duygusu: {predicted_sentiment} -> Olasılık: % {probability:.2f}\n")
if __name__ == "__main__":
main(train_model=False) # Eğitim yapmadan tahmin yapmak için False True olarak değiştir!