introvert_or_extrovert.py

# -*- coding: utf-8 -*-
"""Introvert or Extrovert.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1zDMXThpUMCjkPj36nSb633HnSp3_Gq1F
"""

import numpy as np
import pandas as pd

from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import tensorflow as tf

data = pd.read_csv(r"/content/mbti_1.csv")

data

data.info()

"""# Data Preprocessing"""

data['type'].unique()
# Shows unique values

def preprocess_inputs(df):
    
    texts = df['posts'].copy()
    labels = df['type'].copy()
    
#     Process text data
#     Stopwords are the English words which does not add much meaning to a sentence
    stop_words = stopwords.words('english')
    
    texts = [text.lower() for text in texts]
    texts = [text.split() for text in texts]
    texts = [[word.strip() for word in text] for text in texts]
    texts = [[word for word in text if word not in stop_words]for text in texts]
    
    vocab_length = 1000
    
    tokenizer = Tokenizer(num_words = vocab_length)
    tokenizer.fit_on_texts(texts)
    
    texts = tokenizer.texts_to_sequences(texts)

# Creating a reverse dictionary
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
    def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
      words = [reverse_word_map.get(letter) for letter in list_of_indices]
      return(words)

# Creating texts 
    my_texts = list(map(sequence_to_text, texts))   
    

    max_seq_length = np.max([len(text) for text in texts])
    
    texts = pad_sequences(texts, maxlen = max_seq_length, padding = 'post')
    
    # Process label data
    
    label_values = [
        'INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'
    ]
    
    label_mapping = {label:np.int(label[0] == 'E') for label in label_values}

    labels = labels.replace(label_mapping)
    labels = np.array(labels)
    
    return texts, labels, max_seq_length, vocab_length, label_mapping, my_texts

texts, labels, max_seq_length, vocab_length, label_mapping, my_texts = preprocess_inputs(data)

print("Text sequences:\n", texts.shape)
print("\nLabels:\n", labels.shape)
print("\nMax sequence length:\n", max_seq_length)
print("\nVocab length:\n", vocab_length)
print("\nLabel mapping:\n", label_mapping)

texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, train_size=0.7, random_state=123)

texts

"""# Training"""

embedding_dim = 512

inputs = tf.keras.Input(shape=(max_seq_length,))

embedding = tf.keras.layers.Embedding(
    input_dim = vocab_length,
    output_dim = embedding_dim,
    input_length = max_seq_length)(inputs)

gru = tf.keras.layers.Bidirectional(
    tf.keras.layers.GRU(
        units=256,
        return_sequences=True
    )
)(embedding)
    
flatten = tf.keras.layers.Flatten()(gru)
    
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(flatten)
    
model = tf.keras.Model(inputs, outputs)
    
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)


history = model.fit(
    texts_train,
    labels_train,
    validation_split=0.2,
    batch_size=32,
    epochs=5,
    callbacks=[
        tf.keras.callbacks.ModelCheckpoint('./model.h5', save_best_only=True, save_weights_only=True)
    ]
)

model.load_weights('./model.h5')

model.evaluate(texts_test, labels_test)

predicted_arr = model.predict(texts[:10])

for i in range(0,10):
  print(predicted_arr[i]*100,"==>", my_texts[i])