# -*- coding: utf-8 -*-
"""Thesis

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1bcNzikBh8ohUOif-UQgysI-zljVx6XEq

# Thesis
"""

pip install Sastrawi

"""##Import Library"""

import pandas as pd 
import nltk # Library nltk
import string # Library string
import re # Library regex
import sklearn
import numpy as np

import nltk
# Impor word_tokenize dari NLTK
from nltk.tokenize import word_tokenize 
nltk.download('punkt')

# import StopWordRemoverFactory class
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

from gensim.models import KeyedVectors

factory = StemmerFactory()
stemmer = factory.create_stemmer()
 
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()


from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# from keras import optimizers

"""## Import File"""

from google.colab import drive
drive.mount('/content/drive')

df=pd.read_csv('/content/drive/MyDrive/Thesis/data_1.csv', sep=";")
df

"""##Preprocessing"""

df_copy = df.copy()

df_copy.Label = pd.factorize(df_copy.Label)[0]

"""###Hapus Kolom

"""

df_copy = df_copy.drop_duplicates(subset=['Text', 'Label'], keep='first')

df_copy.drop(['Created-At','From-User','From-User-Id','To-User','To-User-Id','Language','Source','Geo-Location-Latitude','Geo-Location-Longitude','Retweet-Count'], axis = 1, inplace = True)

df_copy

df_copy.dtypes

"""###Cleansing"""

def cleansing(text, remove_digits=True):
    print(text)
    pattern=r'https?://\S+'
    text=re.sub(pattern,'',text)
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

df_copy['Text']=df_copy['Text'].apply(cleansing)

df_copy

# def cleaner(tweet):
#     tweet = tweet.lower()
#     tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
#     tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
#     tweet = " ".join(tweet.split())
#     # tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
#     # tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
#     # tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
#     #      if w.lower() in words or not w.isalpha())

#     # Menghapus angka
#     tweet = re.sub(r"\d+", "", str(tweet))
#     # Menghapus tanda baca
#     tweet = str(tweet).translate(str.maketrans("","",string.punctuation))
#     # Menghapus beberapa whitespace menjadi whitespace tunggal
#     tweet = re.sub('\s+',' ',str(tweet))
#     # Menghapus whitespace
#     tweet = str(tweet).strip()
    
#     return tweet

# df_copy['text_filter'] = df_copy['Text'].map(lambda x: cleaner(x))

# df_copy = df_copy.drop_duplicates(subset=['text_filter', 'Label'], keep='first')
# df_copy.dropna(subset = ["text_filter"], inplace=True)

"""###Transform Case"""

def transform_case(text):
    text=text.lower()
    return text

df_copy['Text']=df_copy['Text'].apply(transform_case)

df_copy

# df_copy['tokenization'] = df_copy.apply(lambda row: nltk.word_tokenize(row['text_filter']), axis=1)

"""###Tokenization"""

df_copy['tokenization'] = df_copy.apply(lambda row: nltk.word_tokenize(row['Text']), axis=1)

df_copy

# def stopwordStemming(tweet):
#    tweet = stopword.remove(tweet)
#    tweet = stemmer.stem(tweet)

#    return tweet

# factory = StopWordRemoverFactory()
# stopwords = factory.get_stop_words()
# df_copy['tokenization'] = df_copy['tokenization'].map(lambda x: [stopwordStemming(i) for i in x] )

df_copy

"""###Stopword and Stemming


"""

stopword_list = (open('/content/drive/MyDrive/Thesis/tala-stopword.txt', 'r').read().replace('\n', ' ')).split(" ")
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stopword_stemming(tokens):  
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_tokens = [token for token in filtered_tokens if len(token)>1]  
    filtered_text = ' '.join(stemmer.stem(token) for token in filtered_tokens )    
    return filtered_text

df_copy['res_preprocessing'] = df_copy['tokenization'].apply(stopword_stemming)

df_copy

pd.set_option('display.max_colwidth', None)
df_copy.iloc[0]

"""```###Stemmer


"""

# factory = StemmerFactory()
# stemmer = factory.create_stemmer()

# def stemmer(tokens):  
#     tokens = [token.strip() for token in tokens]
#     filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
#     filtered_text = ' '.join(filtered_tokens)    
#     return filtered_text

# df_copy['stopword'] = df_copy['tokenization'].apply(remove_stopwords)

# #Stemming the text
# def simple_stemmer(text):
#     ps=nltk.porter.PorterStemmer()
#     text= ' '.join([ps.stem(word) for word in text.split()])
#     return text
# #Apply function on review column
# data_news['text']=data_news['text'].apply(simple_stemmer)
# data_news['title']=data_news['title'].apply(simple_stemmer)

"""###Export ke CSV"""

df_copy.to_csv("train-processed-sample.csv", header=None, index=None)

"""##Pembentukan Vector

###Word2Vec
"""

# def stitchToken(token):
#     tweet = ' '.join(token)  
    
#     return tweet

# df_copy['pre_tweet'] =  df_copy['tokenization'].map(lambda x: stitchToken(x) )
# df_copy.head(10)

# for index, row in df_copy.iterrows():
#   if 'jatiasih' in row['pre_tweet'] :
#     print(index)
#     print(row['pre_tweet'])

# df_copy.iloc[27]

# df_copy.iloc[[128]]

x = df_copy['res_preprocessing']
y = df_copy['Label']

from sklearn.model_selection import train_test_split
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.4, random_state=SEED) # random_state=SEED
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED) # random_state=SEED
# x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.4)
# x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5)

print("Train set has total {0} entries with {1:.2f}% Macet, {2:.2f}% Lancar".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% Macet, {2:.2f}% Lancar".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% Macet, {2:.2f}% Lancar".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v = labelize_tweets_ug(all_x, 'all')

all_x

all_x_w2v

cores = multiprocessing.cpu_count()
model_ug_cbow = Word2Vec(sg=0, size=100, negative=5, window=5, min_count=1, workers=8, alpha=0.065, min_alpha=0.065)
model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])

# Commented out IPython magic to ensure Python compatibility.
# %%time
# for epoch in range(50):
#     model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
#     model_ug_cbow.alpha -= 0.002
#     model_ug_cbow.min_alpha = model_ug_cbow.alpha

model_ug_sg = Word2Vec(sg=1, size=100, negative=5, window=5, min_count=1, workers=8, alpha=0.065, min_alpha=0.065)
model_ug_sg.build_vocab([x.words for x in tqdm(all_x_w2v)])

# Commented out IPython magic to ensure Python compatibility.
# %%time
# for epoch in range(50):
#     model_ug_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
#     model_ug_sg.alpha -= 0.002
#     model_ug_sg.min_alpha = model_ug_sg.alpha

model_ug_cbow.save('w2v_model_ug_cbow.word2vec')
model_ug_sg.save('w2v_model_ug_sg.word2vec')

model_ug_cbow = KeyedVectors.load('w2v_model_ug_cbow.word2vec')
model_ug_sg = KeyedVectors.load('w2v_model_ug_sg.word2vec')

model_ug_sg.wv['lancar']

embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = np.append(model_ug_cbow.wv[w],model_ug_sg.wv[w])
print('Found %s word vectors.' % len(embeddings_index))

embeddings_index['lancar']

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

len(tokenizer.word_index)

# x_train[:10]

# sequences[:10]

length = []
for x in x_train:
    length.append(len(x.split()))

max(length)

x_train_seq = pad_sequences(sequences, maxlen=43)
print('Shape of data tensor:', x_train_seq.shape)

# print(x_train_seq[128:129])

# x_validation

sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=43)

num_words = 100000
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_matrix

print(embeddings_index.get("padat"))

"""###FastText"""

model_fasttext_cbow = FastText(sg=0, size=100, negative=5, window=5, min_count=1, workers=8, alpha=0.065, min_alpha=0.065, word_ngrams=3)
model_fasttext_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])

a = utils.shuffle([x.words for x in tqdm(all_x_w2v)])

# Commented out IPython magic to ensure Python compatibility.
# %%time
# for epoch in range(50):
#     model_fasttext_cbow.build_vocab(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), update=True) 
#     model_fasttext_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
#     model_fasttext_cbow.alpha -= 0.002
#     model_fasttext_cbow.min_alpha = model_fasttext_cbow.alpha

model_fasttext_sg = FastText(sg=1, size=100, negative=5, window=5, min_count=1, workers=8, alpha=0.065, min_alpha=0.065, word_ngrams=3)
model_fasttext_sg.build_vocab([x.words for x in tqdm(all_x_w2v)])

# Commented out IPython magic to ensure Python compatibility.
# %%time
# for epoch in range(50):
#     model_fasttext_sg.build_vocab(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), update=True) 
#     model_fasttext_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
#     model_fasttext_sg.alpha -= 0.002
#     model_fasttext_sg.min_alpha = model_fasttext_sg.alpha

model_fasttext_cbow.save('w2v_model_fasttext_cbow.fasttext')
model_fasttext_sg.save('w2v_model_fasttext_sg.fasttext')

model_fasttext_sg.wv['lancar']

embeddings_ft_index = {}
for w in model_fasttext_cbow.wv.vocab.keys():
    embeddings_ft_index[w] = np.append(model_fasttext_cbow.wv[w],model_fasttext_sg.wv[w])
print('Found %s word vectors.' % len(embeddings_ft_index))

num_words = 100000
embedding_ft_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_ft_vector = embeddings_ft_index.get(word)
    if embedding_ft_vector is not None:
        embedding_ft_matrix[i] = embedding_ft_vector

embeddings_ft_index.get("lancar")

"""##Klasifikasi"""

from tensorflow.keras.optimizers import SGD

# learning_rate = 0.3
# neurons = 64
# dropout_rate = 0.3

"""###CNN + Word2Vec"""

tweet_input = Input(shape=(43,), dtype='int32')

tweet_encoder = Embedding(100000, 200, weights=[embedding_matrix], input_length=43, trainable=True)(tweet_input)
bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalAveragePooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalAveragePooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalAveragePooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

# merged = Dense(256, activation='relu')(merged)
merged = Dense(128, activation='relu')(merged)
# merged = Dropout(0.2)(merged)
merged = Dropout(0.1)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
opt = SGD(lr=0.3)
model.compile(loss='binary_crossentropy',
                  optimizer = opt,
                  metrics=['accuracy'])
model.summary()

import keras.backend as K
from keras.callbacks import ReduceLROnPlateau, Callback
class MyCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        lr = self.model.optimizer.lr
        decay = self.model.optimizer.decay
        iterations = self.model.optimizer.iterations
        lr_with_decay = lr / (1. + decay * K.cast(iterations, K.dtype(decay)))
        print("Learning Rate = ", K.eval(lr_with_decay))

from keras.callbacks import ModelCheckpoint


filepath="CNN_best_weights_16.{epoch:02d}-{val_accuracy:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')


rlrop = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10)
print_rl = MyCallback() 

model.fit(x_train_seq, y_train, batch_size=32, epochs=50, validation_data=(x_val_seq, y_validation), callbacks = [checkpoint, rlrop, print_rl])

# text_coba = cleansing('tmcpoldametro ini malah bikin macet jalan sekitar situ')
# text_coba = transform_case(text_coba)
# tokenizer_coba = nltk.word_tokenize(text_coba)
# text_coba = stopword_stemming(tokenizer_coba)

# text_coba

# testing4 = np.array(['tmcpoldametro bikin macet jalan situ'])
# testing2 = tokenizer.texts_to_sequences(testing4)
# testing3 = pad_sequences(testing2, maxlen=43)
# testing1 = model.predict(testing3)
# print(testing1)

"""###CNN + FastText"""

tweet_input_ft = Input(shape=(43,), dtype='int32')

tweet_encoder_ft = Embedding(100000, 200, weights=[embedding_ft_matrix], input_length=43, trainable=True)(tweet_input_ft)
bigram_branch_ft = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=2)(tweet_encoder_ft)
bigram_branch_ft = GlobalAveragePooling1D()(bigram_branch_ft)
trigram_branch_ft = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=2)(tweet_encoder_ft)
trigram_branch_ft = GlobalAveragePooling1D()(trigram_branch_ft)
fourgram_branch_ft = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=2)(tweet_encoder_ft)
fourgram_branch_ft = GlobalAveragePooling1D()(fourgram_branch_ft)
merged_ft = concatenate([bigram_branch_ft, trigram_branch_ft, fourgram_branch_ft], axis=1)

# merged_ft = Dense(256, activation='relu')(merged_ft)
merged_ft = Dense(128, activation='relu')(merged_ft)
# merged_ft = Dropout(0.2)(merged_ft)
merged_ft = Dropout(0.3)(merged_ft)
merged_ft = Dense(1)(merged_ft)
output_ft = Activation('sigmoid')(merged_ft)
model_ft = Model(inputs=[tweet_input_ft], outputs=[output_ft])
opt = SGD(lr=0.3)
model_ft.compile(loss='binary_crossentropy',
                  optimizer = opt,
                  metrics=['accuracy'])
model_ft.summary()

filepath_ft="CNN_FT_best_weights_16.{epoch:02d}-{val_accuracy:.4f}.hdf5"
checkpoint_ft = ModelCheckpoint(filepath_ft, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

model_ft.fit(x_train_seq, y_train, batch_size=32, epochs=50, validation_data=(x_val_seq, y_validation), callbacks = [checkpoint_ft])

"""###LSTM"""

model_lstm = Sequential()
model_lstm.add(Embedding(input_dim = num_words, output_dim = 256, input_length = 43))
model_lstm.add(Dropout(0.1))
model_lstm.add(LSTM(256, dropout = 0.2, recurrent_dropout = 0.2))
# model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dense(256, activation = 'relu'))
# model_lstm.add(Dropout(0.2))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(1, activation = 'sigmoid'))
opt = SGD(lr=0.1)
model_lstm.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

filepath_lstm="LSTM_best_weights_16.{epoch:02d}-{val_accuracy:.4f}.hdf5"
checkpoint_lstm = ModelCheckpoint(filepath_lstm, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

model_lstm.fit(x_train_seq, y_train, batch_size=32, epochs=50, validation_data=(x_val_seq, y_validation), callbacks = [checkpoint_lstm])

"""###SVM"""

from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(x_val_seq,y_validation)

"""###Load Model

####CNN + Word2Vec
"""

from keras.models import load_model
loaded_CNN_model = load_model('CNN_best_weights_16.17-0.9979.hdf5')
loaded_CNN_model.evaluate(x=x_val_seq, y=y_validation)

"""####CNN + FastText"""

loaded_CNN_FT_model = load_model('CNN_FT_best_weights_16.20-0.9979.hdf5')
loaded_CNN_FT_model.evaluate(x=x_val_seq, y=y_validation)

"""####LSTM"""

loaded_LSTM_model = load_model('LSTM_best_weights_16.10-0.9958.hdf5')
loaded_LSTM_model.evaluate(x=x_val_seq, y=y_validation)

"""###Uji dengan Test Set

####CNN + Word2Vec
"""

sequences_test = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(sequences_test, maxlen=43)
yhat_cnn = loaded_CNN_model.predict(x_test_seq)
loaded_CNN_model.evaluate(x=x_test_seq, y=y_test)

"""#####Confusion Matrix"""

# for i, pred in enumerate(yhat_cnn) :
#   if(pred > 0.5):
#     yhat_cnn[i] = 1
#   else :
#     yhat_cnn[i] = 0
yhat_cnn = yhat_cnn > 0.5

matrix = sklearn.metrics.confusion_matrix(y_test, yhat_cnn)

matrix

"""######Accuracy Score"""

sklearn.metrics.accuracy_score(y_test, yhat_cnn)

"""######Precision"""

sklearn.metrics.precision_score(y_test, yhat_cnn)

"""######Recall"""

sklearn.metrics.recall_score(y_test, yhat_cnn)

"""######f1-score"""

sklearn.metrics.f1_score(y_test, yhat_cnn)

"""####CNN + FastText"""

# sequences_test = tokenizer.texts_to_sequences(x_test)
# x_test_seq = pad_sequences(sequences_test, maxlen=43)
yhat_cnn_ft = loaded_CNN_FT_model.predict(x_test_seq)
loaded_CNN_FT_model.evaluate(x=x_test_seq, y=y_test)

"""#####Confusion Matrix"""

yhat_cnn_ft = yhat_cnn_ft > 0.5

matrix_ft = sklearn.metrics.confusion_matrix(y_test, yhat_cnn_ft)

matrix_ft

"""######Accuracy Score"""

sklearn.metrics.accuracy_score(y_test, yhat_cnn_ft)

"""######Precision"""

sklearn.metrics.precision_score(y_test, yhat_cnn_ft)

"""######Recall"""

sklearn.metrics.recall_score(y_test, yhat_cnn_ft)

"""######f1-score"""

sklearn.metrics.f1_score(y_test, yhat_cnn_ft)

"""####LSTM"""

# sequences_test = tokenizer.texts_to_sequences(x_test)
# x_test_seq = pad_sequences(sequences_test, maxlen=43)
yhat_lstm = loaded_LSTM_model.predict(x_test_seq)
loaded_LSTM_model.evaluate(x=x_test_seq, y=y_test)

"""#####Confusion Matrix"""

yhat_lstm = yhat_lstm > 0.5

matrix_lstm = sklearn.metrics.confusion_matrix(y_test, yhat_lstm)

matrix_lstm

"""######Accuracy Score"""

sklearn.metrics.accuracy_score(y_test, yhat_lstm)

"""######Precision"""

sklearn.metrics.precision_score(y_test, yhat_lstm)

"""######Recall"""

sklearn.metrics.recall_score(y_test, yhat_lstm)

"""######f1-score"""

sklearn.metrics.f1_score(y_test, yhat_lstm)

"""####SVM"""

# predict the labels on validation dataset
predictions_SVM = SVM.predict(x_test_seq)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

matrix = sklearn.metrics.confusion_matrix(y_test, predictions_SVM)
matrix

sklearn.metrics.accuracy_score(y_test, predictions_SVM)

sklearn.metrics.precision_score(y_test, predictions_SVM)

sklearn.metrics.recall_score(y_test, predictions_SVM)

sklearn.metrics.f1_score(y_test, predictions_SVM)

# # new instance where we do not know the answer
# Xnew = ([[  0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
#            0,    0,    0,    0,    0,  824,  734,   13,  202,    1,   33,
#           77,  148,   63,   34,   99,  734,   91,  408,   23,  373,  376,
#           44,  265,    9,   13,  408,   55,  589,   34,   99,  361]])
# # make a prediction
# ynew = model.predict(Xnew)
# # show the inputs and predicted outputs
# print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))

"""##Pengujian manual"""

df_test_manual=pd.read_csv('/content/drive/MyDrive/Thesis/Testing/New/30 - Lancar.csv', sep=";")
# df_test_manual

df_test_manual = df_test_manual.dropna(subset = ['Text'])

df_test_manual['Text_process']=df_test_manual['Text'].apply(cleansing)
df_test_manual['Text_process'] = df_test_manual['Text_process'].apply(transform_case)
df_test_manual['Text_tokenizer'] = df_test_manual.apply(lambda row: nltk.word_tokenize(row['Text_process']), axis=1)
df_test_manual['Text_process'] = df_test_manual['Text_tokenizer'].apply(stopword_stemming)

"""###Dengan mengunakan Word2Vec+CNN"""

manual_wvcnn_1 = np.array(df_test_manual['Text_process'])
manual_wvcnn_2 = tokenizer.texts_to_sequences(manual_wvcnn_1)
manual_wvcnn_3 = pad_sequences(manual_wvcnn_2, maxlen=43)
manual_wvcnn_4 = loaded_CNN_model.predict(manual_wvcnn_3)

for i, pred in enumerate(manual_wvcnn_4) :
  if(pred > 0.5):
    manual_wvcnn_4[i] = 1
  else :
    manual_wvcnn_4[i] = 0
list_manual_wvcnn = list(manual_wvcnn_4)
if(list_manual_wvcnn.count(0) >= list_manual_wvcnn.count(1)):
  print("macet")
else :
  print("Lancar")

"""###Dengan mengunakan FastText+CNN"""

manual_ftcnn_1 = np.array(df_test_manual['Text_process'])
manual_ftcnn_2 = tokenizer.texts_to_sequences(manual_ftcnn_1)
manual_ftcnn_3 = pad_sequences(manual_ftcnn_2, maxlen=43)
manual_ftcnn_4 = loaded_CNN_FT_model.predict(manual_ftcnn_3)

for i, pred in enumerate(manual_ftcnn_4) :
  if(pred > 0.5):
    manual_ftcnn_4[i] = 1
  else :
    manual_ftcnn_4[i] = 0
list_manual_ftcnn = list(manual_ftcnn_4)
if(list_manual_ftcnn.count(0) >= list_manual_ftcnn.count(1)):
  print("macet")
else :
  print("Lancar")

"""###Dengan Menggunakan SVM"""

manualSVM1 = np.array(df_test_manual['Text_process'])
manualSVM2 = tokenizer.texts_to_sequences(manualSVM1)
manualSVM3 = pad_sequences(manualSVM2, maxlen=43)
manualSVM4 = SVM.predict(manualSVM3)

for i, pred in enumerate(manualSVM4) :
  if(pred > 0.5):
    manualSVM4[i] = 1
  else :
    manualSVM4[i] = 0
list_manual_svm = list(manualSVM4)
if(list_manual_svm.count(0) >= list_manual_svm.count(1)):
  print("macet")
else :
  print("Lancar")

"""###LSTM"""

manual_lstm_1 = np.array(df_test_manual['Text_process'])
manual_lstm_2 = tokenizer.texts_to_sequences(manual_lstm_1)
manual_lstm_3 = pad_sequences(manual_lstm_2, maxlen=43)
manual_lstm_4 = loaded_LSTM_model.predict(manual_lstm_3)

for i, pred in enumerate(manual_lstm_4) :
  if(pred > 0.5):
    manual_lstm_4[i] = 1
  else :
    manual_lstm_4[i] = 0
list_manual_lstm = list(manual_lstm_4)
if(list_manual_lstm.count(0) >= list_manual_lstm.count(1)):
  print("macet")
else :
  print("Lancar")

"""------------------------- Cobaaa"""

# from keras.layers import Conv1D, GlobalMaxPooling1D

# structure_test = Sequential()
# e = Embedding(100000, 200, input_length=45)
# structure_test.add(e)
# structure_test.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
# structure_test.summary()

# model = Sequential()
# e = Embedding(100000, 200, weights=[embedding_matrix], input_length=43, trainable=False)
# model.add(e)
# model.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1, input_shape=(None, 43, 200)))
# model.add(GlobalMaxPooling1D())
# model.add(Dense(256, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))
# model.summary()

# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)

# # new instance where we do not know the answer
# Xnew = (x_train_seq[[128]])
# # make a prediction
# ynew = model.predict_classes(Xnew)
# # show the inputs and predicted outputs
# print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))

# x_train_seq[[128]]

# layer_names = [layer.name for layer in model.layers]


# #Getting output of each layer
# from tensorflow.keras import backend as K

# inp = model.input                                           # input 
# outputs = [layer.output for layer in model.layers]          # all layer outputs
# functors = [K.function([inp], [out]) for out in outputs]    # evaluation functions

# # Testing
# test = test = x_train_seq[[128]]
# count = 0
# for func in functors:
#   print('\n')
#   print("Layer Name: ",layer_names[count])
#   print('\n')
#   print(func([test]))
#   count+=1

# test

# x_train_seq[[1]]

# x_train_seq[1]

"""<!-- ###VGG16 -->"""

# from keras.applications.vgg16 import VGG16
# # load the model

# initial_vgg = VGG16(include_top=True,weights='imagenet',input_tensor=Input(shape=(43,)))
# tweet_input = Input(shape=(43,), dtype='int32')

# tweet_encoder = Embedding(100000, 200, weights=[embedding_matrix], input_length=43, trainable=True)(initial_vgg(tweet_input))
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
# bigram_branch = GlobalMaxPooling1D()(bigram_branch)
# trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
# trigram_branch = GlobalMaxPooling1D()(trigram_branch)
# fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
# fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
# merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

# merged = Dense(256, activation='relu')(merged)
# merged = Dropout(0.2)(merged)
# merged = Dense(1)(merged)
# output = Activation('sigmoid')(merged)

# modelVGG = Model(inputs=[tweet_input], outputs=[output])
# modelVGG.compile(loss='binary_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy'])
# modelVGG.summary()

# initial_vgg.summary()

# filepath="CNN_best_weights.{epoch:02d}-{val_accuracy:.4f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

# x_train_seq =  [a.reshape(Input(shape=(224,224,3,))) for a in x_train_seq]
# x_val_seq = x_val_seq.reshape((x_val_seq.shape[0], 224, 224, 3))

# modelVGG.fit(x_train_seq, y_train, batch_size=32, epochs=5, validation_data=(x_val_seq, y_validation), callbacks = [checkpoint])

# print(x_train_seq)

# ####################


# tweet_input = Input(shape=(43,), dtype='int32')
# tweet_encoder = Embedding(100000, 200, weights=[embedding_matrix], input_length=43, trainable=True)(tweet_input)

# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = GlobalMaxPooling1D()(bigram_branch)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = GlobalMaxPooling1D()(bigram_branch)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = GlobalMaxPooling1D()(bigram_branch)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = GlobalMaxPooling1D()(bigram_branch)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(bigram_branch)
# bigram_branch = GlobalMaxPooling1D()(bigram_branch)


# # trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
# # trigram_branch = GlobalMaxPooling1D()(trigram_branch)
# # fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
# # fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
# # merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

# merged = Dense(256, activation='relu')(bigram_branch)
# merged = Dropout(0.2)(merged)
# merged = Dense(1)(merged)
# output = Activation('sigmoid')(merged)

# modelVGG = Model(inputs=[tweet_input], outputs=[output])
# modelVGG.compile(loss='binary_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy'])
# modelVGG.summary()

from tensorflow.keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

def create_model(neurons=256, dropout_rate=0.2, learn_rate=0.01, activation='relu', strides=1):
  #CNN+WORD2Vec
  # tweet_input = Input(shape=(43,), dtype='int32')

  # tweet_encoder = Embedding(100000, 200, weights=[embedding_matrix], input_length=43, trainable=True)(tweet_input)
  # bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=strides)(tweet_encoder)
  # bigram_branch = GlobalAveragePooling1D()(bigram_branch)
  # trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=strides)(tweet_encoder)
  # trigram_branch = GlobalAveragePooling1D()(trigram_branch)
  # fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=strides)(tweet_encoder)
  # fourgram_branch = GlobalAveragePooling1D()(fourgram_branch)
  # merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

  # merged = Dense(neurons, activation=activation)(merged)
  # merged = Dropout(dropout_rate)(merged)
  # merged = Dense(1)(merged)
  # output = Activation('sigmoid')(merged)
  # model = Model(inputs=[tweet_input], outputs=[output])
  # optimizer = SGD(lr=learn_rate)
  # model.compile(loss='binary_crossentropy',
  #                   optimizer = optimizer,
  #                   metrics=['accuracy'])

  #CNN+FastText
  # tweet_input_ft = Input(shape=(43,), dtype='int32')

  # tweet_encoder_ft = Embedding(100000, 200, weights=[embedding_ft_matrix], input_length=43, trainable=True)(tweet_input_ft)
  # bigram_branch_ft = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=strides)(tweet_encoder_ft)
  # bigram_branch_ft = GlobalAveragePooling1D()(bigram_branch_ft)
  # trigram_branch_ft = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=strides)(tweet_encoder_ft)
  # trigram_branch_ft = GlobalAveragePooling1D()(trigram_branch_ft)
  # fourgram_branch_ft = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=strides)(tweet_encoder_ft)
  # fourgram_branch_ft = GlobalAveragePooling1D()(fourgram_branch_ft)
  # merged_ft = concatenate([bigram_branch_ft, trigram_branch_ft, fourgram_branch_ft], axis=1)

  # merged_ft = Dense(neurons, activation=activation)(merged_ft)
  # merged_ft = Dropout(dropout_rate)(merged_ft)
  # merged_ft = Dense(1)(merged_ft)
  # output_ft = Activation('sigmoid')(merged_ft)
  # model_ft = Model(inputs=[tweet_input_ft], outputs=[output_ft])
  # optimizer = SGD(lr=learn_rate)
  # model_ft.compile(loss='binary_crossentropy',
  #                   optimizer = optimizer,
  #                   metrics=['accuracy'])

  model_lstm = Sequential()
  model_lstm.add(Embedding(input_dim = num_words, output_dim = 256, input_length = 43))
  model_lstm.add(Dropout(0.1))
  model_lstm.add(LSTM(256, dropout = 0.2, recurrent_dropout = 0.2))
  model_lstm.add(Dense(neurons, activation = 'relu'))
  model_lstm.add(Dropout(dropout_rate))
  model_lstm.add(Dense(1, activation = 'sigmoid'))
  opt = SGD(lr=learn_rate)
  model_lstm.compile(
      loss='binary_crossentropy',
      optimizer=opt,
      metrics=['accuracy']
  )

  return model_lstm

# model =  KerasClassifier(build_fn=create_model, epochs=50, batch_size=32, verbose=0)
# # batch_size = [16, 32]
# # epochs = [10, 30, 50]
# learn_rate = [0.1, 0.2, 0.3]
# dropout_rate = [0.1, 0.2, 0.3]
# neurons = [16, 32, 64, 128]
# activation = ['relu']
# strides = [1, 2, 3]
# param_grid = dict(learn_rate=learn_rate, dropout_rate=dropout_rate, neurons=neurons, activation=activation, strides=strides)
# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
# grid_result = grid.fit(x_train_seq, y_train)
# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))