import gensim
import numpy as np
import pandas as pd
import text_preprocessing
from collections import defaultdict
from gensim.test.utils import get_tmpfile
from export_jobs_resumes_csv import get_jobs_resumes_csv 
from clean_dataframe import clean_jobs_resume_csv
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


print("get jobs and resumes from database")

#get jobs and resumes csvs
try:
    get_jobs_resumes_csv()
except Exception as e:
    print("Details attached here. ", e)
    

print("clean jobs and resumes")

#clean jobs and resumes csvs
try:
    clean_jobs_resume_csv()
except Exception as e:
    print("Details attached here. ", e)


    
    
#read the clean jobs file
j_df = pd.read_csv("clean_jobs.csv")

print(j_df.head())
print(j_df.describe())


#check the len of unique values
print(len(j_df['id'].unique()))


print("make jobs tokens from jobs dataframe")

#make jobs tokens from jobs dataframe
jobs_tokens = []
#for i in range(0, len(j_df)):
for i in range(0, len(j_df)):
    raw = str(j_df.iloc[i]['details'])
    clean_tokens = text_preprocessing.preprocess_text(raw)
    jobs_tokens.append([j_df.iloc[i]['id'], clean_tokens])

#print(jobs_tokens)

print(len(jobs_tokens))


#get only the tokens
jobs_token_list = np.array(jobs_tokens)[:,1].tolist()

print(len(jobs_token_list))

print("remove words that appear only once")

# remove words that appear only once
frequency = defaultdict(int)
for row in jobs_token_list:
    for token in row:
        frequency[token] += 1

jobs_token_list = [
    [token for token in row if frequency[token] > 1]
    for row in jobs_token_list
]



print("creating tagged documents needed for Doc2Vec")

# Create the tagged document needed for Doc2Vec
def create_train_tagged_document(list_of_list_of_words, tokens_only=False):
    for i, list_of_words in enumerate(list_of_list_of_words):    
        if tokens_only:
                yield list_of_words
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(list_of_words, [str(jobs_tokens[i][0])])
        

train_data = list(create_train_tagged_document(jobs_token_list))

#print(train_data)
print(len(train_data))




#read the existing clean resumes file
r_df= pd.read_csv("clean_resumes.csv")

#check the len of unique values
print(len(r_df['resume_id'].unique()))
print(len(r_df['person_id'].unique()))


print("make tokens from resume dataframe")
#make tokens from resume dataframe
resumes_tokens=[]

for i in range(0, len(r_df)):    
    raw = str(r_df.iloc[i]['resume_details'])
    clean_tokens = text_preprocessing.preprocess_text(raw)
    resumes_tokens.append([r_df.iloc[i]['resume_id'], clean_tokens])
#print(resumes_tokens)


#only get tokens
resumes_token_list = np.array(resumes_tokens)[:,1].tolist()

print("removing words that appear only once")

# remove words that appear only once
frequency = defaultdict(int)
for row in resumes_token_list:
    for token in row:
        frequency[token] += 1

resumes_token_list = [
    [token for token in row if frequency[token] > 1]
    for row in resumes_token_list
]

print("creating the tagged document needed for Doc2Vec")
# Create the tagged document needed for Doc2Vec
def create_test_tagged_document(list_of_list_of_words, tokens_only=False):
    for i, list_of_words in enumerate(list_of_list_of_words):    
        if tokens_only:
                yield list_of_words
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(list_of_words, [str(resumes_tokens[i][0])])
        

test_data = list(create_test_tagged_document(resumes_token_list))
print(len(test_data))










from gensim import corpora

#build dictionary and corpus

dictionary = corpora.Dictionary(jobs_token_list)
corpus = [dictionary.doc2bow(text) for text in jobs_token_list]


#save dictionary
dictionary.save_as_text("dic_match_job_posting_for_resume")

#load dictionary
dictionary = corpora.Dictionary.load_from_text("dic_match_job_posting_for_resume")

# save corpus
from gensim.corpora import MmCorpus
output_fname= "corpus_match_job_posting_for_resume.mm"

MmCorpus.serialize(output_fname, corpus)


#load corpus
corpus = MmCorpus(output_fname)











from gensim import models, similarities
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=len(dictionary))

# save model
lda.save("match_job_posting_for_resume_lda")


















lda= models.LdaModel.load("match_job_posting_for_resume_lda")  # load model

from gensim import similarities
index = similarities.MatrixSimilarity(lda[corpus])  # transform corpus to LDA space and index it

















#top k records to fetch
k=15


tuple_list=[]


for i in range(0, len(test_data)):

    doc=test_data[i].words
    #vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_bow = dictionary.doc2bow(doc)
    vec_lda = lda[vec_bow] # convert the query to LDA space



    sims = index[vec_lda]  # perform a similarity query against the test data

    sims = [(i, x) for i, x in enumerate(sims)]
    sims.sort(key=lambda x: x[1], reverse=True)

    sims=sims[:k]

    for j, sim in sims:


        #print ("-->", test_data[i])

        #print (test_data[i].tags[0])

        print (list(r_df.loc[r_df['resume_id'] == test_data[i].tags[0]]['title'])[0])
        #print (list(r_df.loc[r_df['resume_id'] == int(test_data[i].tags[0])]['title'])[0])
        print (int(train_data[j].tags[0]))
        print (list(j_df.loc[j_df['id'] == int(train_data[j].tags[0])]['title'])[0])
        print (sim)
        #resume_id, resume_title, job_id, job_title, similarity_score  = test_data[i].tags[0], list(r_df.loc[r_df['resume_id'] ==int(test_data[i].tags[0])]['title'])[0], int(train_data[j].tags[0]), list(j_df.loc[j_df['id'] == int(train_data[j].tags[0])]['title'])[0], sim



        #tuple_list.append(tuple([resume_id, job_id, resume_title, job_title, similarity_score]))













#fill nan in the tuple list

import math

new_tuple_list = [
    tuple(None if isinstance(i, float) and math.isnan(i) else i for i in t)
    for t in tuple_list
]












#populate results in db using bulk query insertion

import csv
import pymysql


db = pymysql.connect (host="cldy-hub-db-dev-do-user-1524670-0.a.db.ondigitalocean.com",
                        user="appuser",
                        passwd="s7xnwc7aeyq9vbzk",
                        db="defaultdb",
                        port=25060,
                        charset='utf8mb4',
                        cursorclass=pymysql.cursors.DictCursor)

cursor = db.cursor()



sql_select_Query = "CREATE TABLE IF NOT EXISTS match_job_posting_for_resume_lda(resume_id BIGINT, " \
                   "job_id BIGINT, resume_title VARCHAR(255), job_title VARCHAR(255), similarity_score DOUBLE, "\
                    "PRIMARY KEY (resume_id, job_id) ) "

cursor.execute(sql_select_Query)

query = 'INSERT INTO match_job_posting_for_resume_lda(resume_id, job_id,resume_title,job_title, similarity_score) VALUES(%s, %s, %s, %s, %s)'


print(query)
cursor = db.cursor()
cursor.executemany(query, new_tuple_list)
db.commit()

# disconnect from server
db.close()



