import pymysql
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import joblib
import gensim
from gensim.models import phrases,word2vec

from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from geopy.extra.rate_limiter import RateLimiter
import databaseconfig as cfg
geolocator = Nominatim(user_agent="user_agent")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)


resume_data = pd.read_pickle('resume_updated_data.pkl')
job_data = pd.read_pickle('job_updated_data.pkl')
stored_distance = pd.read_csv('stored_distance_data.csv')
skills_data = pd.read_csv("skills.csv")

bigrams = joblib.load('bigrams.pkl')
trigrams = joblib.load('trigrams.pkl')
w2v_model = joblib.load('res_score_w2v.pkl')

def extract(query):
    db = pymysql.connect (host=cfg.mysql["host"], 
                      user=cfg.mysql["user"], 
                      passwd=cfg.mysql["passwd"],
                      db=cfg.mysql["db"],
                      port=cfg.mysql["port"],
                      charset=cfg.mysql["charset"],
                      cursorclass=pymysql.cursors.DictCursor)
    cursor = db.cursor()
    cursor.execute(query)
    results = cursor.fetchall()
    db.close()
    
    if len(results)>0:
        df = pd.DataFrame(results)
        return df
    else:
        return results
    


def preprocess(source):
    stop_words=set(stopwords.words('english'))
    stop_words.remove('it')
    soup = BeautifulSoup(source,'lxml')
    text = soup.get_text()
    text = re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z+.#]',' ',text)
    text = text.lower()
    text=text.split()
    text = [re.sub("\A[.]|[.]\Z|[.]com|[']s|sr[.]|senior|sr","",word) for word in text]
    text=[word for word in text if not word in stop_words]
    tokens = []
    for word in text:
        if (len(word.strip()) > 1) or (word in ['c','r']):
            tokens.append(word)
    return tokens



def add_features(tokens):
    bigrams = list(nltk.bigrams(tokens))
    b=[]
    for i in bigrams:
        b.append(" ".join(i))
    
    trigrams = list(nltk.trigrams(tokens))
    t=[]
    for i in trigrams:
        t.append(" ".join(i))
    
    tokens.extend(b)
    tokens.extend(t)
    return tokens

def find_skills(tokens,all_skills):
    resume_skills = {}
    for word in tokens:
        if word in all_skills:
            if word in resume_skills:
                resume_skills[word] += 1
            else:
                resume_skills[word] = 1
    return resume_skills

def find_skill_matched(resume_data,job_data):
    com_skill_matched = []
    for i in range(len(resume_data)):
        res_skills = resume_data['skills'][i]
        for j in range(len(job_data)):
            skill_matched = 0
            job_skills = job_data['skills'][j]
            for skill in job_skills:
                if skill in res_skills:
                    skill_matched += 1
            if len(job_skills)!=0:
                percentage = (skill_matched / len(job_skills))*100
            else:
                percentage = np.nan
            com_skill_matched.append({'resume_id':resume_data['resume_id'][i],'job_id':job_data['id'][j],'matched_skills':skill_matched,'percentage':percentage})
    return com_skill_matched

def find_top_skills(data):
    try:
        return sorted(data.items(), key = lambda kv:(kv[1], kv[0]),reverse=True)[0][0]
    except:
        return np.nan



def find_title_score(resume_data,job_data):
    all_title_score = []
    for i in range(len(resume_data)):
        for j in range(len(job_data)):
            j_title = job_data.iloc[j]['title']
            r_title = resume_data.iloc[i]['title']
            if pd.isna(r_title) or pd.isna(j_title):
                title_score = np.nan
            else:
                r_title = trigrams[bigrams[preprocess(r_title)]]
                j_title = trigrams[bigrams[preprocess(j_title)]]
                sim_score = []
                for r_word in r_title:
                    for j_word in j_title:
                        try:
                            sim_score.append(w2v_model.similarity(r_word,j_word))
                        except:
                            sim_score.append(0)
                title_score = round(np.mean(sim_score)*100,2)
                if title_score < 0:
                    title_score = 0
            all_title_score.append({'resume_id':resume_data['resume_id'][i],'job_id':job_data['id'][j],'title_score':title_score})
    return all_title_score



def find_top_skill_score(resume_data,job_data):
    all_top_skill_score = []
    for i in range(len(resume_data)):
        for j in range(len(job_data)):
            j_title = job_data.iloc[j]['title']
            r_top_skill = resume_data['top_skill'][i]

            if pd.isna(r_top_skill) or pd.isna(j_title):
                top_skill_score = np.nan
            else:
                r_top_skill = trigrams[bigrams[preprocess(r_top_skill)]]
                j_title = trigrams[bigrams[preprocess(j_title)]]
                sim_score = []
                for r_word in r_top_skill:
                    for j_word in j_title:
                        try:
                            sim_score.append(w2v_model.similarity(r_word,j_word))
                        except:
                            sim_score.append(0)
                top_skill_score = round(np.mean(sim_score)*100,2)
                if top_skill_score < 0:
                    top_skill_score = 0
            all_top_skill_score.append({'resume_id':resume_data['resume_id'][i],'job_id':job_data['id'][j],'top_skill_score':top_skill_score})
    return all_top_skill_score



def find_lat_long(loc):
    try:
        loc_obj = geocode(loc)
        coord = str(loc_obj.latitude) + ',' + str(loc_obj.longitude)
        loc_obj= geolocator.reverse(coord)
        country_name = loc_obj.raw['address']['country']
        lat = loc_obj.latitude
        long = loc_obj.longitude
    except:
        lat = np.nan
        long= np.nan
        country_name = np.nan
    return (lat,long,country_name)


def find_loc_details(data):
    loc_details = []
    for j in range(len(data)):
        city = data['city'][j]
        state = data['state_province'][j]
        country = data['country'][j]

        if pd.isna(city):
            if pd.isna(state):
                if pd.isna(country):
                    loc_details.append({'lat':np.nan,'long':np.nan,'new_country':np.nan})
                else:
                    lat, long, new_country_name = find_lat_long(country)
                    loc_details.append({'lat':lat,'long':long,'new_country':new_country_name})
            else:
                lat, long, new_country_name = find_lat_long(state)
                loc_details.append({'lat':lat,'long':long,'new_country':new_country_name})
        else:
            lat, long, new_country_name = find_lat_long(city)
            loc_details.append({'lat':lat,'long':long,'new_country':new_country_name})
    return loc_details



def calculate_distance(job_lat,job_long,job_country,res_lat,res_long,res_country):
    if res_country != job_country:
        return -1
    else:
        return round(geodesic((job_lat,job_long), (res_lat,res_long)).km,0)



def find_distance_data(resume_data,job_data):
    all_distance = []
    for i in range(len(resume_data)):
        res_city = resume_data['city'][i]
        res_state = resume_data['state_province'][i]
        res_country = resume_data['new_country'][i]
        res_lat = resume_data['lat'][i]
        res_long = resume_data['long'][i]

        for j in range(len(job_data)):
            job_city = job_data['city'][j]
            job_state = job_data['state_province'][j]
            job_country = job_data['new_country'][j]
            job_lat = job_data['lat'][j]
            job_long = job_data['long'][j]
            if pd.isna(job_city) or pd.isna(res_city):
                if pd.isna(job_state) or pd.isna(res_state):
                    distance = np.nan
                else:
                    distance = calculate_distance(job_lat,job_long,job_country,res_lat,res_long,res_country)
            else:
                distance = calculate_distance(job_lat,job_long,job_country,res_lat,res_long,res_country)
            all_distance.append({'resume_id':resume_data['resume_id'][i],'job_id':job_data['id'][j],'resume_country':res_country,'job_country':job_country,'distance':distance})
    return all_distance



def find_loc_scores(distance_data,stored_distance):
    all_loc_scores = []
    unique_country = distance_data[~pd.isna(distance_data['resume_country'])]['resume_country'].unique()
    stored_unique_country = stored_distance[~pd.isna(stored_distance['resume_country'])]['resume_country'].unique()
    
    for con in unique_country:
        if con not in stored_unique_country:
            data = distance_data[distance_data['resume_country'] == con]
        else:
            data = stored_distance[stored_distance['resume_country'] == con]
        
        all_dist = data['distance']
        notnull_distance = all_dist[~pd.isna(all_dist)]
        filtered_distance = notnull_distance[notnull_distance != -1]

        if len(filtered_distance) > 0:
            scaler = MinMaxScaler()
            scaler.fit(filtered_distance.values.reshape((-1,1)))
            
        data = distance_data[distance_data['resume_country'] == con]
        for i in range(len(data)):
            distance = data.iloc[i]['distance']
            if (distance == -1) or (pd.isna(distance)):
                score = distance
            else:
                score = round((1-scaler.transform(np.asarray(distance).reshape((-1,1)))[0][0])*100,2)
                if score < 0:
                    score = 0
            all_loc_scores.append({'job_id':data.iloc[i]['job_id'],'resume_id':data.iloc[i]['resume_id'],'resume_country':data.iloc[i]['resume_country'],
                               'job_country':data.iloc[i]['job_country'],'distance':data.iloc[i]['distance'],'loc_score':score})
    return all_loc_scores



def extract_new_data(query,input_data):
    db = pymysql.connect (host=cfg.mysql["host"], 
                      user=cfg.mysql["user"], 
                      passwd=cfg.mysql["passwd"],
                      db=cfg.mysql["db"],
                      port=cfg.mysql["port"],
                      charset=cfg.mysql["charset"],
                      cursorclass=pymysql.cursors.DictCursor)
    cursor = db.cursor()
    cursor.execute(query,input_data)
    results = cursor.fetchone()
    db.close()
    return results


def find_new_ids(all_id,old_id):
    all_id = list(all_id)
    old_id = list(old_id)
    new_id = []
    for Id in all_id:
        if Id not in old_id:
            new_id.append(Id)
    return new_id


def compute_all_scores(resume_data,job_data,stored_distance):
    skill_score = find_skill_matched(resume_data,job_data)
    top_skill_score = find_top_skill_score(resume_data,job_data)
    overall_skill_score = pd.merge(pd.DataFrame(skill_score),pd.DataFrame(top_skill_score))
    overall_skill_score['overall_skill_score'] = (overall_skill_score['percentage'] + overall_skill_score['top_skill_score'])/2
    title_score = find_title_score(resume_data,job_data)
    title_score = pd.DataFrame(title_score)
    
    distance_data = pd.DataFrame(find_distance_data(resume_data,job_data))
    
    loc_scores = find_loc_scores(distance_data,stored_distance)
    null_country = distance_data[pd.isna(distance_data['resume_country'])]
    
    for i in range(len(null_country)):
        loc_scores.append({'job_id':null_country.iloc[i]['job_id'],'resume_id':null_country.iloc[i]['resume_id'],
                           'resume_country':null_country.iloc[i]['resume_country'],'job_country':null_country.iloc[i]['job_country'],
                           'distance':null_country.iloc[i]['distance'],'loc_score':np.nan})
    
    
    loc_scores = pd.DataFrame(loc_scores)
    all_scores = pd.merge(overall_skill_score,title_score)
    all_scores = pd.merge(all_scores,loc_scores)
    
    all_scores['title_rank_score'] = np.where(((all_scores['title_score'] < 50) | (pd.isna(all_scores['title_score']))),0,0)
    all_scores['title_rank_score'] = np.where(((all_scores['title_score'] > 50) & (all_scores['title_score'] < 75)),1,0)
    all_scores['title_rank_score'] = np.where(all_scores['title_score'] > 75,1,0)
    
    all_scores['skill_rank_score'] = np.where(((all_scores['overall_skill_score'] < 25) | (pd.isna(all_scores['overall_skill_score']))),0,0)
    all_scores['skill_rank_score'] = np.where(((all_scores['overall_skill_score'] > 25) & (all_scores['overall_skill_score'] < 50)),1,0)
    all_scores['skill_rank_score'] = np.where(((all_scores['overall_skill_score'] > 50) & (all_scores['overall_skill_score'] < 75)),2,0)
    all_scores['skill_rank_score'] = np.where(all_scores['overall_skill_score'] > 75,3,0)
    
    all_scores['location_rank_score'] = np.where(((all_scores['loc_score'] < 50) | (pd.isna(all_scores['loc_score']))),0,0)
    all_scores['location_rank_score'] = np.where(((all_scores['loc_score'] > 50) & (all_scores['loc_score'] < 75)),3,0)
    all_scores['location_rank_score'] = np.where(all_scores['loc_score'] > 75,4,0)
    
    all_scores['overall_score(0-10)'] = np.where((all_scores['loc_score'] == -1),-1,all_scores['title_rank_score'] + all_scores['skill_rank_score'] + all_scores['location_rank_score'])

    stored_distance = stored_distance.append(distance_data)
    stored_distance.drop_duplicates(inplace=True)
    stored_distance.reset_index(drop=True,inplace=True)
    
    return (stored_distance, all_scores)



def find_db_input_data(database_score):
    type_ = 'Job-Resume-Match'
    source = "Backend"
    status = "Active"
    input_data = []
    for i in range(len(database_score)):
        name = database_score['title'][i]
        job_id = int(database_score['job_id'][i])
        resume_id = int(database_score['resume_id'][i])
        person_id = int(database_score['person_id'][i])

        skill_score = database_score['overall_skill_score'][i]
        if skill_score is not None:
            skill_score = float(skill_score) 

        title_score = database_score['title_score'][i]
        if title_score is not None:
            title_score = float(title_score) 

        location_score = database_score['loc_score'][i]
        if location_score is not None:
            location_score = float(location_score)

        matched_skills = database_score['matched_skills'][i]
        if matched_skills is not None:
            matched_skills = int(matched_skills) 

        percentage = database_score['percentage'][i]
        if percentage is not None:
            percentage = float(percentage) 

        top_skill_score = database_score['top_skill_score'][i]
        if top_skill_score is not None:
            top_skill_score = float(top_skill_score)

        skill_rank_score = database_score['skill_rank_score'][i]
        if skill_rank_score is not None:
            skill_rank_score = int(skill_rank_score)

        title_rank_score = database_score['title_rank_score'][i]
        if title_rank_score is not None:
            title_rank_score = int(title_rank_score)

        location_rank_score = database_score['location_rank_score'][i]
        if location_rank_score is not None:
            location_rank_score = int(location_rank_score)

        overall_score = database_score['overall_score(0-10)'][i]
        if overall_score is not None:
            overall_score = int(overall_score)


        input_data.append((type_, source, name, status, matched_skills,percentage,
                                      top_skill_score,skill_score,title_score, location_score,
                                      skill_rank_score,title_rank_score,location_rank_score,overall_score,
                                      job_id, resume_id, person_id))
    return input_data


def find_skills_list(skills_data):
    all_skills = list(skills_data.columns)
    for cat in skills_data.values:
        all_skills.extend(cat)
    na_index = []
    for i in range(len(all_skills)):
        if pd.isna(all_skills[i]):
            na_index.append(i)
    for i in na_index[::-1]:
        all_skills.pop(i)
    all_skills = list(set(all_skills))
    return all_skills

def update_database(all_scores,job_data):
    database_score = all_scores[['job_id','resume_id','person_id','matched_skills','percentage',
                                'top_skill_score','overall_skill_score','title_score','loc_score','title_rank_score',
                                'skill_rank_score','location_rank_score','overall_score(0-10)']]
    temp_job = job_data[['id','title']]
    temp_job.columns = ['job_id','title']
    database_score = database_score.merge(temp_job)
    database_score = database_score.where((pd.notnull(database_score)), None)
            
    insert_input_data = find_db_input_data(database_score)
    insert_query = """INSERT INTO job_person_matches (type, source, name, status,matched_skill_count, matched_skill_percentage,
                                                       top_skill_score, skill_score, title_score, location_score,
                                                       skill_rank_score,title_rank_score,location_rank_score,overall_score,
                                                       job_id, resume_id, person_id)
                       VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
    db = pymysql.connect (host=cfg.mysql["host"], 
							user=cfg.mysql["user"], 
	  						passwd=cfg.mysql["passwd"],
							  db=cfg.mysql["db"],
							  port=cfg.mysql["port"],
							  charset=cfg.mysql["charset"],
							  cursorclass=pymysql.cursors.DictCursor)
    cursor = db.cursor()
    print("inserting")
    cursor.executemany(insert_query, insert_input_data)
    db.commit()
    print("inserted")
    db.close()



def main(resume_data,job_data,stored_distance):
	all_skills = find_skills_list(skills_data)
	query_resume = "select r.id as resume_id, r.person_id from resumes r, person p where r.person_id = p.id and r.person_id is not null and r.resume_details is not null"
	all_resume_id = extract(query_resume)

	query_jobs = "select j.id from jobs j where tenant_id = 6 and j.details is not null and j.id is not null"
	all_job_id = extract(query_jobs)
	new_resume = []
	new_job = []
	if len(all_resume_id) > 0 :
	    query_new_resume = """select r.id as resume_id, r.person_id, p.job_profile as title, r.resume_details, p.city, p.state_province, p.country, p.total_experience_years 
	                      from resumes r, person p 
	                      where r.id = %s and r.person_id = p.id order by r.id desc"""

	    new_res_ids = find_new_ids(all_resume_id['resume_id'],resume_data['resume_id'])
	    if len(new_res_ids) > 0:
	        print("New resume found")
	        new_resume = []
	        for Id in new_res_ids:
	            new_resume.append(extract_new_data(query_new_resume,Id))
	        new_resume = pd.DataFrame(new_resume)
	        
	        new_res_skills = []
	        new_res_top_skills = []
	        for i in range(len(new_resume)):
	            new_res_tok = preprocess(new_resume['resume_details'][i])
	            new_res_tok = add_features(new_res_tok)
	            new_res_skills.append(find_skills(new_res_tok,all_skills))
	            new_res_top_skills.append(find_top_skills(new_res_skills[i]))

	        new_resume['skills'] = pd.DataFrame(pd.Series(new_res_skills))
	        new_resume['top_skill'] = pd.DataFrame(pd.Series(new_res_top_skills))
	        new_res_loc_details = find_loc_details(new_resume)
	        new_resume = pd.concat([new_resume,pd.DataFrame(new_res_loc_details)],axis=1)
	        resume_data = resume_data.append(new_resume,ignore_index=True)
	        resume_data['resume_id'] = resume_data['resume_id'].astype(int)
	        resume_data['person_id'] = resume_data['person_id'].astype(int)
	        resume_data.to_pickle('resume_updated_data.pkl')
	        
	        

	if len(all_job_id) > 0 :
	    query_new_jobs = """select j.id, j.title, j.details, j.experience_level, j.city, j.state_province, j.country 
	                    from jobs j 
	                    where j.id = %s and tenant_id = 6"""
	    
	    new_job_ids = find_new_ids(all_job_id['id'],job_data['id'])
	    if len(new_job_ids) > 0:
	        print("New job found")
	        new_job = []
	        for Id in new_job_ids:
	            new_job.append(extract_new_data(query_new_jobs,Id))
	        new_job = pd.DataFrame(new_job)
	    
	        new_job_skills = []
	        for i in range(len(new_job)):
	            new_job_tok = preprocess(new_job['details'][i])
	            new_job_tok = add_features(new_job_tok)
	            new_job_skills.append(find_skills(new_job_tok,all_skills))

	        new_job['skills'] = pd.DataFrame(pd.Series(new_job_skills))    
	        new_job_loc_details = find_loc_details(new_job)
	        new_job = pd.concat([new_job,pd.DataFrame(new_job_loc_details)],axis=1)
	        job_data = job_data.append(new_job,ignore_index=True)
	        job_data['id'] = job_data['id'].astype(int)
	        job_data.to_pickle('job_updated_data.pkl')


	job_person_table_data = extract('select * from job_person_matches')
	if len(job_person_table_data) == 0:
	    stored_distance, all_scores = compute_all_scores(resume_data,job_data,stored_distance)
	    stored_distance.to_csv('stored_distance_data.csv',index=False)
	    all_scores = all_scores.merge(resume_data[['resume_id','person_id']])
	    update_database(all_scores,job_data)

	else:
	    if len(new_resume) > 0:
	        stored_distance, new_resume_score = compute_all_scores(new_resume,job_data,stored_distance)
	        stored_distance.to_csv('stored_distance_data.csv',index=False)
	        new_resume_score = new_resume_score.merge(resume_data[['resume_id','person_id']])
	        update_database(new_resume_score,job_data)


	    if len(new_job) > 0:
	        stored_distance, new_job_score = compute_all_scores(resume_data,new_job,stored_distance)
	        stored_distance.to_csv('stored_distance_data.csv',index=False)
	        new_job_score = new_job_score.merge(resume_data[['resume_id','person_id']])
	        update_database(new_job_score,job_data)

main(resume_data,job_data,stored_distance)