# Libraries
import os
import re
import pandas as pd
import np
import pymysql.cursors
import pymysql
import time
import json
from cleanco import cleanco
from cleanco import prepare_terms, basename
from couchbase.cluster import Cluster, ClusterOptions
from couchbase_core.cluster import PasswordAuthenticator

cluster = Cluster('couchbase://docs.gigminds.com', ClusterOptions(PasswordAuthenticator('admin', 'Tanya@12')))
cb = cluster.bucket('default')
cb_coll = cb.default_collection()

db = pymysql.connect (host="cldy-hub-db-prod-do-user-1524670-0.a.db.ondigitalocean.com",
                     user="doadmin",
                     passwd="jmbly6e4obtma5z0",
                     db="defaultdb",
                     port=25060,
                     charset='utf8mb4',
                     cursorclass=pymysql.cursors.DictCursor)


#
# Standardize the Organization Name and store it in other_names column
#
i = 0
print (time.ctime(), " : Standardizing the Organization Name...")
with db.cursor() as cursor:
    sql = "SELECT name, id FROM organization where created_at > curdate() - 1 and other_names is null and name is not null;"
    cursor.execute(sql)
    records = cursor.fetchall()
    for record in records:
        org_name = record['name']
        n_org_name = org_name.upper()
        n_org_name = n_org_name.replace(',', '')
        n_org_name = n_org_name.replace(' - ', ' ')
        n_org_name = n_org_name.replace(r"\(.*\)", "")
        n_org_name = n_org_name.replace(' AND ', ' & ')
        n_org_name = n_org_name.strip()
        #n_org_name = n_org_name.encode('utf-8')
        terms = prepare_terms()
        n_org_name = basename(n_org_name, terms, prefix=False, middle=False, suffix=True)

        #n_org_name = n_org_name.apply(lambda x: cleanco(x).clean_name() if type(x) == str else x)
        n_org_name = n_org_name.replace('.', '')
        n_org_name = basename(n_org_name, terms, prefix=False, middle=False, suffix=True)

        #n_org_name = n_org_name.apply(lambda x: cleanco(x).clean_name() if type(x) == str else x)
        #n_org_name = n_org_name.replace('SP ZOO', '')

        #print ("original name: ", org_name, " >>> ", n_org_name)

        up_sql = "update `organization` set `other_names` = %s where `id` = %s"
        cursor.execute(up_sql, (n_org_name, record['id']))                
        db.commit()
        i = i + 1        

print (time.ctime(), " : Updated ", i, " rows.")
print (time.ctime(), " : Completed")
print("...")





#
# Update the Organization Name with the domain name from Person table
#
print (time.ctime(), " : Updating Organization's Domain Name...")
with db.cursor() as cursor:
    sql = "UPDATE organization o SET domain = ( select distinct lower(p.domain) from person p where p.org_id = o.id and p.domain not in ('gmail.com', 'yahoo.com', 'indeedemail.com', 'hotmail.com', 'charter.net') limit 1) WHERE o.domain is null AND o.name is not null"
    count = cursor.execute(sql)
    db.commit()
    print (time.ctime(), " : Updated ", count, " rows.")
print(time.ctime(), " : Completed")
print("...")





#
# Update Organization Metadata
#
i = 0
print (time.ctime(), " : Populating Organization Metadata...")
with db.cursor() as cursor:
    sql = "SELECT name, id, domain, legal_name, overview, year_founded, industry, city, zip_postal, state_province, country, profile_photo, linkedin_url, twitter_url, sales_phone, street_address_1, street_address_2, crunchbase_url, sic, naics, EIN, tech_stack, tech_categories, number_of_employees, estimated_revenue FROM organization where created_at > curdate() - 1 and tech_stack is null and domain is not null"
    cursor.execute(sql)
    records = cursor.fetchall()
    i = 0
    for record in records:
        org_name = record['name']
        org_domain = record['domain']
        org_id = record['id']

        result = cluster.query(
            "SELECT * FROM `default` WHERE id = $1 LIMIT 1",
            org_domain)

        for row in result:
            org_data = row['default']

            #print (org_domain)

            try:
                name = org_data['name'] if org_data['name'] else record['name']
                legal_name = org_data['legalName'] if org_data['legalName'] else record['legal_name']
                overview = org_data['description'] if org_data['description'] else record['overview']
                foundedYear = org_data['foundedYear'] if org_data['foundedYear'] else record['year_founded']
                industry = org_data['category']['industry'] if org_data['category']['industry'] else record['industry']
                address1 = org_data['geo']['streetNumber'] if org_data['geo']['streetNumber'] else record['street_address_1']
                address1 = address1 + " " + org_data['geo']['streetName'] if org_data['geo']['streetName'] else record['street_address_1']
                address2 = org_data['geo']['subPremise'] if org_data['geo']['subPremise'] else record['street_address_2']
                city  = org_data['geo']['city'] if org_data['geo']['city'] else record['city']
                state = org_data['geo']['stateCode'] if org_data['geo']['stateCode'] else record['state_province']
                zip = org_data['geo']['postalCode'] if org_data['geo']['postalCode'] else record['zip_postal']
                country = org_data['geo']['country'] if org_data['geo']['country'] else record['country']
                profilePhoto = org_data['logo'] if org_data['logo'] else record['profile_photo']
                linkedinUrl = "https://linkedin.com/" + org_data['linkedin']['handle']  if org_data['linkedin']['handle'] else record['linkedin_url']
                twitterUrl = "https://twitter.com/" + org_data['twitter']['handle'] if org_data['twitter']['handle'] else record['twitter_url']                
                salesPhone = org_data['phone'] if org_data['phone'] else record['sales_phone']
                crunchbaseUrl = "https://crunchbase.com/" + org_data['crunchbase']['handle'] if org_data['crunchbase']['handle'] else record['crunchbase_url']
                sic = org_data['category']['sicCode'] if org_data['category']['sicCode'] else record['sic']
                naics = org_data['category']['naicsCode'] if org_data['category']['naicsCode'] else record['naics']
                ein = org_data['identifiers']['usEIN'] if org_data['identifiers']['usEIN'] else record['EIN']
                tech_stack = str(org_data['tech'] if org_data['tech'] else record['tech_stack'])
                tech_categories = str(org_data['techCategories'] if org_data['techCategories'] else record['tech_categories'])
                employees = org_data['metrics']['employees'] if org_data['metrics']['employees'] else record['number_of_employees']
                revenue = org_data['metrics']['estimatedAnnualRevenue'] if org_data['metrics']['estimatedAnnualRevenue'] else record['estimated_revenue']
                website = "http://www." + record['domain']

                upSql = "UPDATE organization SET name = %s, legal_name = %s, overview = %s, industry = %s, street_address_1 = %s, street_address_2 = %s, city = %s, state_province = %s, zip_postal = %s, country = %s, profile_photo = %s, linkedin_url = %s, twitter_url = %s, year_founded = %s, sales_phone = %s, crunchbase_url = %s, sic = %s, naics = %s, EIN = %s, tech_stack = %s, tech_categories = %s, number_of_employees = %s, estimated_revenue = %s, website = %s WHERE id = %s"
                upData = (name, legal_name, overview, industry, address1, address2, city, state, zip, country, profilePhoto, linkedinUrl, twitterUrl, foundedYear, salesPhone, crunchbaseUrl, sic, naics, ein, tech_stack, tech_categories, employees, revenue, website, org_id )
                cursor.execute(upSql, upData)
                db.commit()
                i = i + 1
            except:
                print ("Skipping", ' -', org_domain)


            #print (org_data['category']['sicCode'])
            #print (org_data['category']['naicsCode'])
            #print (org_data['location'])
            #print (org_data['timeZone'])
            #print (org_data['geo']['streetNumber'])
            #print (org_data['geo']['streetName'])
            #print (org_data['geo']['subPremise'])
            #print (org_data['geo']['city'])
            #print (org_data['geo']['postalCode'])
            #print (org_data['geo']['stateCode'])
            #print (org_data['geo']['country'])
            #print (org_data['geo']['streetNumber'])
            #print (org_data['logo'])
            #print (org_data['linkedin']['handle'])
            #print (org_data['twitter']['handle'])
            #print (org_data['crunchbase']['handle'])
            #print (org_data['identifiers']['usEIN'])
            #print (org_data['phone'])
            #print (org_data['tech'])
print (time.ctime(), " : Updated ", i, " rows.")
print(time.ctime(), " : Completed")
print("...")





#
# Update the Organization Name in Person table
#
print (time.ctime(), " : Updating Organization Name in Person Table...")
with db.cursor() as cursor:
    sql = "UPDATE person p SET organization = (SELECT name FROM organization o WHERE p.org_id = o.id), update_search_index = 1 WHERE org_id in (select id from organization o where (p.organization != o.name OR p.organization is null) AND p.org_id is not null)"
    count = cursor.execute(sql)
    db.commit()
    print (time.ctime(), " : Updated ", count, " rows.")
print(time.ctime(), " : Completed")
print("...")


#
# Update the Organization Name in Person table if ORG ID IS NULL
#
print (time.ctime(), " : Updating Organization ID and Name in Person Table if ORG ID IS NULL...")
with db.cursor() as cursor:
    sql = "update person set org_id = 17994, organization = '*Not Available*', update_search_index = 1 where org_id is null"
    count = cursor.execute(sql)
    db.commit()
    print (time.ctime(), " : Updated ", count, " rows.")
print(time.ctime(), " : Completed")
print("...")


#
# Update the Organization Name in Jobs table
#
print (time.ctime(), " : Updating Company Name in Jobs Table...")
with db.cursor() as cursor:
    sql = "UPDATE jobs p SET company = (SELECT name FROM organization o WHERE p.org_id = o.id), update_search_index = 1 WHERE org_id in (select id from organization o where (p.company != o.name OR p.company is null) AND p.org_id is not null)"
    count = cursor.execute(sql)
    db.commit()
    print (time.ctime(), " : Updated ", count, " rows.")
print(time.ctime(), " : Completed")
print("...")




#
# Cleanup the Person Email in Person table
#
print (time.ctime(), " : Cleaning up Person's Email Address in Person Table...")
with db.cursor() as cursor:
    sql = "update person set email = SUBSTRING(email, 1, locate('|',email)-1) where email like '%|%'"
    count = cursor.execute(sql)
    db.commit()
    print (time.ctime(), " : Updated ", count, " rows.")

    sql = "update person set email = SUBSTRING(email, 1, locate(',',email)-1) where email like '%,%'"
    count = cursor.execute(sql)
    db.commit()
    print (time.ctime(), " : Updated ", count, " rows.")

    sql = "update person set email = SUBSTRING(email, 1, locate(';',email)-1) where email like '%;%'"
    count = cursor.execute(sql)
    db.commit()
    print (time.ctime(), " : Updated ", count, " rows.")

print(time.ctime(), " : Completed")
print("...")









#
# Merge Duplicate Organizations based on Domain Name
#






#
# Merge Duplicate Person based on Email Address
#




#
# Delete duplicate articles
#
print (time.ctime(), " : Deleting duplicate articles based on title...")
with db.cursor() as cursor:
    sql = "DELETE t1 FROM articles t1 INNER JOIN articles t2 WHERE t1.id < t2.id AND t1.title = t2.title"
    count = cursor.execute(sql)
    db.commit()
    print (time.ctime(), " : Deleted ", count, " rows.")
print(time.ctime(), " : Completed")
print("...")


