3
e_H                 @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dl	m
Z
 d dlmZmZ dd Zdd Zd	d
 Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  ZdS )!    N)word_tokenizesent_tokenize)	stopwords)LancasterStemmerWordNetLemmatizerc             C   s   t j| }|S )z-Convert text into list of tokenized sentences)nltkr   )text	sentences r
   L/var/www/html/talenthub/backend/ai/resume-to-job-match/text_preprocessing.pytext_to_sentence_tokens   s    
r   c             C   s   t j| }|S )z)Convert text into list of tokenized words)r   r   )r   wordsr
   r
   r   text_to_words_tokens   s    
r   c             C   s   t | }|S )z0Convert string text into list of tokenized words)r   )r   r   r
   r
   r   text_to_words   s    r   c       	      C   s   g }x| D ]}ddiddiddiddidd	id
diddiddiddig	}x8|D ]0}x*|j  D ]\}}tj|}|j||}q\W qNW |j }|jdd}|j| q
W |S )z/Clean special tags from list of tokenized wordsz>\s+>z\s+ z\s*<br\s*/?>\s*
z</(div)\s*>\s*z</(p|h\d)\s*>\s*z

z<head>.*<\s*(/head|body)[^>]*> z<a\s+href="([^"]+)"[^>]*>.*</a>z\1z[ \t]*<[^<]*?/?>z^\s+z\n)itemsrecompilesubrstripreplaceappend)	r   	new_wordswordZrulesrulekvregexnew_wordr
   r
   r   remove_special_tags   s&    



r"   c             C   s<   g }x2| D ]*}t jd|jddjdd}|j| q
W |S )z8Remove non-ASCII characters from list of tokenized wordsZNFKDasciiignorezutf-8)unicodedata	normalizeencodedecoder   )r   r   r   r!   r
   r
   r   remove_non_ascii8   s
    
r)   c             C   s0   g }x&| D ]}t jj|dd}|j| q
W |S )z.Remove apostrophe from list of tokenized words'r   )npcharr   r   )r   r   r   r!   r
   r
   r   remove_apostrophe@   s
    
r-   c             C   s(   g }x| D ]}|j  }|j| q
W |S )z@Convert all characters to lowercase from list of tokenized words)lowerr   )r   r   r   r!   r
   r
   r   to_lowercaseI   s
    
r/   c             C   s6   g }x,| D ]$}t jdd|}|dkr
|j| q
W |S )z/Remove punctuation from list of tokenized wordsz[^\w\s]r   )r   r   r   )r   r   r   r!   r
   r
   r   remove_punctuationQ   s    
r0   c             C   s~   g }xt| D ]l}d}x8t t|D ](}tjj||| d}tjj|dd}q W tjj|dd}tjj|dd}|j| q
W |S )z6Remove special characters from list of tokenized wordsz!"#$%&()*+-./:;<=>?@[\]^_`{|}~
r   z  ,r   r*   )rangelenr+   r,   r   r   )r   r   r   Zsymbolsir!   r
   r
   r   remove_special_chars\   s    
r5   c             C   sF   t j }g }x4| D ],}|j r4|j|}|j| q|j| qW |S )zWReplace all interger occurrences in list of tokenized words with textual representation)inflectengineisdigitZnumber_to_wordsr   )r   pr   r   r!   r
   r
   r   replace_numbersm   s    

r:   c             C   s.   g }x$| D ]}|t jdkr
|j| q
W |S )z.Remove stop words from list of tokenized wordsenglish)r   r   r   )r   r   r   r
   r
   r   remove_stopwordsy   s
    
r<   c             C   s0   t  }g }x | D ]}|j|}|j| qW |S )z%Stem words in list of tokenized words)r   stemr   )r   ZstemmerZstemsr   r=   r
   r
   r   stemming_words   s    

r>   c             C   s4   t  }g }x$| D ]}|j|dd}|j| qW |S )z*Lemmatize verbs in list of tokenized wordsr   )pos)r   	lemmatizer   )r   Z
lemmatizerlemmasr   lemmar
   r
   r   lemmatize_verbs   s    
rC   c             C   sD   t | }t|}t|}t|}t|}t|}t|}t|}|S )z1 preprocess the text into list of tokenized words)r   r)   r/   r0   r:   r<   r>   rC   )r   Zwords_tokensr
   r
   r   preprocess_text   s    rD   )r   stringr%   numpyr+   r   r6   r   r   Znltk.corpusr   Z	nltk.stemr   r   r   r   r   r"   r)   r-   r/   r0   r5   r:   r<   r>   rC   rD   r
   r
   r
   r   <module>   s(   			
