diff --git a/cos_similarity.py b/cos_similarity.py new file mode 100644 index 0000000..7ade2d9 --- /dev/null +++ b/cos_similarity.py @@ -0,0 +1,93 @@ +import re +from math import sqrt + +def inf_verbs(text): + verbs_dic = {} + with open('verbs.txt', 'r') as txt: + setp1 = txt.readlines() + for i in setp1: + tmp = i[:-2].split(',') + #verbs_dic[tmp[0]] = [elem for elem in tmp[1:]] + for l in tmp[1:]: + verbs_dic[l] = tmp[0] + words = text.split() + for i in range(len(words)): + try: + words[i] = verbs_dic[words[i]] + except: + None + return ' '.join(words) + + +def remove_punctuation(text): + text2 = '' + for i in range(len(text)): + text2 += text[i] if text[i].isalpha() else '' + return text2 + +def remove_pronouns(text): + pronouns = [ + "je", "tu", "il", "elle", "nous", "vous", "ils", "elles", + "me", "te", "se", "le", "la", "lui", "les", "leur", "y", "en", + "moi", "toi", "soi", "toi-même", "vous-même", "lui-même", "elle-même", + "eux-même", "elles-même", "nous-même", "me-même", "te-même", "on", "qu'il", "qu'ils", "qu'elle", "qu'elles" + ] + pattern = r'\b(?:' + '|'.join(pronouns) + r')\b' + return re.sub(pattern, '', text, flags=re.IGNORECASE) + +def remove_miscellaneous(text): + prepositions = [ + "de", "du", "des", "d'", "le", "la", "les", "l'", "un", "une", "au", + "aux", "en", "dans", "sur", "sous", "vers", "à", "chez", "par", "pour", + "avec", "sans", "entre", "pendant", "depuis", "devant", "derrière", + "près", "loin", "jusque", "jusqu'à", "à côté de", "et", "ou", "qu", "s'", + "m'", "t'", "n'", "a", "c'", "t'", "n'y" + ] + pattern = r'\b(?:' + '|'.join(prepositions) + r')\b' + return re.sub(pattern, '', text, flags=re.IGNORECASE) + +def remove_line_breaks(text): + return text.replace('\n', ' ') + +def text_to_vector(text, all_words): + words = text.split() + word_count = {} + for word in words: + word_count[word] = word_count.get(word, 0) + 1 + return [word_count.get(word, 0) for word in all_words] + +def cosine_similarity(vec1, vec2): + dot_product, norm1, norm2 = 0, 0, 0 + for x, y in zip(vec1, vec2): + dot_product += x * y + norm1 += x ** 2 + norm2 += y ** 2 + return dot_product / (sqrt(norm1) * sqrt(norm2)) + +def cosine_similarity_rec(vec1, vec2): + def recur_part(vec1, vec2): + if vec1 == []: + return [0, 0, 0] + suivant = recur_part(vec1[1:], vec2[1:]) + return [vec1[0] * vec2[0] + suivant[0], vec1[0] ** 2 + suivant[1], vec2[0] ** 2 + suivant[2]] + result = recur_part(vec1, vec2) + for i in (1, 2): + result[i] = 1 if result[i] == 0 else result[i] + return result[0] / (sqrt(result[1]) * sqrt(result[2])) + +def load_text(name): + with open(f'{name}.txt', 'r') as file: + return file.read() + +text1 = load_text('') +text2 = load_text('') + +text1_cleaned = inf_verbs(remove_line_breaks(remove_miscellaneous(remove_pronouns(remove_punctuation(text1.lower()))))) +text2_cleaned = inf_verbs(remove_line_breaks(remove_miscellaneous(remove_pronouns(remove_punctuation(text2.lower()))))) + +all_words = set(text1_cleaned.split()).union(set(text2_cleaned.split())) +vec1 = text_to_vector(text1_cleaned, all_words) +vec2 = text_to_vector(text2_cleaned, all_words) + +similarity = cosine_similarity(vec1, vec2) +print(f"Les textes ont un score de similarité de {round(similarity, 3)}. Ils ont été convertit en deux vecteurs de {len(vec1)} dimensions.")