import re from math import sqrt def inf_verbs(text): verbs_dic = {} with open('verbs.txt', 'r') as txt: setp1 = txt.readlines() for i in setp1: tmp = i[:-2].split(',') #verbs_dic[tmp[0]] = [elem for elem in tmp[1:]] for l in tmp[1:]: verbs_dic[l] = tmp[0] words = text.split() for i in range(len(words)): try: words[i] = verbs_dic[words[i]] except: None return ' '.join(words) def remove_punctuation(text): text2 = '' for i in range(len(text)): text2 += text[i] if text[i].isalpha() else '' return text2 def remove_pronouns(text): pronouns = [ "je", "tu", "il", "elle", "nous", "vous", "ils", "elles", "me", "te", "se", "le", "la", "lui", "les", "leur", "y", "en", "moi", "toi", "soi", "toi-même", "vous-même", "lui-même", "elle-même", "eux-même", "elles-même", "nous-même", "me-même", "te-même", "on", "qu'il", "qu'ils", "qu'elle", "qu'elles" ] pattern = r'\b(?:' + '|'.join(pronouns) + r')\b' return re.sub(pattern, '', text, flags=re.IGNORECASE) def remove_miscellaneous(text): prepositions = [ "de", "du", "des", "d'", "le", "la", "les", "l'", "un", "une", "au", "aux", "en", "dans", "sur", "sous", "vers", "à", "chez", "par", "pour", "avec", "sans", "entre", "pendant", "depuis", "devant", "derrière", "près", "loin", "jusque", "jusqu'à", "à côté de", "et", "ou", "qu", "s'", "m'", "t'", "n'", "a", "c'", "t'", "n'y" ] pattern = r'\b(?:' + '|'.join(prepositions) + r')\b' return re.sub(pattern, '', text, flags=re.IGNORECASE) def remove_line_breaks(text): return text.replace('\n', ' ') def text_to_vector(text, all_words): words = text.split() word_count = {} for word in words: word_count[word] = word_count.get(word, 0) + 1 return [word_count.get(word, 0) for word in all_words] def cosine_similarity(vec1, vec2): dot_product, norm1, norm2 = 0, 0, 0 for x, y in zip(vec1, vec2): dot_product += x * y norm1 += x ** 2 norm2 += y ** 2 return dot_product / (sqrt(norm1) * sqrt(norm2)) def cosine_similarity_rec(vec1, vec2): def recur_part(vec1, vec2): if vec1 == []: return [0, 0, 0] suivant = recur_part(vec1[1:], vec2[1:]) return [vec1[0] * vec2[0] + suivant[0], vec1[0] ** 2 + suivant[1], vec2[0] ** 2 + suivant[2]] result = recur_part(vec1, vec2) for i in (1, 2): result[i] = 1 if result[i] == 0 else result[i] return result[0] / (sqrt(result[1]) * sqrt(result[2])) def load_text(name): with open(f'{name}.txt', 'r') as file: return file.read() text1 = load_text('') text2 = load_text('') text1_cleaned = inf_verbs(remove_line_breaks(remove_miscellaneous(remove_pronouns(remove_punctuation(text1.lower()))))) text2_cleaned = inf_verbs(remove_line_breaks(remove_miscellaneous(remove_pronouns(remove_punctuation(text2.lower()))))) all_words = set(text1_cleaned.split()).union(set(text2_cleaned.split())) vec1 = text_to_vector(text1_cleaned, all_words) vec2 = text_to_vector(text2_cleaned, all_words) similarity = cosine_similarity(vec1, vec2) print(f"Les textes ont un score de similarité de {round(similarity, 3)}. Ils ont été convertit en deux vecteurs de {len(vec1)} dimensions.")