Téléverser les fichiers vers "/"
This commit is contained in:
parent
b8a9abe78d
commit
bc9a8ed401
1 changed files with 93 additions and 0 deletions
93
cos_similarity.py
Normal file
93
cos_similarity.py
Normal file
|
@ -0,0 +1,93 @@
|
|||
import re
|
||||
from math import sqrt
|
||||
|
||||
def inf_verbs(text):
|
||||
verbs_dic = {}
|
||||
with open('verbs.txt', 'r') as txt:
|
||||
setp1 = txt.readlines()
|
||||
for i in setp1:
|
||||
tmp = i[:-2].split(',')
|
||||
#verbs_dic[tmp[0]] = [elem for elem in tmp[1:]]
|
||||
for l in tmp[1:]:
|
||||
verbs_dic[l] = tmp[0]
|
||||
words = text.split()
|
||||
for i in range(len(words)):
|
||||
try:
|
||||
words[i] = verbs_dic[words[i]]
|
||||
except:
|
||||
None
|
||||
return ' '.join(words)
|
||||
|
||||
|
||||
def remove_punctuation(text):
|
||||
text2 = ''
|
||||
for i in range(len(text)):
|
||||
text2 += text[i] if text[i].isalpha() else ''
|
||||
return text2
|
||||
|
||||
def remove_pronouns(text):
|
||||
pronouns = [
|
||||
"je", "tu", "il", "elle", "nous", "vous", "ils", "elles",
|
||||
"me", "te", "se", "le", "la", "lui", "les", "leur", "y", "en",
|
||||
"moi", "toi", "soi", "toi-même", "vous-même", "lui-même", "elle-même",
|
||||
"eux-même", "elles-même", "nous-même", "me-même", "te-même", "on", "qu'il", "qu'ils", "qu'elle", "qu'elles"
|
||||
]
|
||||
pattern = r'\b(?:' + '|'.join(pronouns) + r')\b'
|
||||
return re.sub(pattern, '', text, flags=re.IGNORECASE)
|
||||
|
||||
def remove_miscellaneous(text):
|
||||
prepositions = [
|
||||
"de", "du", "des", "d'", "le", "la", "les", "l'", "un", "une", "au",
|
||||
"aux", "en", "dans", "sur", "sous", "vers", "à", "chez", "par", "pour",
|
||||
"avec", "sans", "entre", "pendant", "depuis", "devant", "derrière",
|
||||
"près", "loin", "jusque", "jusqu'à", "à côté de", "et", "ou", "qu", "s'",
|
||||
"m'", "t'", "n'", "a", "c'", "t'", "n'y"
|
||||
]
|
||||
pattern = r'\b(?:' + '|'.join(prepositions) + r')\b'
|
||||
return re.sub(pattern, '', text, flags=re.IGNORECASE)
|
||||
|
||||
def remove_line_breaks(text):
|
||||
return text.replace('\n', ' ')
|
||||
|
||||
def text_to_vector(text, all_words):
|
||||
words = text.split()
|
||||
word_count = {}
|
||||
for word in words:
|
||||
word_count[word] = word_count.get(word, 0) + 1
|
||||
return [word_count.get(word, 0) for word in all_words]
|
||||
|
||||
def cosine_similarity(vec1, vec2):
|
||||
dot_product, norm1, norm2 = 0, 0, 0
|
||||
for x, y in zip(vec1, vec2):
|
||||
dot_product += x * y
|
||||
norm1 += x ** 2
|
||||
norm2 += y ** 2
|
||||
return dot_product / (sqrt(norm1) * sqrt(norm2))
|
||||
|
||||
def cosine_similarity_rec(vec1, vec2):
|
||||
def recur_part(vec1, vec2):
|
||||
if vec1 == []:
|
||||
return [0, 0, 0]
|
||||
suivant = recur_part(vec1[1:], vec2[1:])
|
||||
return [vec1[0] * vec2[0] + suivant[0], vec1[0] ** 2 + suivant[1], vec2[0] ** 2 + suivant[2]]
|
||||
result = recur_part(vec1, vec2)
|
||||
for i in (1, 2):
|
||||
result[i] = 1 if result[i] == 0 else result[i]
|
||||
return result[0] / (sqrt(result[1]) * sqrt(result[2]))
|
||||
|
||||
def load_text(name):
|
||||
with open(f'{name}.txt', 'r') as file:
|
||||
return file.read()
|
||||
|
||||
text1 = load_text('')
|
||||
text2 = load_text('')
|
||||
|
||||
text1_cleaned = inf_verbs(remove_line_breaks(remove_miscellaneous(remove_pronouns(remove_punctuation(text1.lower())))))
|
||||
text2_cleaned = inf_verbs(remove_line_breaks(remove_miscellaneous(remove_pronouns(remove_punctuation(text2.lower())))))
|
||||
|
||||
all_words = set(text1_cleaned.split()).union(set(text2_cleaned.split()))
|
||||
vec1 = text_to_vector(text1_cleaned, all_words)
|
||||
vec2 = text_to_vector(text2_cleaned, all_words)
|
||||
|
||||
similarity = cosine_similarity(vec1, vec2)
|
||||
print(f"Les textes ont un score de similarité de {round(similarity, 3)}. Ils ont été convertit en deux vecteurs de {len(vec1)} dimensions.")
|
Loading…
Add table
Reference in a new issue