Vecteurs de mots et sémantique – Vers la science des données
exploration d’opinion ou émotion AI

# Import spaCy and load the language library
import spacynlp = spacy.load('en_core_web_lg')
# make sure to use a larger model!#or# Import spaCy and load the language library
import spacy nlp = spacy.load('en_core_web_md')
# make sure to use a larger model!doc = nlp(u'The quick brown fox jumped over the lazy dogs.')doc.vector
# Create a three-token Doc object:
tokens = nlp(u'lion cat pet')
# Iterate through token combinations:
for token1 in tokens:
for token2 in tokens:
print(token1.text, token2.text, token1.similarity(token2))
Output-lion lion 1.0
lion cat 0.526544
lion pet 0.399238
cat lion 0.526544
cat cat 1.0
cat pet 0.750546
pet lion 0.399238
pet cat 0.750546
pet pet 1.0
# Create a three-token Doc object:
tokens = nlp(u'like love hate')
# Iterate through token combinations:
for token1 in tokens:
for token2 in tokens:
print(token1.text, token2.text, token1.similarity(token2))
Output-like like 1.0
like love 0.657904
like hate 0.657465
love like 0.657904
love love 1.0
love hate 0.63931
hate like 0.657465
hate love 0.63931
hate hate 1.0
tokens = nlp(u'dog cat nargle')for token in tokens:
print(token.text, token.has_vector, token.vector_norm, token.is_oov)
Output-dog True 7.03367 False
cat True 6.68082 False
nargle False 0.0 True
"queen" - "woman" + "man" = "king"
from scipy import spatialcosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
queen = nlp.vocab['queen'].vector
woman = nlp.vocab['woman'].vector
man = nlp.vocab['man'].vector
# Now we find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
new_vector = queen - woman + man
computed_similarities = []
for word in nlp.vocab:
# Ignore words without vectors and mixed-case words:
if word.has_vector:
if word.is_lower:
if word.is_alpha:
similarity = cosine_similarity(new_vector, word.vector)
computed_similarities.append((word, similarity))computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])
['queen', 'king', 'kings', 'queens', 'prince', 'lord', 'throne', 'royal', 'god', 'monarch']