1 import requests
2 import gensim
3 url = 'http://www.gutenberg.org/cache/epub/1041/pg1041.txt'
4 text = requests.get(url).text
5 tokens = gensim.utils.simple_preprocess(text)
6 model = gensim.models.Word2Vec([tokens], min_count=3, size=100)
import requests
def fetch(url):
'''input url, output text from page'''
r = requests.get(url).text
return r
# fetch Shakespeare's sonnets
text = fetch('http://www.gutenberg.org/cache/epub/1041/pg1041.txt')
len(text)
# take a look at what we have
# its dirty
text[1000:3001]
import gensim
tokenized = gensim.utils.simple_preprocess(text)
len(tokenized)
# cleaned up
tokenized[1000:1021]
from collections import Counter
# find the frequency of each word in list
# NLP speak: most frequently occuring tokens in corpus
c = Counter(tokenized)
c.most_common(20) # top 20
# min_count = must appear at least 3 times
# size = dimensionality of the feature vectors
model = gensim.models.Word2Vec([tokenized], min_count=3, size=100)
model.wv.similarity('man', 'woman')
model.wv.similarity('woman', 'woman')
# vector representation of the word 'love'
model['love']
vec1 = model['love']
# cosine similarity between 'love' and top 20 most similar words
# in the context of Shakespears Sonnets.
model.wv.similar_by_vector(vec1, topn=20)
It means that the words 'and', 'in', 'my', and 'so', were used most frequently when the word 'love' was used.
vec2 = model['die']
model.wv.similar_by_vector(vec2, topn=10)
vec3 = model['death']
model.wv.similar_by_vector(vec3, topn=10)