import numpy as np

def text2vec(text, dictionary):
    """Create a float64 term frequency vector from a text with given term corpus"""
    result = np.zeros(len(dictionary), dtype='float64')
    split = text.split()
    for i in split:
        if i in dictionary:
            result[dictionary.index(i)] += 1
    return result / np.max(result)
    # return result  # raw term frequency


docs = ["spatz amsel vogel drossel fink falke flug",
        "spatz vogel flug nest amsel amsel amsel",
        "kuckuck nest nest ei ei ei flug amsel amsel vogel",
        "amsel elster elster drossel vogel ei",
        "falke katze nest nest flug vogel",
        "spatz spatz konstruktion nest ei"]

queries = ["spatz vogel nest konstruktion", "amsel ei nest"]

# first let's get all the words together ...
bag = set()  # create empty set
for d in docs:
    for token in d.lower().split():
        bag.add(token)  # add all tokens to the set.
mydictionary = list(bag)
mydictionary.sort()  # and that's the set of words available in the whole corpus ...

for m in mydictionary:
    print(m, end=' ')
print('\n')

# create frequency vectors for the queries ..
temptf = []
for q in queries:
    temptf.append(text2vec(q, mydictionary))
qf = np.array(temptf)

# create all the raw term frequency vectors ...
temptf = []
for d in docs:
    temptf.append(text2vec(d, mydictionary))
tf = np.array(temptf)
# find the vector of the first doc in tf[0]


# create document frequency vector ...
df = np.zeros(len(mydictionary), dtype='float64')
for t in temptf:
    for i in range(0, len(t)):
        if t[i] > 0:
            df[i] += 1

# tf[x] contains the raw frequency vector of document x
# df contains the raw document frequency
# qf[y] contains the raw frequency vector of query y

# Examples:
# print(tf[0]*np.log(len(docs)/df))  # tf*idf for the first doc ...
# print(np.inner(tf[0],qf[0]))  # scalar product
