-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtop_fact.py
116 lines (110 loc) · 4.4 KB
/
top_fact.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from gensim.models import word2vec
import json
from collections import defaultdict
from nltk import sent_tokenize, word_tokenize
import numpy as np
import random
import time
if __name__ == '__main__':
model = word2vec.Word2Vec.load('./word2vec_model/facts.model')
with open('./data/KB-REF/expression.json') as file:
expression = json.load(file)
with open('./data/KB-REF/candidate.json') as file:
cand = json.load(file)
with open('./data/KB-REF/Wikipedia.json') as file:
Wikipedia = json.load(file)
with open('./data/KB-REF/ConceptNet.json') as file:
ConceptNet = json.load(file)
with open('./data/KB-REF/WebChild.json') as file:
WebChild = json.load(file)
with open('./data/KB-REF/objects.json') as file:
objects = json.load(file)
top_facts = {}
for k in expression:
start = time.time()
middle = {}
img = k.split('_')[0]
e = expression[k][0]
candidates = cand[img]
j = 0
em = np.zeros(300)
for f in word_tokenize(e):
try:
em += np.array(model.wv.get_vector(f.lower()))
j += 1
except:
continue
em /= j
for c in candidates:
if c != '-1':
sims = []
fs = []
final = []
o = objects[img][c][0].split('.')[0]
try:
facts = sent_tokenize(Wikipedia[o.lower()])
for fact in facts:
j = 0
nm = np.zeros(300)
for f in word_tokenize(fact):
try:
nm += np.array(model.wv.get_vector(f.lower()))
j += 1
except:
continue
if j!= 0:
nm /= j
sim = np.dot(em, nm) / (np.linalg.norm(em) * np.linalg.norm(nm))
sim = 0.5 + 0.5 * sim
fs.append(fact)
sims.append(sim)
except:
continue
try:
facts = sent_tokenize(ConceptNet[o.lower()].replace('.', '. ').replace('has/have ', ''))
for fact in facts:
j = 0
nm = np.zeros(300)
for f in word_tokenize(fact):
try:
nm += np.array(model.wv.get_vector(f.lower()))
j += 1
except:
continue
if j!= 0:
nm /= j
sim = np.dot(em, nm) / (np.linalg.norm(em) * np.linalg.norm(nm))
sim = 0.5 + 0.5 * sim
fs.append(fact)
sims.append(sim)
except:
continue
try:
facts = sent_tokenize(WebChild[o.lower()])
for fact in facts:
j = 0
nm = np.zeros(300)
for f in word_tokenize(fact):
try:
nm += np.array(model.wv.get_vector(f.lower()))
j += 1
except:
continue
if j!= 0:
nm /= j
sim = np.dot(em, nm) / (np.linalg.norm(em) * np.linalg.norm(nm))
sim = 0.5 + 0.5 * sim
fs.append(fact)
sims.append(sim)
except:
continue
sims = np.array(sims)
inxs = np.argsort(-sims)[0:50]
for ix in inxs:
final.append(fs[ix])
random.shuffle(final)
middle = dict(middle, **{c: final})
top_facts = dict(top_facts, **{k: middle})
print(time.time()-start)
with open('./json/top_facts.json', 'w') as file:
json.dump(top_facts, file)