-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathwebscraper.py
85 lines (68 loc) · 2.24 KB
/
webscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from eventregistry import *
from threading import Thread, Lock
from py_ms_cognitive import PyMsCognitiveWebSearch
import nltk
import numpy as np
import pandas as pd
import io, json
# Print a list of recently added articles mentioning entered words
api_key = 'eda39267-9017-481a-860d-0b565c6d8bf3'
er = EventRegistry(apiKey = api_key)
global_df = pd.DataFrame()
mutex = Lock()
def get_articles(keywords):
global global_df
q = QueryArticlesIter(keywords=QueryItems.AND(keywords))
q.setRequestedResult(RequestArticlesInfo(count= 10, sortBy="sourceImportance"))
# print keywords
x = 0
local_df = pd.DataFrame()
res = er.execQuery(q)
for article in res['articles']['results']:
data = {
'source': article['source']['title'].encode('utf-8'),
# 'title' : article['title'].encode('utf-8'),
'url' : article['url'].encode('utf-8'),
# 'text' : article['body'].encode('utf-8')
}
local_df = pd.concat([local_df, pd.DataFrame(data,index=[x])])
x += 1
mutex.acquire()
try:
global_df = pd.concat([global_df,local_df])
finally:
mutex.release()
class myThread(threading.Thread):
def __init__(self, query):
threading.Thread.__init__(self)
self.query = query
def run(self):
get_articles(self.query)
def web_scrape(url):
global global_df
kl = get_keywords(url)
params = get_search_params(kl)
index = 0
threads = []
for query in params:
threads.append(myThread(query))
threads[index].start()
index += 1
for thread in threads:
thread.join()
# global_df = global_df.reset_index(drop=True)
# with io.open('data.json', 'w', encoding='utf-8') as f:
# f.write(json.dumps(global_df, ensure_ascii=False))
global_df['id'] = range(len(global_df.index))
bodies = global_df.loc[:,['id','text']]
bodies.to_csv('ml/bodies.csv')
claim = [claim] * len(global_df.index)
claims = pd.DataFrame(claim)
claims.to_csv('ml/claims.csv')
urls = global_df.loc[:,['id','source','url']]
urls.to_csv('url.csv')
return global_df.to_dict(orient='records')
def main(args):
web_scrape(args[1])
if __name__ == '__main__':
web_scrape(sys.argv)