-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetQuestionComments.py
executable file
·103 lines (84 loc) · 2.75 KB
/
getQuestionComments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
import re
import sys
sys.path.append("/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages")
import iso8601
import operator
from processUnis import *
def getQuestionComments():
lines = open("FCBQuestionsOnly.csv").readlines()
messages = []
f = open("CommentsQuestionsPhase9.csv",'w')
f.write("university | postID | Post | Username | Time | CommentID | Comment\n")
#f.write("sdsd|sdsd|Sdsds|sdsd|sdsd|\n")
g = open("Discarded.csv","w")
g.write("uni|Post\n")
#f = open("FCBRealQuestions.csv",'w')
lines = lines[1:]
count = 0
#for line in lines[801:3779]:
#for line in lines[700:]
start = 1685
stop = 3779
index = start
for line in lines[start:stop]:
postID = line.split('|')[2]
uni = line.split('|')[3]
index += 1
if "Rhodes" in uni or "Southwestern" in uni or "University of California--Davis" in uni:
continue
#urls = re.findall(r'https?://\S+', message)
message = line.split('|')[7]
message = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', message)
if "?" in message:
try:
comments = commentsSpecificPost(uni, postID)
if comments == None :
print "except"
continue
comments.sort(key=operator.itemgetter('isotime'))
for comment in comments:
comment['message'] = comment['message'].replace("|","")
f.write (uni+"|"+postID+"|"+message +"|"+comment['userName']+'|'+comment['time']+"|"+comment["commentID"]+"|"+comment['message'].strip()+"\n")
print "here "
count += 1
#f.write(message+"\n")
except:
print "except"
g.write(uni+"|"+postID+"\n")
#if len(messages)+2327 == 3000:
# return len(messages), count
print "Coded Till", index
'''
f = open("SarahPhase2.csv",'a')
#f.write("posts\n")
lines = lines[1:]
count = 0
postID = {}
unis = []
for line in lines[700:3200]:
#count += 1
#if count <=700 or count>3200:
# continue
uni = line.split('|')[3]
unis += [uni]
#unis = list(set(unis))
#return unis
#postID[uni] = line.split('|')[2]
if "Rhodes" in uni or "Southwestern" in uni:
continue
message = line.split('|')[7]
message = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', message)
if "?" in message:
messages += [message]
if len(messages) ==13:
print message
break
#len1 = len(messages)
#len2 = len(lines[3000:3200])
return (len(lines[700:3200]) - len(messages)), len(lines[700:3200])
#len1, len2 = getQuestions()
#print len1
#print len2
'''
getQuestionComments()