counterfacto (8746B)
1 #!/usr/bin/env python2 2 # Counterfacto is Copyright (c) 2016 by the Dyne.org Foundation 3 # as part of the PIEnews project 4 # 5 # This file is part of Counterfacto 6 # Written by Ivan J. <parazyd@dyne.org> 7 # 8 # This source code is free software; you can redistribute it and/or 9 # modify it under the terms of the GNU Public License as published by 10 # the Free Software Foundation; either version 3 of the License, or 11 # (at your option) any later version. 12 # 13 # This source code is distributed in the hope that it will be useful, 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. Please refer 16 # to the GNU Public License for more details. 17 # 18 # You should have received a copy of the GNU Public License along with 19 # this source code; if not, write to: Free Software Foundation, Inc., 20 # 675 Mass Ave, Cambridge, MA 02139, USA. 21 # 22 # This project has received funding from the European Union's Horizon 23 # 2020 Programme for research, technological development and 24 # demonstration under grant agreement nr. 687922 25 26 import nltk 27 from nltk.tag.perceptron import PerceptronTagger 28 import re 29 import sys 30 from twitter import * 31 import twokenize 32 33 global tweetfile 34 global taggedFile 35 taggedFile = 'tagged.txt' 36 37 try: 38 with open('credentials') as fd: 39 exec(fd.read()) 40 except: 41 print('no credentials file found. please create it.') 42 exit(1) 43 44 def main(): 45 ## credential check json 46 #print(api.VerifyCredentials()) 47 48 try: 49 if sys.argv[1] == '-f': 50 tweetfile = sys.argv[2] 51 classify(tweetfile) 52 53 elif sys.argv[1] == '-a': 54 api = Twitter(auth=OAuth(oatoken,oasecret,conskey,conssecret)) 55 accountname = sys.argv[2] 56 statuses = api.statuses.user_timeline(screen_name=accountname, 57 count=100) 58 59 tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt' 60 tweetFile = open(tweetfile, 'w') 61 62 for s in statuses: 63 sintweet = s['text'] 64 sintweet = sintweet.replace('\n', ' ') 65 sintweet = sintweet.encode('ascii', 'ignore') 66 tweetFile.write(sintweet + '\n') 67 #print('wrote tweet') 68 69 tweetFile.close() 70 classify(tweetfile) 71 72 elif sys.argv[1] == '-s': 73 api = twitter.Api(consumer_key=twit_consumer_key, 74 consumer_secret=twit_consumer_secret, 75 access_token_key=twit_access_key, 76 access_token_secret=twit_access_secret) 77 78 if len(sys.argv) >= 3: 79 searchterm = ' '.join(sys.argv[2:]) 80 else: 81 searchterm = sys.argv[2] 82 83 statuses = api.GetSearch(term=searchterm, 84 count=100) 85 #result_type="recent") 86 87 #for s in statuses: 88 # print(s.text) 89 #exit() 90 91 tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt' 92 tweetFile = open(tweetfile, 'w') 93 94 for s in statuses: 95 sintweet = s.text 96 sintweet = sintweet.replace('\n', ' ') 97 sintweet = sintweet.encode('ascii', 'ignore') 98 tweetFile.write(sintweet + '\n') 99 #print('wrote tweet') 100 101 tweetFile.close() 102 classify(tweetfile) 103 104 except: 105 print("usage: " + sys.argv[0] + " [-a account] [-f tweetfile] [-s searchterm]") 106 exit(1) 107 108 ## {{{ processing functions 109 def tokenizelocal(): 110 tweets = tweetFile.read().splitlines() 111 for t in tweets: 112 print(t + '\n') 113 print(str(twokenize.tokenize(t)) + '\n') 114 115 def format_tweet(message): 116 m = str(message) 117 m = m.replace('\n', ' ') 118 m = m.encode('ascii', 'ignore') 119 return m 120 121 def format_tagged(tagged_list): 122 out = '' 123 for t in tagged_list: 124 token, tag = postprocess_tag(t[0], t[1]) 125 out = out + token + '/' + tag + '/' 126 out = out + '\n' 127 return out 128 129 def postprocess_tag(token, tag): 130 outtag = tag 131 if (is_twitter_cf_modal(token)): 132 outtag = 'MD' 133 elif (tag_CCJ(token)): 134 outtag = 'CCJ' 135 return token, outtag 136 137 def get_cf_form(tagged_message): 138 139 # Filter out questions 140 pq = re.compile('\.*/\?/.', re.IGNORECASE) 141 if pq.search(tagged_message) != None: 142 return 0 143 144 # CASE 1 WISH VERB FORM 145 p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE) 146 if p1.search(tagged_message) != None: 147 return 1 148 149 150 # CASE 2 CONJUNTION NORMAL 151 p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE) 152 if p2.search(tagged_message) != None: 153 return 2 154 155 156 # CASE 3 CONJUNCTIVE CONVERSE 157 p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE) 158 if p3.search(tagged_message) != None: 159 return 3 160 161 162 # CASE 5 Should have 163 p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE) 164 if p4.search(tagged_message) != None: 165 return 4 166 167 # CASE 6 VERB INVERSION 168 p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)" 169 "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)" 170 "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE) 171 if p5.search(tagged_message) != None: 172 return 5 173 174 175 # CASE 6 MODAL NORMAL 176 p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE) 177 if p6.search(tagged_message) != None: 178 return 6 179 180 # If no matches 181 return 0 182 183 184 185 def is_twitter_cf_modal(word): 186 w = unicode(word, errors='ignore').encode('utf-8').lower() 187 if (w == 'should' or 188 w == 'should\'ve' or 189 w == 'shouldve' or 190 w == 'shoulda' or 191 w == 'shulda' or 192 w == 'shuda' or 193 w == 'shudda' or 194 w == 'shudve' or 195 w == 'would' or 196 w == 'would\'ve' or 197 w == 'wouldve' or 198 w == 'woulda' or 199 w == 'wuda' or 200 w == 'wulda' or 201 w == 'wudda' or 202 w == 'wudve' or 203 w == 'wlda' or 204 w == 'could' or 205 w == 'could\'ve' or 206 w == 'couldve' or 207 w == 'coulda' or 208 w == 'cudda' or 209 w == 'culda' or 210 w == 'cudve' or 211 w == 'must' or 212 w == 'mustve' or 213 w == 'might' or 214 w == 'might\'ve' or 215 w == 'mightve' or 216 w == 'ought' or 217 w == 'may' or 218 w == 'i\'d' or 219 w == 'id' or 220 w == 'we\'d' or 221 w == 'youd' or 222 w == 'you\'d' or 223 w == 'he\'d' or 224 w == 'she\'d'): 225 return True 226 return False 227 228 def tag_CCJ(word): 229 w = word.lower() 230 ''' 231 as long as, even if, if, one condition that, provided (that), 232 providing (that), so long as, unless, whether... or, supposing, 233 suppose, imagine, but for 234 ''' 235 if(w == 'as' or 236 w == 'if' or 237 w == 'even' or 238 w == 'provided' or 239 w == 'providing' or 240 w == 'suppose' or 241 w == 'supposing' or 242 w == 'unless' or 243 w == 'whether' or 244 w == 'envision' or 245 w == 'envisioning' or 246 w == 'conceptualize'or 247 w == 'conceptualizing' or 248 w == 'conjure' or 249 w == 'conjuring' or 250 w == 'visualize' or 251 w == 'visualizing'): 252 return True 253 return False 254 255 def get_tagged_message(message, tagger): 256 tagset = None 257 formatted_message = format_tweet(message) 258 tokens = twokenize.tokenize(formatted_message) 259 tags = nltk.tag._pos_tag(tokens, tagset, tagger) 260 return format_tagged(tags) 261 ## }}} 262 263 def classify(tweetfile): 264 tweetFile = open(tweetfile, 'r') 265 tagFile = open(taggedFile, 'w') 266 267 tagger = PerceptronTagger() 268 form_num = 7 269 270 cf_count = [[0 for x in range(form_num)] for x in range(form_num)] 271 272 form_vec = [] 273 274 print("Reading file...") 275 tweet = tweetFile.readline() 276 277 while tweet != '': 278 taggedTweet = get_tagged_message(tweet, tagger) 279 tagFile.write(taggedTweet) 280 #print("did tweet") 281 form = int(get_cf_form(taggedTweet)) 282 283 ## if our tweet is positive, print it 284 if form != 0: 285 print(tweet) 286 287 form_vec.append(form) 288 289 cf_count[form][0] = cf_count[form][0] + 1 290 291 tweet = tweetFile.readline() 292 293 count = 0 294 for i in xrange(1,form_num): 295 count = count + cf_count[i][0] 296 297 298 print("finished tagging...") 299 tweetFile.close() 300 tagFile.close() 301 302 print("counterfactuals: " + str(count) + "/100") 303 304 main()