counterfacto - counterfacto - small software tool to analyze twitter and highlight counterfactual statements

counterfacto (8746B)
      1 #!/usr/bin/env python2
      2 # Counterfacto is Copyright (c) 2016 by the Dyne.org Foundation
      3 # as part of the PIEnews project
      4 #
      5 # This file is part of Counterfacto
      6 # Written by Ivan J. <parazyd@dyne.org>
      7 #
      8 # This source code is free software; you can redistribute it and/or
      9 # modify it under the terms of the GNU Public License as published by
     10 # the Free Software Foundation; either version 3 of the License, or
     11 # (at your option) any later version.
     12 #
     13 # This source code is distributed in the hope that it will be useful,
     14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  Please refer
     16 # to the GNU Public License for more details.
     17 #
     18 # You should have received a copy of the GNU Public License along with
     19 # this source code; if not, write to: Free Software Foundation, Inc.,
     20 # 675 Mass Ave, Cambridge, MA 02139, USA.
     21 #
     22 # This project has received funding from the European Union's Horizon
     23 # 2020 Programme for research, technological development and
     24 # demonstration under grant agreement nr. 687922
     25 
     26 import nltk
     27 from nltk.tag.perceptron import PerceptronTagger
     28 import re
     29 import sys
     30 from twitter import *
     31 import twokenize
     32 
     33 global tweetfile
     34 global taggedFile
     35 taggedFile = 'tagged.txt'
     36 
     37 try:
     38     with open('credentials') as fd:
     39         exec(fd.read())
     40 except:
     41     print('no credentials file found. please create it.')
     42     exit(1)
     43 
     44 def main():
     45     ## credential check json
     46     #print(api.VerifyCredentials())
     47 
     48     try:
     49         if sys.argv[1] == '-f':
     50             tweetfile = sys.argv[2]
     51             classify(tweetfile)
     52 
     53         elif sys.argv[1] == '-a':
     54             api = Twitter(auth=OAuth(oatoken,oasecret,conskey,conssecret))
     55             accountname = sys.argv[2]
     56             statuses = api.statuses.user_timeline(screen_name=accountname,
     57                                                   count=100)
     58 
     59             tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt'
     60             tweetFile = open(tweetfile, 'w')
     61 
     62             for s in statuses:
     63                 sintweet = s['text']
     64                 sintweet = sintweet.replace('\n', ' ')
     65                 sintweet = sintweet.encode('ascii', 'ignore')
     66                 tweetFile.write(sintweet + '\n')
     67                 #print('wrote tweet')
     68 
     69             tweetFile.close()
     70             classify(tweetfile)
     71 
     72         elif sys.argv[1] == '-s':
     73             api = twitter.Api(consumer_key=twit_consumer_key,
     74                   consumer_secret=twit_consumer_secret,
     75                   access_token_key=twit_access_key,
     76                   access_token_secret=twit_access_secret)
     77 
     78             if len(sys.argv) >= 3:
     79                 searchterm = ' '.join(sys.argv[2:])
     80             else:
     81                 searchterm = sys.argv[2]
     82 
     83             statuses = api.GetSearch(term=searchterm,
     84                                      count=100)
     85                                      #result_type="recent")
     86 
     87             #for s in statuses:
     88             #    print(s.text)
     89             #exit()
     90 
     91             tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt'
     92             tweetFile = open(tweetfile, 'w')
     93 
     94             for s in statuses:
     95                 sintweet = s.text
     96                 sintweet = sintweet.replace('\n', ' ')
     97                 sintweet = sintweet.encode('ascii', 'ignore')
     98                 tweetFile.write(sintweet + '\n')
     99                 #print('wrote tweet')
    100 
    101             tweetFile.close()
    102             classify(tweetfile)
    103 
    104     except:
    105         print("usage: " + sys.argv[0] + " [-a account] [-f tweetfile] [-s searchterm]")
    106         exit(1)
    107 
    108 ## {{{ processing functions
    109 def tokenizelocal():
    110     tweets = tweetFile.read().splitlines()
    111     for t in tweets:
    112         print(t + '\n')
    113         print(str(twokenize.tokenize(t)) + '\n')
    114 
    115 def format_tweet(message):
    116     m = str(message)
    117     m = m.replace('\n', ' ')
    118     m = m.encode('ascii', 'ignore')
    119     return m
    120 
    121 def format_tagged(tagged_list):
    122     out = ''
    123     for t in tagged_list:
    124         token, tag = postprocess_tag(t[0], t[1])
    125         out = out + token + '/' + tag + '/'
    126     out = out + '\n'
    127     return out
    128 
    129 def postprocess_tag(token, tag):
    130     outtag = tag
    131     if (is_twitter_cf_modal(token)):
    132         outtag = 'MD'
    133     elif (tag_CCJ(token)):
    134         outtag = 'CCJ'
    135     return token, outtag
    136 
    137 def get_cf_form(tagged_message):
    138 
    139     # Filter out questions
    140     pq = re.compile('\.*/\?/.', re.IGNORECASE)
    141     if pq.search(tagged_message) != None:
    142         return 0
    143 
    144     # CASE 1 WISH VERB FORM
    145     p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE)
    146     if p1.search(tagged_message) != None:
    147         return 1
    148 
    149 
    150     # CASE 2 CONJUNTION NORMAL
    151     p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE)
    152     if p2.search(tagged_message) != None:
    153         return 2
    154 
    155 
    156     # CASE 3 CONJUNCTIVE CONVERSE
    157     p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE)
    158     if p3.search(tagged_message) != None:
    159         return 3
    160 
    161 
    162     # CASE 5 Should have
    163     p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE)
    164     if p4.search(tagged_message) != None:
    165         return 4
    166 
    167     # CASE 6 VERB INVERSION
    168     p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)"
    169                     "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)"
    170                     "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE)
    171     if p5.search(tagged_message) != None:
    172         return 5
    173 
    174 
    175     # CASE 6 MODAL NORMAL
    176     p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE)
    177     if p6.search(tagged_message) != None:
    178         return 6
    179 
    180     # If no matches
    181     return 0
    182 
    183 
    184 
    185 def is_twitter_cf_modal(word):
    186     w = unicode(word, errors='ignore').encode('utf-8').lower()
    187     if (w == 'should' or
    188         w == 'should\'ve' or
    189         w == 'shouldve' or
    190         w == 'shoulda' or
    191         w == 'shulda' or
    192         w == 'shuda' or
    193         w == 'shudda' or
    194         w == 'shudve' or
    195         w == 'would' or
    196         w == 'would\'ve' or
    197         w == 'wouldve' or
    198         w == 'woulda' or
    199         w == 'wuda' or
    200         w == 'wulda' or
    201         w == 'wudda' or
    202         w == 'wudve' or
    203         w == 'wlda' or
    204         w == 'could' or
    205         w == 'could\'ve' or
    206         w == 'couldve' or
    207         w == 'coulda' or
    208         w == 'cudda' or
    209         w == 'culda' or
    210         w == 'cudve' or
    211         w == 'must' or
    212         w == 'mustve' or
    213         w == 'might' or
    214         w == 'might\'ve' or
    215         w == 'mightve' or
    216         w == 'ought' or
    217         w == 'may' or
    218         w == 'i\'d' or
    219         w == 'id' or
    220         w == 'we\'d' or
    221         w == 'youd' or
    222         w == 'you\'d' or
    223         w == 'he\'d' or
    224         w == 'she\'d'):
    225             return True
    226     return False
    227 
    228 def tag_CCJ(word):
    229     w = word.lower()
    230     '''
    231     as long as, even if, if, one condition that, provided (that),
    232     providing (that), so long as, unless, whether... or, supposing,
    233     suppose, imagine, but for
    234     '''
    235     if(w == 'as' or
    236         w == 'if' or
    237         w == 'even' or
    238         w == 'provided' or
    239         w == 'providing' or
    240         w == 'suppose' or
    241         w == 'supposing' or
    242         w == 'unless' or
    243         w == 'whether' or
    244         w == 'envision' or
    245         w == 'envisioning' or
    246         w == 'conceptualize'or
    247         w == 'conceptualizing' or
    248         w == 'conjure' or
    249         w == 'conjuring' or
    250         w == 'visualize' or
    251         w == 'visualizing'):
    252         return True
    253     return False
    254 
    255 def get_tagged_message(message, tagger):
    256     tagset = None
    257     formatted_message = format_tweet(message)
    258     tokens = twokenize.tokenize(formatted_message)
    259     tags = nltk.tag._pos_tag(tokens, tagset, tagger)
    260     return format_tagged(tags)
    261 ## }}}
    262 
    263 def classify(tweetfile):
    264     tweetFile = open(tweetfile, 'r')
    265     tagFile = open(taggedFile, 'w')
    266 
    267     tagger = PerceptronTagger()
    268     form_num = 7
    269 
    270     cf_count = [[0 for x in range(form_num)] for x in range(form_num)]
    271 
    272     form_vec = []
    273 
    274     print("Reading file...")
    275     tweet = tweetFile.readline()
    276 
    277     while tweet != '':
    278         taggedTweet = get_tagged_message(tweet, tagger)
    279         tagFile.write(taggedTweet)
    280         #print("did tweet")
    281         form = int(get_cf_form(taggedTweet))
    282 
    283         ## if our tweet is positive, print it
    284         if form != 0:
    285             print(tweet)
    286 
    287         form_vec.append(form)
    288 
    289         cf_count[form][0] = cf_count[form][0] + 1
    290 
    291         tweet = tweetFile.readline()
    292 
    293     count = 0
    294     for i in xrange(1,form_num):
    295         count = count + cf_count[i][0]
    296 
    297 
    298     print("finished tagging...")
    299     tweetFile.close()
    300     tagFile.close()
    301 
    302     print("counterfactuals: " + str(count) + "/100")
    303 
    304 main()
	counterfacto small software tool to analyze twitter and highlight counterfactual statements
	git clone git://parazyd.org/counterfacto.git
	Log \| Files \| Refs \| README \| LICENSE