counterfacto

small software tool to analyze twitter and highlight counterfactual statements
git clone git://parazyd.org/counterfacto.git
Log | Files | Refs | README | LICENSE

factolib.py (5455B)


      1 #!/usr/bin/env python2
      2 # Copyright (c) 2017 Ivan J. <parazyd@dyne.org
      3 
      4 import nltk
      5 import re
      6 import twokenize
      7 from nltk.tag.perceptron import PerceptronTagger
      8 
      9 def tokenizelocal():
     10     tweets = tweetFile.read().splitlines()
     11     for t in tweets:
     12         print(t + '\n')
     13         print(str(twokenize.tokenize(t)) + '\n')
     14 
     15 def format_tweet(message):
     16     m = str(message)
     17     m = m.replace('\n', ' ')
     18     m = m.encode('ascii', 'ignore')
     19     return m
     20 
     21 def format_tagged(tagged_list):
     22     out = ''
     23     for t in tagged_list:
     24         token, tag = postprocess_tag(t[0], t[1])
     25         out = out + token + '/' + tag + '/'
     26     out = out + '\n'
     27     return out
     28 
     29 def postprocess_tag(token, tag):
     30     outtag = tag
     31     if (is_twitter_cf_modal(token)):
     32         outtag = 'MD'
     33     elif (tag_CCJ(token)):
     34         outtag = 'CCJ'
     35     return token, outtag
     36 
     37 def get_cf_form(tagged_message):
     38 
     39     # Filter out questions
     40     pq = re.compile('\.*/\?/.', re.IGNORECASE)
     41     if pq.search(tagged_message) != None:
     42         return 0
     43 
     44     # CASE 1 WISH VERB FORM
     45     p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE)
     46     if p1.search(tagged_message) != None:
     47         return 1
     48 
     49 
     50     # CASE 2 CONJUNTION NORMAL
     51     p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE)
     52     if p2.search(tagged_message) != None:
     53         return 2
     54 
     55 
     56     # CASE 3 CONJUNCTIVE CONVERSE
     57     p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE)
     58     if p3.search(tagged_message) != None:
     59         return 3
     60 
     61 
     62     # CASE 5 Should have
     63     p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE)
     64     if p4.search(tagged_message) != None:
     65         return 4
     66 
     67     # CASE 6 VERB INVERSION
     68     p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)"
     69                     "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)"
     70                     "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE)
     71     if p5.search(tagged_message) != None:
     72         return 5
     73 
     74 
     75     # CASE 6 MODAL NORMAL
     76     p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE)
     77     if p6.search(tagged_message) != None:
     78         return 6
     79 
     80     # If no matches
     81     return 0
     82 
     83 
     84 def is_twitter_cf_modal(word):
     85     w = unicode(word, errors='ignore').encode('utf-8').lower()
     86     if (w == 'should' or
     87         w == 'should\'ve' or
     88         w == 'shouldve' or
     89         w == 'shoulda' or
     90         w == 'shulda' or
     91         w == 'shuda' or
     92         w == 'shudda' or
     93         w == 'shudve' or
     94         w == 'would' or
     95         w == 'would\'ve' or
     96         w == 'wouldve' or
     97         w == 'woulda' or
     98         w == 'wuda' or
     99         w == 'wulda' or
    100         w == 'wudda' or
    101         w == 'wudve' or
    102         w == 'wlda' or
    103         w == 'could' or
    104         w == 'could\'ve' or
    105         w == 'couldve' or
    106         w == 'coulda' or
    107         w == 'cudda' or
    108         w == 'culda' or
    109         w == 'cudve' or
    110         w == 'must' or
    111         w == 'mustve' or
    112         w == 'might' or
    113         w == 'might\'ve' or
    114         w == 'mightve' or
    115         w == 'ought' or
    116         w == 'may' or
    117         w == 'i\'d' or
    118         w == 'id' or
    119         w == 'we\'d' or
    120         w == 'youd' or
    121         w == 'you\'d' or
    122         w == 'he\'d' or
    123         w == 'she\'d'):
    124             return True
    125     return False
    126 
    127 def tag_CCJ(word):
    128     w = word.lower()
    129     '''
    130     as long as, even if, if, one condition that, provided (that),
    131     providing (that), so long as, unless, whether... or, supposing,
    132     suppose, imagine, but for
    133     '''
    134     if(w == 'as' or
    135         w == 'if' or
    136         w == 'even' or
    137         w == 'provided' or
    138         w == 'providing' or
    139         w == 'suppose' or
    140         w == 'supposing' or
    141         w == 'unless' or
    142         w == 'whether' or
    143         w == 'envision' or
    144         w == 'envisioning' or
    145         w == 'conceptualize'or
    146         w == 'conceptualizing' or
    147         w == 'conjure' or
    148         w == 'conjuring' or
    149         w == 'visualize' or
    150         w == 'visualizing'):
    151         return True
    152     return False
    153 
    154 def get_tagged_message(message, tagger):
    155     tagset = None
    156     formatted_message = format_tweet(message)
    157     tokens = twokenize.tokenize(formatted_message)
    158     tags = nltk.tag._pos_tag(tokens, tagset, tagger)
    159     return format_tagged(tags)
    160 
    161 def classify(tweetfile, taggedfile):
    162     tweetfile  = open(tweetfile,  "r")
    163     taggedfile = open(taggedfile, "w")
    164     counterfactuals = open('counterfactuals.txt', 'w')
    165 
    166     tagger = PerceptronTagger()
    167     form_num = 8
    168 
    169     cf_count = [[0 for x in range(form_num)] for x in range(form_num)]
    170 
    171     form_vec = []
    172 
    173     print("Reading file...")
    174     tweet = tweetfile.readline()
    175 
    176     while tweet:
    177         taggedTweet = get_tagged_message(tweet, tagger)
    178         taggedfile.write(taggedTweet)
    179         form = int(get_cf_form(taggedTweet))
    180 
    181         if form:
    182             print(tweet)
    183             counterfactuals.write(tweet + '<hr>\n')
    184 
    185         form_vec.append(form)
    186         cf_count[form][0] += 1
    187         tweet = tweetfile.readline()
    188 
    189     count = 0
    190     for i in xrange(1, form_num):
    191         count += cf_count[i][0]
    192 
    193     print("Finished tagging...")
    194     tweetfile.close()
    195     taggedfile.close()
    196 
    197     print("counterfactuals: " + str(count) + "/100")
    198     counterfactuals.write("counterfactuals: " + str(count) + "/100<br>\n")
    199     counterfactuals.close()