factolib.py (5455B)
1 #!/usr/bin/env python2 2 # Copyright (c) 2017 Ivan J. <parazyd@dyne.org 3 4 import nltk 5 import re 6 import twokenize 7 from nltk.tag.perceptron import PerceptronTagger 8 9 def tokenizelocal(): 10 tweets = tweetFile.read().splitlines() 11 for t in tweets: 12 print(t + '\n') 13 print(str(twokenize.tokenize(t)) + '\n') 14 15 def format_tweet(message): 16 m = str(message) 17 m = m.replace('\n', ' ') 18 m = m.encode('ascii', 'ignore') 19 return m 20 21 def format_tagged(tagged_list): 22 out = '' 23 for t in tagged_list: 24 token, tag = postprocess_tag(t[0], t[1]) 25 out = out + token + '/' + tag + '/' 26 out = out + '\n' 27 return out 28 29 def postprocess_tag(token, tag): 30 outtag = tag 31 if (is_twitter_cf_modal(token)): 32 outtag = 'MD' 33 elif (tag_CCJ(token)): 34 outtag = 'CCJ' 35 return token, outtag 36 37 def get_cf_form(tagged_message): 38 39 # Filter out questions 40 pq = re.compile('\.*/\?/.', re.IGNORECASE) 41 if pq.search(tagged_message) != None: 42 return 0 43 44 # CASE 1 WISH VERB FORM 45 p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE) 46 if p1.search(tagged_message) != None: 47 return 1 48 49 50 # CASE 2 CONJUNTION NORMAL 51 p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE) 52 if p2.search(tagged_message) != None: 53 return 2 54 55 56 # CASE 3 CONJUNCTIVE CONVERSE 57 p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE) 58 if p3.search(tagged_message) != None: 59 return 3 60 61 62 # CASE 5 Should have 63 p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE) 64 if p4.search(tagged_message) != None: 65 return 4 66 67 # CASE 6 VERB INVERSION 68 p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)" 69 "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)" 70 "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE) 71 if p5.search(tagged_message) != None: 72 return 5 73 74 75 # CASE 6 MODAL NORMAL 76 p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE) 77 if p6.search(tagged_message) != None: 78 return 6 79 80 # If no matches 81 return 0 82 83 84 def is_twitter_cf_modal(word): 85 w = unicode(word, errors='ignore').encode('utf-8').lower() 86 if (w == 'should' or 87 w == 'should\'ve' or 88 w == 'shouldve' or 89 w == 'shoulda' or 90 w == 'shulda' or 91 w == 'shuda' or 92 w == 'shudda' or 93 w == 'shudve' or 94 w == 'would' or 95 w == 'would\'ve' or 96 w == 'wouldve' or 97 w == 'woulda' or 98 w == 'wuda' or 99 w == 'wulda' or 100 w == 'wudda' or 101 w == 'wudve' or 102 w == 'wlda' or 103 w == 'could' or 104 w == 'could\'ve' or 105 w == 'couldve' or 106 w == 'coulda' or 107 w == 'cudda' or 108 w == 'culda' or 109 w == 'cudve' or 110 w == 'must' or 111 w == 'mustve' or 112 w == 'might' or 113 w == 'might\'ve' or 114 w == 'mightve' or 115 w == 'ought' or 116 w == 'may' or 117 w == 'i\'d' or 118 w == 'id' or 119 w == 'we\'d' or 120 w == 'youd' or 121 w == 'you\'d' or 122 w == 'he\'d' or 123 w == 'she\'d'): 124 return True 125 return False 126 127 def tag_CCJ(word): 128 w = word.lower() 129 ''' 130 as long as, even if, if, one condition that, provided (that), 131 providing (that), so long as, unless, whether... or, supposing, 132 suppose, imagine, but for 133 ''' 134 if(w == 'as' or 135 w == 'if' or 136 w == 'even' or 137 w == 'provided' or 138 w == 'providing' or 139 w == 'suppose' or 140 w == 'supposing' or 141 w == 'unless' or 142 w == 'whether' or 143 w == 'envision' or 144 w == 'envisioning' or 145 w == 'conceptualize'or 146 w == 'conceptualizing' or 147 w == 'conjure' or 148 w == 'conjuring' or 149 w == 'visualize' or 150 w == 'visualizing'): 151 return True 152 return False 153 154 def get_tagged_message(message, tagger): 155 tagset = None 156 formatted_message = format_tweet(message) 157 tokens = twokenize.tokenize(formatted_message) 158 tags = nltk.tag._pos_tag(tokens, tagset, tagger) 159 return format_tagged(tags) 160 161 def classify(tweetfile, taggedfile): 162 tweetfile = open(tweetfile, "r") 163 taggedfile = open(taggedfile, "w") 164 counterfactuals = open('counterfactuals.txt', 'w') 165 166 tagger = PerceptronTagger() 167 form_num = 8 168 169 cf_count = [[0 for x in range(form_num)] for x in range(form_num)] 170 171 form_vec = [] 172 173 print("Reading file...") 174 tweet = tweetfile.readline() 175 176 while tweet: 177 taggedTweet = get_tagged_message(tweet, tagger) 178 taggedfile.write(taggedTweet) 179 form = int(get_cf_form(taggedTweet)) 180 181 if form: 182 print(tweet) 183 counterfactuals.write(tweet + '<hr>\n') 184 185 form_vec.append(form) 186 cf_count[form][0] += 1 187 tweet = tweetfile.readline() 188 189 count = 0 190 for i in xrange(1, form_num): 191 count += cf_count[i][0] 192 193 print("Finished tagging...") 194 tweetfile.close() 195 taggedfile.close() 196 197 print("counterfactuals: " + str(count) + "/100") 198 counterfactuals.write("counterfactuals: " + str(count) + "/100<br>\n") 199 counterfactuals.close()