counterfacto

small software tool to analyze twitter and highlight counterfactual statements
git clone git://parazyd.org/counterfacto.git
Log | Files | Refs | README | LICENSE

commit d284382baf093bbecb220b64ce241c8b663ddf62
Author: parazyd <parazyd@dyne.org>
Date:   Thu, 22 Sep 2016 17:05:17 +0200

initial importx

Diffstat:
AREADME.md | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acounterfacto | 304+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atwokenize.py | 298+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 665 insertions(+), 0 deletions(-)

diff --git a/README.md b/README.md @@ -0,0 +1,63 @@ + +# Counterfacto + +Counterfactual (noun) + +Definition: the tendency to create possible alternatives to life events that have already occurred; something that is contrary to what actually happened. + +Effects: it starts off with disappointment, then one will be able to uncover insights or knowledge that can be used to enhance future performance, leading to a better outcome in life. + +---------------------------------------------------------------------------------- + +Counterfacto is a small software tool that can analyse search results on twitter to highlight counterfactual statements on certain topics. + +This tool is used by PIEnews.eu researchers for sentiment analysis about poverty and other related topics, to understand actual stories elaborated as counterfactual. + +We deem such a tool as a useful experiment, considering the importance of counterfactual analysis for political sentiment assessments and focus on news stories. + +## Dependencies + +Python is required along the following packages: + +``` +python-twitter python-nltk +``` + +Then run the `python` console in a terminal and type + +``` +import nltk +nltk.download('averaged_perceptron_tagger') +``` + +This will download the nltk_data folder and place it in your `$HOME`. + +Your distro may have an outdated nltk (less than 3.2) without the perceptron module, in that case an update from `pip` is needed: + +``` +pip install nltk --upgrade +``` + +## References + +- Learning Representations for Counterfactual Inference (2016) http://jmlr.org/proceedings/papers/v48/johansson16.pdf + +- Bounding and Minimizing Counterfactual Error (2016) https://arxiv.org/abs/1606.03976 + +- "Counterfactuals in the Language of Social Media: A Natural Language Processing Project in Conjunction with the World Well Being Project" (2015) http://www.seas.upenn.edu/~cse400/CSE400_2015_2016/reports/report_15.pdf + +## Licensing + +Counterfacto is Copyright (C) 2016 by the Dyne.org Foundation +as part of the PIEnews project + +Software written by Ivan J. <parazyd@dyne.org> +with contributions by Denis Roio <jaromil@dyne.org> + +This source code is free software; you can redistribute it and/or modify it under the terms of the GNU Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. + +This source code is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. Please refer to the GNU Public License for more details. + +You should have received a copy of the GNU Public License along with this source code; if not, write to: Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +This project has received funding from the European Union’s Horizon 2020 Programme for research, technological development and demonstration under grant agreement nr. 687922 diff --git a/counterfacto b/counterfacto @@ -0,0 +1,304 @@ +#!/usr/bin/env python2 +# Counterfacto is Copyright (C) 2016 by the Dyne.org Foundation +# as part of the PIEnews project +# +# This file is part of Counterfacto +# Written by Ivan J. <parazyd@dyne.org> +# +# This source code is free software; you can redistribute it and/or +# modify it under the terms of the GNU Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This source code is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. Please refer +# to the GNU Public License for more details. +# +# You should have received a copy of the GNU Public License along with +# this source code; if not, write to: Free Software Foundation, Inc., +# 675 Mass Ave, Cambridge, MA 02139, USA. +# +# This project has received funding from the European Union's Horizon +# 2020 Programme for research, technological development and +# demonstration under grant agreement nr. 687922 + +import nltk +from nltk.tag.perceptron import PerceptronTagger +import re +import sys +import twitter +import twokenize + + +global tweetfile +global taggedFile +taggedFile = 'tagged.txt' + +## get these by creating a twitter app +twit_consumer_key = '' +twit_consumer_secret = '' +twit_access_key = '' +twit_access_secret = '' + +def main(): + ## credential check json + #print(api.VerifyCredentials()) + + try: + if sys.argv[1] == '-f': + tweetfile = sys.argv[2] + classify(tweetfile) + + elif sys.argv[1] == '-a': + api = twitter.Api(consumer_key=twit_consumer_key, + consumer_secret=twit_consumer_secret, + access_token_key=twit_access_key, + access_token_secret=twit_access_secret) + + accountname = sys.argv[2] + statuses = api.GetUserTimeline(screen_name=accountname, + count=100) + + tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt' + tweetFile = open(tweetfile, 'w') + + for s in statuses: + sintweet = s.text + sintweet = sintweet.encode('ascii', 'ignore') + tweetFile.write(sintweet + '\n') + #print('wrote tweet') + + tweetFile.close() + classify(tweetfile) + + elif sys.argv[1] == '-s': + api = twitter.Api(consumer_key=twit_consumer_key, + consumer_secret=twit_consumer_secret, + access_token_key=twit_access_key, + access_token_secret=twit_access_secret) + + if len(sys.argv) >= 3: + searchterm = ' '.join(sys.argv[2:]) + statuses = api.GetSearch(term=searchterm, + count=100, + result_type="recent") + + #for s in statuses: + # print(s.text) + #exit() + + tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt' + tweetFile = open(tweetfile, 'w') + + for s in statuses: + sintweet = s.text + sintweet = sintweet.replace('\n', ' ') + sintweet = sintweet.encode('ascii', 'ignore') + tweetFile.write(sintweet + '\n') + #print('wrote tweet') + + tweetFile.close() + classify(tweetfile) + + except: + print("Usage: ./counterfacto {-a|-f|-s} {twitter account | file with tweets | search terms}") + exit(1) + +## {{{ processing functions +def tokenizelocal(): + tweets = tweetFile.read().splitlines() + for t in tweets: + print(t + '\n') + print(str(twokenize.tokenize(t)) + '\n') + +def format_tweet(message): + m = str(message) + m = m.replace('\n', ' ') + m = m.encode('ascii', 'ignore') + return m + +def format_tagged(tagged_list): + out = '' + for t in tagged_list: + token, tag = postprocess_tag(t[0], t[1]) + out = out + token + '/' + tag + '/' + out = out + '\n' + return out + +def postprocess_tag(token, tag): + outtag = tag + if (is_twitter_cf_modal(token)): + outtag = 'MD' + elif (tag_CCJ(token)): + outtag = 'CCJ' + return token, outtag + +def get_cf_form(tagged_message): + + # Filter out questions + pq = re.compile('\.*/\?/.', re.IGNORECASE) + if pq.search(tagged_message) != None: + return 0 + + # CASE 1 WISH VERB FORM + p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE) + if p1.search(tagged_message) != None: + return 1 + + + # CASE 2 CONJUNTION NORMAL + p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE) + if p2.search(tagged_message) != None: + return 2 + + + # CASE 3 CONJUNCTIVE CONVERSE + p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE) + if p3.search(tagged_message) != None: + return 3 + + + # CASE 5 Should have + p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE) + if p4.search(tagged_message) != None: + return 4 + + # CASE 6 VERB INVERSION + p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)" + "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)" + "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE) + if p5.search(tagged_message) != None: + return 5 + + + # CASE 6 MODAL NORMAL + p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE) + if p6.search(tagged_message) != None: + return 6 + + # If no matches + return 0 + + + +def is_twitter_cf_modal(word): + w = unicode(word, errors='ignore').encode('utf-8').lower() + if (w == 'should' or + w == 'should\'ve' or + w == 'shouldve' or + w == 'shoulda' or + w == 'shulda' or + w == 'shuda' or + w == 'shudda' or + w == 'shudve' or + w == 'would' or + w == 'would\'ve' or + w == 'wouldve' or + w == 'woulda' or + w == 'wuda' or + w == 'wulda' or + w == 'wudda' or + w == 'wudve' or + w == 'wlda' or + w == 'could' or + w == 'could\'ve' or + w == 'couldve' or + w == 'coulda' or + w == 'cudda' or + w == 'culda' or + w == 'cudve' or + w == 'must' or + w == 'mustve' or + w == 'might' or + w == 'might\'ve' or + w == 'mightve' or + w == 'ought' or + w == 'may' or + w == 'i\'d' or + w == 'id' or + w == 'we\'d' or + w == 'youd' or + w == 'you\'d' or + w == 'he\'d' or + w == 'she\'d'): + return True + return False + +def tag_CCJ(word): + w = word.lower() + ''' + as long as, even if, if, one condition that, provided (that), + providing (that), so long as, unless, whether... or, supposing, + suppose, imagine, but for + ''' + if(w == 'as' or + w == 'if' or + w == 'even' or + w == 'provided' or + w == 'providing' or + w == 'suppose' or + w == 'supposing' or + w == 'unless' or + w == 'whether' or + w == 'envision' or + w == 'envisioning' or + w == 'conceptualize'or + w == 'conceptualizing' or + w == 'conjure' or + w == 'conjuring' or + w == 'visualize' or + w == 'visualizing'): + return True + return False + +def get_tagged_message(message, tagger): + tagset = None + formatted_message = format_tweet(message) + tokens = twokenize.tokenize(formatted_message) + tags = nltk.tag._pos_tag(tokens, tagset, tagger) + return format_tagged(tags) +## }}} + +def classify(tweetfile): + tweetFile = open(tweetfile, 'r') + tagFile = open(taggedFile, 'w') + + tagger = PerceptronTagger() + form_num = 7 + + cf_count = [[0 for x in range(form_num)] for x in range(form_num)] + + form_vec = [] + + print("Reading file...") + tweet = tweetFile.readline() + + while tweet != '': + taggedTweet = get_tagged_message(tweet, tagger) + tagFile.write(taggedTweet) + #print("did tweet") + form = int(get_cf_form(taggedTweet)) + + ## if our tweet is positive, print it + if form != 0: + print(tweet) + + form_vec.append(form) + + cf_count[form][0] = cf_count[form][0] + 1 + + tweet = tweetFile.readline() + + count = 0 + for i in xrange(1,form_num): + count = count + cf_count[i][0] + + + print("finished tagging...") + tweetFile.close() + tagFile.close() + + print("counterfactuals: " + str(count)) + +main() diff --git a/twokenize.py b/twokenize.py @@ -0,0 +1,298 @@ +# -*- coding: utf-8 -*- +""" +Twokenize -- a tokenizer designed for Twitter text in English and some other European languages. +This tokenizer code has gone through a long history: + +(1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif + TweetMotif: Exploratory Search and Topic Summarization for Twitter. + Brendan O'Connor, Michel Krieger, and David Ahn. + ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf +(2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger +(2b) Jason Baldridge and David Snyder ported it to Scala +(3) Brendan bugfixed the Scala port and merged with POS-specific changes + for the CMU ARK Twitter POS Tagger +(4) Tobi Owoputi ported it back to Java and added many improvements (2012-06) + +Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP + +There have been at least 2 other Java ports, but they are not in the lineage for the code here. + +Ported to Python by Myle Ott <myleott@gmail.com>. +""" + +from __future__ import print_function + +import operator +import re +import HTMLParser + +def regex_or(*items): + return '(?:' + '|'.join(items) + ')' + +Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE) +Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE) + +punctChars = r"['\"“”‘’.?!…,:;]" +#punctSeq = punctChars+"+" #'anthem'. => ' anthem '. +punctSeq = r"['\"“”‘’]+|[.?!,…]+|[:;]+" #'anthem'. => ' anthem ' . +entity = r"&(?:amp|lt|gt|quot);" +# URLs + + +# BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong. +# If you actually empirically test it the results are bad. +# Please see https://github.com/brendano/ark-tweet-nlp/pull/9 + +urlStart1 = r"(?:https?://|\bwww\.)" +commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)" +ccTLDs = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \ +r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \ +r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \ +r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \ +r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \ +r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \ +r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \ +r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)" #TODO: remove obscure country domains? +urlStart2 = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)" +urlBody = r"(?:[^\.\s<>][^\s<>]*?)?" +urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?" +urlEnd = r"(?:\.\.+|[<>]|\s|$)" +url = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")" + + +# Numeric +timeLike = r"\d+(?::\d+){1,2}" +#numNum = r"\d+\.\d+" +numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))" +numComb = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?".encode('utf-8') + +# Abbreviations +boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity) +aa1 = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")" +aa2 = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")" +standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\." +arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations) +separators = "(?:--+|―|—|~|–|=)" +decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)".encode('utf-8') +thingsThatSplitWords = r"[^\s\.,?\"]" +embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*" + +# Emoticons +# myleott: in Python the (?iu) flags affect the whole expression +#normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems +normalEyes = "[:=]" # 8 and x are eyes but cause problems +wink = "[;]" +noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-( +happyMouths = r"[D\)\]\}]+" +sadMouths = r"[\(\[\{]+" +tongue = "[pPd3]+" +otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned + +# mouth repetition examples: +# @aliciakeys Put it in a love song :-)) +# @hellocalyclops =))=))=)) Oh well + +# myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails +#bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8') +bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8') +bfCenter = r"(?:[\.]|[_-]+)" +bfRight = r"\2" +s3 = r"(?:--['\"])" +s4 = r"(?:<|&lt;|>|&gt;)[\._-]+(?:<|&lt;|>|&gt;)" +s5 = "(?:[.][_]+[.])" +# myleott: in Python the (?i) flag affects the whole expression +#basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5 +basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5 + +eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+" +eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+".encode('utf-8') +eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]" +eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight + +oOEmote = r"(?:[oO]" + bfCenter + r"[oO])" + + +emoticon = regex_or( + # Standard version :) :( :] :D :P + "(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths), + + # reversed version (: D: use positive lookbehind to remove "(word):" + # because eyes on the right side is more ambiguous with the standard usage of : ; + regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?", + + #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style + eastEmote.replace("2", "1", 1), basicface, + # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb] + # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this + + # myleott: o.O and O.o are two of the biggest sources of differences + # between this and the Java version. One little hack won't hurt... + oOEmote +) + +Hearts = "(?:<+/?3+)+" #the other hearts are in decorations + +Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8')) + +# BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes +# "hello (#hashtag)" ==> "hello (#hashtag )" WRONG +# "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT +# "hello (@person)" ==> "hello (@person )" WRONG +# "hello (@person)" ==> "hello ( @person )" RIGHT +# ... Some sort of weird interaction with edgepunct I guess, because edgepunct +# has poor content-symbol detection. + +# This also gets #1 #40 which probably aren't hashtags .. but good as tokens. +# If you want good hashtag identification, use a different regex. +Hashtag = "#[a-zA-Z0-9_]+" #optional: lookbehind for \b +#optional: lookbehind for \b, max length 15 +AtMention = "[@@][a-zA-Z0-9_]+" + +# I was worried this would conflict with at-mentions +# but seems ok in sample of 5800: 7 changes all email fixes +# http://www.regular-expressions.info/email.html +Bound = r"(?:\W|^|$)" +Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")" + +# We will be tokenizing using these regexps as delimiters +# Additionally, these things are "protected", meaning they shouldn't be further split themselves. +Protected = re.compile( + unicode(regex_or( + Hearts, + url, + Email, + timeLike, + #numNum, + numberWithCommas, + numComb, + emoticon, + Arrows, + entity, + punctSeq, + arbitraryAbbrev, + separators, + decorations, + embeddedApostrophe, + Hashtag, + AtMention + ).decode('utf-8')), re.UNICODE) + +# Edge punctuation +# Want: 'foo' => ' foo ' +# While also: don't => don't +# the first is considered "edge punctuation". +# the second is word-internal punctuation -- don't want to mess with it. +# BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days. +# I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate. + +# Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes) +#edgePunctChars = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols) +edgePunctChars = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols) +edgePunct = "[" + edgePunctChars + "]" +notEdgePunct = "[a-zA-Z0-9]" # content characters +offEdge = r"(^|$|:|;|\s|\.|,)" # colon here gets "(hello):" ==> "( hello ):" +EdgePunctLeft = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE) +EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE) + +def splitEdgePunct(input): + input = EdgePunctLeft.sub(r"\1\2 \3", input) + input = EdgePunctRight.sub(r"\1 \2\3", input) + return input + +# The main work of tokenizing a tweet. +def simpleTokenize(text): + + # Do the no-brainers first + splitPunctText = splitEdgePunct(text) + + textLength = len(splitPunctText) + + # BTO: the logic here got quite convoluted via the Scala porting detour + # It would be good to switch back to a nice simple procedural style like in the Python version + # ... Scala is such a pain. Never again. + + # Find the matches for subsequences that should be protected, + # e.g. URLs, 1.0, U.N.K.L.E., 12:53 + bads = [] + badSpans = [] + for match in Protected.finditer(splitPunctText): + # The spans of the "bads" should not be split. + if (match.start() != match.end()): #unnecessary? + bads.append( [splitPunctText[match.start():match.end()]] ) + badSpans.append( (match.start(), match.end()) ) + + # Create a list of indices to create the "goods", which can be + # split. We are taking "bad" spans like + # List((2,5), (8,10)) + # to create + # List(0, 2, 5, 8, 10, 12) + # where, e.g., "12" here would be the textLength + # has an even length and no indices are the same + indices = [0] + for (first, second) in badSpans: + indices.append(first) + indices.append(second) + indices.append(textLength) + + # Group the indices and map them to their respective portion of the string + splitGoods = [] + for i in range(0, len(indices), 2): + goodstr = splitPunctText[indices[i]:indices[i+1]] + splitstr = goodstr.strip().split(" ") + splitGoods.append(splitstr) + + # Reinterpolate the 'good' and 'bad' Lists, ensuring that + # additonal tokens from last good item get included + zippedStr = [] + for i in range(len(bads)): + zippedStr = addAllnonempty(zippedStr, splitGoods[i]) + zippedStr = addAllnonempty(zippedStr, bads[i]) + zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)]) + + # BTO: our POS tagger wants "ur" and "you're" to both be one token. + # Uncomment to get "you 're" + #splitStr = [] + #for tok in zippedStr: + # splitStr.extend(splitToken(tok)) + #zippedStr = splitStr + + return zippedStr + +def addAllnonempty(master, smaller): + for s in smaller: + strim = s.strip() + if (len(strim) > 0): + master.append(strim) + return master + +# "foo bar " => "foo bar" +def squeezeWhitespace(input): + return Whitespace.sub(" ", input).strip() + +# Final pass tokenization based on special patterns +def splitToken(token): + m = Contractions.search(token) + if m: + return [m.group(1), m.group(2)] + return [token] + +# Assume 'text' has no HTML escaping. +def tokenize(text): + return simpleTokenize(squeezeWhitespace(text)) + + +# Twitter text comes HTML-escaped, so unescape it. +# We also first unescape &amp;'s, in case the text has been buggily double-escaped. +def normalizeTextForTagger(text): + text = text.replace("&amp;", "&") + text = HTMLParser.HTMLParser().unescape(text) + return text + +# This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger. +# +# This function normalizes the input text BEFORE calling the tokenizer. +# So the tokens you get back may not exactly correspond to +# substrings of the original text. +def tokenizeRawTweetText(text): + tokens = tokenize(normalizeTextForTagger(text)) + return tokens