commit d284382baf093bbecb220b64ce241c8b663ddf62
Author: parazyd <parazyd@dyne.org>
Date: Thu, 22 Sep 2016 17:05:17 +0200
initial importx
Diffstat:
A | README.md | | | 63 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | counterfacto | | | 304 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | twokenize.py | | | 298 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 665 insertions(+), 0 deletions(-)
diff --git a/README.md b/README.md
@@ -0,0 +1,63 @@
+
+# Counterfacto
+
+Counterfactual (noun)
+
+Definition: the tendency to create possible alternatives to life events that have already occurred; something that is contrary to what actually happened.
+
+Effects: it starts off with disappointment, then one will be able to uncover insights or knowledge that can be used to enhance future performance, leading to a better outcome in life.
+
+----------------------------------------------------------------------------------
+
+Counterfacto is a small software tool that can analyse search results on twitter to highlight counterfactual statements on certain topics.
+
+This tool is used by PIEnews.eu researchers for sentiment analysis about poverty and other related topics, to understand actual stories elaborated as counterfactual.
+
+We deem such a tool as a useful experiment, considering the importance of counterfactual analysis for political sentiment assessments and focus on news stories.
+
+## Dependencies
+
+Python is required along the following packages:
+
+```
+python-twitter python-nltk
+```
+
+Then run the `python` console in a terminal and type
+
+```
+import nltk
+nltk.download('averaged_perceptron_tagger')
+```
+
+This will download the nltk_data folder and place it in your `$HOME`.
+
+Your distro may have an outdated nltk (less than 3.2) without the perceptron module, in that case an update from `pip` is needed:
+
+```
+pip install nltk --upgrade
+```
+
+## References
+
+- Learning Representations for Counterfactual Inference (2016) http://jmlr.org/proceedings/papers/v48/johansson16.pdf
+
+- Bounding and Minimizing Counterfactual Error (2016) https://arxiv.org/abs/1606.03976
+
+- "Counterfactuals in the Language of Social Media: A Natural Language Processing Project in Conjunction with the World Well Being Project" (2015) http://www.seas.upenn.edu/~cse400/CSE400_2015_2016/reports/report_15.pdf
+
+## Licensing
+
+Counterfacto is Copyright (C) 2016 by the Dyne.org Foundation
+as part of the PIEnews project
+
+Software written by Ivan J. <parazyd@dyne.org>
+with contributions by Denis Roio <jaromil@dyne.org>
+
+This source code is free software; you can redistribute it and/or modify it under the terms of the GNU Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version.
+
+This source code is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. Please refer to the GNU Public License for more details.
+
+You should have received a copy of the GNU Public License along with this source code; if not, write to: Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+This project has received funding from the European Union’s Horizon 2020 Programme for research, technological development and demonstration under grant agreement nr. 687922
diff --git a/counterfacto b/counterfacto
@@ -0,0 +1,304 @@
+#!/usr/bin/env python2
+# Counterfacto is Copyright (C) 2016 by the Dyne.org Foundation
+# as part of the PIEnews project
+#
+# This file is part of Counterfacto
+# Written by Ivan J. <parazyd@dyne.org>
+#
+# This source code is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This source code is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. Please refer
+# to the GNU Public License for more details.
+#
+# You should have received a copy of the GNU Public License along with
+# this source code; if not, write to: Free Software Foundation, Inc.,
+# 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+# This project has received funding from the European Union's Horizon
+# 2020 Programme for research, technological development and
+# demonstration under grant agreement nr. 687922
+
+import nltk
+from nltk.tag.perceptron import PerceptronTagger
+import re
+import sys
+import twitter
+import twokenize
+
+
+global tweetfile
+global taggedFile
+taggedFile = 'tagged.txt'
+
+## get these by creating a twitter app
+twit_consumer_key = ''
+twit_consumer_secret = ''
+twit_access_key = ''
+twit_access_secret = ''
+
+def main():
+ ## credential check json
+ #print(api.VerifyCredentials())
+
+ try:
+ if sys.argv[1] == '-f':
+ tweetfile = sys.argv[2]
+ classify(tweetfile)
+
+ elif sys.argv[1] == '-a':
+ api = twitter.Api(consumer_key=twit_consumer_key,
+ consumer_secret=twit_consumer_secret,
+ access_token_key=twit_access_key,
+ access_token_secret=twit_access_secret)
+
+ accountname = sys.argv[2]
+ statuses = api.GetUserTimeline(screen_name=accountname,
+ count=100)
+
+ tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt'
+ tweetFile = open(tweetfile, 'w')
+
+ for s in statuses:
+ sintweet = s.text
+ sintweet = sintweet.encode('ascii', 'ignore')
+ tweetFile.write(sintweet + '\n')
+ #print('wrote tweet')
+
+ tweetFile.close()
+ classify(tweetfile)
+
+ elif sys.argv[1] == '-s':
+ api = twitter.Api(consumer_key=twit_consumer_key,
+ consumer_secret=twit_consumer_secret,
+ access_token_key=twit_access_key,
+ access_token_secret=twit_access_secret)
+
+ if len(sys.argv) >= 3:
+ searchterm = ' '.join(sys.argv[2:])
+ statuses = api.GetSearch(term=searchterm,
+ count=100,
+ result_type="recent")
+
+ #for s in statuses:
+ # print(s.text)
+ #exit()
+
+ tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt'
+ tweetFile = open(tweetfile, 'w')
+
+ for s in statuses:
+ sintweet = s.text
+ sintweet = sintweet.replace('\n', ' ')
+ sintweet = sintweet.encode('ascii', 'ignore')
+ tweetFile.write(sintweet + '\n')
+ #print('wrote tweet')
+
+ tweetFile.close()
+ classify(tweetfile)
+
+ except:
+ print("Usage: ./counterfacto {-a|-f|-s} {twitter account | file with tweets | search terms}")
+ exit(1)
+
+## {{{ processing functions
+def tokenizelocal():
+ tweets = tweetFile.read().splitlines()
+ for t in tweets:
+ print(t + '\n')
+ print(str(twokenize.tokenize(t)) + '\n')
+
+def format_tweet(message):
+ m = str(message)
+ m = m.replace('\n', ' ')
+ m = m.encode('ascii', 'ignore')
+ return m
+
+def format_tagged(tagged_list):
+ out = ''
+ for t in tagged_list:
+ token, tag = postprocess_tag(t[0], t[1])
+ out = out + token + '/' + tag + '/'
+ out = out + '\n'
+ return out
+
+def postprocess_tag(token, tag):
+ outtag = tag
+ if (is_twitter_cf_modal(token)):
+ outtag = 'MD'
+ elif (tag_CCJ(token)):
+ outtag = 'CCJ'
+ return token, outtag
+
+def get_cf_form(tagged_message):
+
+ # Filter out questions
+ pq = re.compile('\.*/\?/.', re.IGNORECASE)
+ if pq.search(tagged_message) != None:
+ return 0
+
+ # CASE 1 WISH VERB FORM
+ p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE)
+ if p1.search(tagged_message) != None:
+ return 1
+
+
+ # CASE 2 CONJUNTION NORMAL
+ p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE)
+ if p2.search(tagged_message) != None:
+ return 2
+
+
+ # CASE 3 CONJUNCTIVE CONVERSE
+ p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE)
+ if p3.search(tagged_message) != None:
+ return 3
+
+
+ # CASE 5 Should have
+ p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE)
+ if p4.search(tagged_message) != None:
+ return 4
+
+ # CASE 6 VERB INVERSION
+ p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)"
+ "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)"
+ "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE)
+ if p5.search(tagged_message) != None:
+ return 5
+
+
+ # CASE 6 MODAL NORMAL
+ p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE)
+ if p6.search(tagged_message) != None:
+ return 6
+
+ # If no matches
+ return 0
+
+
+
+def is_twitter_cf_modal(word):
+ w = unicode(word, errors='ignore').encode('utf-8').lower()
+ if (w == 'should' or
+ w == 'should\'ve' or
+ w == 'shouldve' or
+ w == 'shoulda' or
+ w == 'shulda' or
+ w == 'shuda' or
+ w == 'shudda' or
+ w == 'shudve' or
+ w == 'would' or
+ w == 'would\'ve' or
+ w == 'wouldve' or
+ w == 'woulda' or
+ w == 'wuda' or
+ w == 'wulda' or
+ w == 'wudda' or
+ w == 'wudve' or
+ w == 'wlda' or
+ w == 'could' or
+ w == 'could\'ve' or
+ w == 'couldve' or
+ w == 'coulda' or
+ w == 'cudda' or
+ w == 'culda' or
+ w == 'cudve' or
+ w == 'must' or
+ w == 'mustve' or
+ w == 'might' or
+ w == 'might\'ve' or
+ w == 'mightve' or
+ w == 'ought' or
+ w == 'may' or
+ w == 'i\'d' or
+ w == 'id' or
+ w == 'we\'d' or
+ w == 'youd' or
+ w == 'you\'d' or
+ w == 'he\'d' or
+ w == 'she\'d'):
+ return True
+ return False
+
+def tag_CCJ(word):
+ w = word.lower()
+ '''
+ as long as, even if, if, one condition that, provided (that),
+ providing (that), so long as, unless, whether... or, supposing,
+ suppose, imagine, but for
+ '''
+ if(w == 'as' or
+ w == 'if' or
+ w == 'even' or
+ w == 'provided' or
+ w == 'providing' or
+ w == 'suppose' or
+ w == 'supposing' or
+ w == 'unless' or
+ w == 'whether' or
+ w == 'envision' or
+ w == 'envisioning' or
+ w == 'conceptualize'or
+ w == 'conceptualizing' or
+ w == 'conjure' or
+ w == 'conjuring' or
+ w == 'visualize' or
+ w == 'visualizing'):
+ return True
+ return False
+
+def get_tagged_message(message, tagger):
+ tagset = None
+ formatted_message = format_tweet(message)
+ tokens = twokenize.tokenize(formatted_message)
+ tags = nltk.tag._pos_tag(tokens, tagset, tagger)
+ return format_tagged(tags)
+## }}}
+
+def classify(tweetfile):
+ tweetFile = open(tweetfile, 'r')
+ tagFile = open(taggedFile, 'w')
+
+ tagger = PerceptronTagger()
+ form_num = 7
+
+ cf_count = [[0 for x in range(form_num)] for x in range(form_num)]
+
+ form_vec = []
+
+ print("Reading file...")
+ tweet = tweetFile.readline()
+
+ while tweet != '':
+ taggedTweet = get_tagged_message(tweet, tagger)
+ tagFile.write(taggedTweet)
+ #print("did tweet")
+ form = int(get_cf_form(taggedTweet))
+
+ ## if our tweet is positive, print it
+ if form != 0:
+ print(tweet)
+
+ form_vec.append(form)
+
+ cf_count[form][0] = cf_count[form][0] + 1
+
+ tweet = tweetFile.readline()
+
+ count = 0
+ for i in xrange(1,form_num):
+ count = count + cf_count[i][0]
+
+
+ print("finished tagging...")
+ tweetFile.close()
+ tagFile.close()
+
+ print("counterfactuals: " + str(count))
+
+main()
diff --git a/twokenize.py b/twokenize.py
@@ -0,0 +1,298 @@
+# -*- coding: utf-8 -*-
+"""
+Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
+This tokenizer code has gone through a long history:
+
+(1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
+ TweetMotif: Exploratory Search and Topic Summarization for Twitter.
+ Brendan O'Connor, Michel Krieger, and David Ahn.
+ ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
+(2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
+(2b) Jason Baldridge and David Snyder ported it to Scala
+(3) Brendan bugfixed the Scala port and merged with POS-specific changes
+ for the CMU ARK Twitter POS Tagger
+(4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)
+
+Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP
+
+There have been at least 2 other Java ports, but they are not in the lineage for the code here.
+
+Ported to Python by Myle Ott <myleott@gmail.com>.
+"""
+
+from __future__ import print_function
+
+import operator
+import re
+import HTMLParser
+
+def regex_or(*items):
+ return '(?:' + '|'.join(items) + ')'
+
+Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE)
+Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
+
+punctChars = r"['\"“”‘’.?!…,:;]"
+#punctSeq = punctChars+"+" #'anthem'. => ' anthem '.
+punctSeq = r"['\"“”‘’]+|[.?!,…]+|[:;]+" #'anthem'. => ' anthem ' .
+entity = r"&(?:amp|lt|gt|quot);"
+# URLs
+
+
+# BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
+# If you actually empirically test it the results are bad.
+# Please see https://github.com/brendano/ark-tweet-nlp/pull/9
+
+urlStart1 = r"(?:https?://|\bwww\.)"
+commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
+ccTLDs = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
+r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
+r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
+r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
+r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
+r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
+r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
+r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)" #TODO: remove obscure country domains?
+urlStart2 = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
+urlBody = r"(?:[^\.\s<>][^\s<>]*?)?"
+urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
+urlEnd = r"(?:\.\.+|[<>]|\s|$)"
+url = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
+
+
+# Numeric
+timeLike = r"\d+(?::\d+){1,2}"
+#numNum = r"\d+\.\d+"
+numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"
+numComb = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?".encode('utf-8')
+
+# Abbreviations
+boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
+aa1 = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")"
+aa2 = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"
+standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."
+arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations)
+separators = "(?:--+|―|—|~|–|=)"
+decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)".encode('utf-8')
+thingsThatSplitWords = r"[^\s\.,?\"]"
+embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*"
+
+# Emoticons
+# myleott: in Python the (?iu) flags affect the whole expression
+#normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
+normalEyes = "[:=]" # 8 and x are eyes but cause problems
+wink = "[;]"
+noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-(
+happyMouths = r"[D\)\]\}]+"
+sadMouths = r"[\(\[\{]+"
+tongue = "[pPd3]+"
+otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned
+
+# mouth repetition examples:
+# @aliciakeys Put it in a love song :-))
+# @hellocalyclops =))=))=)) Oh well
+
+# myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails
+#bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
+bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
+bfCenter = r"(?:[\.]|[_-]+)"
+bfRight = r"\2"
+s3 = r"(?:--['\"])"
+s4 = r"(?:<|<|>|>)[\._-]+(?:<|<|>|>)"
+s5 = "(?:[.][_]+[.])"
+# myleott: in Python the (?i) flag affects the whole expression
+#basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
+basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
+
+eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+"
+eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+".encode('utf-8')
+eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
+eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
+
+oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
+
+
+emoticon = regex_or(
+ # Standard version :) :( :] :D :P
+ "(?:>|>)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),
+
+ # reversed version (: D: use positive lookbehind to remove "(word):"
+ # because eyes on the right side is more ambiguous with the standard usage of : ;
+ regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|<)?",
+
+ #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
+ eastEmote.replace("2", "1", 1), basicface,
+ # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
+ # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
+
+ # myleott: o.O and O.o are two of the biggest sources of differences
+ # between this and the Java version. One little hack won't hurt...
+ oOEmote
+)
+
+Hearts = "(?:<+/?3+)+" #the other hearts are in decorations
+
+Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8'))
+
+# BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
+# "hello (#hashtag)" ==> "hello (#hashtag )" WRONG
+# "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT
+# "hello (@person)" ==> "hello (@person )" WRONG
+# "hello (@person)" ==> "hello ( @person )" RIGHT
+# ... Some sort of weird interaction with edgepunct I guess, because edgepunct
+# has poor content-symbol detection.
+
+# This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
+# If you want good hashtag identification, use a different regex.
+Hashtag = "#[a-zA-Z0-9_]+" #optional: lookbehind for \b
+#optional: lookbehind for \b, max length 15
+AtMention = "[@@][a-zA-Z0-9_]+"
+
+# I was worried this would conflict with at-mentions
+# but seems ok in sample of 5800: 7 changes all email fixes
+# http://www.regular-expressions.info/email.html
+Bound = r"(?:\W|^|$)"
+Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"
+
+# We will be tokenizing using these regexps as delimiters
+# Additionally, these things are "protected", meaning they shouldn't be further split themselves.
+Protected = re.compile(
+ unicode(regex_or(
+ Hearts,
+ url,
+ Email,
+ timeLike,
+ #numNum,
+ numberWithCommas,
+ numComb,
+ emoticon,
+ Arrows,
+ entity,
+ punctSeq,
+ arbitraryAbbrev,
+ separators,
+ decorations,
+ embeddedApostrophe,
+ Hashtag,
+ AtMention
+ ).decode('utf-8')), re.UNICODE)
+
+# Edge punctuation
+# Want: 'foo' => ' foo '
+# While also: don't => don't
+# the first is considered "edge punctuation".
+# the second is word-internal punctuation -- don't want to mess with it.
+# BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
+# I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate.
+
+# Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
+#edgePunctChars = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
+edgePunctChars = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
+edgePunct = "[" + edgePunctChars + "]"
+notEdgePunct = "[a-zA-Z0-9]" # content characters
+offEdge = r"(^|$|:|;|\s|\.|,)" # colon here gets "(hello):" ==> "( hello ):"
+EdgePunctLeft = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
+EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
+
+def splitEdgePunct(input):
+ input = EdgePunctLeft.sub(r"\1\2 \3", input)
+ input = EdgePunctRight.sub(r"\1 \2\3", input)
+ return input
+
+# The main work of tokenizing a tweet.
+def simpleTokenize(text):
+
+ # Do the no-brainers first
+ splitPunctText = splitEdgePunct(text)
+
+ textLength = len(splitPunctText)
+
+ # BTO: the logic here got quite convoluted via the Scala porting detour
+ # It would be good to switch back to a nice simple procedural style like in the Python version
+ # ... Scala is such a pain. Never again.
+
+ # Find the matches for subsequences that should be protected,
+ # e.g. URLs, 1.0, U.N.K.L.E., 12:53
+ bads = []
+ badSpans = []
+ for match in Protected.finditer(splitPunctText):
+ # The spans of the "bads" should not be split.
+ if (match.start() != match.end()): #unnecessary?
+ bads.append( [splitPunctText[match.start():match.end()]] )
+ badSpans.append( (match.start(), match.end()) )
+
+ # Create a list of indices to create the "goods", which can be
+ # split. We are taking "bad" spans like
+ # List((2,5), (8,10))
+ # to create
+ # List(0, 2, 5, 8, 10, 12)
+ # where, e.g., "12" here would be the textLength
+ # has an even length and no indices are the same
+ indices = [0]
+ for (first, second) in badSpans:
+ indices.append(first)
+ indices.append(second)
+ indices.append(textLength)
+
+ # Group the indices and map them to their respective portion of the string
+ splitGoods = []
+ for i in range(0, len(indices), 2):
+ goodstr = splitPunctText[indices[i]:indices[i+1]]
+ splitstr = goodstr.strip().split(" ")
+ splitGoods.append(splitstr)
+
+ # Reinterpolate the 'good' and 'bad' Lists, ensuring that
+ # additonal tokens from last good item get included
+ zippedStr = []
+ for i in range(len(bads)):
+ zippedStr = addAllnonempty(zippedStr, splitGoods[i])
+ zippedStr = addAllnonempty(zippedStr, bads[i])
+ zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])
+
+ # BTO: our POS tagger wants "ur" and "you're" to both be one token.
+ # Uncomment to get "you 're"
+ #splitStr = []
+ #for tok in zippedStr:
+ # splitStr.extend(splitToken(tok))
+ #zippedStr = splitStr
+
+ return zippedStr
+
+def addAllnonempty(master, smaller):
+ for s in smaller:
+ strim = s.strip()
+ if (len(strim) > 0):
+ master.append(strim)
+ return master
+
+# "foo bar " => "foo bar"
+def squeezeWhitespace(input):
+ return Whitespace.sub(" ", input).strip()
+
+# Final pass tokenization based on special patterns
+def splitToken(token):
+ m = Contractions.search(token)
+ if m:
+ return [m.group(1), m.group(2)]
+ return [token]
+
+# Assume 'text' has no HTML escaping.
+def tokenize(text):
+ return simpleTokenize(squeezeWhitespace(text))
+
+
+# Twitter text comes HTML-escaped, so unescape it.
+# We also first unescape &'s, in case the text has been buggily double-escaped.
+def normalizeTextForTagger(text):
+ text = text.replace("&", "&")
+ text = HTMLParser.HTMLParser().unescape(text)
+ return text
+
+# This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
+#
+# This function normalizes the input text BEFORE calling the tokenizer.
+# So the tokens you get back may not exactly correspond to
+# substrings of the original text.
+def tokenizeRawTweetText(text):
+ tokens = tokenize(normalizeTextForTagger(text))
+ return tokens