commit 5d141366aa5bc60d8ee62bc7229daa5b8d6d617b
parent a6e2fee6da44bd214d6db79113be431142bb60bf
Author: parazyd <parazyd@dyne.org>
Date: Thu, 9 Mar 2017 00:14:49 +0100
move parsers to factolib.py
Diffstat:
5 files changed, 266 insertions(+), 25 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
credentials
-twokenize.pyc
+*.pyc
+*.txt
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
-
-# Counterfacto
+Counterfacto
+============
Counterfactual (noun)
@@ -11,7 +11,7 @@ Effects: it starts off with disappointment, then one will be able to
uncover insights or knowledge that can be used to enhance future
performance, leading to a better outcome in life.
-----------------------------------------------------------------------------------
+-----------------------------------------------------------------------
Counterfacto is a small software tool that can analyse search results
on twitter to highlight counterfactual statements on certain topics.
@@ -24,7 +24,8 @@ We deem such a tool as a useful experiment, considering the importance
of counterfactual analysis for political sentiment assessments and
focus on news stories.
-## Dependencies
+Dependencies
+------------
Python is required along the following packages:
@@ -43,7 +44,8 @@ After installing the necessary python modules, run `make`, which will
then download the needed data for nltk, and tell you how to use your
twitter credentials in counterfacto
-### Running the web edition of counterfacto
+Running the web edition of counterfacto
+---------------------------------------
To run counterfacto along with its web interface, you will need
additional dependencies:
@@ -52,7 +54,8 @@ additional dependencies:
python-flask
```
-## Usage
+Usage
+-----
```
usage: ./counterfacto [-a account] [-f tweetfile] [-s searchterm]
@@ -64,17 +67,17 @@ The web interface can be ran with (port defaults to 5000):
usage: ./counterfacto-web [-p port]
```
-## References
-
-- [Learning Representations for Counterfactual Inference (2016)](http://jmlr.org/proceedings/papers/v48/johansson16.pdf)
-
-- [Bounding and Minimizing Counterfactual Error (2016)](https://arxiv.org/abs/1606.03976)
+References
+----------
-- [Counterfactuals in the Language of Social Media: A Natural Language Processing Project in Conjunction with the World Well Being Project (2015)](http://www.seas.upenn.edu/~cse400/CSE400_2015_2016/reports/report_15.pdf)
+* [Learning Representations for Counterfactual Inference (2016)](http://jmlr.org/proceedings/papers/v48/johansson16.pdf)
+* [Bounding and Minimizing Counterfactual Error (2016)](https://arxiv.org/abs/1606.03976)
+* [Counterfactuals in the Language of Social Media: A Natural Language Processing Project in Conjunction with the World Well Being Project (2015)](http://www.seas.upenn.edu/~cse400/CSE400_2015_2016/reports/report_15.pdf)
-## Licensing
+Licensing
+---------
-Counterfacto is Copyright (C) 2016 by the Dyne.org Foundation
+Counterfacto is Copyright (C) 2016-2017 by the Dyne.org Foundation
as part of the PIEnews project
Software written by Ivan J. <parazyd@dyne.org>
diff --git a/counterfacto-web b/counterfacto-web
@@ -1,10 +1,37 @@
#!/usr/bin/env python2
+# Copyright (c) 2017 Ivan J. <parazyd@dyne.org>
+import json
+import sys
from flask import Flask, render_template, request, json
-import os
+from twitter import *
+
+import factolib
+
+
+global tweetsFile
+global taggedFile
+
+taggedFile = "tagged.txt"
+
+try:
+ with open('credentials') as fd:
+ exec(fd.read())
+except:
+ print("no credentials file found. please create it")
+ sys.exit(1)
app = Flask(__name__)
+def writetweets(tweets, twfile):
+ twfile = open(twfile, "w")
+ for s in tweets:
+ sintweet = s["text"]
+ sintweet = sintweet.replace("\n", " ")
+ sintweet = sintweet.encode("ascii", "ignore")
+ twfile.write(sintweet + "\n")
+ twfile.close()
+
@app.route("/")
def main():
return render_template('index.html')
@@ -17,10 +44,23 @@ def search():
if not _name or not _method:
return "Wrong data. Please try again."
+ api = Twitter(auth=OAuth(oatoken,oasecret,conskey,conssecret))
+
if _method == "account":
- os.system("./counterfacto -a " + _name)
+ statuses = api.statuses.user_timeline(screen_name=_name, count=100)
+ tweetsFile = "fetchedtweets-" + _name + ".txt"
+ writetweets(statuses, tweetsFile)
+ factolib.classify(tweetsFile, taggedFile)
+
elif _method == "searchterm":
- os.system("./counterfacto -s " + _name)
+ statuses = api.search.tweets(q=_name, count=1)
+ tweetsFile = "fetchedsearch.txt"
+ writetweets(statuses, tweetsFile)
+ factolib.classify(tweetsFile, taggedFile)
+
+ cfs = "counterfactuals.txt"
+ with open(cfs) as f:
+ return f.read()
if __name__ == "__main__":
try:
@@ -29,7 +69,5 @@ if __name__ == "__main__":
except:
_port = 5000
- app.run(
- host="0.0.0.0",
- port=int(_port)
- )
+ app.run(host="127.0.0.1", port=int(_port))
+ #subprocess.call(["xdg-open", "http://127.0.0.1:" + _port])
diff --git a/factolib.py b/factolib.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python2
+# Copyright (c) 2017 Ivan J. <parazyd@dyne.org
+
+import nltk
+import re
+import twokenize
+from nltk.tag.perceptron import PerceptronTagger
+
+def tokenizelocal():
+ tweets = tweetFile.read().splitlines()
+ for t in tweets:
+ print(t + '\n')
+ print(str(twokenize.tokenize(t)) + '\n')
+
+def format_tweet(message):
+ m = str(message)
+ m = m.replace('\n', ' ')
+ m = m.encode('ascii', 'ignore')
+ return m
+
+def format_tagged(tagged_list):
+ out = ''
+ for t in tagged_list:
+ token, tag = postprocess_tag(t[0], t[1])
+ out = out + token + '/' + tag + '/'
+ out = out + '\n'
+ return out
+
+def postprocess_tag(token, tag):
+ outtag = tag
+ if (is_twitter_cf_modal(token)):
+ outtag = 'MD'
+ elif (tag_CCJ(token)):
+ outtag = 'CCJ'
+ return token, outtag
+
+def get_cf_form(tagged_message):
+
+ # Filter out questions
+ pq = re.compile('\.*/\?/.', re.IGNORECASE)
+ if pq.search(tagged_message) != None:
+ return 0
+
+ # CASE 1 WISH VERB FORM
+ p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE)
+ if p1.search(tagged_message) != None:
+ return 1
+
+
+ # CASE 2 CONJUNTION NORMAL
+ p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE)
+ if p2.search(tagged_message) != None:
+ return 2
+
+
+ # CASE 3 CONJUNCTIVE CONVERSE
+ p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE)
+ if p3.search(tagged_message) != None:
+ return 3
+
+
+ # CASE 5 Should have
+ p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE)
+ if p4.search(tagged_message) != None:
+ return 4
+
+ # CASE 6 VERB INVERSION
+ p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)"
+ "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)"
+ "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE)
+ if p5.search(tagged_message) != None:
+ return 5
+
+
+ # CASE 6 MODAL NORMAL
+ p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE)
+ if p6.search(tagged_message) != None:
+ return 6
+
+ # If no matches
+ return 0
+
+
+def is_twitter_cf_modal(word):
+ w = unicode(word, errors='ignore').encode('utf-8').lower()
+ if (w == 'should' or
+ w == 'should\'ve' or
+ w == 'shouldve' or
+ w == 'shoulda' or
+ w == 'shulda' or
+ w == 'shuda' or
+ w == 'shudda' or
+ w == 'shudve' or
+ w == 'would' or
+ w == 'would\'ve' or
+ w == 'wouldve' or
+ w == 'woulda' or
+ w == 'wuda' or
+ w == 'wulda' or
+ w == 'wudda' or
+ w == 'wudve' or
+ w == 'wlda' or
+ w == 'could' or
+ w == 'could\'ve' or
+ w == 'couldve' or
+ w == 'coulda' or
+ w == 'cudda' or
+ w == 'culda' or
+ w == 'cudve' or
+ w == 'must' or
+ w == 'mustve' or
+ w == 'might' or
+ w == 'might\'ve' or
+ w == 'mightve' or
+ w == 'ought' or
+ w == 'may' or
+ w == 'i\'d' or
+ w == 'id' or
+ w == 'we\'d' or
+ w == 'youd' or
+ w == 'you\'d' or
+ w == 'he\'d' or
+ w == 'she\'d'):
+ return True
+ return False
+
+def tag_CCJ(word):
+ w = word.lower()
+ '''
+ as long as, even if, if, one condition that, provided (that),
+ providing (that), so long as, unless, whether... or, supposing,
+ suppose, imagine, but for
+ '''
+ if(w == 'as' or
+ w == 'if' or
+ w == 'even' or
+ w == 'provided' or
+ w == 'providing' or
+ w == 'suppose' or
+ w == 'supposing' or
+ w == 'unless' or
+ w == 'whether' or
+ w == 'envision' or
+ w == 'envisioning' or
+ w == 'conceptualize'or
+ w == 'conceptualizing' or
+ w == 'conjure' or
+ w == 'conjuring' or
+ w == 'visualize' or
+ w == 'visualizing'):
+ return True
+ return False
+
+def get_tagged_message(message, tagger):
+ tagset = None
+ formatted_message = format_tweet(message)
+ tokens = twokenize.tokenize(formatted_message)
+ tags = nltk.tag._pos_tag(tokens, tagset, tagger)
+ return format_tagged(tags)
+
+def classify(tweetfile, taggedfile):
+ tweetfile = open(tweetfile, "r")
+ taggedfile = open(taggedfile, "w")
+ counterfactuals = open('counterfactuals.txt', 'w')
+
+ tagger = PerceptronTagger()
+ form_num = 8
+
+ cf_count = [[0 for x in range(form_num)] for x in range(form_num)]
+
+ form_vec = []
+
+ print("Reading file...")
+ tweet = tweetfile.readline()
+
+ while tweet:
+ taggedTweet = get_tagged_message(tweet, tagger)
+ taggedfile.write(taggedTweet)
+ form = int(get_cf_form(taggedTweet))
+
+ if form:
+ print(tweet)
+ counterfactuals.write(tweet + '<hr>\n')
+
+ form_vec.append(form)
+ cf_count[form][0] += 1
+ tweet = tweetfile.readline()
+
+ count = 0
+ for i in xrange(1, form_num):
+ count += cf_count[i][0]
+
+ print("Finished tagging...")
+ tweetfile.close()
+ taggedfile.close()
+
+ print("counterfactuals: " + str(count) + "/100")
+ counterfactuals.write("counterfactuals: " + str(count) + "/100<br>\n")
+ counterfactuals.close()
diff --git a/templates/index.html b/templates/index.html
@@ -11,9 +11,9 @@
<script type="text/javascript" href="../static/js/search.js"></script>
</head>
<body>
- <div class="container">
+ <!-- <div class="container">
<h3 class="text-muted">Counterfacto</h3>
- </div>
+ </div> -->
<div class="jumbotron">
<h1>Counterfacto</h1>
@@ -28,7 +28,7 @@
</div>
<footer class="footer">
- <p>© PIEnews / Dyne.org 2016-2116</p>
+ <center><p>© PIEnews / Dyne.org Foundation 2016-2017</p></center>
</footer>
</body>
</html>