Python, Google Charts and Twitter Statistics

I post weekly statistics of DJIA and OMXH25 companies’ tweeting activity where companies are ranked by the number of tweets during the previous week (Mon-Sun).  See this Top tweeters (DJIA) 23.4. – 29.4.2012 as an example.

But how do I do that? I do it automatically, using Python, Python Twitter API, MySQL, Google Charts and cron.

In practice, generating statistics happens in two phases. First phase happens every day when I use Python script and cron-job to get daily tweets by companies and then save them to MySQL database.

The second phase is another Python script and cron-job (once per week, on Mondays), where I read the database and calculate the tweet statistics of the previous week. In the same script I generate HTML with Google Charts Javascript code for the bar chart. And finally, I post a blog entry using the same method as in my scifi marketing experiment.

Google Charts has good examples of how to create charts and I modified this bar chart example for my purposes.

The sample source code of scripts are shown below. The code includes some comments but no installation instructions or details (Python’s design philosophy emphasizes code readability :-).

Get tweets every day and save them to database

#!/usr/local/bin/python

import twitter
import sys
import MySQLdb
import datetime

api = twitter.Api()

consumerKey='twitter_consumer_key'
consumerSecret='twitter_consumer_secret'

accessTokenStr = 'access_token_str'
accessTokenSecret = 'access_token_secret'

accountsFollowed=[
'list of twitter account names',
]

try:
  conn = MySQLdb.connect (host = "localhost",
                             user = "dbuser",
                             passwd = "password",
                             db = "db_name")
  api = twitter.Api(consumer_key=consumerKey,consumer_secret=consumerSecret, access_token_key=accessTokenStr, access_token_secret=accessTokenSecret)

  cursor = conn.cursor()

  for user in accountsFollowed:
    user=user.lower()
    #select id of latest tweet in database
    #database created outside of this program
    cursor.execute ("SELECT max(id)  FROM tweets where user=%s",(user))
    row = cursor.fetchone ()
    latestId=0
    if row != None:
      if row[0]!=None:
        latestId=row[0]
    if latestId==0:
      maxTweetsToGet=5
      statuses = api.GetUserTimeline(user,count=maxTweetsToGet)
    else:
      statuses = api.GetUserTimeline(user,since_id=latestId)

    for s in statuses:
      dt=s.GetCreatedAtInSeconds()
      src=s.GetSource().encode('utf-8')
      txt=s.text.encode('utf-8')
      findex=0
      sindex=src.find('<',findex)
      while sindex>-1:
        eindex=src.find('>',sindex)
        if eindex>-1:
          src=src[0:sindex]+src[eindex+1:]
        findex=sindex+1
        sindex=src.find('<',findex)
      try:
        #insert into database
        cursor.execute ("""
         INSERT INTO  tweets (datetime,user,content,source,id)
         VALUES (%s,%s,%s,%s,%s)
         """, (datetime.datetime.utcfromtimestamp(dt),user.lower(),txt,src,s.GetId() ))
      except MySQLdb.Error, e:
        print "Error %d: %s" % (e.args[0], e.args[1])

  cursor.close()
  conn.commit()
  conn.close()
  sys.exit(0)

except MySQLdb.Error, e:
  print "Error %d: %s" % (e.args[0], e.args[1])
  print "While handling %s" % (user)
  sys.exit (1)

Generate statistics page and post to WordPress

#!/usr/local/bin/python

import sys
import MySQLdb
import datetime
import operator
import stat_functions
import xmlrpclib

accountsFollowedOMXH25=[
'list of twitter account names',
]

try:
  conn = MySQLdb.connect (host = "localhost",
                             user = "dbuser",
                             passwd = "password",
                             db = "db_name")

  cursor = conn.cursor()

  #get tweet count between week ago and yesterday
  yesterdayDelta=datetime.timedelta(days=1)
  weekAgoDelta = datetime.timedelta(days=7)
  currentDate = datetime.date.today()
  yesterday=currentDate-yesterdayDelta
  weekAgo=currentDate-weekAgoDelta
  startDatetime=datetime.datetime(weekAgo.year,weekAgo.month,weekAgo.day,0,0,0)
  endDatetime=datetime.datetime(yesterday.year,yesterday.month,yesterday.day,23,59,59)

  ranking=dict()
  for user in accountsFollowedOMXH25:
    user=user.lower()
    cursor.execute ("SELECT count(datetime)  FROM tweets where user=%s and datetime>=%s and datetime<=%s",(user,startDatetime,endDatetime))
    row= cursor.fetchone()
    tweetCount=row[0]
    ranking[user]=tweetCount

  sortedRanking = sorted(ranking.iteritems(), key=operator.itemgetter(1),reverse=True)

  cursor.close()
  conn.commit()
  conn.close()

  #generate HTML file using external function
  htmlFileName="top_tweeters_OMXH25_%02d%02d%d-%02d%02d%d.html" % (startDatetime.day,startDatetime.month,startDatetime.year,endDatetime.day,endDatetime.month,endDatetime.year)
  #print htmlFileName
  webPath="/toptweeters/"+htmlFileName
  omxh25FileName="/path/to/dir/"+webPath
  title= "OMXH25 company tweets %02d.%02d.%d - %02d.%02d.%d" % (startDatetime.day,startDatetime.month,startDatetime.year,endDatetime.day,endDatetime.month,endDatetime.year)
  stat_functions.writeTweetBarChartHtmlFile(omxh25FileName,title,sortedRanking)

  webServer='http://wordpress.server.com'

  serverUrl='%s/xmlrpc.php' % webServer
  userName='adminuser'
  password='adminpwd'

# Create an object to represent our server.;
  server = xmlrpclib.Server(serverUrl);

#set blog entry content
  category="Visualization"
  if startDatetime.year==endDatetime.year:
    dateRange="%d.%d. - %d.%d.%d" % (startDatetime.day,startDatetime.month,endDatetime.day,endDatetime.month,endDatetime.year)
  else:
    dateRange="%d.%d.%d - %d.%d.%d" % (startDatetime.day,startDatetime.month,startDatetime.year,endDatetime.day,endDatetime.month,endDatetime.year)
  
  title="Top tweeters (OMXH25) %s" % dateRange
  content=""
  for i in range(5):
    account= sortedRanking[i][0]
    content+="@%s" % (account,account.capitalize())
    if i<3:
      content+=", "
    if i==3:
      content+=" and "
  content+=" are the top tweeters between %s." % (dateRange) 
  content+="
" content+='' % (webPath) content+="
" content+="" content+="Statistics are based on company tweets (excluding retweets) as seen on companies' Twitter home page. Accounts are assumed to be official corporate Twitter accounts (other Twitter accounts are excluded, such as division or country accounts). Accounts used in the statistics are listed in this Twitter list." content+="\n" content+="\n" content+='See also this blog post: Top Tweeters Statistics.' publish=True postData={'title':title, 'description': content, 'categories':[category] } result = server.metaWeblog.newPost(1, userName,password,postData,publish) #print ranking. This gets emailed to me after cron executes this script print "%s. Post URL: http://sami.salkosuo.net/?p=%s" % (title,result) print "Tweet ranking:" print for company,tweetCount in sortedRanking: print "%s\t\t%d" %( company.capitalize(),tweetCount) sys.exit(0) except MySQLdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) print "While handling %s" % (user) sys.exit (1)

Write HTML file with Google Charts code

def writeTweetBarChartHtmlFile(fileName,title,rankingDict):
  htmlFile = open(fileName, 'w')
  htmlFile.write("\n")
  htmlFile.write("""
 
    
    
  
  
    
""") htmlFile.write("\n") htmlFile.close()

Leave a Reply

Your email address will not be published. Required fields are marked *