Commentary On Multiple Database Twitter Crawler

#import libraries to open url's, interact with the Twitter API, use json,

#use SQL, and ignore security certificates.

import urllib.request, urllib.parse, urllib.error

import twurl

import json

import sqlite3

import ssl

#relevant Twitter API.

TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json'

#create the SQL database and cursor.

conn = sqlite3.connect('friends.sqlite')

cur = conn.cursor()

#Create the People and Follows tables. People will list all the users

#we've encountered and let us know whose friend lists we've retrieved; it

#also assigns a unique number to each one. (Should be 1-1 between id's and

#names.)

cur.execute('''CREATE TABLE IF NOT EXISTS People

(id INTEGER PRIMARY KEY, name TEXT UNIQUE, retrieved INTEGER)''')

cur.execute('''CREATE TABLE IF NOT EXISTS Follows

(from_id INTEGER, to_id INTEGER, UNIQUE(from_id, to_id))''')

#create context to ignore SSL errors.

ctx = ssl.create_default_context()

ctx.check_hostname = False

ctx.verify_mode = ssl.CERT_NONE

#main drag

while True:

#prompt users for account. If the user hits "enter," the app will look for

#a users whose follow-list has not been retrieved -- this time both user

#and id. If it doesn't find any, it alerts and restarts. If the user enters

#a name, it looks for its ID in the database and gets the row -- if it doesn't

#find it, it tries to add it, printing an error if it is not successful (why

#wouldn't it be successful?). If then changes the id (why?) and adds one to

#the count of new retrievals.

acct = input('Enter a Twitter account, or quit: ')

if (acct == 'quit'): break

if (len(acct) < 1):

cur.execute('SELECT id, name FROM People WHERE retrieved = 0 LIMIT 1')

try:

(id, acct) = cur.fetchone()

except:

print('No unretrieved Twitter accounts found.')

continue

else:

cur.execute('SELECT id FROM People WHERE name = ? LIMIT 1', (acct,))

try:

id = cur.fetchone()[0]

except:

cur.execute('''INSERT OR IGNORE INTO People (name, retrieved) VALUES (?,0)''',(acct,))

conn.commit()

if cur.rowcount != 1:

print('Error inserting account:', friend)

continue

id = cur.lastrowid

#notice countnew hasn't yet been defined!

countnew = countnew + 1

#Augments the url with the API keys and attempts to retrieve the account

#into a handle, "connection", printing an error message if it fails.

url = twurl.augment(TWITTER_URL, {'screen_name': acct, 'count': '100'})

print('Retrieving account, acct')

try:

connection = urllib.request.urlopen(url, context = ctx)

except Exception as err:

print('Failed to retrieve', err)

break

#Reads data from "connection" as UTF, gets the headers from the database

#to alert user of API use limit.

data = connection.read().decode()

headers = dict(connection.getheaders())

print('Remaining', headers['x-rate-limit-remaining'])

#Attempt to load json from the data.

try:

js = json.loads(data)

except:

print('Unable to parse json')

print(data)

break

#Debugging

#print(json.dumps(js, indent = 4))

#Checks to make sure we have the correct json (why wouldn't we?)

if 'users' not in js:

print('Incorrect JSON received')

print(json.dumps(js, indent=4))

continue

#databse updated to indicate the account we chose above is retrieved.

cur.execute('UPDATE People SET retrieved=1 WHERE name=?', (acct,))

#define variables to count the number of users we're just encountering and

#the number we have already found. Go through each user in the json and #try to find that

#user in our database -- if we do, update the number of times we've already #found someone, If we don't, add them to People and update the number of #times we encountered someone new.

countnew = 0

countold = 0

for u in js['users']:

friend = u['screen_name']

print(friend)

cur.execute('SELECT id FROM People WHERE name = ? LIMIT 1', (friend,))

try:

friend_id = cur.fetchone()[0]

countold = countold + 1

except:

cur.execute('''INSERT OR IGNORE INTO People (name, retrieved) VALUES (?,0)''',(friend,))

conn.commit()

if cur.rowcount != 1:

print('Error inserting account:', friend)

continue

friend_id = cur.lastrowid

countnew = countnew + 1

#Add entries into our Follows database indicating that user wih unique

#id x follows users with unique id y.

cur.execute('''INSERT OR IGNORE INTO Follows (from_id, to_id) VALUES (?,?)''', (id, friend_id))

Coding Journal

Search This Blog

Commentary On Multiple Database Twitter Crawler

Comments

Post a Comment

Popular posts from this blog

Getting Geodata From Google's API

Compiling and Executing Java Files With -cp

Throughput, Latency, and Pipelines: Diagnosis Of A Fallacy