The Big Datawski

A blog that really ties the room together

Testing Ipython

Download the data

In []:
%matplotlib inline
import mta

mta.get_turnstile_data(num_files=9,
                       data_url="http://web.mta.info/developers/turnstile.html",
                       destination='./data/',
                       start_from_id = 130105)

Get the data out of the csvs

In []:
filenames = mta.get_glob()
ts_data = mta.extract_turnstile_data_from_csvs(filenames)
print len(ts_data)

Get turnstile time series

In []:
 turnstile_timeseries = mta.get_turnstile_timeseries(ts_data)

Collapse turnstiles to booth-remotes

In []:
# collapse the data to booth-remotes
#[takes a minute but not as long as the day count extraction]
boothremote_timeseries = mta.collapse_data_to_booth_remote(turnstile_timeseries)
In []:
# how many boothremotes are there?
print 'number of boothremotes: ',len(boothremote_timeseries) 

# what does ONE boothremote look like?
br_id, br_counts = boothremote_timeseries.iteritems().next()
print 'br id of first booth remote: ',br_id
print 'daily counts:'
from pprint import pprint
pprint(br_counts)

PLOT

In []:
import matplotlib.pyplot as plt
# plot the turnstile entry counts per day for one boothremote
dates,counts = mta.dates_and_counts(br_counts)
f = plt.figure(figsize=(10,3))
p = plt.plot(dates,counts)
In []:
import numpy as np

counts_by_week = mta.reshape_flat_counts_to_weekly(counts)
#print counts_by_week

for week in counts_by_week:
    plt.plot(week)
    
xl = plt.xlabel('Day of the week')
yl = plt.ylabel('Number of turnstile entries')
xt = plt.xticks(np.arange(7),['S','S','M','T','W','R','F'])

Function for rainbow weekly ridership curves

In []:
# let's make a function that does everything we just did to make this "rainbow line" plot
# try it out on a random boothremote
import random
br = random.sample(boothremote_timeseries.items(),1)[0]

# what station is that anyway?
# make the station key
station_key = mta.create_booth_remote_station_key('data/turnstile_key.csv')
#print station_key[br_id]

mta.plot_weeks(br,station_key[br[0]])
In []:
# make a function that does all that stuff up there and do it on a random station
import random
reload(mta)

#sta = ('R202','R042')
mta.plot_timeseries_and_weeklies(mta.get_a_random_boothremote(boothremote_timeseries),station_key)
#plot_timeseries_and_weeklies((sta,boothremotes[sta]))
#pprint(boothremotes[sta])
In []:
# let's take the rainbow plot and turn it into averages with error bars

# boothremotes weekly
br = mta.get_a_random_boothremote(boothremote_timeseries)
station_name = station_key[br[0]]
print br[0], station_name 

weekly = mta.reshape_flat_counts_to_weekly([c for d,c in br[1]])
print weekly

# numpy arrays are awesome! by using the axis parameter you can find the mean of all columns at once.
import numpy as np
print np.mean(weekly,0).astype(int)
print np.std(weekly,0).astype(int)
In []:
#so let's make a plot of the means 
from scipy import stats
err = stats.sem(weekly)
#print err

f=plt.figure(figsize=(8,3))
plt.errorbar(np.arange(7),np.mean(weekly,0),yerr=err)
#plt.errorbar(np.arange(7),np.mean(weekly,0),yerr=np.std(weekly,0))
#a = plt.gca()
xl = plt.xlim([-1,7])
xt = plt.xticks(np.arange(8),['S','S','M','T','W','R','F',''])
tt = plt.title(station_name)
In []:
# and let's turn this into a function

def plot_avg_weeklies_with_err_bars(br=None):
    if not br: 
        br=mta.get_a_random_boothremote(boothremote_timeseries)

    weekly = mta.reshape_flat_counts_to_weekly([c for d,c in br[1]])
    err = stats.sem(weekly)
    f=plt.figure(figsize=(8,3))
    plt.errorbar(np.arange(7),np.mean(weekly,0),yerr=err)
    xl = plt.xlim([-1,7])
    xt = plt.xticks(np.arange(8),['S','S','M','T','W','R','F',''])
    tt = plt.title(station_key[br[0]])
    return 

#test it
plot_avg_weeklies_with_err_bars()
In []:
# so there is more than one booth remote per station. Let's see what we can do about that

# we have the station key but it goes the wrong way. let's change it around so that we are indexing on the station name
from collections import defaultdict

station_dict = defaultdict(list)
for boothremote,station in station_key.items():
    station_dict[station].append(boothremote)
    
#pprint(dict(station_dict))

#let's get a random station
def get_a_random_station():
    station = random.sample(station_dict,1)[0]
    return station
print station, station_dict[station]


#'86 ST' is a good one with lots of booth remotes
In []:
 
In []:
 
In []:
# and let's alter our function from above to plot all of the booth remotes for a station
def plot_all_brs_for_station(station=None):
    if not station: 
        station=get_a_random_station()
    f=plt.figure(figsize=(8,3))
    
    for boothremote in station_dict[station]:
        br = boothremote_timeseries[boothremote]
        weekly = mta.reshape_flat_counts_to_weekly([c for d,c in br])
        #print boothremote
        #print weekly
        plt.errorbar(np.arange(7),np.mean(weekly,0),yerr=err,label=boothremote)
        
    xl = plt.xlim([-1,7])
    xt = plt.xticks(np.arange(8),['S','S','M','T','W','R','F',''])
    tt = plt.title(station)
    plt.legend(bbox_to_anchor=(1.35,1))
    #lg = plt.figlegend(f,label=station_dict[station],loc=3)
    return 

plot_all_brs_for_station()
#plot_all_brs_for_station(station='BOWLING GREEN')
In []:
stations = {}
for station,br_names in station_dict.items():
    total_ridership = 0
    daily_totals = np.array([0,0,0,0,0,0,0])
    for br_name in br_names: 
        br = boothremote_timeseries[br_name]
        weekly = mta.reshape_flat_counts_to_weekly([c for d,c in br])
        weekly_sums = np.sum(weekly,0)
        daily_totals = daily_totals+weekly_sums
        total_ridership += np.sum(weekly_sums)
    stations[station]={
    'daily_totals':daily_totals,
    'total_ridership':total_ridership}
    

        
In []:
total_ridership = []
for station,br_names in station_dict.items():
    total_ridership.append((sum([sum([c for d,c in boothremote_timeseries[br_name]]) for br_name in br_names]),station))
    
#average_weekly_ridership = []
#for station,br_names in station_dict.items()[:5]:
    #rect= [[c for d,c in boothremote_timeseries[br_name]] for br_name in br_names]
    #print len(br_names)
    #print rect

        
In []:
pprint(list(reversed(sorted(total_ridership))))
In []:
station_totals = [a for (a,b) in total_ridership]
#print station_totals
h = plt.hist(station_totals,50)

Hello

Hello, Ladies and Gentlemen. What’s up?