Download the data
In []:
%matplotlib inline
import mta
mta.get_turnstile_data(num_files=9,
data_url="http://web.mta.info/developers/turnstile.html",
destination='./data/',
start_from_id = 130105)
Get the data out of the csvs
In []:
filenames = mta.get_glob()
ts_data = mta.extract_turnstile_data_from_csvs(filenames)
print len(ts_data)
Get turnstile time series
In []:
turnstile_timeseries = mta.get_turnstile_timeseries(ts_data)
Collapse turnstiles to booth-remotes
In []:
# collapse the data to booth-remotes
#[takes a minute but not as long as the day count extraction]
boothremote_timeseries = mta.collapse_data_to_booth_remote(turnstile_timeseries)
In []:
# how many boothremotes are there?
print 'number of boothremotes: ',len(boothremote_timeseries)
# what does ONE boothremote look like?
br_id, br_counts = boothremote_timeseries.iteritems().next()
print 'br id of first booth remote: ',br_id
print 'daily counts:'
from pprint import pprint
pprint(br_counts)
PLOT
In []:
import matplotlib.pyplot as plt
# plot the turnstile entry counts per day for one boothremote
dates,counts = mta.dates_and_counts(br_counts)
f = plt.figure(figsize=(10,3))
p = plt.plot(dates,counts)
In []:
import numpy as np
counts_by_week = mta.reshape_flat_counts_to_weekly(counts)
#print counts_by_week
for week in counts_by_week:
plt.plot(week)
xl = plt.xlabel('Day of the week')
yl = plt.ylabel('Number of turnstile entries')
xt = plt.xticks(np.arange(7),['S','S','M','T','W','R','F'])
Function for rainbow weekly ridership curves
In []:
# let's make a function that does everything we just did to make this "rainbow line" plot
# try it out on a random boothremote
import random
br = random.sample(boothremote_timeseries.items(),1)[0]
# what station is that anyway?
# make the station key
station_key = mta.create_booth_remote_station_key('data/turnstile_key.csv')
#print station_key[br_id]
mta.plot_weeks(br,station_key[br[0]])
In []:
# make a function that does all that stuff up there and do it on a random station
import random
reload(mta)
#sta = ('R202','R042')
mta.plot_timeseries_and_weeklies(mta.get_a_random_boothremote(boothremote_timeseries),station_key)
#plot_timeseries_and_weeklies((sta,boothremotes[sta]))
#pprint(boothremotes[sta])
In []:
# let's take the rainbow plot and turn it into averages with error bars
# boothremotes weekly
br = mta.get_a_random_boothremote(boothremote_timeseries)
station_name = station_key[br[0]]
print br[0], station_name
weekly = mta.reshape_flat_counts_to_weekly([c for d,c in br[1]])
print weekly
# numpy arrays are awesome! by using the axis parameter you can find the mean of all columns at once.
import numpy as np
print np.mean(weekly,0).astype(int)
print np.std(weekly,0).astype(int)
In []:
#so let's make a plot of the means
from scipy import stats
err = stats.sem(weekly)
#print err
f=plt.figure(figsize=(8,3))
plt.errorbar(np.arange(7),np.mean(weekly,0),yerr=err)
#plt.errorbar(np.arange(7),np.mean(weekly,0),yerr=np.std(weekly,0))
#a = plt.gca()
xl = plt.xlim([-1,7])
xt = plt.xticks(np.arange(8),['S','S','M','T','W','R','F',''])
tt = plt.title(station_name)
In []:
# and let's turn this into a function
def plot_avg_weeklies_with_err_bars(br=None):
if not br:
br=mta.get_a_random_boothremote(boothremote_timeseries)
weekly = mta.reshape_flat_counts_to_weekly([c for d,c in br[1]])
err = stats.sem(weekly)
f=plt.figure(figsize=(8,3))
plt.errorbar(np.arange(7),np.mean(weekly,0),yerr=err)
xl = plt.xlim([-1,7])
xt = plt.xticks(np.arange(8),['S','S','M','T','W','R','F',''])
tt = plt.title(station_key[br[0]])
return
#test it
plot_avg_weeklies_with_err_bars()
In []:
# so there is more than one booth remote per station. Let's see what we can do about that
# we have the station key but it goes the wrong way. let's change it around so that we are indexing on the station name
from collections import defaultdict
station_dict = defaultdict(list)
for boothremote,station in station_key.items():
station_dict[station].append(boothremote)
#pprint(dict(station_dict))
#let's get a random station
def get_a_random_station():
station = random.sample(station_dict,1)[0]
return station
print station, station_dict[station]
#'86 ST' is a good one with lots of booth remotes
In []:
In []:
In []:
# and let's alter our function from above to plot all of the booth remotes for a station
def plot_all_brs_for_station(station=None):
if not station:
station=get_a_random_station()
f=plt.figure(figsize=(8,3))
for boothremote in station_dict[station]:
br = boothremote_timeseries[boothremote]
weekly = mta.reshape_flat_counts_to_weekly([c for d,c in br])
#print boothremote
#print weekly
plt.errorbar(np.arange(7),np.mean(weekly,0),yerr=err,label=boothremote)
xl = plt.xlim([-1,7])
xt = plt.xticks(np.arange(8),['S','S','M','T','W','R','F',''])
tt = plt.title(station)
plt.legend(bbox_to_anchor=(1.35,1))
#lg = plt.figlegend(f,label=station_dict[station],loc=3)
return
plot_all_brs_for_station()
#plot_all_brs_for_station(station='BOWLING GREEN')
In []:
stations = {}
for station,br_names in station_dict.items():
total_ridership = 0
daily_totals = np.array([0,0,0,0,0,0,0])
for br_name in br_names:
br = boothremote_timeseries[br_name]
weekly = mta.reshape_flat_counts_to_weekly([c for d,c in br])
weekly_sums = np.sum(weekly,0)
daily_totals = daily_totals+weekly_sums
total_ridership += np.sum(weekly_sums)
stations[station]={
'daily_totals':daily_totals,
'total_ridership':total_ridership}
In []:
total_ridership = []
for station,br_names in station_dict.items():
total_ridership.append((sum([sum([c for d,c in boothremote_timeseries[br_name]]) for br_name in br_names]),station))
#average_weekly_ridership = []
#for station,br_names in station_dict.items()[:5]:
#rect= [[c for d,c in boothremote_timeseries[br_name]] for br_name in br_names]
#print len(br_names)
#print rect
In []:
pprint(list(reversed(sorted(total_ridership))))
In []:
station_totals = [a for (a,b) in total_ridership]
#print station_totals
h = plt.hist(station_totals,50)