In [13]:

from IPython import utils
from IPython.core.display import HTML
import os
def css_styling():
 """Load default custom.css file from ipython profile"""
 base = utils.path.get_ipython_dir()
 styles = "<style>\n%s\n</style>" % (open(os.path.join(base,'/home/shanks/Documents/New Folder/base16-ipython-notebook/base16-londontube-light.css'),'r').read())
 return HTML(styles)
css_styling()

Out[13]:

In [1]:

#Importing required Python packages
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint
import pandas as pd
import zipfile
import requests
import StringIO
import datetime
import seaborn

np.random.seed(sum(map(ord, "aesthetics")))
seaborn.set_context('notebook')

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)

In [20]:

#The Dataset is a zip file. So let's first read in the dataset through requests then pass it on to Pandas through the
#read_csv command
url=requests.get('https://s3.amazonaws.com/tripdata/201402-citibike-tripdata.zip')
z=zipfile.ZipFile(StringIO.StringIO(url.content))

bike=pd.read_csv(z.open('2014-02 - Citi Bike trip data.csv'))

In [21]:

#Display the Top 5 rows in the dataset
print bike.head()

   tripduration            starttime             stoptime  start station id  \
0           382  2014-02-01 00:00:00  2014-02-01 00:06:22               294   
1           372  2014-02-01 00:00:03  2014-02-01 00:06:15               285   
2           591  2014-02-01 00:00:09  2014-02-01 00:10:00               247   
3           583  2014-02-01 00:00:32  2014-02-01 00:10:15               357   
4           223  2014-02-01 00:00:41  2014-02-01 00:04:24               401   

        start station name  start station latitude  start station longitude  \
0      Washington Square E               40.730494               -73.995721   
1       Broadway & E 14 St               40.734546               -73.990741   
2   Perry St & Bleecker St               40.735354               -74.004831   
3       E 11 St & Broadway               40.732618               -73.991580   
4  Allen St & Rivington St               40.720196               -73.989978   

   end station id          end station name  end station latitude  \
0             265  Stanton St & Chrystie St             40.722293   
1             439            E 4 St & 2 Ave             40.726281   
2             251       Mott St & Prince St             40.723180   
3             284     Greenwich Ave & 8 Ave             40.739017   
4             439            E 4 St & 2 Ave             40.726281   

   end station longitude  bikeid    usertype birth year  gender  
0             -73.991475   21101  Subscriber       1991       1  
1             -73.989780   15456  Subscriber       1979       2  
2             -73.994800   16281  Subscriber       1948       2  
3             -74.002638   17400  Subscriber       1981       1  
4             -73.989780   19341  Subscriber       1990       1

In [23]:

#Let's specify some optional parameters to make the dataset more digestable
bike=pd.read_csv(z.open('2014-02 - Citi Bike trip data.csv'), encoding='utf8', parse_dates=['starttime','stoptime'], 
                 index_col=None)
bike.set_index('bikeid',drop='False')
#Note above, we could've simply said index_cols = bikeid but it would drop the actual column. To retain bikeid in the
#dataset, I set index to None initially then replaced it with bikeid, this time setting the option not to drop the column.
#If this is complicated, skip and continue, I will illustrate in detail later.
print bike.head()

   tripduration           starttime            stoptime  start station id  \
0           382 2014-02-01 00:00:00 2014-02-01 00:06:22               294   
1           372 2014-02-01 00:00:03 2014-02-01 00:06:15               285   
2           591 2014-02-01 00:00:09 2014-02-01 00:10:00               247   
3           583 2014-02-01 00:00:32 2014-02-01 00:10:15               357   
4           223 2014-02-01 00:00:41 2014-02-01 00:04:24               401   

        start station name  start station latitude  start station longitude  \
0      Washington Square E               40.730494               -73.995721   
1       Broadway & E 14 St               40.734546               -73.990741   
2   Perry St & Bleecker St               40.735354               -74.004831   
3       E 11 St & Broadway               40.732618               -73.991580   
4  Allen St & Rivington St               40.720196               -73.989978   

   end station id          end station name  end station latitude  \
0             265  Stanton St & Chrystie St             40.722293   
1             439            E 4 St & 2 Ave             40.726281   
2             251       Mott St & Prince St             40.723180   
3             284     Greenwich Ave & 8 Ave             40.739017   
4             439            E 4 St & 2 Ave             40.726281   

   end station longitude  bikeid    usertype birth year  gender  
0             -73.991475   21101  Subscriber       1991       1  
1             -73.989780   15456  Subscriber       1979       2  
2             -73.994800   16281  Subscriber       1948       2  
3             -74.002638   17400  Subscriber       1981       1  
4             -73.989780   19341  Subscriber       1990       1

In [326]:

#The object we've imported bike is called a DataFrame, and is made up of rows and columns. 
#Think of this like excel on wheels.

#Let's get some info on the columns our dataframe.
bike.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 224736 entries, 0 to 224735
Data columns (total 15 columns):
tripduration               224736 non-null int64
starttime                  224736 non-null datetime64[ns]
stoptime                   224736 non-null datetime64[ns]
start station id           224736 non-null int64
start station name         224736 non-null object
start station latitude     224736 non-null float64
start station longitude    224736 non-null float64
end station id             224736 non-null int64
end station name           224736 non-null object
end station latitude       224736 non-null float64
end station longitude      224736 non-null float64
bikeid                     224736 non-null int64
usertype                   224736 non-null object
birth year                 224736 non-null object
gender                     224736 non-null int64
dtypes: datetime64[ns](2), float64(4), int64(5), object(4)

In [53]:

#Some more detail
bike.describe()

Out[53]:

	tripduration	start station id	start station latitude	start station longitude	end station id	end station latitude	end station longitude	gender
count	224736.000000	224736.000000	224736.000000	224736.000000	224736.000000	224736.000000	224736.000000	224736.000000
mean	874.519810	439.203479	40.734366	-73.990386	440.741995	40.734221	-73.990521	1.154617
std	5486.092219	335.723861	0.019031	0.011853	341.497433	0.019048	0.011920	0.436592
min	60.000000	72.000000	40.680342	-74.017134	72.000000	40.680342	-74.017134	0.000000
25%	360.000000	305.000000	40.721854	-73.998522	305.000000	40.721816	-73.999061	1.000000
50%	544.000000	403.000000	40.736197	-73.990617	403.000000	40.735877	-73.990741	1.000000
75%	845.000000	490.000000	40.749156	-73.981918	488.000000	40.749013	-73.981948	1.000000
max	766108.000000	3002.000000	40.770513	-73.950048	3002.000000	40.770513	-73.950048	2.000000

In [24]:

#Accessing Columns
print bike['gender'].head() #or just bike.gender

0    1
1    2
2    2
3    1
4    1
Name: gender, dtype: int64

In [87]:

#Plenty of functions/transformations available - this below lists the unique values.
print bike.gender.unique()

#Calling a histogram on a column by using hist()
bike['start station id'].hist()

#Try hitting tab after bike.colname to see options

[1 2 0]

Out[87]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f8757f3c350>

In [91]:

#Plotting is easy, we could just do a table.colname.plot() but to get something pretty to display, let's do
#a scatter plot
plt.scatter(bike.index[:25], bike['tripduration'][:25], cmap=plt.cm.RdBu)
plt.show()

In [92]:

#Slicing - first 5 rows in the DF
bike[:5]

Out[92]:

	tripduration	starttime	stoptime	start station id	start station name	start station latitude	start station longitude	end station id	end station name	end station latitude	end station longitude	usertype	birth year	gender
bikeid
21101	382	2014-02-01 00:00:00	2014-02-01 00:06:22	294	Washington Square E	40.730494	-73.995721	265	Stanton St & Chrystie St	40.722293	-73.991475	Subscriber	1991	1
15456	372	2014-02-01 00:00:03	2014-02-01 00:06:15	285	Broadway & E 14 St	40.734546	-73.990741	439	E 4 St & 2 Ave	40.726281	-73.989780	Subscriber	1979	2
16281	591	2014-02-01 00:00:09	2014-02-01 00:10:00	247	Perry St & Bleecker St	40.735354	-74.004831	251	Mott St & Prince St	40.723180	-73.994800	Subscriber	1948	2
17400	583	2014-02-01 00:00:32	2014-02-01 00:10:15	357	E 11 St & Broadway	40.732618	-73.991580	284	Greenwich Ave & 8 Ave	40.739017	-74.002638	Subscriber	1981	1
19341	223	2014-02-01 00:00:41	2014-02-01 00:04:24	401	Allen St & Rivington St	40.720196	-73.989978	439	E 4 St & 2 Ave	40.726281	-73.989780	Subscriber	1990	1

In [105]:

#Slicing - first 5 rows of specific column(s)
bike[['gender','tripduration']][:5]

##Note the 2 sets of [] - list of lists

Out[105]:

	gender	tripduration
bikeid
21101	1	382
15456	2	372
16281	2	591
17400	1	583
19341	1	223

In [117]:

#Find the startstation id with the maximum trips
bike['start station id'].value_counts()

#It's station 293 with 2920 trips

Out[117]:

293    2920
519    2719
497    2493
435    2403
521    2171
490    1944
477    1916
382    1808
151    1796
285    1795
402    1694
318    1688
492    1570
444    1562
379    1538
...
298     89
339     82
242     76
218     76
2001    75
216     72
253     69
372     57
119     57
418     56
431     54
278     45
443     41
2005    36
320      4
Length: 329, dtype: int64

In [167]:

#Find the start station with the longest trip

#First find the longest trip
print "Longest Trip: %.6f" %bike.tripduration.max()

#Now retrieve row from DF where duration is the longest
bike[bike.tripduration==bike.tripduration.max()]


#bike[bike.tripduration.idxmax()]

Longest Trip: 766108.000000

Out[167]:

	tripduration	starttime	stoptime	start station id	start station name	start station latitude	start station longitude	end station id	end station name	end station latitude	end station longitude	usertype	birth year	gender
bikeid
15379	766108	2014-02-08 15:52:51	2014-02-17 12:41:19	353	S Portland Ave & Hanson Pl	40.685396	-73.974315	281	Grand Army Plaza & Central Park S	40.764397	-73.973715	Subscriber	1982	2

In [171]:

#Find the start station with the longest trip by a Female Biker

#First find the longest trip by a female biker
print "Longest Trip by a Female Biker: %.6f" %bike[bike.gender==1].tripduration.max()

#Now retrieve row from DF where duration is the longest
bike[bike.tripduration==bike[bike.gender==1].tripduration.max()]


#bike[bike.tripduration.idxmax()]

Longest Trip by a Female Biker: 585281.000000

Out[171]:

	tripduration	starttime	stoptime	start station id	start station name	start station latitude	start station longitude	end station id	end station name	end station latitude	end station longitude	usertype	birth year	gender
bikeid
16149	585281	2014-02-13 16:52:23	2014-02-20 11:27:04	252	MacDougal St & Washington Sq	40.732264	-73.998522	348	W Broadway & Spring St	40.72491	-74.001547	Subscriber	1973	1

In [204]:

#Plot the Top 10 longest trips by Male Subscribers as a Bar Plot

#Let's first get all Male Subscribers
bike[(bike.usertype=="Subscriber") & (bike.gender==2)]

#Now, valuecounts of tripduration for these subscribers - only the Top 10
print "Top 10 Male Subscribers with the longest trips"
print bike[(bike.usertype=="Subscriber") & (bike.gender==2)].tripduration.value_counts()[:10]

#Now we'll just plot this as a bar
bike[(bike.usertype=="Subscriber") & (bike.gender==2)].tripduration.value_counts()[:10].plot(kind='bar')
plt.suptitle('Top 10 Longest Trips by Male Subscribers', fontsize=14)
plt.show()

Top 10 Male Subscribers with the longest trips
427    76
557    71
403    70
385    70
530    68
458    68
372    68
364    67
420    67
531    66
dtype: int64

In [213]:

#The columns of a Data Frame are of type called Series. These are internally just numpy arrays, which we can find out
#by calling the values() method
bike['start station id'].values

Out[213]:

array([294, 285, 247, ..., 497, 353, 252])

In [327]:

#OK let's dig into some Time Series functionality. Pandas again is excellent in this area.

#Say we want to get the day from the timestamp
temp=pd.DatetimeIndex(bike.starttime) 
temp.day
#NOTE: the .day, .month.. functions work only if the Date field is indexed. Since in our case it's not we had to
#temporarily index it. Otherwise we could've just done a bike.starttime.day

#Let's calc and store the weekday. Note we're adding this as a new column to bike
bike['weekday']=temp.weekday

In [336]:

#GROUP BY - now let's say we want to group all bikers by weekday we could do

#Let's add weekday as an index to the DF. We're choosing to append rather than replace the original index.
bike.set_index('weekday',drop='false', append='true')
test=bike.groupby('weekday').aggregate('count')['bikeid']
print test

#Say we wanted to plot this, but we want to change the indexes to something meaningful
test.index=['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
test.plot(kind='bar',color='orange')
plt.show()

weekday
0          27222
1          35022
2          31020
3          31900
4          30371
5          34062
6          35139
Name: bikeid, dtype: int64

In [369]:

#Let's now group all Customers on working days
female_customers=bike[(bike.usertype=='Customer')]
grouped=female_customers.groupby('weekday').aggregate('count')
grouped=grouped[(grouped.index.values>=1) & (grouped.index.values<=5)]
grouped.index=['Monday','Tuesday','Wednesday','Thursday','Friday']
grouped['bikeid'].plot(kind='bar')
plt.suptitle('Customers biking on Working Days', fontsize=14)
plt.show()

In [2]:

#OK great let's now grab some weather data from San Francisco from Weather Underground. Firstly we'll define a custom URL
#to grab specific months of data.

#The URL template will have placeholders for dates so we can fill them up as we like.
url_template='http://www.wunderground.com/history/airport/KSFO/{start_year}/{start_month}/{start_day}' \
             '/CustomHistory.html?dayend={end_day}&monthend={end_month}&yearend={end_year}' \
             '&req_city=NA&req_state=NA&req_statename=NA&MR=1&format=1'

#Let's generate the URL to retrieve data from Sep 1, 2014 to Dec 10,2014
url=url_template.format(start_year=2014, start_month=9, start_day=1, end_year=2014, end_month=12, end_day=10)

#Great, let's now open the URL and read it in as a CSV.
sfo=pd.read_csv(url, header=1, parse_dates=True, index_col='PST')

In [25]:

print sfo.head()

            MaxTemperatureF  MeanTemperatureF  MinTemperatureF  MaxDewPointF  \
2014-08-01               72                66               59            58   
2014-08-02               69                64               59            57   
2014-08-03               70                65               60            57   
2014-08-04               75                68               60            59   
2014-08-05               74                68               62            62   

            MeanDewPointF  MinDewpointF  MaxHumidity  MeanHumidity  \
2014-08-01             57            57           90            76   
2014-08-02             56            55           93            78   
2014-08-03             56            55           84            73   
2014-08-04             57            55           84            73   
2014-08-05             58            57           84            75   

            MinHumidity  MaxSeaLevelPressureIn          ...            \
2014-08-01           61                  29.93          ...             
2014-08-02           63                  29.96          ...             
2014-08-03           61                  29.98          ...             
2014-08-04           61                  30.01          ...             
2014-08-05           66                  30.08          ...             

            MaxVisibilityMiles  MeanVisibilityMiles  MinVisibilityMiles  \
2014-08-01                  10                   10                  10   
2014-08-02                  10                   10                  10   
2014-08-03                  10                   10                  10   
2014-08-04                  10                   10                  10   
2014-08-05                  10                   10                  10   

            MaxWindSpeedMPH  MeanWindSpeedMPH  MaxGustSpeedMPH  \
2014-08-01               24                15               29   
2014-08-02               27                16               33   
2014-08-03               22                11               28   
2014-08-04               16                 9               22   
2014-08-05               23                11               26   

            PrecipitationIn  CloudCover Events  WindDirDegrees<br/>  
2014-08-01             0.00           3    NaN            297<br />  
2014-08-02             0.00           4    NaN            286<br />  
2014-08-03             0.00           6    NaN            254<br />  
2014-08-04             0.00           6    NaN            261<br />  
2014-08-05             0.00           6    NaN            296<br />  

[5 rows x 22 columns]

In [3]:

#Let's remove spaces in column names to make things easier down the line.
sfo = sfo.rename(columns=lambda x: x.replace(' ', ''))
sfo.columns

Out[3]:

Index([u'MaxTemperatureF', u'MeanTemperatureF', u'MinTemperatureF', u'MaxDewPointF', u'MeanDewPointF', u'MinDewpointF', u'MaxHumidity', u'MeanHumidity', u'MinHumidity', u'MaxSeaLevelPressureIn', u'MeanSeaLevelPressureIn', u'MinSeaLevelPressureIn', u'MaxVisibilityMiles', u'MeanVisibilityMiles', u'MinVisibilityMiles', u'MaxWindSpeedMPH', u'MeanWindSpeedMPH', u'MaxGustSpeedMPH', u'PrecipitationIn', u'CloudCover', u'Events', u'WindDirDegrees<br/>'], dtype='object')

In [4]:

#Great, now let's plot the mean temp
with seaborn.axes_style('darkgrid'):
    sfo.MeanTemperatureF.plot(figsize=(15,5), color='brown')
plt.suptitle('San Francisco - Mean Temperatures in Fahrenheit',fontsize=15)
plt.show()

#That was as easy as it could get.

In [65]:

sfo.describe()

Out[65]:

	MaxTemperatureF	MeanTemperatureF	MinTemperatureF	MaxDewPointF	MeanDewPointF	MinDewpointF	MaxHumidity	MeanHumidity	MinHumidity	MaxSeaLevelPressureIn	MeanSeaLevelPressureIn	MinSeaLevelPressureIn	MaxVisibilityMiles	MeanVisibilityMiles	MinVisibilityMiles	MaxWindSpeedMPH	MeanWindSpeedMPH	MaxGustSpeedMPH	CloudCover	hour
count	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101
mean	72.495050	65.405941	57.831683	59.316832	55.722772	51.821782	91.009901	73.871287	56.237624	30.047327	29.989208	29.931188	10	9.465347	7.445545	19.990099	7.831683	24.821782	3.801980	0
std	7.315222	5.312585	4.406970	3.679486	4.343084	6.358295	7.686996	10.905653	15.820966	0.134907	0.134964	0.138818	0	0.975339	3.579037	5.148777	3.307777	6.219961	2.135508	0
min	60.000000	55.000000	46.000000	49.000000	38.000000	28.000000	67.000000	43.000000	13.000000	29.810000	29.750000	29.680000	10	5.000000	0.000000	8.000000	1.000000	12.000000	0.000000	0
25%	67.000000	61.000000	56.000000	57.000000	54.000000	49.000000	87.000000	69.000000	48.000000	29.950000	29.900000	29.840000	10	9.000000	4.000000	16.000000	5.000000	21.000000	2.000000	0
50%	72.000000	66.000000	58.000000	60.000000	57.000000	54.000000	90.000000	74.000000	57.000000	30.020000	29.970000	29.910000	10	10.000000	10.000000	21.000000	8.000000	25.000000	4.000000	0
75%	76.000000	69.000000	61.000000	61.000000	58.000000	56.000000	100.000000	80.000000	66.000000	30.130000	30.070000	30.020000	10	10.000000	10.000000	24.000000	10.000000	29.000000	6.000000	0
max	95.000000	81.000000	66.000000	66.000000	64.000000	62.000000	100.000000	95.000000	90.000000	30.460000	30.390000	30.330000	10	10.000000	10.000000	32.000000	14.000000	50.000000	8.000000	0

In [17]:

#It's easy to concactenate dataframes

#Let's generate the URL to retrieve data from Aug 1, 2014 to Aug 31,2014
url=url_template.format(start_year=2014, start_month=8, start_day=1, end_year=2014, end_month=8, end_day=31)

#Great, let's now open the URL and read it in as a CSV.
sfo_aug=pd.read_csv(url, header=1, parse_dates=True, index_col='PDT')

#Renaming columns
sfo_aug = sfo_aug.rename(columns=lambda x: x.replace(' ', ''))

#Concatenate Aug and Sep months
sfo = pd.concat([sfo_aug,sfo])

sfo.index.get_values

Out[17]:

<bound method DatetimeIndex.get_values of <class 'pandas.tseries.index.DatetimeIndex'>
[2014-08-01, ..., 2014-12-10]
Length: 194, Freq: None, Timezone: None>

In [18]:

#Saving back to CSV
sfo.to_csv('SFO_Weather_Aug_to_Dec_14.csv')

Dive-in to Pandas

Dive-in to Pandas

Reading from CSV

Comments