DBSCAN for Uber 🚕¶
In this example, we will use DBSCAN to cluster the locations of Uber pickups in New York City. We will compare the results of DBSCAN to a baseline K-Means model.
In [2]:
Copied!
import pandas as pd
import numpy as np
import matplotlib as plt
import pandas as pd
import numpy as np
import matplotlib as plt
In [3]:
Copied!
dataset = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/uber-tlc-foil-response/master/uber-trip-data/uber-raw-data-apr14.csv")
dataset = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/uber-tlc-foil-response/master/uber-trip-data/uber-raw-data-apr14.csv")
In [4]:
Copied!
dataset.head()
dataset.head()
Out[4]:
Date/Time | Lat | Lon | Base | |
---|---|---|---|---|
0 | 4/1/2014 0:11:00 | 40.7690 | -73.9549 | B02512 |
1 | 4/1/2014 0:17:00 | 40.7267 | -74.0345 | B02512 |
2 | 4/1/2014 0:21:00 | 40.7316 | -73.9873 | B02512 |
3 | 4/1/2014 0:28:00 | 40.7588 | -73.9776 | B02512 |
4 | 4/1/2014 0:33:00 | 40.7594 | -73.9722 | B02512 |
In [5]:
Copied!
dataset.describe()
dataset.describe()
Out[5]:
Lat | Lon | |
---|---|---|
count | 564516.000000 | 564516.000000 |
mean | 40.740005 | -73.976817 |
std | 0.036083 | 0.050426 |
min | 40.072900 | -74.773300 |
25% | 40.722500 | -73.997700 |
50% | 40.742500 | -73.984800 |
75% | 40.760700 | -73.970000 |
max | 42.116600 | -72.066600 |
In [6]:
Copied!
X = dataset.iloc[:, 1:3]
X = dataset.iloc[:, 1:3]
In [7]:
Copied!
X.head()
X.head()
Out[7]:
Lat | Lon | |
---|---|---|
0 | 40.7690 | -73.9549 |
1 | 40.7267 | -74.0345 |
2 | 40.7316 | -73.9873 |
3 | 40.7588 | -73.9776 |
4 | 40.7594 | -73.9722 |
In [8]:
Copied!
#fit kmeans to the dataset
#fit kmeans to the dataset
Out[8]:
MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++', init_size=None, max_iter=100, max_no_improvement=10, n_clusters=4, n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0, verbose=0)
In [9]:
Copied!
#print the cluster centers
#print the cluster centers
Out[9]:
array([[ 40.76363743, -73.97023338], [ 40.695024 , -73.76179 ], [ 40.68196117, -73.95966893], [ 40.72687565, -74.00319827]])
In [10]:
Copied!
X.describe()
X.describe()
Out[10]:
Lat | Lon | |
---|---|---|
count | 564516.000000 | 564516.000000 |
mean | 40.740005 | -73.976817 |
std | 0.036083 | 0.050426 |
min | 40.072900 | -74.773300 |
25% | 40.722500 | -73.997700 |
50% | 40.742500 | -73.984800 |
75% | 40.760700 | -73.970000 |
max | 42.116600 | -72.066600 |
In [11]:
Copied!
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, GMapOptions
from bokeh.plotting import gmap
%matplotlib inline
#use Bokeh GMap to plot the clusters
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, GMapOptions
from bokeh.plotting import gmap
%matplotlib inline
#use Bokeh GMap to plot the clusters
Out[11]:
GlyphRenderer(
id = '1056', …)
In [12]:
Copied!
show(p)
show(p)
In [ ]:
Copied!
#export it in png
#export it in png
Out[ ]:
'/Users/benjamindallard/Documents/Freelance/JEDHA/code/gitProd/jedha/S13/map.png'
In [14]:
Copied!
dataset.iloc[:,0]= pd.to_datetime(dataset.iloc[:,0])
dataset.iloc[:,0]= pd.to_datetime(dataset.iloc[:,0])
In [15]:
Copied!
dataset["weekday"] = dataset.iloc[:,0].dt.dayofweek
dataset["weekday"] = dataset.iloc[:,0].dt.dayofweek
In [16]:
Copied!
dataset.describe()
dataset.describe()
Out[16]:
Lat | Lon | weekday | |
---|---|---|---|
count | 564516.000000 | 564516.000000 | 564516.00000 |
mean | 40.740005 | -73.976817 | 2.86698 |
std | 0.036083 | 0.050426 | 1.82081 |
min | 40.072900 | -74.773300 | 0.00000 |
25% | 40.722500 | -73.997700 | 1.00000 |
50% | 40.742500 | -73.984800 | 3.00000 |
75% | 40.760700 | -73.970000 | 4.00000 |
max | 42.116600 | -72.066600 | 6.00000 |
In [17]:
Copied!
X = dataset[dataset.weekday == 1].iloc[:, [1, 2]]
X = dataset[dataset.weekday == 1].iloc[:, [1, 2]]
In [18]:
Copied!
X.head()
X.head()
Out[18]:
Lat | Lon | |
---|---|---|
0 | 40.7690 | -73.9549 |
1 | 40.7267 | -74.0345 |
2 | 40.7316 | -73.9873 |
3 | 40.7588 | -73.9776 |
4 | 40.7594 | -73.9722 |
Time analysis¶
Do the same thing as in the previous notebook, but now for the weekend, do you notice any difference?
In [29]:
Copied!
#initialize dbscan with eps=0.01 and min_samples=6
#initialize dbscan with eps=0.01 and min_samples=6
Out[29]:
DBSCAN(algorithm='auto', eps=0.01, leaf_size=30, metric='manhattan', metric_params=None, min_samples=6, n_jobs=None, p=None)
In [30]:
Copied!
#print the cluster labels
#print the cluster labels
Out[30]:
(array([-1, 0, 1, 2, 3, 4, 5]), array([ 79, 853, 23, 11, 13, 12, 9]))
In [24]:
Copied!
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, GMapOptions
from bokeh.plotting import gmap
#plot the bokeh map
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, GMapOptions
from bokeh.plotting import gmap
#plot the bokeh map
Out[24]:
GlyphRenderer(
id = '1557', …)
In [28]:
Copied!
show(p)
show(p)