df = pd.read_csv('https://query.data.world/s/oocsxb6dwkdr37iotjt4r6tzeljcgv')
df
df['time_of_trip'] = pd.to_datetime(df['time_of_trip'], )
df['time_of_trip'] = df['time_of_trip'].dt.tz_localize('UTC').dt.tz_convert('America/New_York')
df = df[df.columns.difference(['Unnamed: 3'])]
df.head()
df = df.dropna()
df['week_day'] = df['time_of_trip'].apply(lambda x: x.weekday())
df['day_hour'] = df['time_of_trip'].apply(lambda x: x.hour)
df['year_day'] = df['time_of_trip'].apply(lambda x: x.timetuple().tm_yday)
df['year'] = df['time_of_trip'].apply(lambda x: x.year)
df['start_coord'] = list(map(list, zip(df['start_lat'], df['start_lng'])))
df.head()
sns.catplot(x="week_day", y="day_hour",color='lightblue', kind="bar", data=df.groupby(by="week_day").count().reset_index())
plt.ylabel("Count")
sns.catplot(x="day_hour", y="week_day",color='lightblue', kind="bar", data=df.groupby(by="day_hour").count().reset_index())
plt.ylabel("Count");
hist = physt.h2(df["start_lat"], df["start_lng"], "fixed_width", bin_width=1 / 120)
m = hist.plot()
m
startinglocation = df['start_coord'][0]
m = folium.Map(location=startinglocation)
HeatMap(df['start_coord'],overlay=False, radius=5).add_to(m)
m
df_grouped_day = df.groupby(by='day_hour')
data_day = [list(group['start_coord'].values) for _,group in df_grouped_day]
startinglocation = df['start_coord'][0]
m = folium.Map(location=startinglocation)
HeatMapWithTime(data_day,overlay=False, radius=5).add_to(m)
m
df_grouped_year = df.groupby(by='year_day')
data_year = [list(group['start_coord'].values) for _,group in df_grouped_year]
startinglocation = df['start_coord'][0]
m = folium.Map(location=startinglocation)
HeatMapWithTime(data_year,overlay=False, radius=5).add_to(m)
m
#algorithm
def customCluster(x,y,n_clusters = 4, n_iter = 10, exact=False):
#help methods
def progressBar(current, total, barLength = 20):
percent = float(current) * 100 / total
arrow = '-' * int(percent/100 * barLength - 1) + '>'
spaces = ' ' * (barLength - len(arrow))
print('Progress: [%s%s] %d %%' % (arrow, spaces, percent), end='\r')
def dist(pointA, pointB):
return np.sqrt((pointA[0] - pointB[0])**2 + (pointA[1] - pointB[1])**2)
def calc_dists2(x,y,centers):
res = {}
closestcenters = []
for i, point in enumerate(list(zip(x,y))):
d = []
for c in centers:
d.append(dist(point,c))
closestcenters.append(list(np.argsort(d)))
res[i] = list(np.sort(d))
return res, closestcenters
def calc_dists(x,y,centers):
res = {}
closestcenters = []
for i, point in enumerate(list(zip(x,y))):
d = []
for c in centers:
d.append(dist(point,c))
closestcenters.append(list(np.argsort(d)))
res[i] = list(np.sort(d))
return res, closestcenters
def calcnewcenters(assigned_centers,x,y):
res = []
for i, points in enumerate(assigned_centers):
centerx = np.mean(x[points])
centery = np.mean(y[points])
res.append((centerx, centery))
random.shuffle(res)
return res
def randomc(low, high):
return random.random()*(high-low) + low
x_range = (np.min(x),np.max(x))
y_range = (np.min(y),np.max(y))
nrows = n_clusters
meandists = []
centers_init = [(randomc(x_range[0], x_range[1]), randomc(y_range[0], y_range[1])) for i in range(n_clusters)]
max_size = len(x)/n_clusters
centers = centers_init
for i in range(n_iter):
#Shuffle observations
shuffledindex = np.random.choice(range(len(x)),size=len(x),replace=False)
x,y =x[shuffledindex], y[shuffledindex]
#reset assigned centers
assigned_centers = [[] for _ in range(n_clusters)]
#calculate distances
dists, closest_centers = calc_dists(x,y,centers)
meandist = np.mean([_[0] for _ in dists.values()])
meandists.append(meandist)
dists_sorted = sorted(dists.items(), key=lambda x:x[1])
disdists_sorted = dists_sorted[-1:] + dists_sorted[:-1]
#assign to clusters
while len(dists_sorted) > 0:
obs = dists_sorted.pop(0)
closest_c = closest_centers[obs[0]].pop(0)
while len(assigned_centers[closest_c]) >= max_size:
closest_c = closest_centers[obs[0]].pop(0)
assigned_centers[closest_c].append(obs[0])
centers = calcnewcenters(assigned_centers,x,y)
progressBar(i, n_iter-1)
if not exact:
adjusted_assigned_centers = [[] for _ in range(n_clusters)]
dists, closest_centers = calc_dists(x,y,centers)
for i,v in enumerate(closest_centers):
adjusted_assigned_centers[v[0]].append(i)
return [x,y,adjusted_assigned_centers, dists, closest_centers, centers, meandist, meandists]
return [x,y,assigned_centers, dists, closest_centers, centers, meandist, meandists]
x = np.random.uniform(0,50,1000)
y = np.random.uniform(0,50,1000)
res = customCluster(x,y,n_iter=10, exact=False)
centers = res[5]
assigned_centers = res[2]
x = res[0]
y=res[1]
plt.figure(figsize=(10,10))
for c in assigned_centers:
plt.scatter(x=x[c],y=y[c], s=30, alpha=0.8)
plt.scatter(x = np.array(centers).T[0], y = np.array(centers).T[1], color="black", s=60)
#plt.scatter(x = np.array(centers_init).T[0], y = np.array(centers_init).T[1], color="yellow", s=60)
print("\n")
print(res[-2])
print([len(_) for _ in assigned_centers])
print(np.var([len(_) for _ in assigned_centers]))
ind=np.random.choice(range(len(df['start_lat'])),size=50000, replace=False)
x=df['start_lat'][ind].values
y=df['start_lng'][ind].values
res = customCluster(x=x,y=y,n_iter=10,n_clusters=35, exact=False)
centers = res[5]
assigned_centers = res[2]
x = res[1]
y = res[0]
plt.figure(figsize=(10,10))
for c in assigned_centers:
plt.scatter(x=x[c],y=y[c], s=30, alpha=0.01)
plt.scatter(x = np.array(centers).T[1], y = np.array(centers).T[0], color="black", s=60)
plt.figure(figsize=(10,10))
for i in range(len(assigned_centers)):
points = np.vstack([x[assigned_centers[i]],y[assigned_centers[i]]]).T
hull = ConvexHull(points)
x_traj = np.append(points[hull.vertices,0], points[hull.vertices[0],0])
y_traj = np.append(points[hull.vertices,1], points[hull.vertices[0],1])
plt.plot(x_traj, y_traj, '-', lw=1)
for c in assigned_centers:
plt.scatter(x=x[c],y=y[c], s=1, alpha=0.1)
plt.scatter(x = np.array(centers).T[1], y = np.array(centers).T[0], color="black", s=20)
epsilon = 0.05
min_lat = df['start_lat'].min() - epsilon
max_lat = df['start_lat'].max() + epsilon
min_lng = df['start_lng'].min() - epsilon
max_lng = df['start_lng'].max() + epsilon
centers = np.array(centers)
centers[:,[0, 1]] = centers[:,[1, 0]]
vor = Voronoi(np.array(centers))
fig1, ax1 = plt.subplots(figsize=(10,10))
voronoi_plot_2d(vor, ax=ax1, show_points=False)
for c in assigned_centers:
ax1.scatter(x=x[c],y=y[c], s=1, alpha=1)
#plt.xlim([min_lat,max_lat])
#plt.ylim([min_lng,max_lng])
plt.scatter(x = np.array(centers).T[0], y = np.array(centers).T[1], color="black", s=20)
plt.show()
bounding_box = [[min_lng,min_lat],[min_lng,max_lat],[max_lng,max_lat],[max_lng,min_lat]]
bounding_box_polygon = Polygon(bounding_box)
plt.scatter(*np.array(centers).T)
plt.plot(*bounding_box_polygon.exterior.xy)
# Mirror points
bb = [min_lng, max_lng, min_lat, max_lat]
points_center = np.array(centers)
points_left = np.copy(points_center)
points_left[:, 0] = bb[0] - (points_left[:, 0] - bb[0])
points_right = np.copy(points_center)
points_right[:, 0] = bb[1] + (bb[1] - points_right[:, 0])
points_down = np.copy(points_center)
points_down[:, 1] = bb[2] - (points_down[:, 1] - bb[2])
points_up = np.copy(points_center)
points_up[:, 1] = bb[3] + (bb[3] - points_up[:, 1])
all_adj_centers = np.vstack([points_center,points_left,points_up,points_right,points_down])
plt.scatter(*all_adj_centers.T)
plt.plot(*bounding_box_polygon.exterior.xy)
vor_mirrored = Voronoi(all_adj_centers)
fig1, ax1 = plt.subplots(figsize=(10,10))
voronoi_plot_2d(vor_mirrored, ax=ax1, show_points=False)
#plt.xlim([min_lat,max_lat])
#plt.ylim([min_lng,max_lng])
plt.scatter(x = np.array(all_adj_centers).T[0], y = np.array(all_adj_centers).T[1], color="black", s=20)
plt.show()
polygons = {}
for id, region_index in enumerate(vor_mirrored.point_region):
points = []
vertex_indices = vor_mirrored.regions[region_index]
if -1 in vertex_indices:
continue
for vertex_index in vertex_indices:
if vertex_index != -1: # the library uses this for infinity
points.append(list(vor_mirrored.vertices[vertex_index]))
polygon = Polygon(points)
if bounding_box_polygon.contains(polygon):
polygons[id] = polygon
fig1, ax1 = plt.subplots(figsize=(10,10))
for k,polygon in polygons.items():
#polygon = np.array(polygon)
plt.fill(*polygon.exterior.xy, label = k)
plt.text(centers[k][0],centers[k][1], k, fontsize=10)
plt.legend()
#plt.plot(*outer_boundary.exterior.xy)
allpoints = df[['start_lng','start_lat']].values
points_exterior = MultiPoint(allpoints).convex_hull.buffer(0.05)
points_exterior
polygons = [p.intersection(points_exterior) for p in polygons.values()]
fig1, ax1 = plt.subplots(figsize=(10,10))
for polygon in polygons:
#polygon = np.array(polygon)
plt.fill(*polygon.exterior.xy, label = k)
geo_df = gpd.GeoDataFrame(df[['time_of_trip','week_day', 'day_hour','year_day']], geometry=df['start_coord'].apply(reversed).apply(list).apply(Point)).set_crs(epsg=4326)
res = []
for region in polygons:
arr = np.array(geo_df['geometry'].within(region))
res.append(arr)
res_arr = np.vstack(res)
res_arr.shape
res_arr.T.shape
regions = np.where(res_arr.T == True)
regions[1].shape
sns.histplot(regions[1])
plt.xlabel("Region");
plt.ylabel("Number of points");
geo_df['region'] = regions[1]
allpoints = np.hstack(geo_df['geometry'].apply(lambda x: list(x.coords.xy))).T
plt.figure(figsize=(10,10))
plt.scatter(*allpoints.T, alpha = 1,s=1,c=geo_df['region'])
geo_df_grouped_regionday = geo_df_grouped = geo_df.groupby(by=['region','year_day','week_day']).count()
geo_df_grouped_regionday = geo_df_grouped_regionday.reset_index()
geo_df_grouped_regionday = geo_df_grouped_regionday[['region','week_day','year_day','geometry']]
geo_df_grouped_regionday.columns = ['region','week_day','year_day','count']
geo_df_grouped_regionday
fig = plt.figure(figsize=(10,5))
for _,group in geo_df_grouped_regionday.groupby(by='region'):
plt.plot(group['year_day'],group['count'], label=_, alpha=0.5)
xposition = range(group['year_day'].min(),group['year_day'].max(), 7)
for xc in xposition:
plt.axvline(x=xc, color='k', linestyle='--', linewidth=1, alpha=0.5)
n_lags = 5
for region in geo_df_grouped_regionday['region'].unique():
subdf = geo_df_grouped_regionday[geo_df_grouped_regionday['region'] == region]
for l in range(1,n_lags+1):
lag = subdf['count'].shift(l)
geo_df_grouped_regionday.loc[subdf.index,f"lag{l}"] = lag
geo_df_grouped_regionday = geo_df_grouped_regionday.dropna()
geo_df_grouped_regionday
np.mean(np.abs(geo_df_grouped_regionday['count'].values - geo_df_grouped_regionday['lag1'].values ))
r2_score(geo_df_grouped_regionday['count'].values, geo_df_grouped_regionday['lag1'].values)
X_train, X_test, y_train, y_test = train_test_split(geo_df_grouped_regionday, geo_df_grouped_regionday['count'], random_state=1)
X_train_tree, X_test_tree = X_train[X_train.columns.difference(['count'])].values, X_test[X_train.columns.difference(['count'])].values
r2_train = []
r2_test = []
max_depths = range(1,20)
for max_depth in max_depths:
treeregr = tree.DecisionTreeRegressor(max_depth=max_depth).fit(X_train_tree, y_train)
r2_train.append(treeregr.score(X_train_tree, y_train))
r2_test.append(treeregr.score(X_test_tree, y_test))
plt.figure(figsize=(10,5))
plt.plot(r2_train);
plt.plot(r2_test);
treeregr = tree.DecisionTreeRegressor(max_depth=7).fit(X_train_tree, y_train)
treeregr.score(X_train_tree, y_train)
treeregr.score(X_test_tree, y_test)
tree_pred = treeregr.predict(X_test_tree)
tree_meanabserror = np.mean(np.abs(tree_pred - y_test))
tree_rootmeansqerror = np.sqrt(np.mean(np.square(tree_pred - y_test)))
print("Tree performance:")
print(f"Mean abs error: {tree_meanabserror}")
print(f"Root Mean squared error: {tree_rootmeansqerror}")
rfregr = RandomForestRegressor(max_depth=7, n_estimators=100,bootstrap=True, random_state=0).fit(X_train_tree, y_train)
rfregr.score(X_test_tree, y_test)
rf_pred = rfregr.predict(X_test_tree)
rf_meanabserror = np.mean(np.abs(rf_pred - y_test))
rf_rootmeansqerror = np.sqrt(np.mean(np.square(rf_pred - y_test)))
print("Random Forest performance:")
print(f"Mean abs error: {rf_meanabserror}")
print(f"Root Mean squared error: {rf_rootmeansqerror}")
region_input_train, region_input_test = X_train['region'], X_test['region']
week_day_input_train, week_day_input_test = X_train['week_day'], X_test['week_day']
num_input_train, num_input_test = X_train[X_train.columns.difference(['region','year_day','week_day','count'])].values, X_test[X_train.columns.difference(['region','year_day','week_day','count'])].values
#num_input_train, num_input_test = num_input_train[np.newaxis,:], num_input_test[np.newaxis,:]
scaler = MinMaxScaler(feature_range=(-1, 1)).fit(num_input_train)
num_input_train, num_input_test = scaler.transform(num_input_train), scaler.transform(num_input_test)
num_input_train.shape
N = len(geo_df_grouped_regionday.dropna())
hidden_units = [45,20,5]
region_embedding_size = 10
region_embedding_max = len(geo_df_grouped_regionday['region'].unique()) + 1
day_embedding_size = 10
day_embedding_max = 7
merged_embedding_size = region_embedding_max *7
n_filter = 10
kernel_size = 2
def base_model():
region_input = keras.Input(shape=(1,), name='region_id') #stock id
week_day_input = keras.Input(shape=(1,), name='week_day')
num_input = keras.Input(shape=(n_lags,), name='num_data') #numerical
#individual embedding, flatenning and concatenating
region_embedded = keras.layers.Embedding(region_embedding_max, region_embedding_size,
input_length=1, name='region_embedding')(region_input)
day_embedded = keras.layers.Embedding(day_embedding_max, day_embedding_size,
input_length=1, name='day_embedding')(week_day_input)
region_flattened = keras.layers.Flatten()(region_embedded)
day_flattened = keras.layers.Flatten()(day_embedded)
out = keras.layers.Concatenate()([region_flattened, day_flattened, num_input])
#out = tf.keras.layers.BatchNormalization()(out)
for n_hidden in hidden_units:
out = keras.layers.Dense(n_hidden, activation='swish')(out)
#out = keras.layers.Concatenate()([out, num_input])
out = keras.layers.Dense(1, activation='linear', name='prediction')(out)
model = keras.Model(
inputs = [region_input, week_day_input, num_input],
outputs = out
)
return model
model = base_model()
model.compile(
tf.keras.optimizers.Adam(learning_rate=0.01),
loss="mse",
metrics=[tf.keras.metrics.RootMeanSquaredError(),tf.keras.metrics.MeanAbsoluteError()]
)
model.summary()
train = [region_input_train, week_day_input_train, num_input_train]
model.fit(train,
y_train,
batch_size=1000,
epochs=200,
validation_data=([region_input_test, week_day_input_test, num_input_test], y_test),
#callbacks=[es, plateau],
validation_batch_size=500,
#shuffle=True,
verbose = 1)
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['val_loss'])
#plt.plot(np.array(list(model.history.history.values())).T);
model.evaluate(train, y_train)
evaluate = [region_input_test, week_day_input_test, num_input_test]
pred = model.predict(evaluate)
pd.DataFrame({'real': y_test, 'pred':pred.T[0]})[10:20]
enn_pred = model.predict(evaluate).T[0]
enn_meanabserror = np.mean(np.abs(enn_pred - y_test))
enn_rootmeansqerror = np.sqrt(np.mean(np.square(enn_pred - y_test)))
enn_r2 = r2_score(y_test[:,None], enn_pred)
print("ENN performance:")
print(f"Mean abs error: {enn_meanabserror}")
print(f"Root Mean squared error: {enn_rootmeansqerror}")
print(f"R2: {enn_r2}")
geo_df_grouped_regionhour = geo_df.groupby(by=['region','year_day','week_day','day_hour']).count()
geo_df_grouped_regionhour = geo_df_grouped_regionhour.reset_index()
geo_df_grouped_regionhour = geo_df_grouped_regionhour[['region','week_day','year_day','day_hour','geometry']]
geo_df_grouped_regionhour.columns = ['region','week_day','year_day','day_hour','count']
geo_df_grouped_regionhour
def yearday_weekday(yearday):
return (yearday%206+4)%7
hours = np.arange(0,24)
for region in geo_df_grouped_regionhour['region'].unique():
for year_day in geo_df_grouped_regionhour['year_day'].unique():
for hour in hours:
df_filter = ((geo_df_grouped_regionhour['region'] == region) & (geo_df_grouped_regionhour['year_day'] == year_day) & (geo_df_grouped_regionhour['day_hour'] == hour))
if not df_filter.any():
week_day = yearday_weekday(year_day)
geo_df_grouped_regionhour = geo_df_grouped_regionhour.append({'region':region,'week_day':week_day,'year_day':year_day,'day_hour': hour, 'count':0}, ignore_index = True)
geo_df_grouped_regionhour = geo_df_grouped_regionhour.sort_values(by = ['region','year_day','day_hour'])
geo_df_grouped_regionhour[(geo_df_grouped_regionhour['region'] == 6) & (geo_df_grouped_regionhour['year_day'] == 220)]
n_lags = 5
for region in geo_df_grouped_regionhour['region'].unique():
subdf = geo_df_grouped_regionhour[geo_df_grouped_regionhour['region'] == region]
for l in range(1,n_lags+1):
lag = subdf['count'].shift(l)
geo_df_grouped_regionhour.loc[subdf.index,f"lag{l}"] = lag
geo_df_grouped_regionhour = geo_df_grouped_regionhour.dropna()
geo_df_grouped_regionhour
np.mean(np.abs(geo_df_grouped_regionhour['count'].values - geo_df_grouped_regionhour['lag1'].values ))
r2_score(geo_df_grouped_regionhour['count'].values, geo_df_grouped_regionhour['lag1'].values)
X_train, X_test, y_train, y_test = train_test_split(geo_df_grouped_regionhour, geo_df_grouped_regionhour['count'], random_state=1)
X_train_tree, X_test_tree = X_train[X_train.columns.difference(['count'])].values, X_test[X_train.columns.difference(['count'])].values
rfregr = RandomForestRegressor(max_depth=7, n_estimators=100,bootstrap=True, random_state=0).fit(X_train_tree, y_train)
rf_pred = rfregr.predict(X_test_tree)
rf_meanabserror = np.mean(np.abs(rf_pred - y_test))
rf_rootmeansqerror = np.sqrt(np.mean(np.square(rf_pred - y_test)))
rf_r2 = r2_score( y_test[:,None], rf_pred)
print("Random Forest performance:")
print(f"Mean abs error: {rf_meanabserror}")
print(f"Root Mean squared error: {rf_rootmeansqerror}")
print(f"R2: {rf_r2}")
region_input_train, region_input_test = X_train['region'], X_test['region']
week_day_input_train, week_day_input_test = X_train['week_day'], X_test['week_day']
day_hour_input_train, day_hour_input_test = X_train['day_hour'], X_test['day_hour']
num_input_train, num_input_test = X_train[X_train.columns.difference(['region','year_day','week_day','day_hour','count'])].values, X_test[X_train.columns.difference(['region','year_day','week_day','day_hour','count'])].values
scaler = MinMaxScaler(feature_range=(-1, 1)).fit(num_input_train)
num_input_train, num_input_test = scaler.transform(num_input_train), scaler.transform(num_input_test)
hidden_units = [35,20,5]
region_embedding_size = 10
region_embedding_max = len(geo_df_grouped_regionhour['region'].unique()) + 1
day_embedding_size = 10
day_embedding_max = 7
hour_embedding_size = 10
hour_embedding_max = 24
merged_embedding_size = region_embedding_max *7
n_filter = 10
kernel_size = 2
num_features = num_input_train.shape[1]
def hourly_model():
region_input = keras.Input(shape=(1,), name='region_id')
week_day_input = keras.Input(shape=(1,), name='week_day')
day_hour_input = keras.Input(shape=(1,), name='day_hour')
num_input = keras.Input(shape=(num_features,), name='num_data') #numerical
#individual embedding, flatenning and concatenating
region_embedded = keras.layers.Embedding(region_embedding_max, region_embedding_size,
input_length=1, name='region_embedding')(region_input)
day_embedded = keras.layers.Embedding(day_embedding_max, day_embedding_size,
input_length=1, name='day_embedding')(week_day_input)
day_hour_embedded = keras.layers.Embedding(hour_embedding_max, hour_embedding_size,
input_length=1, name='hour_embedding')(day_hour_input)
region_flattened = keras.layers.Flatten()(region_embedded)
day_flattened = keras.layers.Flatten()(day_embedded)
hour_flattened = keras.layers.Flatten()(day_hour_embedded)
out = keras.layers.Concatenate()([region_flattened, day_flattened, hour_flattened, num_input])
out = tf.keras.layers.BatchNormalization()(out)
# Add one or more hidden layers
for n_hidden in hidden_units:
out = keras.layers.Dense(n_hidden, activation='swish')(out)
#out = keras.layers.Concatenate()([out, num_input])
# A single output: our predicted rating
out = keras.layers.Dense(1, activation='linear', name='prediction')(out)
model = keras.Model(
inputs = [region_input, week_day_input, day_hour_input, num_input],
outputs = out
)
return model
model = hourly_model()
model.compile(
tf.keras.optimizers.Adam(learning_rate=0.001),
loss="mse",
metrics=[tf.keras.metrics.RootMeanSquaredError(),tf.keras.metrics.MeanAbsoluteError()]
)
train = [region_input_train, week_day_input_train, day_hour_input_train, num_input_train]
model.fit(train,
y_train,
batch_size=3000,
epochs=200,
validation_data=([region_input_test, week_day_input_test, day_hour_input_test, num_input_test], y_test),
#callbacks=[es, plateau],
validation_batch_size=500,
#shuffle=True,
verbose = 1)
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['val_loss'])
evaluate = [region_input_test, week_day_input_test,day_hour_input_test, num_input_test]
enn_pred = model.predict(evaluate).T[0]
enn_meanabserror = np.mean(np.abs(enn_pred - y_test))
enn_rootmeansqerror = np.sqrt(np.mean(np.square(enn_pred - y_test)))
enn_r2 = r2_score(y_test[:,None], enn_pred)
print("Next hour prediction ENN performance:")
print(f"Mean abs error: {enn_meanabserror}")
print(f"Root Mean squared error: {enn_rootmeansqerror}")
print(f"R2: {enn_r2}")