In this project, I'll be exploring and clustering neighborhoods in Toronto with the help of Foursquare API. This small project will be tackled in several steps:
from bs4 import BeautifulSoup
import numpy as np # library to handle data in a vectorized manner
import re # regex library
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
print('Libraries imported.')
webpage_response = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
webpage = webpage_response.content
soup = BeautifulSoup(webpage, "html.parser") # Instantiate BeautifulSoup object
# print(soup.tbody) # Uncomment to analyze which element we're going to scrape
We can see that most of the content in the webpage is assigned in the children tags of tbody. So, let's do some more exploratory analysis before actually scraping the contents.
For table headers, like the labels Postal Code and Borough, are stored in the tag <th>.
Contrarily, the contents of the table are stored in the tag <td>, which are also the children of the tag <tr>.
We can get this information stored in a list, in which we're going to convert into a pandas dataframe later.
# Perform multiple string cleaning
headers = [(str(x)).strip("<th>") for x in soup.find_all("th")]
headers = [x.strip("</") for x in headers]
headers = [x.strip() for x in headers[:3]]
# Display end results
print(headers)
# Convert all entries into list
table = soup.find('table')
table_rows = table.find_all('tr')
entries = list()
for tr in table_rows:
td = tr.find_all('td')
entries.append([tr.text.strip() for tr in td])
# Convert the list contents into a dataframe, with column name as the headers list
df = pd.DataFrame(entries, columns=headers)
df.head()
Here are the conditions of the desired DataFrame state:
## Drop NaN values
df.dropna(axis=0, inplace=True)
## Process the cells that have an assigned borough
to_drop = df.index[df['Borough'] == 'Not assigned'].tolist() # Get index of all rows that hasn't an assigned borough
df.drop(to_drop, inplace=True) # Drop values based on index
## Display processed dataframe
print(df.shape)
df.reset_index(drop=True, inplace=True)
df.head()
We're going to import the pre-processed long/lat values from this csv file. Then, we're going to append the imported values into our existing dataframe.
longlat = pd.read_csv("https://cocl.us/Geospatial_data")
longlat.head()
merged_df = df.merge(longlat, how = 'inner', on = ['Postal Code']) # Similar to SQL inner join
merged_df.head()
from sklearn.cluster import KMeans # Import clustering library
A quick Google search revealed that Toronto lies on 43.653908 latitude, -79.384293 longitude. We double-assign these values before mapping the map with folium.
latitude, longitude = 43.653908, -79.384293
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, borough, neighborhood in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Borough'], merged_df['Neighborhood']):
label = '{}, {}'.format(neighborhood, borough)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=5,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.7,
parse_html=False).add_to(map_toronto)
map_toronto
Note that when I share this notebook, I would have to redact the credentials. I hope this is understandable :)
CLIENT_ID = 'REDACTED' # your Foursquare ID
CLIENT_SECRET = 'REDACTED' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
We're going to use these functions to explore neighborhoods in Toronto.
# function that extracts the category of the venue
def get_category_type(row):
try:
categories_list = row['categories']
except:
categories_list = row['venue.categories']
if len(categories_list) == 0:
return None
else:
return categories_list[0]['name']
# function that returns venues in an area, given latitude and longitude
def getNearbyVenues(names, latitudes, longitudes, radius=500):
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
# print(name) # uncomment to debug. I commented it to preserve memory
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
100) # Get the results of the top 100 venues.
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
# return only relevant information for each nearby venue
venues_list.append([(
name,
lat,
lng,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Neighborhood',
'Neighborhood Latitude',
'Neighborhood Longitude',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return(nearby_venues)
toronto_venues = getNearbyVenues(names=merged_df['Neighborhood'],
latitudes=merged_df['Latitude'],
longitudes=merged_df['Longitude']
)
print(toronto_venues.shape)
The dataframe consists of venue properties along with its appropriate neighborhood.
toronto_venues.head()
We can find out how many unique categories can be curated from all the returned venues.
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))
This way, we can analyze each neighborhood without having to scroll through all entries in the toronto_venues dataframe.
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
neighborhood_column = toronto_onehot['Neighborhood']
toronto_onehot.drop(labels=['Neighborhood'], axis=1,inplace = True)
toronto_onehot.insert(0, 'Neighborhood', neighborhood_column)
print(toronto_onehot.shape)
toronto_onehot.head()
We can then group rows based on neighborhood names and by taking the mean of the frequency of occurrence of each category.
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()
If we want to prettify the most common venues in descending order, we need to write a function. From there, we can create a new dataframe and display the top 10 venues for each neighborhood.
def return_most_common_venues(row, num_top_venues):
row_categories = row.iloc[1:]
row_categories_sorted = row_categories.sort_values(ascending=False)
return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']
for ind in np.arange(toronto_grouped.shape[0]):
neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted.head()
We use k-means clustering with 5 clusters.
# set number of clusters
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
# check unique values of cluster labels, to see whether we got it right
print(set(kmeans.labels_))
# check cluster labels generated for the first ten rows in the dataframe
print(kmeans.labels_[0:10] )
We can create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = merged_df
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
# Fill, if any, missing values with cluster 0
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].fillna(0)
toronto_merged.head() # check the "Cluster Labels" column to see changes
Finally, let's visualize the resulting clusters.
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
folium.CircleMarker(
[lat, lon],
radius=5,
popup=label,
color=rainbow[int(cluster-1)],
fill=True,
fill_color=rainbow[int(cluster-1)],
fill_opacity=0.7).add_to(map_clusters)
map_clusters