Data Preparation

In [1]:
import pandas as pd
business_path = "/Users/donghe/Desktop/yelp_dataset/yelp_academic_dataset_business.json"
df_business = pd.read_json(business_path, lines=True)
df_business.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160585 entries, 0 to 160584
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   160585 non-null  object 
 1   name          160585 non-null  object 
 2   address       160585 non-null  object 
 3   city          160585 non-null  object 
 4   state         160585 non-null  object 
 5   postal_code   160585 non-null  object 
 6   latitude      160585 non-null  float64
 7   longitude     160585 non-null  float64
 8   stars         160585 non-null  float64
 9   review_count  160585 non-null  int64  
 10  is_open       160585 non-null  int64  
 11  attributes    145593 non-null  object 
 12  categories    160470 non-null  object 
 13  hours         133244 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 17.2+ MB
In [2]:
df_business.describe()
df_business.isna().sum() / len(df_business)
Out[2]:
business_id     0.000000
name            0.000000
address         0.000000
city            0.000000
state           0.000000
postal_code     0.000000
latitude        0.000000
longitude       0.000000
stars           0.000000
review_count    0.000000
is_open         0.000000
attributes      0.093359
categories      0.000716
hours           0.170259
dtype: float64
In [3]:
# Formalizing the datatypes
df_business['business_id'] = df_business['business_id'].astype("str")
df_business['name'] = df_business['name'].astype("str")
df_business['address'] = df_business['address'].astype("str")
df_business['city'] = df_business['city'].astype("str")
df_business['state'] = df_business['state'].astype("str")
df_business['latitude'] = df_business['latitude'].astype('float64')
df_business['longitude'] = df_business['longitude'].astype('float64')
df_business['stars'] = df_business['stars'].astype('float64')
df_business['review_count'] = df_business['review_count'].astype('int64')
df_business['is_open'] = df_business['is_open'].astype('bool')
df_business['attributes'] = df_business['attributes'].astype("str")
df_business['categories'] = df_business['categories'].astype("str")
df_business['hours'] = df_business['hours'].astype("str")
In [4]:
df_business.info()
df_business.isna().sum() / len(df_business)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160585 entries, 0 to 160584
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   160585 non-null  object 
 1   name          160585 non-null  object 
 2   address       160585 non-null  object 
 3   city          160585 non-null  object 
 4   state         160585 non-null  object 
 5   postal_code   160585 non-null  object 
 6   latitude      160585 non-null  float64
 7   longitude     160585 non-null  float64
 8   stars         160585 non-null  float64
 9   review_count  160585 non-null  int64  
 10  is_open       160585 non-null  bool   
 11  attributes    160585 non-null  object 
 12  categories    160585 non-null  object 
 13  hours         160585 non-null  object 
dtypes: bool(1), float64(3), int64(1), object(9)
memory usage: 16.1+ MB
Out[4]:
business_id     0.0
name            0.0
address         0.0
city            0.0
state           0.0
postal_code     0.0
latitude        0.0
longitude       0.0
stars           0.0
review_count    0.0
is_open         0.0
attributes      0.0
categories      0.0
hours           0.0
dtype: float64

Exploratory Visual Analysis

Initial Questions

  1. How are rating stars distributed in Austin?
  2. How do rating stars correlate with different attributes of food businesses in Austin?
  3. How do combinations of different attributes impact rating stars of food businesses in Austin?
In [5]:
df_business = df_business.loc[df_business['city'] == "Austin"]
df_business.head(2)
Out[5]:
business_id name address city state postal_code latitude longitude stars review_count is_open attributes categories hours
8 N3_Gs3DnX4k9SgpwJxdEfw Lane Wells Jewelry Repair 7801 N Lamar Blvd, Ste A140 Austin TX 78752 30.346169 -97.711458 5.0 30 True {'RestaurantsPriceRange2': '1', 'ByAppointment... Shopping, Jewelry Repair, Appraisal Services, ... {'Monday': '12:15-17:0', 'Tuesday': '12:15-17:...
9 tXvdYGvlEceDljN8gt2_3Q Capital City Barber Shop 615 W Slaughter Ln, Ste 113 Austin TX 78748 30.172706 -97.799920 4.0 5 False {'BusinessAcceptsCreditCards': 'False', 'Resta... Barbers, Beauty & Spas {'Monday': '9:0-17:0', 'Tuesday': '9:0-19:0', ...
In [6]:
import altair as alt
alt.data_transformers.disable_max_rows()
Out[6]:
DataTransformerRegistry.enable('default')

Counts of open businesses and closed businesses

In [7]:
alt.Chart(df_business).mark_bar().encode(
    x=alt.X("is_open:N", title="Business Open?"),
    y=alt.Y("count()", title="Business Count"),
).properties(
    title="Counts of businesses: open or not"
)
Out[7]: