import pandas as pd
business_path = "/Users/donghe/Desktop/yelp_dataset/yelp_academic_dataset_business.json"
df_business = pd.read_json(business_path, lines=True)
df_business.info()
df_business.describe()
df_business.isna().sum() / len(df_business)
# Formalizing the datatypes
df_business['business_id'] = df_business['business_id'].astype("str")
df_business['name'] = df_business['name'].astype("str")
df_business['address'] = df_business['address'].astype("str")
df_business['city'] = df_business['city'].astype("str")
df_business['state'] = df_business['state'].astype("str")
df_business['latitude'] = df_business['latitude'].astype('float64')
df_business['longitude'] = df_business['longitude'].astype('float64')
df_business['stars'] = df_business['stars'].astype('float64')
df_business['review_count'] = df_business['review_count'].astype('int64')
df_business['is_open'] = df_business['is_open'].astype('bool')
df_business['attributes'] = df_business['attributes'].astype("str")
df_business['categories'] = df_business['categories'].astype("str")
df_business['hours'] = df_business['hours'].astype("str")
df_business.info()
df_business.isna().sum() / len(df_business)
df_business = df_business.loc[df_business['city'] == "Austin"]
df_business.head(2)
import altair as alt
alt.data_transformers.disable_max_rows()
Counts of open businesses and closed businesses
alt.Chart(df_business).mark_bar().encode(
x=alt.X("is_open:N", title="Business Open?"),
y=alt.Y("count()", title="Business Count"),
).properties(
title="Counts of businesses: open or not"
)