Load dependencies and data from kaggle ¶

import kagglehub
import pandas as pd
import numpy as np
import scipy.stats as stats
import math

import matplotlib.pyplot as plt
import seaborn as sns

c:\Users\mcadr\Coursework\EDA_Github_Portfolio\eda_london-bike-share-analysis\venv\Lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

import warnings
warnings.filterwarnings('ignore')

# Download latest version
path = kagglehub.dataset_download("hmavrodiev/london-bike-sharing-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\mcadr\.cache\kagglehub\datasets\hmavrodiev\london-bike-sharing-dataset\versions\1

bronze_df = pd.read_csv(path + "/london_merged.csv")
bronze_df.head()

bronze_df.shape

(17414, 10)

bronze_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     17414 non-null  object 
 1   cnt           17414 non-null  int64  
 2   t1            17414 non-null  float64
 3   t2            17414 non-null  float64
 4   hum           17414 non-null  float64
 5   wind_speed    17414 non-null  float64
 6   weather_code  17414 non-null  float64
 7   is_holiday    17414 non-null  float64
 8   is_weekend    17414 non-null  float64
 9   season        17414 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 1.3+ MB

# Check the Cardinality of each column
for col in bronze_df.columns:
  print(f'{col}: {bronze_df[col].nunique()}')

timestamp: 17414
cnt: 3781
t1: 73
t2: 82
hum: 143
wind_speed: 103
weather_code: 7
is_holiday: 2
is_weekend: 2
season: 4

silver_df = bronze_df.copy()

# Check for duplicates
silver_df.duplicated().sum()

np.int64(0)

# Check for null values
silver_df.isnull().sum().any()

np.False_

# Do mappings for visualizations
silver_df['season'] = silver_df['season'].map({0:'spring', 1:'summer', 2:'fall', 3:'winter'})
silver_df['is_holiday'] = silver_df['is_holiday'].map({0:'no', 1:'yes'})
silver_df['is_weekend'] = silver_df['is_weekend'].map({0:'no', 1:'yes'})
silver_df['weather_code'] = silver_df['weather_code'].map({1:'clear', 2:'scattered clouds', 3:'broken clouds', 4:'cloudy', 7:'rain', 10:'thunderstorm', 26:'snowfall', 94:'freezing fog'})

# Update data types
silver_df['timestamp'] = pd.to_datetime(silver_df['timestamp'])
silver_df['weather_code'] = silver_df['weather_code'].astype('str')
silver_df['season'] = silver_df['season'].astype('str')
silver_df['is_holiday'] = silver_df['is_holiday'].astype('str')
silver_df['is_weekend'] = silver_df['is_weekend'].astype('str')

# Change humidity values to percentages
silver_df['humidity_perc'] = silver_df['hum'] / 100
silver_df.drop('hum', axis=1, inplace=True)

silver_df['timestamp'] = pd.to_datetime(silver_df['timestamp'])

# Create seperate 'date'  columns
silver_df['date'] = pd.to_datetime(silver_df['timestamp'].dt.date)

cols = silver_df.columns.tolist()

# Move 'date' and 'time' to the first positions in df
cols = ['date'] +  [col for col in cols if col != 'date']
silver_df = silver_df[cols]

# Save data to excel file for Tableau
silver_df.to_excel('data/silver_Tableau_bike_sharing.xlsx', 
                   sheet_name='Source Data')

# Export clean data to csv for import to postgresql
silver_df.to_csv('./data/silver_bike_sharing.csv', index=False)

# Create new meaningful column names for visualizations
new_cols_dict = {
    'date': 'date',
    'timestamp':'timestamp',
    'cnt':'count',
    't1':'temp_real_C',
    't2':'temp_feels_like_C',
    'hum':'humidity_percent',
    'wind_speed':'wind_speed_kph',
    'weather_code':'weather',
    'is_holiday':'is_holiday',
    'is_weekend':'is_weekend',
    'season':'season'
}

# Rename columns
silver_df.rename(columns=new_cols_dict, inplace=True)

num_cols = silver_df.select_dtypes(include='number').columns

silver_df[num_cols].describe()

def create_histplot(data, columns):
  nrows= math.ceil(len(columns) /2)
  fig, axes = plt.subplots(nrows=nrows, ncols=2, figsize=(12, 8))
  colors = sns.color_palette('Dark2', n_colors=len(columns))

  for idx, col in enumerate(columns):
    ax = axes[idx // 2, idx % 2 ]
    color=colors[idx]
    sns.histplot(x=col, data=data, kde=True,ax=ax, color=color)
    ax.set_title(f'Histogram: {col}')
    ax.set_xlabel(col)
  # Remove empty subplots
  if len(columns) % 2 != 0:
    fig.delaxes(axes[-1, -1])
  plt.tight_layout(h_pad = 3.0)
  plt.show()


create_histplot(silver_df, num_cols)

df1 = silver_df.copy()
cat_cols = [col for col in silver_df.select_dtypes(include='object').columns if col != 'time']

# Convert dtype to categorical for visualizations
for col in cat_cols:
  df1[col] = df1[col].astype('category')

def create_countplot(data, columns):
  nrows= math.ceil(len(columns) /2)
  fig, axes = plt.subplots(nrows=nrows, ncols=2, figsize=(12, 8))

  for idx, col in enumerate(columns):
    ax = axes[idx // 2, idx % 2 ]
    sns.countplot(x=col, data=data, hue=col, ax=ax)

    # Display bar heights
    for p in ax.patches:
      ax.annotate(f'{p.get_height()}',
                  (p.get_x() + p.get_width() / 2. , p.get_height()),
                  ha='center', va='center',
                  xytext=(0, 5),
                  textcoords='offset points')
    ax.set_title(f'Frequency count for {col}', pad=25)
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')

    # Rotate x-tick labels
    plt.sca(ax)
    plt.xticks(rotation=90)

    # Remove top and right borders
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
  plt.tight_layout(h_pad = 5)
  plt.show()

create_countplot(df1, cat_cols)

# Average bike sharing counts per day
gold_avg_bike_rides_per_day = silver_df.groupby(silver_df['date'].dt.day_name())['count'].mean().sort_values(ascending=False).reset_index()
gold_avg_bike_rides_per_day = gold_avg_bike_rides_per_day.rename(columns={'date':'day', 'count':'avg_count'})
#gold_avg_bike_rides_per_day

plt.figure(figsize=(10, 6))
ax = sns.barplot(x='day', y='avg_count', data=gold_avg_bike_rides_per_day, hue='day')
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.1f'),
                     (p.get_x() + p.get_width() / 2., p.get_height() + 10),  # Adjust the offset here
                     ha='center', va='bottom')
plt.title('Average Bike Sharing Rides per Day')
plt.xlabel('Day of the Week')
plt.ylabel('Average Bike Sharing Rides')
plt.show()

# Extract hour from time
silver_df['hour'] = silver_df['timestamp'].dt.hour.astype('str')

plt.figure(figsize=(10, 6))
# Create the horizontal bar plot with sum as estimator
ax = sns.barplot(data=silver_df, y='hour', x='count', hue = 'hour',estimator=sum, errorbar=None)


# Add rotated count values on the bars with an offset
for p in ax.patches:
        ax.annotate(format(p.get_width(), '.1f'),  # Swap get_height() with get_width()
                (p.get_width() + 50, p.get_y() + p.get_height() / 2.),  # Adjust the offset here
                ha='left', va='center')

# Remove top and right borders
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title('Bike Rentals by Time of the Day', loc='left', pad=30)
plt.ylabel('Hour of the Day')
plt.xlabel('Bike rentals')

plt.show()

# Create bins for humidity_percent
bins = [0, 20, 40, 60, 80, 100]
labels = ['0-20%', '21-40%', '41-60%', '61-80%', '81-100%']
silver_df['humidity_bins'] = pd.cut(silver_df['humidity_perc'] * 100, bins=bins, labels=labels, right=False)

# Group by humidity_bins and season, then count
gold_count_by_humidity_and_season = silver_df.groupby(['humidity_bins', 'season'], observed=True)['count'].sum().reset_index()
gold_count_by_humidity_and_season.tail()

# Pivot the data for stacked bar plot
gold_rides_by_humidity_and_season = ( 
        gold_count_by_humidity_and_season 
           .pivot(index='humidity_bins', columns='season', values='count') 
           .fillna(0) )
# Create a stacked barplot
gold_rides_by_humidity_and_season.plot(kind='bar', stacked=True, figsize=(8, 6))
plt.title('Bike Rentals by Season and Humidity Levels')
plt.xlabel('Humidity Levels')
plt.ylabel('Bike Rentals')
plt.legend(title='Season')
plt.show()

gold_avg_bike_counts_by_weather_and_hr = (
    silver_df
        .groupby(['weather', 'hour'])['count']
        .mean()
        .reset_index()
)

gold_rides_by_weather_and_hour = ( 
        gold_avg_bike_counts_by_weather_and_hr
             .pivot(index='hour', columns='weather', values='count') 
)


plt.figure(figsize=(12, 6))
sns.heatmap(gold_rides_by_weather_and_hour, annot=True, fmt='.1f', cmap='coolwarm')
plt.title('Total Bike Shares by Weather Condition and Hour')
plt.ylabel('Hour of Day')
plt.xlabel('Weather Type')
plt.show()

# Compute the daily rental
gold_rental_summary = silver_df.groupby('date').agg(total_rentals=('count', 'sum'), avg_temp=('temp_real_C', 'mean')).reset_index()

# Create the primary plot axes
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot total rentals on the primary y-axis
sns.lineplot(data=gold_rental_summary, x='date', y='total_rentals', color='b', label='Total Rentals', ax=ax1)

# Create the secondary y-axis
ax2 = ax1.twinx()

# Plot average temperature on the secondary y-axis
sns.lineplot(data=gold_rental_summary, x='date', y='avg_temp', color='red', label='Average Temp', ax=ax2)

# Adding titles and labels
ax1.set_title('Total Bike Shares and Temperature Over Time')
ax1.set_xlabel('Date')
ax1.set_ylabel('Total Bike Shares')
ax2.set_ylabel('Average Temperature (°C)')

# Adding legends
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

plt.show()

	timestamp	cnt	t1	t2	hum	wind_speed	weather_code	is_weekend	season
0	2015-01-04 00:00:00	182	3.0	2.0	93.0	6.0	3.0	1.0	3.0
1	2015-01-04 01:00:00	138	3.0	2.5	93.0	5.0	1.0	1.0	3.0
2	2015-01-04 02:00:00	134	2.5	2.5	96.5	0.0	1.0	1.0	3.0
3	2015-01-04 03:00:00	72	2.0	2.0	100.0	0.0	1.0	1.0	3.0
4	2015-01-04 04:00:00	47	2.0	0.0	93.0	6.5	1.0	1.0	3.0

	count	temp_real_C	temp_feels_like_C	wind_speed_kph	humidity_perc
count	17414.000000	17414.000000	17414.000000	17414.000000	17414.000000
mean	1143.101642	12.468091	11.520836	15.913063	0.723250
std	1085.108068	5.571818	6.615145	7.894570	0.143132
min	0.000000	-1.500000	-6.000000	0.000000	0.205000
25%	257.000000	8.000000	6.000000	10.000000	0.630000
50%	844.000000	12.500000	12.500000	15.000000	0.745000
75%	1671.750000	16.000000	16.000000	20.500000	0.830000
max	7860.000000	34.000000	34.000000	56.500000	1.000000

Load dependencies and data from kaggle ¶

Data Overview¶

Data Cleaning¶

Data to exported to Tableau¶

Data to be exported to PostgreSQL¶

Exploratory Data Analysis¶

🚴 Univariate Analysis¶

Distribution of numerical variables¶

Distribution of the categorical variables¶

🚴 Bivariate Analysis¶

🚴 Multivariate Analysis¶

How bike rentals vary with humidity levels across different seasons ¶

Hourly Weather conditions vs Bike Rentals¶

Total Bike Rentals Against Temperature Over Time¶

	humidity_bins	season	count
11	61-80%	winter	1809949
12	81-100%	fall	1457928
13	81-100%	spring	593613
14	81-100%	summer	729460
15	81-100%	winter	1358118

Load dependencies and data from kaggle¶

Data Overview¶

Data Cleaning¶

Data to exported to Tableau¶

Data to be exported to PostgreSQL¶

Exploratory Data Analysis¶

🚴 Univariate Analysis¶

Distribution of numerical variables¶

Distribution of the categorical variables¶

🚴 Bivariate Analysis¶

Average bike sharing rides per day¶

Time of the day with the most bike sharing counts¶

🚴 Multivariate Analysis¶

How bike rentals vary with humidity levels across different seasons ¶

Hourly Weather conditions vs Bike Rentals¶

Total Bike Rentals Against Temperature Over Time¶

Load dependencies and data from kaggle ¶