import pandas as pd                  #  For data manipulation and analysis
import numpy as np                   #  For numerical computations
import matplotlib.pyplot as plt      #  For plotting and visualization
import seaborn as sns                #  For advanced visualizations
import scipy.stats as stats          #  for statistical analysis

# Step 1: Install gdown

!pip install gdown

# Step 2: Import necessary libraries
import gdown
import pandas as pd

# Step 3: Set the file ID and create a download URL
file_id = "1QQaRZizuq3TGhUqHyIc701ChT-GgM1yR"
download_url = f"https://drive.google.com/uc?id={file_id}"

# Step 4: Set the output file name
output_file = "spinny_dataset.csv"

# Step 5: Download the file
gdown.download(download_url, output_file, quiet=False)

# Step 6: Load the CSV file into a Pandas DataFrame
data = pd.read_csv(output_file)

Requirement already satisfied: gdown in /usr/local/lib/python3.11/dist-packages (5.2.0)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.11/dist-packages (from gdown) (4.13.3)
Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from gdown) (3.17.0)
Requirement already satisfied: requests[socks] in /usr/local/lib/python3.11/dist-packages (from gdown) (2.32.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from gdown) (4.67.1)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown) (2.6)
Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown) (4.12.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (2025.1.31)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (1.7.1)

Downloading...
From: https://drive.google.com/uc?id=1QQaRZizuq3TGhUqHyIc701ChT-GgM1yR
To: /content/spinny_dataset.csv
100%|██████████| 99.9M/99.9M [00:00<00:00, 227MB/s]

rows, columns = data.shape
print(f"The dataset contains **{rows} rows** and **{columns} columns**.")

The dataset contains **426880 rows** and **21 columns**.

# Viewing all column names as not all are visible in head or sample above
data.columns

Index(['id', 'price', 'year', 'manufacturer', 'model', 'condition',
       'cylinders', 'fuel', 'odometer', 'title_status', 'transmission', 'VIN',
       'drive', 'size', 'type', 'paint_color', 'image_url', 'state',
       'posting_date', 'latitude', 'longitude'],
      dtype='object')

# Checking the top 5 Rows to see how the dataset looks
data.head(5)

# Randomly selecting 5 rows from the dataset
data.sample(5)

print("Dataset Information:\n")
data.info()

Dataset Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 21 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            426880 non-null  int64  
 1   price         426880 non-null  float64
 2   year          425675 non-null  float64
 3   manufacturer  409234 non-null  object 
 4   model         421603 non-null  object 
 5   condition     252776 non-null  object 
 6   cylinders     249202 non-null  object 
 7   fuel          423867 non-null  object 
 8   odometer      422480 non-null  float64
 9   title_status  418638 non-null  object 
 10  transmission  424324 non-null  object 
 11  VIN           265838 non-null  object 
 12  drive         296313 non-null  object 
 13  size          120519 non-null  object 
 14  type          334022 non-null  object 
 15  paint_color   296677 non-null  object 
 16  image_url     426812 non-null  object 
 17  state         426880 non-null  object 
 18  posting_date  426812 non-null  object 
 19  latitude      426880 non-null  float64
 20  longitude     426880 non-null  float64
dtypes: float64(5), int64(1), object(15)
memory usage: 68.4+ MB

# Calculating the missing value for each column
missing_values = data.isnull().sum()

# calculating the missing percentage
missing_percentage = (missing_values / len(data)) * 100

# creating summary table
missing_summary = pd.DataFrame({"missing values in each column": missing_values,
                                "missing %": missing_percentage }).sort_values(by ="missing %",ascending=False)

# Displaying summary table
print("missing data summary for each column:\n\n", missing_summary)

missing data summary for each column:

               missing values in each column  missing %
size                                 306361  71.767476
cylinders                            177678  41.622470
condition                            174104  40.785232
VIN                                  161042  37.725356
drive                                130567  30.586347
paint_color                          130203  30.501078
type                                  92858  21.752717
manufacturer                          17646   4.133714
title_status                           8242   1.930753
model                                  5277   1.236179
odometer                               4400   1.030735
fuel                                   3013   0.705819
transmission                           2556   0.598763
year                                   1205   0.282281
posting_date                             68   0.015930
image_url                                68   0.015930
latitude                                  0   0.000000
id                                        0   0.000000
state                                     0   0.000000
price                                     0   0.000000
longitude                                 0   0.000000

# Creating a heatmap for visualisation missing values
plt.figure(figsize=(14, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='crest', yticklabels=False)
plt.title("Missing Data Heatmap (blue = missing data)", fontsize=16)
plt.xlabel("Columns", fontsize=14)
plt.ylabel("", fontsize=14)
plt.tight_layout()
plt.show()

# Counting unique values in each column and presenting them in a DataFrame
unique_values_df = pd.DataFrame({
    "Column Name": data.columns,
    "Unique Value Count": [data[col].nunique() for col in data.columns]
})

# Display the DataFrame
print("Unique Value Counts:\n", unique_values_df)

Unique Value Counts:
      Column Name  Unique Value Count
0             id              426880
1          price                6253
2           year                 114
3   manufacturer                  42
4          model               29667
5      condition                   6
6      cylinders                   8
7           fuel                   5
8       odometer              104870
9   title_status                   6
10  transmission                   3
11           VIN              118264
12         drive                   3
13          size                   4
14          type                  13
15   paint_color                  12
16     image_url              241899
17         state                  35
18  posting_date              333355
19      latitude              191354
20     longitude              162536

# Checking summary statistics for specific numerical columns
print("\n. Summary Statistics (Price, Year, and Odometer Only):")
print(data[['price', 'year', 'odometer']].describe())

. Summary Statistics (Price, Year, and Odometer Only):
              price           year      odometer
count  4.268800e+05  425675.000000  4.224800e+05
mean   9.657028e+06    2011.235191  9.804333e+04
std    1.564439e+09       9.452120  2.138815e+05
min    0.000000e+00    1900.000000  0.000000e+00
25%    7.576800e+05    2008.000000  3.770400e+04
50%    1.791970e+06    2013.000000  8.554800e+04
75%    3.401080e+06    2017.000000  1.335425e+05
max    4.798934e+11    2022.000000  1.000000e+07

# Calculating the timeline while ignoring missing values
valid_dates = pd.to_datetime(data['posting_date'], errors='coerce').dropna()

# Get the minimum and maximum dates from valid entries
timeline_start = valid_dates.min().date()
timeline_end = valid_dates.max().date()

print(f"Dataset timeline: {timeline_start} to {timeline_end}")

Dataset timeline: 2021-04-04 to 2021-05-05

# Creating a copy of the dataset
df = data.copy()

# Confirmation by printing the first few rows of the copied dataset
print("Copy of the dataset has been created with the name 'df'")

Copy of the dataset has been created with the name 'df'

# Convert only string columns to lowercase
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].map(lambda x: x.lower() if isinstance(x, str) else x)

# Renaming the 'year' column to 'mfg_year'
df.rename(columns={'year': 'mfg_year'}, inplace=True)

# Confirmation
print("The 'year' column has been renamed to 'mfg_year' for better understanding.")

The 'year' column has been renamed to 'mfg_year' for better understanding.

# Function to clean special characters in string columns
def clean_special_characters(column):
    return column.str.replace(r'[^a-zA-Z0-9 ]', '', regex=True)  # Keep only letters, numbers, and spaces

# List of columns to exclude
exclude_columns = ['image_url', 'posting_date', 'size']

# Apply the cleaning function to string columns except the excluded ones
for col in df.select_dtypes(include=['object']).columns:
    if col not in exclude_columns:
        df[col] = clean_special_characters(df[col])

# Converting 'mfg_year' to an integer-compatible type
df['mfg_year'] = df['mfg_year'].astype('Int64')

# Confirmation message
print("The 'mfg_year' column has been converted from float to Int64")

The 'mfg_year' column has been converted from float to Int64

# Convert posting_date to datetime
df['posting_date'] = pd.to_datetime(df['posting_date'])

# Confirmation
print("The 'posting_date' column has been successfully converted from object to date format.")

The 'posting_date' column has been successfully converted from object to date format.

# Checking for duplicate rows
total_rows = df.shape[0]
num_duplicates = df.duplicated().sum()

# Printing the results
print(f"Total number of rows: {total_rows}")
print(f"Number of duplicate rows: {num_duplicates}")

Total number of rows: 426880
Number of duplicate rows: 0

# Removing duplicate rows
df = df.drop_duplicates()

# Confirmation
print(f"Duplicate rows removed: {num_duplicates}")
print(f"Remaining rows in the dataset: {df.shape[0]}")

Duplicate rows removed: 0
Remaining rows in the dataset: 426880

# Dropping the 'image_url' column
df = df.drop(columns=['image_url'])

# Displaying confirmation
print("Columns 'image_url' have been dropped.")

Columns 'image_url' have been dropped.

# Dropping the 'size' column
df = df.drop(columns=['size'])

# Displaying confirmation
print("Columns 'size' have been dropped.")

Columns 'size' have been dropped.

# Count repeated VIN values in the DataFrame
repeated_vin_count = df['VIN'].value_counts()[df['VIN'].value_counts() > 1].count()

# Print the result
print(f"Number of total repeated VINs in dataset is: {repeated_vin_count}")

Number of total repeated VINs in dataset is: 40280

# Get the unique repeated VINs
unique_repeated_vins = df['VIN'].value_counts()[df['VIN'].value_counts() > 1].index.tolist()

# Count how many rows will be removed
rows_to_remove = df[df['VIN'].isin(unique_repeated_vins)].shape[0]

# Print the result
print(f"Number of rows that will be removed if we removed repeated VINs will be: {rows_to_remove}")

Number of rows that will be removed if we removed repeated VINs will be: 187854

# Display random rows from the DataFrame to inspect VINs
random_vins = df.sample(n=10, random_state=42)  # AdjustING n for the number of rows we want
print(random_vins[['VIN']])  # Display only the VIN column

                      VIN
100905  1ftfx1eg9hkd14814
143835                NaN
20235                 NaN
300734                NaN
316249  1gnskgkc7kr124145
163902  2c4gp44392r547816
353675                NaN
265426                NaN
231608  1fa6p8th9g5224182
297925  1gc1kwe87ff570325

def identify_unusual_vin_patterns(vin):
    if not isinstance(vin, str):
        return "Non-String"
    patterns = [
        "Invalid Length" if len(vin) != 17 else "",
        "Forbidden Letters (I, O, Q)" if any(c in "IOQ" for c in vin) else "",
        "Non-Alphanumeric Characters" if not vin.isalnum() else "",
        "All Numbers" if vin.isdigit() else "",
        "All Letters" if vin.isalpha() else "",
        "Imbalanced Letters-Numbers" if sum(c.isalpha() for c in vin) == 0 or sum(c.isdigit() for c in vin) == 0 else "",
        "Excessive Repetition" if max(vin.count(c) for c in set(vin)) > 10 else "",
    ]
    return ", ".join(filter(bool, patterns)) or "Valid"

# Process VINs directly without modifying the original DataFrame
df_temp = df[df['VIN'].apply(identify_unusual_vin_patterns) != "Valid"]

# Summarise unusual VINs
unusual_vin_counts = df_temp['VIN'].value_counts()
unusual_vin_summary = unusual_vin_counts.reset_index().rename(columns={'index': 'VIN', 'VIN': 'Row Count'})

# Add total row
unusual_vin_summary.loc[len(unusual_vin_summary)] = ['Total', len(df_temp)]

# Display results
print(unusual_vin_summary)
print(f"Total number of rows carried by unusual VINs: {len(df_temp)}")
print(f"Number of unique unusual VINs: {len(unusual_vin_counts)}")

              Row Count   count
0     13131313131313131      85
1     15151515151515151      81
2     14141414141414141      52
3     11111111111111111      24
4           cr315045444      23
...                 ...     ...
1068      124871n507703       1
1069           f8zs2993       1
1070      3n67k5m336901       1
1071      ccl449f505274       1
1072              Total  162803

[1073 rows x 2 columns]
Total number of rows carried by unusual VINs: 162803
Number of unique unusual VINs: 1072

# Droping the VIN column
df = df.drop(columns=['VIN'])

# Confirm the removal
print("VIN column successfully removed from the dataset.")

VIN column successfully removed from the dataset.

# Boxplot without creating a new column
plt.figure(figsize=(10, 4))
sns.boxplot(x=df['price'] / 1e6, color='cyan')
plt.xlabel('Price (in Millions)')
plt.show()

# Calculating IQR for Outlier Detection
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identifying Outliers (Single Step)
outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]

# Displaying Extreme End Outliers
extreme_low_outliers = outliers.nsmallest(1, 'price')[['price', 'odometer', 'mfg_year']]
extreme_high_outliers = outliers.nlargest(1, 'price')[['price', 'odometer', 'mfg_year']]

print("Extreme low outlier:")
print(extreme_low_outliers)

print("\nExtreme high outlier:")
print(extreme_high_outliers)

print("\nDataset size before cleaning:", df.shape[0])

# Removing Outliers and Updating the Original Dataset
df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]

# Printing Results
print("\nDataset size after cleaning:", df.shape[0])
print("Number of outliers removed:", outliers.shape[0])

Extreme low outlier:
            price  odometer  mfg_year
303716  7371250.0  435400.0      2004

Extreme high outlier:
               price  odometer  mfg_year
318592  4.798934e+11  164000.0      2007

Dataset size before cleaning: 426880

Dataset size after cleaning: 418703
Number of outliers removed: 8177

# Boxplot without creating a new column
plt.figure(figsize=(10, 4))
sns.boxplot(x=df['price'] / 1e6, color='cyan')
plt.xlabel('Price (in Millions)')
plt.show()

# Visualize the Violin Plot for Odometer (in lakh kilometers) excluding missing values
plt.figure(figsize=(12, 6))
sns.violinplot(x=df[df['odometer'].notnull()]['odometer'] / 100000, color='greenyellow')  # Exclude missing values
plt.xlabel('Odometer (in lakh km)')
plt.title('Odometer Distribution (Before removing outliers)')
plt.show()

# Exclude missing values for odometer for calculations
df_filtered = df[df['odometer'].notnull()]

# Calculate IQR for Odometer Outlier Detection
Q1_odometer = df_filtered['odometer'].quantile(0.25)
Q3_odometer = df_filtered['odometer'].quantile(0.75)
IQR_odometer = Q3_odometer - Q1_odometer

lower_bound_odometer = Q1_odometer - 1.5 * IQR_odometer
upper_bound_odometer = Q3_odometer + 1.5 * IQR_odometer

# Identify Odometer Outliers
odometer_outliers = df_filtered[(df_filtered['odometer'] < lower_bound_odometer) | (df_filtered['odometer'] > upper_bound_odometer)]

# Display Extreme End Odometer Outliers
extreme_low_odometer = odometer_outliers.nsmallest(1, 'odometer')[['price', 'odometer', 'mfg_year']]
extreme_high_odometer = odometer_outliers.nlargest(1, 'odometer')[['price', 'odometer', 'mfg_year']]

print("Extreme low odometer outlier:")
print(extreme_low_odometer)

print("\nExtreme high odometer outlier:")
print(extreme_high_odometer)

print("\nDataset size before odometer outlier removal:", df.shape[0])

# Remove Odometer Outliers and Retain Missing Values in the Original Dataset
df = df[(df['odometer'].isnull()) | ((df['odometer'] >= lower_bound_odometer) & (df['odometer'] <= upper_bound_odometer))]

# Print Results
print("\nDataset size after odometer outlier removal:", df.shape[0])
print("Number of odometer outliers removed:", odometer_outliers.shape[0])

Extreme low odometer outlier:
           price  odometer  mfg_year
132381  205460.0  277814.0      1992

Extreme high odometer outlier:
         price    odometer  mfg_year
9218  513660.0  10000000.0      1975

Dataset size before odometer outlier removal: 418703

Dataset size after odometer outlier removal: 414473
Number of odometer outliers removed: 4230

# Visualize the Violin Plot for Odometer (in lakh kilometers) excluding missing values
plt.figure(figsize=(12, 6))
sns.violinplot(x=df[df['odometer'].notnull()]['odometer'] / 100000, color='greenyellow')  # Excluding missing values
plt.xlabel('Odometer (in lakh km)')
plt.title('Odometer Distribution (After outliers removal)')
plt.show()

# Count occurrences of each year
year_counts = df['mfg_year'].value_counts().sort_index()

# Compute statistics
mean_year, median_year, std_year = df['mfg_year'].mean(), df['mfg_year'].median(), df['mfg_year'].std()

# Create figure
fig, ax1 = plt.subplots(figsize=(12,6))

# Bar chart for manufacturing year distribution
ax1.bar(year_counts.index, year_counts.values, color='skyblue', alpha=0.6, label="Car Count")

# Line plot for trend visualization
sns.lineplot(x=year_counts.index, y=year_counts.values, marker="o", color="royalblue", linewidth=2, ax=ax1)

# Add statistical markers (Mean, Median, Std Dev)
for val, color, label in [(mean_year, 'blue', 'Mean'), (median_year, 'red', 'Median')]:
    ax1.axvline(val, color=color, linestyle='dashed', linewidth=2, label=f'{label}: {int(val)}')

ax1.axvline(mean_year + std_year, color='orange', linestyle='dashed', linewidth=2, label=f'Std Dev (+1σ)')

# Labels & Titles
ax1.set_title("Manufacturing Year Distribution", fontsize=14, fontweight='bold')
ax1.set_xlabel("Manufacturing Year")
ax1.set_ylabel("Number of Cars")
ax1.legend()
ax1.grid(axis="y", linestyle="--", alpha=0.5)

# Show chart
plt.show()

# Checking the minimum and maximum manufacturing year
min_year = df['mfg_year'].min()
max_year = df['mfg_year'].max()

# Displaying the results
print(f"The oldest car was posted is of mfg_year {min_year}")
print(f"The newest car was posted is of mfg_year {max_year}")

The oldest car was posted is of mfg_year 1900
The newest car was posted is of mfg_year 2022

# Get the initial number of rows
initial_rows = df.shape[0]

# Remove rows where mfg_year is 2022
df = df[df['mfg_year'] != 2022]

# Get the updated number of rows
final_rows = df.shape[0]

# Calculate the total number of rows removed
rows_removed = initial_rows - final_rows

# Print the results
print(f"Total rows removed: {rows_removed}")
print(f"Updated DataFrame rows: {final_rows}")

Total rows removed: 1134
Updated DataFrame rows: 413339

# Calculating the total number of rows in the dataset
total_rows = df.shape[0]

# Calculating the number of rows manufactured before 2006
before_2006 = df[df['mfg_year'] < 2006].shape[0]

# Calculating the number of rows manufactured in or after 2006
after_2006 = df[df['mfg_year'] >= 2006].shape[0]

# Counting the missing values in the 'mfg_year' column
missing_values = df['mfg_year'].isna().sum()

# Determining the number of rows remaining after removing 'before 2006' and 'missing'
remaining_rows = after_2006  # Rows from 2006 onwards (excluding missing values)

# Calculating percentages for each category
before_2006_pct = (before_2006 / total_rows) * 100
after_2006_pct = (after_2006 / total_rows) * 100
missing_values_pct = (missing_values / total_rows) * 100

# Printing the results
print(f"total rows in the dataset: {total_rows}")
print(f"percentage of rows manufactured before 2006: {before_2006_pct:.2f}%")
print(f"percentage of rows manufactured from 2006 onwards: {after_2006_pct:.2f}%")
print(f"percentage of missing values in 'mfg_year': {missing_values_pct:.2f}%")
print(f"rows remaining after removing 'before 2006' and 'missing': {remaining_rows}")

total rows in the dataset: 413339
percentage of rows manufactured before 2006: 15.13%
percentage of rows manufactured from 2006 onwards: 84.87%
percentage of missing values in 'mfg_year': 0.00%
rows remaining after removing 'before 2006' and 'missing': 350793

# Step 1: Remove rows with missing values in 'mfg_year'
df = df[df['mfg_year'].notnull()]

# Step 2: Remove rows where 'mfg_year' is less than 2006
df = df[df['mfg_year'] >= 2006]

# Confirmation prints
print(f"Total rows remaining: {df.shape[0]}")
print(f"Minimum mfg_year in the dataset: {df['mfg_year'].min()}")
print(f"Rows with missing mfg_year: {df['mfg_year'].isna().sum()}")

Total rows remaining: 350793
Minimum mfg_year in the dataset: 2006
Rows with missing mfg_year: 0

# Count occurrences of each year
year_counts = df['mfg_year'].value_counts().sort_index()

# Compute statistics
mean_year, median_year, std_year = df['mfg_year'].mean(), df['mfg_year'].median(), df['mfg_year'].std()

# Create figure
fig, ax1 = plt.subplots(figsize=(12,6))

# Bar chart for manufacturing year distribution
ax1.bar(year_counts.index, year_counts.values, color='skyblue', alpha=0.6, label="Car Count")

# Line plot for trend visualization
sns.lineplot(x=year_counts.index, y=year_counts.values, marker="o", color="royalblue", linewidth=2, ax=ax1)

# Add statistical markers (Mean, Median, Std Dev)
for val, color, label in [(mean_year, 'blue', 'Mean'), (median_year, 'red', 'Median')]:
    ax1.axvline(val, color=color, linestyle='dashed', linewidth=2, label=f'{label}: {int(val)}')

ax1.axvline(mean_year + std_year, color='orange', linestyle='dashed', linewidth=2, label=f'Std Dev (+1σ)')

# Labels & Titles
ax1.set_title("Manufacturing Year Distribution", fontsize=14, fontweight='bold')
ax1.set_xlabel("Manufacturing Year")
ax1.set_ylabel("Number of Cars")
ax1.legend()
ax1.grid(axis="y", linestyle="--", alpha=0.5)

# Show chart
plt.show()

# Total number of rows in the dataset before cleaning
total_rows = df.shape[0]

# Counting rows where price is zero
rows_with_price_zero = df[df['price'] == 0].shape[0]

# Removing rows where price equals zero
# This modifies the original dataset
df = df[df['price'] != 0]

# Remaining rows after cleaning
remaining_rows = df.shape[0]

# Printing the results
print(f"Total rows in the dataset: {total_rows}")
print(f"Rows removed where price = 0: {rows_with_price_zero}")
print(f"Remaining rows after cleaning: {remaining_rows}")

Total rows in the dataset: 350793
Rows removed where price = 0: 32420
Remaining rows after cleaning: 318373

# Step 1: Counting the number of missing values for each row
missing_per_row = df.isnull().sum(axis=1)  # Sum missing values across columns for each row

# Step 2: Counting the total rows for each unique count of missing columns
missing_summary = missing_per_row.value_counts().sort_index()  # Count rows for each missing column count

# Step 3: Displaying results in the desired format
print("Column Wise Missing Values Summary")
for num_missing, total_rows in missing_summary.items():
    print(f"No of Missing Column: {num_missing:<3} Total Rows: {total_rows}")

Column Wise Missing Values Summary
No of Missing Column: 0   Total Rows: 85417
No of Missing Column: 1   Total Rows: 86510
No of Missing Column: 2   Total Rows: 66850
No of Missing Column: 3   Total Rows: 35933
No of Missing Column: 4   Total Rows: 14348
No of Missing Column: 5   Total Rows: 27017
No of Missing Column: 6   Total Rows: 1982
No of Missing Column: 7   Total Rows: 11
No of Missing Column: 8   Total Rows: 110
No of Missing Column: 9   Total Rows: 195

# Step 1: Identifying rows with more than 7 missing columns
rows_to_drop = df[df.isnull().sum(axis=1) > 6].index  # Get indices of rows with > 6 missing columns

# Step 2: Droping these rows from the dataframe
df = df.drop(index=rows_to_drop)  # Drops rows from the original dataframe

# Step 3: Displaying confirmation
print(f"Rows dropped: {len(rows_to_drop)}")
print(f"Remaining rows in the dataset: {len(df)}")

Rows dropped: 316
Remaining rows in the dataset: 318057

# Calculating and summarising missing values
missing_data = df.isnull().sum().to_frame('missing values')
missing_data['missing %'] = (missing_data['missing values'] / len(df)) * 100

# Filtering and sorting columns with missing values
missing_data = missing_data[missing_data['missing values'] > 0].sort_values(by='missing %', ascending=False)

# Displaying the summary
print(missing_data)

              missing values  missing %
cylinders             135686  42.660907
condition             124608  39.177883
drive                  97081  30.523145
paint_color            91796  28.861493
type                   61647  19.382375
manufacturer            8485   2.667761
title_status            6245   1.963485
model                   2013   0.632905
fuel                    1815   0.570652
odometer                1702   0.535124
transmission            1300   0.408732

# Dropping rows with missing values from columns with <2% missing values
original_rows = len(df)
columns_to_clean = df.isnull().mean()[df.isnull().mean() < 0.02].index
df = df.dropna(subset=columns_to_clean)
rows_after_cleaning = len(df)

# Printing the results
print(f"Original dataset rows: {original_rows}")
print(f"Rows after cleaning: {rows_after_cleaning}")
print(f"Total rows dropped: {original_rows - rows_after_cleaning}")

# Printing the missing data summary after cleaning with percentages
print("\nMissing data summary after cleaning:")
print(df.isnull().sum()[df.isnull().sum() > 0].apply(
    lambda x: f"{x} ({(x / len(df)) * 100:.2f}%)"
))

Original dataset rows: 318057
Rows after cleaning: 305423
Total rows dropped: 12634

Missing data summary after cleaning:
manufacturer       8319 (2.72%)
condition       118875 (38.92%)
cylinders       131377 (43.01%)
drive            92149 (30.17%)
type             60058 (19.66%)
paint_color      84143 (27.55%)
dtype: object

# Reset the index after dropping rows
df = df.reset_index(drop=True)

# Confirm the index reset
print(df.index)   # Display the new index

RangeIndex(start=0, stop=305423, step=1)

# List of target columns
target_columns = ['manufacturer', 'fuel', 'transmission', 'drive', 'type', 'paint_color']

# Step 1: Build a dictionary of unique values for each column
unique_values = {col: df[col].dropna().unique().tolist() for col in target_columns}

# Step 2: Ensure the 'model' column is treated as a string
df['model'] = df['model'].astype(str)

# Step 3: Function to reallocate missing values
def reallocate_values(row):
    model_data = row['model']  # Ensure it's a string
    for col in target_columns:
        for value in unique_values[col]:
            # General substring matching for target columns
            if isinstance(value, str) and value in model_data and pd.isna(row[col]):
                row[col] = value
    return row

# Calculate missing values before processing
missing_before = df[target_columns].isna().sum()

# Step 4: Apply the function row-wise
df = df.apply(reallocate_values, axis=1)

# Calculate missing values after processing
missing_after = df[target_columns].isna().sum()

# Combine results into a comparison DataFrame
comparison_df = pd.DataFrame({
    'Missing Before': missing_before,
    'Missing After': missing_after
})

# Display the comparison for easy review
comparison_df['Difference'] = comparison_df['Missing Before'] - comparison_df['Missing After']

print("Missing Value Comparison Before and After Reallocation:")
print(comparison_df)

Missing Value Comparison Before and After Reallocation:
              Missing Before  Missing After  Difference
manufacturer            8319           7826         493
fuel                       0              0           0
transmission               0              0           0
drive                  92149          91723         426
type                   60058          57245        2813
paint_color            84143          80715        3428

# Function for imputing missing 'manufacturer' values based on 'model'
def impute_manufacturer_by_model(df):
    # Creating a dictionary to map 'model' to 'manufacturer' based on non-missing values
    model_to_manufacturer = df.dropna(subset=['manufacturer', 'model']).set_index('model')['manufacturer'].to_dict()

    # Imputing only missing values in 'manufacturer' using the mapping, leaving all other rows untouched
    df['manufacturer'] = df['manufacturer'].fillna(df['model'].map(model_to_manufacturer))

    # Returning the DataFrame with only the required changes
    return df



# Checking missing values in 'manufacturer' before
print(f"Missing values in 'manufacturer' before imputing: {df['manufacturer'].isnull().sum()}")

# Applying the function to impute missing 'manufacturer'
df = impute_manufacturer_by_model(df)

# Checking missing values in 'manufacturer' after
print(f"Missing values in 'manufacturer' after imputing: {df['manufacturer'].isnull().sum()}")

Missing values in 'manufacturer' before imputing: 7826
Missing values in 'manufacturer' after imputing: 6828

# Dictionary mapping specific car models to manufacturers
models_to_manufacturer = {
    'toyota': ['camry', 'corolla', 'rav4', 'tacoma', '4runner', 'highlander', 'land cruiser', 'prius'],
    'honda': ['accord', 'civic', 'cr-v', 'pilot', 'odyssey', 'fit', 'hr-v'],
    'ford': ['f-150', 'f-250', 'f-350', 'explorer', 'expedition', 'mustang', 'fusion', 'escape'],
    'chevrolet': ['silverado', 'tahoe', 'suburban', 'impala', 'malibu', 'corvette', 'camaro'],
    'nissan': ['altima', 'sentra', 'maxima', 'rogue', 'murano', 'pathfinder', 'xterra'],
    'jeep': ['wrangler', 'cherokee', 'grand cherokee', 'renegade'],
    'dodge': ['charger', 'challenger', 'durango', 'ram'],
    'bmw': ['3 series', '5 series', '7 series', 'x5', 'x3'],
    'mercedes-benz': ['c-class', 'e-class', 's-class', 'gla', 'glc-class', 'gle', 'benz s550', 'glk-class', 'ml350', 'clk 550'],
    'subaru': ['outback', 'forester', 'impreza', 'legacy', 'wrx'],
    'mazda': ['mazda3', 'mazda6', 'cx-5', 'cx-9', 'mx-5'],
    'hyundai': ['elantra', 'sonata', 'tucson', 'santa fe'],
    'kia': ['optima', 'sorento', 'sportage', 'soul'],
    'volkswagen': ['jetta', 'passat', 'golf', 'tiguan'],
    'lexus': ['rx', 'es', 'nx', 'gx', 'ls'],
    'hummer': ['h1', 'h2', 'h3'],
    'mini': ['cooper', 'clubman', 'countryman'],
    'audi': ['a3', 'a4', 'a5', 'a6', 'q3', 'q5', 'q7'],
    'scion': ['im hatchback 4d', 'xd hatchback 4d', 'fr-s coupe 2d', 'tc hatchback coupe 2d', 'xb base wagon', 'iq'],
    'smart': ['fortwo passion hatchback', 'fortwo electric drive passion', 'fortwo pure'],
    'maserati': ['levante', 'quattroporte', 'ghibli', 'granturismo'],
    'suzuki': ['xl-7', 'reno', 'forenza', 'grand vitara', 'sx4 sport awd'],
    'isuzu': ['npr hd', 'nrr box truck', 'npr crew'],
    'freightliner': ['m2 106 medium duty', 'm-line walk-in van'],
}

# Function to fill the manufacturer column based on the model column
def fill_manufacturer(row):
    # Check if model is valid and manufacturer is empty
    model_value = row['model']
    if pd.isnull(row['manufacturer']) and pd.notnull(model_value):
        # Match the manufacturer using next() for efficiency
        row['manufacturer'] = next(
            (manufacturer for manufacturer, patterns in models_to_manufacturer.items()
             if any(pattern in model_value for pattern in patterns)),
            row['manufacturer']  # Default: None if no match is found
        )
    return row


# Checking missing values in 'manufacturer' before
print(f"Missing values in 'manufacturer' before imputing: {df['manufacturer'].isnull().sum()}")

# Applying the function row by row to fill the manufacturer column
df = df.apply(fill_manufacturer, axis=1)

# Checking missing values in 'manufacturer' after
print(f"Missing values in 'manufacturer' after imputing: {df['manufacturer'].isnull().sum()}")

Missing values in 'manufacturer' before imputing: 6828
Missing values in 'manufacturer' after imputing: 3491

# Function to fill missing values in the 'condition' column without modifying the original DataFrame
def fill_missing_condition(df):
    # Set the pandas option to avoid FutureWarnings
    pd.set_option('future.no_silent_downcasting', True)

    # Create a copy of the DataFrame
    df_copy = df.copy()

    # Step 1: Sort data for logical filling
    df_copy = df_copy.sort_values(by=['mfg_year', 'odometer'])

    # Step 3: Fallback to broader grouping (mfg_year, odometer)
    df_copy['condition'] = (
        df_copy.groupby(['mfg_year', 'odometer'])['condition']
        .transform(lambda x: x.bfill().ffill())  # Use bfill and ffill explicitly
    )

    # Step 4: Fallback with mfg_year (broader group)
    df_copy['condition'] = (
        df_copy.groupby(['mfg_year'])['condition']
        .transform(lambda x: x.bfill().ffill())  # Use bfill and ffill explicitly
    )

    # Step 5: Global bfill and ffill as the final fallback
    df_copy['condition'] = df_copy['condition'].bfill().ffill()

    # Extract the filled 'condition' column
    filled_condition = df_copy['condition']

    # Explicitly delete the temporary copy to free memory
    del df_copy

    # Reset the pandas option after processing
    pd.reset_option('future.no_silent_downcasting')

    # Return the filled 'condition' column
    return filled_condition

# Call the function to fill missing values in the 'condition' column without modifying the original dataset
filled_condition = fill_missing_condition(df)

# Print the original and filled 'condition' columns for comparison
print("Original 'condition' column:")
print(df['condition'])

print("\nFilled 'condition' column:")
print(filled_condition)

Original 'condition' column:
0              good
1              good
2              good
3              good
4         excellent
            ...    
305418         good
305419         good
305420         good
305421         good
305422         good
Name: condition, Length: 305423, dtype: object

Filled 'condition' column:
55215          good
79185          good
103123         good
103386         good
103974         good
            ...    
69815          good
210694         good
254305         good
188408    excellent
142725         good
Name: condition, Length: 305423, dtype: object

# Calculate missing values count and percentage for each column
missing_info = pd.DataFrame({
    'Missing Count': df.isnull().sum(),
    'Missing Percentage (%)': (df.isnull().sum() / len(df)) * 100
})

# Display the result
print(missing_info)

              Missing Count  Missing Percentage (%)
id                        0                0.000000
price                     0                0.000000
mfg_year                  0                0.000000
manufacturer           3491                1.143005
model                     0                0.000000
condition            118875               38.921430
cylinders            131377               43.014770
fuel                      0                0.000000
odometer                  0                0.000000
title_status              0                0.000000
transmission              0                0.000000
drive                 91723               30.031465
type                  57245               18.742858
paint_color           80715               26.427283
state                     0                0.000000
posting_date              0                0.000000
latitude                  0                0.000000
longitude                 0                0.000000

# List of columns to fill missing values with 'unknown'
columns_to_ffill = ['paint_color', 'drive', 'condition', 'cylinders']

# Fill missing values with 'unknown' and apply changes to the dataframe
data[columns_to_ffill] = data[columns_to_ffill].fillna('unknown')

# Verify that there are no missing values remaining in the specified columns
print("Number of remaining missing values in the dataset:", data[columns_to_ffill].isnull().sum().sum())

Number of remaining missing values in the dataset: 0

# Calculate missing values count and percentage for each column
missing_info = pd.DataFrame({
    'Missing Count': df.isnull().sum(),
    'Missing Percentage (%)': (df.isnull().sum() / len(df)) * 100
})

# Display the result
print(missing_info)

              Missing Count  Missing Percentage (%)
id                        0                0.000000
price                     0                0.000000
mfg_year                  0                0.000000
manufacturer           3491                1.143005
model                     0                0.000000
condition            118875               38.921430
cylinders            131377               43.014770
fuel                      0                0.000000
odometer                  0                0.000000
title_status              0                0.000000
transmission              0                0.000000
drive                 91723               30.031465
type                  57245               18.742858
paint_color           80715               26.427283
state                     0                0.000000
posting_date              0                0.000000
latitude                  0                0.000000
longitude                 0                0.000000

# Add new columns for Min-Max Normalisation
df['price_minmax'] = (df['price'] - df['price'].min()) / (df['price'].max() - df['price'].min())
df['odometer_minmax'] = (df['odometer'] - df['odometer'].min()) / (df['odometer'].max() - df['odometer'].min())

# Verify that only the new columns are added
print("Columns after adding normalised values:")
print(df.columns)

# Check the first few rows to confirm the new columns
print("First few rows with new columns:")
print(df[['price', 'price_minmax', 'odometer', 'odometer_minmax']].head())

Columns after adding normalised values:
Index(['id', 'price', 'mfg_year', 'manufacturer', 'model', 'condition',
       'cylinders', 'fuel', 'odometer', 'title_status', 'transmission',
       'drive', 'type', 'paint_color', 'state', 'posting_date', 'latitude',
       'longitude', 'price_minmax', 'odometer_minmax'],
      dtype='object')
First few rows with new columns:
       price  price_minmax  odometer  odometer_minmax
0  4313900.0      0.585802   57923.0         0.208574
1  2901280.0      0.393936   71229.0         0.256487
2  5084440.0      0.690459   19160.0         0.068993
3  3980090.0      0.540463   41124.0         0.148083
4  1926290.0      0.261510  128000.0         0.460912

# Convert object columns to categorical
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype('category')

# Check memory usage before and after
print("Memory usage after conversion:")
print(df.memory_usage(deep=True))
print(df.info())

Memory usage after conversion:
Index                  132
id                 2443384
price              2443384
mfg_year           2443384
manufacturer        309549
model              2400803
condition           305972
cylinders           306263
fuel                305908
odometer           2443384
title_status        305977
transmission        305722
drive               305711
type                306794
paint_color         306468
state               308862
posting_date       2443384
latitude           2443384
longitude          2443384
price_minmax       2443384
odometer_minmax    2443384
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305423 entries, 0 to 305422
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype              
---  ------           --------------   -----              
 0   id               305423 non-null  int64              
 1   price            305423 non-null  float64            
 2   mfg_year         305423 non-null  int64              
 3   manufacturer     301932 non-null  category           
 4   model            305423 non-null  category           
 5   condition        186548 non-null  category           
 6   cylinders        174046 non-null  category           
 7   fuel             305423 non-null  category           
 8   odometer         305423 non-null  float64            
 9   title_status     305423 non-null  category           
 10  transmission     305423 non-null  category           
 11  drive            213700 non-null  category           
 12  type             248178 non-null  category           
 13  paint_color      224708 non-null  category           
 14  state            305423 non-null  category           
 15  posting_date     305423 non-null  datetime64[ns, UTC]
 16  latitude         305423 non-null  float64            
 17  longitude        305423 non-null  float64            
 18  price_minmax     305423 non-null  float64            
 19  odometer_minmax  305423 non-null  float64            
dtypes: category(11), datetime64[ns, UTC](1), float64(6), int64(2)
memory usage: 25.1 MB
None

# Calculate cars_age and store it as a new column
df['cars_age'] = df['posting_date'].dt.year - df['mfg_year']

# Print the three columns in one row for each record
print(df[['posting_date', 'mfg_year', 'cars_age']])

                    posting_date  mfg_year  cars_age
0      2021-05-04 17:31:18+00:00      2014         7
1      2021-05-04 17:31:08+00:00      2010        11
2      2021-05-04 17:31:25+00:00      2020         1
3      2021-05-04 15:41:31+00:00      2017         4
4      2021-05-03 19:02:03+00:00      2013         8
...                          ...       ...       ...
305418 2021-04-04 09:21:31+00:00      2019         2
305419 2021-04-04 09:21:29+00:00      2020         1
305420 2021-04-04 09:21:17+00:00      2020         1
305421 2021-04-04 09:21:11+00:00      2018         3
305422 2021-04-04 09:21:07+00:00      2019         2

[305423 rows x 3 columns]

# Extract integers from the 'cylinders' column and create a new column named 'cylinders_count'
df['cylinders_count'] = df['cylinders'].astype(str).str.extract(r'(\d+)').astype('Int64')

# Confirmation print as a Series
# Confirmation print as a DataFrame with both columns
print(df[['cylinders', 'cylinders_count']].head(5))

     cylinders  cylinders_count
0  8 cylinders                8
1  8 cylinders                8
2  8 cylinders                8
3  8 cylinders                8
4  6 cylinders                6

# Dictionary mapping income categories to a list of states
state_income_category = {
    'high': ['goa', 'sikkim', 'delhi', 'chandigarh', 'haryana', 'telangana',
             'karnataka', 'gujarat', 'tamil nadu', 'puducherry'],
    'medium': ['andaman and nicobar islands', 'kerala', 'arunachal pradesh', 'mizoram',
               'uttarakhand', 'maharashtra', 'himachal pradesh', 'andhra pradesh',
               'dadra and nagar haveli and daman and diu', 'ladakh'],
    'low': ['punjab', 'jammu and kashmir', 'tripura', 'rajasthan', 'west bengal',
            'chhattisgarh', 'madhya pradesh', 'odisha'],
    'very low': ['assam', 'meghalaya', 'manipur', 'jharkhand', 'uttar pradesh', 'bihar']
}

# Reverse mapping: Assign each state its respective income category
state_income_mapping = {state: category for category, states in state_income_category.items() for state in states}

# Create a new column 'state_income' while keeping 'state' unchanged
df['state_income'] = df['state'].str.lower().map(state_income_mapping)

# Ensure state_income values are in lowercase
df['state_income'] = df['state_income'].str.lower()

# Display the first few rows to confirm changes
df.columns

Index(['id', 'price', 'mfg_year', 'manufacturer', 'model', 'condition',
       'cylinders', 'fuel', 'odometer', 'title_status', 'transmission',
       'drive', 'type', 'paint_color', 'state', 'posting_date', 'latitude',
       'longitude', 'price_minmax', 'odometer_minmax', 'cars_age',
       'cylinders_count', 'state_income'],
      dtype='object')

# Automatically select relevant columns
columns_of_interest = ['cars_age', 'price', 'odometer', 'cylinders_count']
selected_df = df[columns_of_interest]

# Calculate the correlation matrix
correlation_matrix = selected_df.corr()

# Global setting for figure size (optional)
sns.set(rc={'figure.figsize': (14, 6)})

# Plot the heatmap
plt.figure(figsize=(16, 5))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, square=False)
plt.title("Correlation Heatmap")
plt.show()

plt.figure(figsize=(14, 5))

for col, color, scale in zip(
    ['price', 'odometer', 'cars_age', 'cylinders_count'],
    ['blue', 'purple', 'green', 'orange'],
    [1e6, 1e5, 1, 1]
):
    sns.kdeplot(df[col] / scale if scale != 1 else df[col],
                label=col.replace("_", " ").title(),
                fill=True, alpha=0.3, color=color, linewidth=3)

plt.title('Density Plot Price (in million), odometer (in lakh km), cars_age & cylinders')
plt.xlabel('Values')
plt.ylabel('Density')
plt.xlim(0, 16)
plt.legend(title='Features', loc='upper right')
plt.show()

odo_lakh = df['odometer'] / 1e5
print({k: f"{v:,.2f}" for k, v in {
    "Mean": odo_lakh.mean(), "Median": odo_lakh.median(),
    "Var": odo_lakh.var(), "Std Dev": odo_lakh.std()}.items()})

plt.figure(figsize=(14,5))
sns.histplot(odo_lakh, bins=30, kde=True, color='indigo', alpha=0.6)
plt.title("Odometer Distribution (Lakh KM)"); plt.xlabel("Lakh KM")
plt.ylabel("Frequency"); plt.show()

{'Mean': '0.84', 'Median': '0.79', 'Var': '0.32', 'Std Dev': '0.57'}

# Count occurrences of each manufacturing year
mfg_year_counts = df['mfg_year'].value_counts().sort_index()

# Convert counts to percentage
total_cars = mfg_year_counts.sum()
mfg_year_percent = (mfg_year_counts / total_cars) * 100  # Convert to percentage

# Plot bar chart with correct `hue` to remove warning
plt.figure(figsize=(14,5))
ax = sns.barplot(x=mfg_year_counts.index, y=mfg_year_counts.values, hue=mfg_year_counts.index, palette="icefire")
ax.legend_.remove()  # Remove legend since hue is only for color

# Add percentage annotations on top of bars
for p, percent in zip(ax.patches, mfg_year_percent):
    ax.annotate(f'{percent:.1f}%',
                (p.get_x() + p.get_width() / 2, p.get_height()),
                ha='center', va='bottom', fontsize=10, fontweight='bold')

# Titles and labels
plt.title("Mfg Year Distribution & % contribution", fontsize=14, fontweight='bold')
plt.xlabel("Manufacturing Year"); plt.ylabel("Car Count")
plt.xticks(rotation=45)

# Set proper y-axis ticks dynamically
ax.set_yticks(range(0, max(mfg_year_counts.values) + 5000, 5000))

plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()

cyl_count = df['cylinders_count'].value_counts().sort_index()
plt.figure(figsize=(14,5))
sns.barplot(x=cyl_count.index, y=cyl_count.values, hue=cyl_count.index, palette="Set3", legend=False)
plt.title("Cylinders Count Distribution"); plt.xlabel("Cylinders"); plt.ylabel("Count")

for i, v in enumerate(cyl_count.values):
    plt.text(i, v + 0.5, str(v), ha='center', fontsize=10)

plt.show()

# Count occurrences of each unique value, excluding 'unknown'
drive_counts = df['drive'].value_counts()
drive_counts_filtered = drive_counts[drive_counts.index != 'unknown']

transmission_counts = df['transmission'].value_counts()
transmission_counts_filtered = transmission_counts[transmission_counts.index != 'unknown']

# Define color palettes for better visualization
palette_drive = sns.color_palette("Set2", len(drive_counts_filtered))
palette_transmission = sns.color_palette("Set3_r", len(transmission_counts_filtered))

# Create side-by-side donut charts
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Drive Type Donut Chart
axes[0].pie(drive_counts_filtered, labels=drive_counts_filtered.index, autopct='%1.1f%%',
            startangle=140, colors=palette_drive, wedgeprops={'edgecolor': 'white'})
axes[0].add_artist(plt.Circle((0, 0), 0.70, fc='white'))  # Create the donut effect
axes[0].set_title("Drive Type Distribution")

# Transmission Type Donut Chart
axes[1].pie(transmission_counts_filtered, labels=transmission_counts_filtered.index, autopct='%1.1f%%',
            startangle=140, colors=palette_transmission, wedgeprops={'edgecolor': 'white'})
axes[1].add_artist(plt.Circle((0, 0), 0.70, fc='white'))  # Create the donut effect
axes[1].set_title("Transmission Type Distribution")

# Show the plot
plt.tight_layout()
plt.show()

# Free up memory by deleting unnecessary variables
del drive_counts, drive_counts_filtered, transmission_counts, transmission_counts_filtered

# Count values
fuel_counts = df['fuel'].value_counts()

# Adjust x-positions with proper spacing
x_positions = [i * 5 for i in range(len(fuel_counts))]  # Increased horizontal spacing

# Plot bubble chart
plt.figure(figsize=(14, 5))  # Increase figure size to accommodate full circles
plt.scatter(x_positions, [0] * len(fuel_counts), s=fuel_counts.values * 0.3, c=plt.cm.Set3(range(len(fuel_counts))), alpha=0.9)

# Add labels with count values
for x, (label, count) in zip(x_positions, zip(fuel_counts.index, fuel_counts.values)):
    plt.text(x, 0, f"{label}\n{count}", ha='center', va='center', fontsize=10, fontweight='bold', color='black')

# Adjust axis limits to ensure full circles are visible
plt.xlim(min(x_positions) - 5, max(x_positions) + 5)
plt.ylim(-100, 100)  # Increase vertical space for larger circles

# Final touches
plt.axis('off')
plt.title("Fuel Type Distribution", fontsize=16)
plt.tight_layout()
plt.show()

# Count occurrences of each unique value in the 'condition' column (case insensitive)
condition_counts = df['condition'].str.lower().value_counts()

# Exclude 'unknown' from the counts for plotting purposes only
condition_counts_filtered = condition_counts[condition_counts.index != 'unknown']

# Plot the distribution
plt.figure(figsize=(14, 5))
sns.barplot(x=condition_counts_filtered.index, y=condition_counts_filtered.values, hue=condition_counts_filtered.index, palette="Set3", dodge=False, legend=False)
plt.title("Condition Distribution")
plt.xlabel("Condition")
plt.ylabel("Count")
plt.show()

# Free up memory by deleting unnecessary variables
del condition_counts, condition_counts_filtered

color_palette, figsize = "Set3", (14, 5)  # Adjust color & chart size

top_models = df['model'].value_counts().nlargest(10)
colors = plt.get_cmap(color_palette)(np.linspace(0, 1, 10))

plt.figure(figsize=figsize)
plt.barh(top_models.index, top_models.values, color=colors)
plt.gca().invert_yaxis()
plt.xlabel("Count"), plt.ylabel("Model"), plt.title("Top 10 Most Repeated Car Models")
plt.show()

df['manufacturer'].value_counts().nlargest(10) \
    .plot.barh(figsize=(14,5), width=0.6, color=plt.cm.cubehelix(np.linspace(0,1,10))) \
    .invert_yaxis()
plt.show()

# Filter the data to exclude 'unknown' and sort in ascending order without modifying the original DataFrame
filtered_types = df['type'].str.lower().value_counts()
filtered_types = filtered_types[filtered_types.index != 'unknown'].sort_values()

# Plot the distribution
plt.figure(figsize=(14, 5))
sns.barplot(x=filtered_types.index, y=filtered_types.values, hue=filtered_types.index, palette="Set3", dodge=False, legend=False)
plt.title("Car Type Distribution ")
plt.xlabel("Car Type")
plt.ylabel("Count")
plt.show()

# Filter the data to exclude 'unknown' and sort in ascending order without modifying the original DataFrame
filtered_color = df['paint_color'].str.lower().value_counts()
filtered_color = filtered_color[filtered_color.index != 'unknown'].sort_values()

# Plot the distribution
plt.figure(figsize=(14, 5))
sns.barplot(x=filtered_color.index, y=filtered_color.values, hue=filtered_color.index, palette="Set3", dodge=False, legend=False)
plt.title("Paint Color Distribution (Ascending Order)")
plt.xlabel("paint_color")
plt.ylabel("Count")
plt.show()

# Free up memory by deleting unnecessary variables
del filtered_color

# Group by week and count posts without creating a new column
weekly_posts = df.groupby(df['posting_date'].dt.isocalendar().week).size()

# Plot with annotations
plt.figure(figsize=(14, 5))
plt.plot(weekly_posts.index, weekly_posts.values, marker='o')

for week, total_posts in zip(weekly_posts.index, weekly_posts.values):
    plt.text(week, total_posts + max(weekly_posts) * 0.02,  # Offset for visibility
             f"{total_posts//1000}K", ha='center', fontsize=9)

plt.xticks(weekly_posts.index, [f"W{w}" for w in weekly_posts.index])
plt.xlabel('Week [April - May]')
plt.ylabel('Total Posts')
plt.title('Weekly Post Counts [2021]')
plt.grid(True)
plt.tight_layout()
plt.show()

# Work on a copy to preserve df
df_copy = df[['price']].dropna().copy()
df_copy['price'] = pd.to_numeric(df_copy['price']) / 1e6  # Convert price to millions

# Compute histogram data & statistics
hist_values, bin_edges = np.histogram(df_copy['price'], bins=30)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
mean_price, median_price, std_price = df_copy['price'].agg(['mean', 'median', 'std'])

# Plot histogram-like area chart
plt.figure(figsize=(14,6))
plt.fill_between(bin_centers, hist_values, color='darkgreen', alpha=0.6, step="mid")

# Add statistical markers
for val, color, label in [(mean_price, 'blue', 'Mean'), (median_price, 'red', 'Median'),
                           (mean_price + std_price, 'orange', 'Std Dev (+1σ)')]:
    plt.axvline(val, color=color, linestyle='dashed', linewidth=2, label=f'{label}: {val:.2f}M')

# Titles, labels, and grid
plt.title("Price Distribution of Pre-Owned Cars (Million ₹)", fontsize=14, fontweight='bold')
plt.xlabel("Car Price (Million ₹)"); plt.ylabel("Frequency")
plt.legend(); plt.grid(axis="y", linestyle="--", alpha=0.5)

plt.show()

# Free memory
del df_copy, hist_values, bin_edges, bin_centers, mean_price, median_price, std_price, val, color, label

# Aggregate the data by state to count transactions (ID count per state)
state_transaction_data = df.groupby("state", observed=True)["id"].count().reset_index()
state_transaction_data.columns = ["State", "Transaction_Count"]


# Optional: Use palette with hue explicitly to remove warnings (Alternative version)
state_transaction_data["Hue_Group"] = state_transaction_data["State"]
plt.figure(figsize=(14, 8))
sns.barplot(x="State", y="Transaction_Count", hue="Hue_Group", data=state_transaction_data, palette="ocean_r", legend=False)
plt.xticks(rotation=90, fontsize=10)
plt.xlabel("State", fontsize=12)
plt.ylabel("Number of Transactions (ID Count)", fontsize=12)
plt.title("Demand for Pre-Owned Cars Across Different States (Based on Transaction Count)", fontsize=14)
plt.tight_layout()
plt.show()

# Step 3: Perform Chi-Square Test
chi2_stat, p_value = stats.chisquare(state_transaction_data["Transaction_Count"])

# Print results
print(f"Chi-Square Statistic: {chi2_stat}")
print(f"P-value: {p_value}")

Chi-Square Statistic: 299316.30432547646
P-value: 0.0

plt.figure(figsize=(12, 6))

# Assign x to hue and disable legend
sns.violinplot(
    x=pd.qcut(df['odometer'], q=4, labels=['Low', 'Medium', 'High', 'Very High']),
    y=df['price'],
    data=df,
    hue=pd.qcut(df['odometer'], q=4, labels=['Low', 'Medium', 'High', 'Very High']),
    dodge=False,
    palette='coolwarm',
    legend=False  # Disable legend explicitly
)

plt.title('Odometer Reading vs Price Distribution', fontsize=14)
plt.xlabel('Odometer Reading (Mileage Category)', fontsize=12)
plt.ylabel('Price (INR)', fontsize=12)
plt.grid(True)
plt.show()

# Selecting only price and odometer columns
correlation_features = df[['price', 'odometer']]

# Compute the correlation matrix
correlation_matrix = correlation_features.corr()

# Create the heatmap
plt.figure(figsize=(13, 5))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Heatmap: Odometer and Price', fontsize=14)
plt.show()

# Creating a copy of the dataset to avoid modifying the original data
df_copy = df.copy()

# Convert price to millions in a temporary column
df_copy["price_million"] = df_copy["price"] / 1e6  # Convert price to million INR

# Boxplot: Vehicle price distribution across different cylinder counts (in millions)
plt.figure(figsize=(13, 6))
sns.boxplot(data=df_copy, x="cylinders_count", y="price_million", hue="cylinders_count", palette="viridis", legend=False)  # Fix warning

# Adding labels and title
plt.title("Vehicle Price Distribution (in Millions) by Number of Cylinders", fontsize=14, fontweight='bold')
plt.xlabel("Number of Cylinders", fontsize=12)
plt.ylabel("Vehicle Price (Million INR)", fontsize=12)
plt.grid(axis='y', linestyle="--", alpha=0.5)

# Show the plot
plt.show()

# Drop the temporary column to ensure original data remains unchanged
df_copy.drop(columns=["price_million"], inplace=True)

# Dropping rows with missing values in cylinders_count and price for a temporary DataFrame
df_corr = df.dropna(subset=["cylinders_count", "price"]).copy()

# Convert price to millions (temporary operation in df_corr)
df_corr["price_million"] = df_corr["price"] / 1e6  # Price in millions

# Scatter plot with regression line
plt.figure(figsize=(13, 6))
sns.regplot(
    data=df_corr,
    x="cylinders_count",
    y="price_million",
    scatter_kws={"alpha": 0.5},
    line_kws={"color": "red"}
)
plt.title("Scatter Plot: Cylinders vs. Price (in Millions)")
plt.xlabel("Number of Cylinders")
plt.ylabel("Vehicle Price (Million INR)")
plt.show()

# Convert price to millions for regression
df_corr["price_million"] = df_corr["price"] / 1e6

# Perform linear regression using scipy.stats
slope, intercept, r_value, p_value, std_err = stats.linregress(
    df_corr["cylinders_count"], df_corr["price_million"]
)

# Display regression results
print(f"Slope (Coefficient): {slope:.4f}")
print(f"Intercept: {intercept:.4f}")
print(f"R-squared: {r_value**2:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Standard Error: {std_err:.4f}")

Slope (Coefficient): 0.4644
Intercept: -0.2783
R-squared: 0.2009
P-value: 0.0000
Standard Error: 0.0022

# Calculate the average price for each state income level
average_price_by_income = df.groupby('state_income')['price'].mean()

# Display the result
print("Average price for each state income level:")
print(average_price_by_income)

Average price for each state income level:
state_income
high        2.533686e+06
low         2.533291e+06
medium      2.536091e+06
very low    2.530398e+06
Name: price, dtype: float64

# Calculate and plot
avg_price = df.groupby('state_income')['price'].mean()
sns.barplot(x=avg_price.index, y=avg_price.values, hue=avg_price.index, dodge=False, legend=False)
plt.title("Average Car Prices by State Income Levels")
plt.xlabel("State Income Levels")
plt.ylabel("Average Price (₹)")
plt.show()

# Create a copy of the dataset to ensure the original remains unchanged
df_copy = df.copy()

# Define metro and rural states
metro_states = ["maharashtra", "karnataka", "delhi"]
rural_states = ["bihar", "odisha", "chhattisgarh"]

# Filter cars priced above 5 million INR in each category
metro_count = df_copy[(df_copy["price"] > 5_000_000) & (df_copy["state"].isin(metro_states))].shape[0]
rural_count = df_copy[(df_copy["price"] > 5_000_000) & (df_copy["state"].isin(rural_states))].shape[0]

# Create a summary table
car_count_summary = pd.DataFrame({
    "Region": ["Metro States (Maharashtra, Karnataka, Delhi)",
               "Rural States (Bihar, Odisha, Chhattisgarh)"],
    "Car Count (> 5M INR)": [metro_count, rural_count]
})

# Display the summary
print(car_count_summary)

                                         Region  Car Count (> 5M INR)
0  Metro States (Maharashtra, Karnataka, Delhi)                  6758
1    Rural States (Bihar, Odisha, Chhattisgarh)                  1526

# Create a pie chart to visualize the data
plt.figure(figsize=(14, 6))
plt.pie(car_count_summary["Car Count (> 5M INR)"],
        labels=car_count_summary["Region"],
        autopct='%1.1f%%',
        colors=['pink', 'yellowgreen'],
        startangle=140,
        wedgeprops={'edgecolor': 'black'})

# Title for the pie chart
plt.title("Proportion of Cars Priced Above 5 Million INR in Metro vs Rural States")

# Show the chart
plt.show()

fig, axes = plt.subplots(1, 3, figsize=(15, 6), sharey=True)
colors = sns.color_palette("pastel", n_colors=5)  # Adjust palette size to match categories

# Define year ranges
year_ranges = {
    '2006-2010': (2006, 2010),
    '2011-2015': (2011, 2015),
    '2016-2022': (2016, 2022)
}

# Generate individual charts for each year range
for idx, (label, (start_year, end_year)) in enumerate(year_ranges.items()):
    subset = df[(df['mfg_year'] >= start_year) & (df['mfg_year'] <= end_year)]
    grouped = subset.groupby('fuel', observed=True).agg(
        Average_Price=('price', 'mean'),
        fuel_Count=('fuel', 'count')
    ).reset_index()
    grouped['Average_Price'] = grouped['Average_Price'] / 1e6  # Convert to million

    sns.barplot(x='fuel', y='Average_Price', hue='fuel', data=grouped, ax=axes[idx], palette=colors, legend=False)
    ax2 = axes[idx].twinx()
    sns.lineplot(x='fuel', y='fuel_Count', data=grouped, ax=ax2, color='blue', marker='o')

    # Add text labels for transaction count
    for x, y in zip(grouped['fuel'], grouped['fuel_Count']):
        ax2.text(x, y, f'{y:,}', ha='center', va='bottom', fontsize=10, color='purple')

    axes[idx].set_title(f"Avg Price & Demand by Fuel ({label})")
    axes[idx].set_xlabel("Fuel Type")
    axes[idx].set_ylabel("Avg Price (Million INR)")
    ax2.set_ylabel("Fuel Count")

plt.tight_layout()
plt.show()

	id	price	year	manufacturer	model	condition	cylinders	fuel	odometer	title_status	...	VIN	drive	size	type	paint_color	image_url	state	posting_date	latitude	longitude
6661	7308571224	3916750.0	2017.0	bmw	430i	NaN	4 cylinders	gas	32214.0	clean	...	WBA4F7C35HG787932	rwd	NaN	sedan	blue	https://images.craigslist.org/00b0b_dEVueDQ2dN...	Bihar	2021-04-18 01:02:14+00:00	26.0785	87.0355
192725	7309999168	2387620.0	2020.0	kia	soul s wagon 4d	good	NaN	other	27505.0	clean	...	KNDJ23AUXL7005901	NaN	NaN	wagon	silver	https://images.craigslist.org/00909_cdPG9Py9E1...	Dadra and Nagar Haveli and Daman and Diu	2021-04-20 20:00:30+00:00	20.1024	72.9349
193042	7307142093	2940840.0	2017.0	bmw	x1	NaN	NaN	gas	56452.0	clean	...	WBXHT3Z36H4A56656	NaN	NaN	NaN	NaN	https://images.craigslist.org/00L0L_bbPvpa1KvW...	Uttar Pradesh	2021-04-15 13:50:42+00:00	24.9249	79.1547
420708	7311357255	4236880.0	2015.0	ford	f150 supercrew cab xlt	good	6 cylinders	gas	65329.0	clean	...	1FTEW1EG0FKE39342	4wd	NaN	pickup	blue	https://images.craigslist.org/00N0N_1xMPvfxRAI...	Telangana	2021-04-23 15:20:44+00:00	16.3049	78.3479
223133	7302507866	0.0	2014.0	gmc	sierra 1500 4x4 crew cab sle	good	8 cylinders	other	110494.0	clean	...	3GTU2UEC7EG175242	4wd	NaN	pickup	blue	https://images.craigslist.org/00b0b_1T8Tm7RYOn...	Uttar Pradesh	2021-04-06 12:00:35+00:00	24.6724	77.6303

	id	price	year	manufacturer	model	condition	cylinders	fuel	odometer	title_status	...	VIN	drive	size	type	paint_color	image_url	state	posting_date	latitude	longitude
0	7222695916	770530.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	Haryana	NaN	28.8446	75.1167
1	7218891961	1528210.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	Haryana	NaN	27.9026	77.0382
2	7221797935	2696820.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	West Bengal	NaN	24.6838	85.9695
3	7222270760	192610.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	Andhra Pradesh	NaN	15.5158	81.9342
4	7210384030	629240.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	West Bengal	NaN	23.7821	87.4861

Spinny's Revenue Growth Through Demand and Pricing Optimisation¶

1. BUSINESS PROBLEM OVERVIEW¶

1.1 - Given Problem Statement¶

1.2 - Simplified Breakdown Of Problem Statement¶

2. DATA OVERVIEW¶

2.1 Dataset Summary¶

2.2 Columns Description¶

3. IMPORTING, LOADING & INITIAL DATASET EXPLORATION¶

3.1 Importing Required Libraries¶

3.2 Loading the dataset from Nextleap google drive¶

3.3 Initial Dataset Exploration¶

3.3.1 Checking Shape of the dataset¶

3.3.2 Viewing all column in Dataset¶

3.3.3 Understanding dataset by looking few Rows¶

3.3.4 Displaying dataset information¶

3.3.5 Missing values data summary¶

3.3.6 Unique values count for each column¶

3.3.7 Statistical summary of price, year & odometer¶

3.3.8 Getting the timeline of the dataset¶

4. DATA PREPARATION¶

4.1 Creating a Backup of the Dataset ( df ) to Work On"¶

4.2 Structuring the Dataset¶

4.2.1 Ensuring Consistency by Lowercasing Text Data¶

4.2.2 Renaming the 'year' Column to 'mfg_year'¶

4.2.3 Removing Special Characters and Keeping Only Letters, Numbers and Single Spaces¶

4.3 Datatype Correction¶

4.3.1 Correcting mfg_year from 'float' to 'int64'¶

4.3.2 Correcting posting_date from 'object' to 'date'¶

4.4 Removed Duplicate Rows¶

4.4.1 Checking duplicate rows¶

4.4.2 Removing duplicate rows if present¶

4.5 Dropping Unnessasry Columns¶

4.5.1 Droping image_url column¶

4.5.2 Droping size column¶

4.5.3 Assessing and Deciding the Relevance of VIN (Vehicle Identification Number)¶

4.5.3.1 Counting Repeated VINs¶

4.5.3.2 Assessing Rows to Be Removed if Repeated VINs Are Excluded¶

4.5.3.3 Examining Unique VINs and Patterns¶

4.5.3.4 Filtering Rows with Unusual VINs¶

4.5.4 Dropping VIN Column Based On Observations¶

4.6 Outliers Handling¶

4.6.1 Price Outlier Detection and Removal with IQR¶

4.6.1.1 Spotting Outliers with Boxplot¶

4.6.1.2 Outlier Removal and Confirmation¶

4.6.1.3 Validating Outlier Removal with Boxplot¶

4.6.2 Odometer Outlier Detection and Removal with IQR*¶

4.6.2.1 Spotting outliers with violin plot¶

4.6.2.2 Outlier Removal and Confirmation¶

4.6.2.3 Validating outlier removal with violin plot¶

4.7 Droping Rows Based on Conditions¶

4.7.1 Droped Rows With Missing, Irrelevent & Outdated mfg_year¶

4.7.1.1 Checking min and max of mfg_year¶

4.7.1.2 Removing rows where mfg_year > posting_date¶

4.7.1.3 Assessing rows to drop: mfg_year < 2006 for relevant analysis¶

Assessing factors¶

impact on dataset if decide to drop rows mfg_year before 2006 & missing mfg_year¶

4.7.1.4 Dropping Rows: mfg_year < 2006 and missing values in mfg_year¶

4.7.2 Droping Rows Where Price Is Zero¶

4.7.3 Dropping Rows Based On Missing Column Thresholds¶

4.7.4 Dropping Rows with Missing Values in Columns Having Less Than 2% Missing Data¶

🔻Resetting Index Because Droped Many Row¶

4.8 Missing Values Imputation¶

4.8.1 Reallocating missing values from "model" column ( one to many )¶

4.8.2 Reducing missing "manufacturer" values with confirmed "model" based lookup ( one to one )¶

4.8.3 Using a custom dictionary to further reducing missing manufacturer values ( one to one )¶

4.8.4 Hierarchical Imputation of Missing Values in the 'Condition' Column using bfill and ffill (many to one)¶

4.8.5 Filling All Remaing Missing Values In Categorical Columns With 'unknown'¶

4.9 Data Preparation For ML Models¶

4.9.1 Normalizing Price and Odometer Using Min-Max Method¶

4.9.2 Optimizing Object Columns: Converting to Categorical for Faster Processing¶

4.10 Feature Engineering¶

4.10.1 car_age: Added ( posting_date - mfg_year )¶

4.10.2 cylinders_count: Converthing Condition To Numeric Value As Int64¶

4.10.3 state_income: Created New categorical column of states based on income¶

5. EXPLORATORY DATA ANALYSIS (EDA)¶

Correlation Heatmap of cars_age, price, odometer, cylinder_count & condition_score¶

Distributions Overview of Price, Odometer, Car Age, & Cylinders Count¶

5.1 UNIVARIATE ANAYSIS¶

5.1.1 Cars with odometer readings between 0 to 0.5 lakh KM dominate the dataset.¶

5.1.2 Cars aged between 3 to 8 years dominate the dataset.¶