# Step 1: Install gdown
!pip install gdown

# Step 2: Import necessary libraries
import gdown
import pandas as pd

# Step 3: Set the file ID and create a download URL
file_id = "1i4ia9ZNfAXgu6JGTXCUgFn7Pb8wltzLH"
download_url = f"https://drive.google.com/uc?id={file_id}"

# Step 4: Set the output file name
output_file = "acko_dataset.csv"

# Step 5: Download the file
gdown.download(download_url, output_file, quiet=False)

# Step 6: Load the CSV file into a Pandas DataFrame
data = pd.read_csv(output_file)

Requirement already satisfied: gdown in /usr/local/lib/python3.11/dist-packages (5.2.0)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.11/dist-packages (from gdown) (4.13.3)
Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from gdown) (3.17.0)
Requirement already satisfied: requests[socks] in /usr/local/lib/python3.11/dist-packages (from gdown) (2.32.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from gdown) (4.67.1)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown) (2.6)
Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown) (4.12.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (2025.1.31)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (1.7.1)

Downloading...
From (original): https://drive.google.com/uc?id=1i4ia9ZNfAXgu6JGTXCUgFn7Pb8wltzLH
From (redirected): https://drive.google.com/uc?id=1i4ia9ZNfAXgu6JGTXCUgFn7Pb8wltzLH&confirm=t&uuid=58922796-c0e5-406b-bc6a-07492803306f
To: /content/acko_dataset.csv
100%|██████████| 219M/219M [00:04<00:00, 50.2MB/s]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv('acko_dataset.csv')

df.head(10)

# Check the shape of dataset.
rows, columns = df.shape
print(f"No.of rows :{rows}\nNo.of columns: {columns}")

No.of rows :1200000
No.of columns: 20

def age_to_group(age):
    if age <= 30:
        return '18-30'
    elif age <= 50:
        return '31-50'
    else:
        return '51+'


# Apply function to create Age Group
df['Age Group'] = df['Age'].apply(age_to_group)

print(df['Age Group'].value_counts())

Age Group
31-50    506170
51+      374386
18-30    319444
Name: count, dtype: int64

# Define Health Score categories based on distribution
def categorize_health_score(score):
    if score <= 20:
        return "Low"
    elif 21 <= score <= 40:
        return "Medium"
    else:
        return "High"
df["Health Score Category"] = df["Health Score"].apply(categorize_health_score)
print(df['Health Score Category'].value_counts())

Health Score Category
Medium    568593
High      411738
Low       219669
Name: count, dtype: int64

# Dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   id                     1200000 non-null  int64  
 1   Age                    1181295 non-null  float64
 2   Gender                 1200000 non-null  object 
 3   Annual Income          1155051 non-null  float64
 4   Marital Status         1200000 non-null  object 
 5   Number of Dependents   1090328 non-null  float64
 6   Education Level        1200000 non-null  object 
 7   Occupation             1200000 non-null  object 
 8   Health Score           1125924 non-null  float64
 9   Location               1200000 non-null  object 
 10  Policy Type            1200000 non-null  object 
 11  Previous Claims        835971 non-null   float64
 12  Credit Score           1062118 non-null  float64
 13  Insurance Duration     1199999 non-null  float64
 14  Policy Start Date      1200000 non-null  object 
 15  Customer Feedback      1122176 non-null  object 
 16  Smoking Status         1200000 non-null  object 
 17  Exercise Frequency     1200000 non-null  object 
 18  Property Type          1200000 non-null  object 
 19  Premium Amount         784968 non-null   float64
 20  Age Group              1200000 non-null  object 
 21  Health Score Category  1200000 non-null  object 
dtypes: float64(8), int64(1), object(13)
memory usage: 201.4+ MB

# Checking for Duplicate Values.
duplicate = df.duplicated().sum()
duplicate

0

#Checking Null Values
df.isnull().sum()

  df.dtypes

#Convert the data type of the columns
# Convert 'Policy Start Date' to datetime (handling errors)
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')

# Convert Number of Dependents to integer (nullable Int64 to handle NaN)
df['Number of Dependents'] = df['Number of Dependents'].astype('Int64')

#Convert age datatype into int
if df['Age'].dropna().apply(float.is_integer).all():
    df['Age'] = df['Age'].astype('Int64')

pd.options.display.float_format = '{:.2f}'.format

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   id                     1200000 non-null  int64         
 1   Age                    1181295 non-null  Int64         
 2   Gender                 1200000 non-null  object        
 3   Annual Income          1155051 non-null  float64       
 4   Marital Status         1200000 non-null  object        
 5   Number of Dependents   1090328 non-null  Int64         
 6   Education Level        1200000 non-null  object        
 7   Occupation             1200000 non-null  object        
 8   Health Score           1125924 non-null  float64       
 9   Location               1200000 non-null  object        
 10  Policy Type            1200000 non-null  object        
 11  Previous Claims        835971 non-null   float64       
 12  Credit Score           1062118 non-null  float64       
 13  Insurance Duration     1199999 non-null  float64       
 14  Policy Start Date      1200000 non-null  datetime64[ns]
 15  Customer Feedback      1122176 non-null  object        
 16  Smoking Status         1200000 non-null  object        
 17  Exercise Frequency     1200000 non-null  object        
 18  Property Type          1200000 non-null  object        
 19  Premium Amount         784968 non-null   float64       
 20  Age Group              1200000 non-null  object        
 21  Health Score Category  1200000 non-null  object        
dtypes: Int64(2), datetime64[ns](1), float64(6), int64(1), object(12)
memory usage: 203.7+ MB

df.head(10)

df.isnull().sum()

plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Data Heatmap")
plt.show()

missing_counts = df.isnull().sum().sort_values(ascending=False)
missing_counts = missing_counts[missing_counts > 0]  # Only show missing columns

missing_counts.plot(kind='barh', figsize=(10, 6), color='skyblue')
plt.xlabel("Number of Missing Values")
plt.title("Missing Values Per Column")
plt.show()

#Drop the rows where 5 or more values are missing

# We are removing rows where more than 4 key columns have missing values.

# Key columns to check for missing values
key_columns = ['Age', 'Annual Income', 'Health Score', 'Previous Claims',
               'Number of Dependents', 'Credit Score', 'Customer Feedback','Insurance Duration','Premium Amount']

# Count missing values in each row for the key columns
df['missing_count'] = df[key_columns].isnull().sum(axis=1)

# Filter rows where most key columns are missing (e.g., more than 4 out of 7)
missing_rows = df[df['missing_count'] > 4]

# Drop rows where more than 4 key columns are missing
df = df[df['missing_count'] <= 4].drop(columns='missing_count')

# Check the new shape of the dataset
print(df.shape)

(1198440, 22)

#1Age
print(f"Missing values after filling: {df['Age'].isnull().sum()}")

# 1. Calculate overall median age (fallback value)
overall_median_age = df['Age'].median()

# 2. Fill missing ages using group medians (Occupation + Policy Type)
df['Age'] = df.groupby(['Occupation', 'Policy Type'])['Age'].transform(
    lambda x: x.fillna(x.median() if len(x) >= 50 else overall_median_age)
)

# 3. Fill any remaining nulls with overall media
df['Age'] = df['Age'].fillna(overall_median_age)

# Verify result
print(f"Missing values after filling: {df['Age'].isnull().sum()}")

Missing values after filling: 18391
Missing values after filling: 0

#Annual Income

print(f"Missing values before filling: {df['Annual Income'].isnull().sum()}")

# 1. Calculate overall median income (fallback value)
overall_median_income = df['Annual Income'].median()

# 2. Fill missing income using group medians (Occupation + Education Level)
df['Annual Income'] = df.groupby(['Occupation', 'Education Level'])['Annual Income'].transform(
    lambda x: x.fillna(x.median() if len(x) >= 50 else overall_median_income)
)

# 3. Fill any remaining nulls with overall median
df['Annual Income'] = df['Annual Income'].fillna(overall_median_income)

# Verify result
print(f"Missing values after filling: {df['Annual Income'].isnull().sum()}")

Missing values before filling: 44421
Missing values after filling: 0

# Health Score
print(f"Missing values before filling: {df['Health Score'].isnull().sum()}")

# 1. Define function to get mode (with tie-breaker: worst-case score)
def get_mode(series):
    modes = series.mode()
    return modes.min() if not modes.empty else np.nan

# 2. Fill missing values with group mode (Smoking Status + Exercise Frequency)
group_modes = df.groupby(['Smoking Status', 'Exercise Frequency'])['Health Score'].transform(get_mode)
df['Health Score'] = df['Health Score'].fillna(group_modes)

# 3. Fill remaining nulls with overall mode
overall_mode = df['Health Score'].mode()[0]
df['Health Score'] = df['Health Score'].fillna(overall_mode)

print(f"Missing values after filling: {df['Health Score'].isnull().sum()}")

Missing values before filling: 73231
Missing values after filling: 0

#Previous Claims
print(f"Missing values before filling: {df['Previous Claims'].isnull().sum()}")

# Fill missing Previous Claims
# 1. If Insurance Duration < 1 year or missing, fill with 0 (assuming new customer)
df['Previous Claims'] = df.apply(
    lambda row: 0 if pd.isna(row['Insurance Duration']) or row['Insurance Duration'] < 1 else row['Previous Claims'],
    axis=1
)

# 2. For remaining missing values (existing customers), fill with median based on Policy Type
df['Previous Claims'] = df.groupby('Policy Type')['Previous Claims'].transform(
    lambda x: x.fillna(x.median())
)
print(f"Missing values after filling: {df['Previous Claims'].isnull().sum()}")

Missing values before filling: 362589
Missing values after filling: 0

#Number of Dependents

print(df['Number of Dependents'].isnull().sum())

# Step 1: Median dependents for 'Spouse Present'
median_dependents_spouse_present = df.loc[df['Marital Status'] == 'Spouse Present', 'Number of Dependents'].median()

# Step 2: Fill missing values based on Marital Status
df.loc[(df['Marital Status'] == 'Spouse Present') & (df['Number of Dependents'].isnull()), 'Number of Dependents'] = median_dependents_spouse_present
df.loc[(df['Marital Status'] == 'Not Married') & (df['Number of Dependents'].isnull()), 'Number of Dependents'] = 0

# Optional: Fill remaining nulls (Unknown/Formerly Married) with overall median
overall_median_dependents = df['Number of Dependents'].median()
df['Number of Dependents'].fillna(overall_median_dependents, inplace=True)

# Check if missing values are filled
print(df['Number of Dependents'].isnull().sum())

108611
0

<ipython-input-27-1fcb0017b83d>:14: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Number of Dependents'].fillna(overall_median_dependents, inplace=True)

# Credit Score
print(f"Missing values before filling: {df['Credit Score'].isnull().sum()}")

# 1. Calculate overall median (fallback)
overall_median = df['Credit Score'].median()

# 2. Group by Occupation, fill nulls with group median (if group has ≥50 rows) or overall median
group_medians = df.groupby('Occupation')['Credit Score'].transform(
    lambda x: x.median() if len(x) >= 50 else overall_median
)
df['Credit Score'] = df['Credit Score'].fillna(group_medians)

# 3. Fill any remaining nulls with overall median
df['Credit Score'] = df['Credit Score'].fillna(overall_median)

print(f"Missing values after filling: {df['Credit Score'].isnull().sum()}")

Missing values before filling: 136782
Missing values after filling: 0

# Customer Feedback
print(f"Missing values before filling: {df['Customer Feedback'].isnull().sum()}")

# 1. Fill missing feedback with group mode (Policy Type + Location)
df['Customer Feedback'] = df.groupby(['Policy Type', 'Location'])['Customer Feedback'].transform(
    lambda x: x.fillna(x.mode()[0])  # Use first mode in case of ties
)

# 2. Fill remaining nulls with overall most common feedback
most_common_feedback = df['Customer Feedback'].mode()[0]
df['Customer Feedback'] = df['Customer Feedback'].fillna(most_common_feedback)

print(f"Missing values after filling: {df['Customer Feedback'].isnull().sum()}")

Missing values before filling: 76792
Missing values after filling: 0

# Level 1: Policy Type + Location + Age Group
df['Premium Amount'] = df.groupby(
    ['Policy Type', 'Location', 'Age Group']
)['Premium Amount'].transform(lambda x: x.fillna(x.median()))

# Level 2: Policy Type + Location (fallback)
df['Premium Amount'] = df.groupby(
    ['Policy Type', 'Location']
)['Premium Amount'].transform(lambda x: x.fillna(x.median()))

# Level 3: Policy Type (fallback)
df['Premium Amount'] = df.groupby('Policy Type')['Premium Amount'].transform(
    lambda x: x.fillna(x.median())
)

# Final Fallback: Overall median
overall_median = df['Premium Amount'].median()
df['Premium Amount'] = df['Premium Amount'].fillna(overall_median)

# --------------------------------------------------------------------------
# Step 3: Verify results
# --------------------------------------------------------------------------
print(f"Missing values remaining: {df['Premium Amount'].isnull().sum()}")

Missing values remaining: 0

# List of numerical columns to check for outliers
columns_to_check = [
    "Age", "Annual Income", "Number of Dependents", "Health Score",
    "Previous Claims", "Credit Score", "Insurance Duration", "Premium Amount"
]

# Detect outliers using IQR method
for column in columns_to_check:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = (df[column] < lower_bound) | (df[column] > upper_bound)

    # Print outlier values
    print(f"Outliers in {column}:")
    print(df[outliers][column].values)
    print("-" * 50)

Outliers in Age:
<IntegerArray>
[]
Length: 0, dtype: Int64
--------------------------------------------------
Outliers in Annual Income:
[ 8942382. 10642586.  5102224. ...  4267852.  7627614.  4225024.]
--------------------------------------------------
Outliers in Number of Dependents:
<IntegerArray>
[]
Length: 0, dtype: Int64
--------------------------------------------------
Outliers in Health Score:
[]
--------------------------------------------------
Outliers in Previous Claims:
[3. 3. 3. ... 4. 3. 3.]
--------------------------------------------------
Outliers in Credit Score:
[]
--------------------------------------------------
Outliers in Insurance Duration:
[]
--------------------------------------------------
Outliers in Premium Amount:
[60067.38023978 47219.5120086  80505.37405629 ... 52714.2924196
 51794.31105174 67101.50990761]
--------------------------------------------------

# List of columns to check
columns_to_check = ["Annual Income", "Premium Amount", "Previous Claims"] # previous clamins

# Dictionary to store outlier counts
outlier_counts = {}

# Detect outliers using IQR method
for column in columns_to_check:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Count outliers
    outliers = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    outlier_counts[column] = outliers

# Print the number of outliers in each column
print("Outlier Counts:")
print(outlier_counts)

Outlier Counts:
{'Annual Income': 117135, 'Premium Amount': 127320, 'Previous Claims': 62056}

# Calculate the 5th and 95th percentiles for Annual Income
lower_bound_income = np.percentile(df['Annual Income'], 5)
upper_bound_income = np.percentile(df['Annual Income'], 95)

print(f"Annual Income Capping Range: ({lower_bound_income}, {upper_bound_income})")

# Apply 5%-95% capping to Annual Income
df['Annual Income'] = np.clip(df['Annual Income'], lower_bound_income, upper_bound_income)

# Calculate the 4th and 96th percentiles for Premium Amount
lower_bound_premium = np.percentile(df['Premium Amount'], 4)
upper_bound_premium = np.percentile(df['Premium Amount'], 96)

print(f"Premium Amount Capping Range: ({lower_bound_premium}, {upper_bound_premium})")

# Apply 4%-96% capping to Premium Amount
df['Premium Amount'] = np.clip(df['Premium Amount'], lower_bound_premium, upper_bound_premium)

print(f"After Capping - Min: {df['Premium Amount'].min()}, Max: {df['Premium Amount'].max()}")

Annual Income Capping Range: (88150.0, 6306990.600000008)
Premium Amount Capping Range: (2690.7025942529003, 81897.70396749469)
After Capping - Min: 2690.7025942529003, Max: 81897.70396749469

# Cap Previous Claims at a maximum of 6
df['Previous Claims'] = np.where(df['Previous Claims'] > 6, 6, df['Previous Claims'])

# Check the updated value counts
print(df['Previous Claims'].value_counts().sort_index())

Previous Claims
0.00    305385
1.00    663368
2.00    167631
3.00     49003
4.00     10668
5.00      2016
6.00       369
Name: count, dtype: int64

# 1 Distribution of Policy Types
print("\n\033[1m Policy Type Distribution: \033[0m")
print(df['Policy Type'].value_counts())

 Policy Type Distribution: 
Policy Type
Premium          401345
Comprehensive    399087
Basic            398008
Name: count, dtype: int64

# 2 Age Group Distribution
print("\n\033[1m Age Group Distribution: \033[0m")
print(df['Age Group'].value_counts())

 Age Group Distribution: 
Age Group
31-50    505676
51+      373681
18-30    319083
Name: count, dtype: int64

# 3 Location Distribution
print("\n\033[1m Location Distribution: \033[0m")
print(df['Location'].value_counts())

 Location Distribution: 
Location
Tier-3    401023
Tier-2    400416
Tier-1    397001
Name: count, dtype: int64

# 4 Smoking Status Distribution
print("\n\033[1m Smoking Status Distribution: \033[0m")
print(df['Smoking Status'].value_counts())

 Smoking Status Distribution: 
Smoking Status
Yes    601057
No     597383
Name: count, dtype: int64

# 5 Previous Claims Count Distribution
print("\n\033[1m Previous Claims Count: \033[0m")
print(df['Previous Claims'].value_counts())

 Previous Claims Count: 
Previous Claims
1.00    663368
0.00    305385
2.00    167631
3.00     49003
4.00     10668
5.00      2016
6.00       369
Name: count, dtype: int64

# 6 Previous Claims Count Distribution
print("\n\033[1m Previous Claims Count: \033[0m")
print(df['Exercise Frequency'].value_counts())

 Previous Claims Count: 
Exercise Frequency
Weekly     305735
Monthly    299454
Rarely     299050
Daily      294201
Name: count, dtype: int64

# Calculate average Premium Amount per Age Group
age_premium_summary = df.groupby('Age Group')['Premium Amount'].agg(['count', 'mean', 'median'])

# Display the results
print(age_premium_summary)

            count     mean   median
Age Group                          
18-30      319083 15375.54 10718.00
31-50      505676 16420.56 11247.49
51+        373681 31080.88 23690.35

age_premium_summary['mean'].plot(kind='bar', color=['#4E79A7', '#F28E2B', '#76B7B2'], figsize=(10, 7))

# Labels and title
plt.xlabel("Age Group", fontsize=12)
plt.ylabel("Average Premium Amount", fontsize=12)
plt.title("Average Premium Amount by Age Group", fontsize=14)

# Show values on bars
for index, value in enumerate(age_premium_summary['mean']):
    plt.text(index, value + 1000, f"{value:.2f}", ha='center', fontsize=11, fontweight='bold')

# Show plot
plt.show()

# Calculate average Premium Amount per Location Tier
location_premium_summary = df.groupby('Location')['Premium Amount'].agg(['count', 'mean', 'median'])

# Display the results
print(location_premium_summary)

           count     mean   median
Location                          
Tier-1    397001 20878.53 12756.78
Tier-2    400416 20801.86 12952.51
Tier-3    401023 20461.92 12915.80

# Define a refined color palette (modern theme)
colors = ['#4B71A5', '#F28E2B', '#76B7B2']

# Plot the bar chart
location_premium_summary['mean'].plot(kind='bar', color=colors, figsize=(10, 7), edgecolor='black')

# Labels and title with padding
plt.xlabel("Location", fontsize=12, fontweight='bold')
plt.ylabel("Average Premium Amount", fontsize=12, fontweight='bold')
plt.title("Average Premium Amount by Location", fontsize=14, fontweight='bold', pad=30)  # Increased padding

# Show values on bars
for index, value in enumerate(location_premium_summary['mean']):
    plt.text(index, value + 500, f"{value:.2f}", ha='center', fontsize=11, fontweight='bold')

# Show plot
plt.show()

# Calculate average Premium Amount per Policy Type Tier
Policy_Type_summary = df.groupby('Policy Type')['Premium Amount'].agg(['count', 'mean', 'median'])

# Display the results
print(Policy_Type_summary)

                count     mean   median
Policy Type                            
Basic          398008 19324.86 10825.14
Comprehensive  399087 20251.81 11397.64
Premium        401345 22549.71 12952.51

Policy_Type_summary['mean'].plot(kind='bar', color=['#4B6A9B', '#F4A261', '#2A9D8F'], figsize=(10, 7))

# Labels and title
plt.xlabel("Policy Type", fontsize=12)
plt.ylabel("Average Premium Amount", fontsize=12)
plt.title("Average Premium Amount by Policy Type", fontsize=14)

# Show values on bars
for index, value in enumerate(Policy_Type_summary['mean']):
    plt.text(index, value + 1000, f"{value:.2f}", ha='center', fontsize=11, fontweight='bold')

# Show plot
plt.show()

smoking_premium_comparison = df.groupby("Smoking Status")["Premium Amount"].mean().reset_index()

# Display results
print(smoking_premium_comparison)

  Smoking Status  Premium Amount
0             No        19405.28
1            Yes        22013.74

smoking_premium_comparison = pd.DataFrame({
    "Smoking Status": ["No", "Yes"],
    "Premium Amount": [20182.85, 24006.33]
})

# Plot the bar chart
plt.figure(figsize=(8, 5))
sns.barplot(x="Smoking Status", y="Premium Amount", data=smoking_premium_comparison, palette="Blues")

# Labels and title
plt.xlabel("Smoking Status")
plt.ylabel("Average Premium Amount")
plt.title("Impact of Smoking on Insurance Premiums")
plt.show()

<ipython-input-48-e7e12934dc1d>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Smoking Status", y="Premium Amount", data=smoking_premium_comparison, palette="Blues")

# Group by Exercise Frequency and calculate mean Premium Amount
exercise_premium = df.groupby("Exercise Frequency")["Premium Amount"].mean().reset_index()

# Sort the categories in the expected order (Daily → Weekly → Monthly → Rarely)
exercise_premium["Exercise Frequency"] = pd.Categorical(exercise_premium["Exercise Frequency"],
                                                        categories=["Daily", "Weekly", "Monthly", "Rarely"],
                                                        ordered=True)
exercise_premium = exercise_premium.sort_values("Exercise Frequency")

# Plot the data
plt.figure(figsize=(8, 5))
sns.barplot(x="Exercise Frequency", y="Premium Amount", data=exercise_premium, palette="viridis")

# Add labels and title
plt.xlabel("Exercise Frequency")
plt.ylabel("Average Premium Amount")
plt.title("Impact of Exercise Frequency on Premium Amount")
plt.show()

<ipython-input-49-51bfd71790d1>:12: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Exercise Frequency", y="Premium Amount", data=exercise_premium, palette="viridis")

exercise_premium

df1 = df
health_premium_analysis = df1.groupby("Health Score Category")["Premium Amount"].mean().reset_index()

# Display results
print(health_premium_analysis)

  Health Score Category  Premium Amount
0                  High        22530.12
1                   Low        17419.91
2                Medium        20672.94

# Set plot style and size
plt.figure(figsize=(8, 6))
sns.set_style("whitegrid")

# Create bar plot using health_premium_analysis
sns.barplot(x="Health Score Category", y="Premium Amount", hue="Health Score Category", data=health_premium_analysis, palette="crest", legend=False)

# Add labels and title
plt.xlabel("Health Score Category", fontsize=12)
plt.ylabel("Average Premium Amount", fontsize=12)
plt.title("Average Premium Amount by Health Score Category", fontsize=14)

# Display values on each bar
for index, value in enumerate(health_premium_analysis["Premium Amount"]):
    plt.text(index, value + 500, f"{value:.2f}", ha='center', fontsize=11, fontweight='bold')

plt.show()

# Group by Previous Claims and calculate average premium
df_grouped = df.groupby("Previous Claims")["Premium Amount"].mean().reset_index()

# Convert Previous Claims to categorical if it's numeric
df_grouped["Previous Claims"] = df_grouped["Previous Claims"].astype(str)

# Barplot for visualization
plt.figure(figsize=(8, 5))
sns.barplot(x=df_grouped["Previous Claims"], y=df_grouped["Premium Amount"], palette="viridis")
plt.xlabel("Previous Claims")
plt.ylabel("Average Premium Amount")
plt.title("Impact of Previous Claims on Premium Amount")
plt.show()

df_grouped

<ipython-input-53-6a6d2f53920a>:9: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=df_grouped["Previous Claims"], y=df_grouped["Premium Amount"], palette="viridis")

# Assuming df_full contains all 120,000 records
base_premium_basic = df[df['Policy Type'] == 'Basic']['Premium Amount'].median()
print(f"✅ Base Premium (Basic Policy from Full Data): ₹{base_premium_basic:.2f}")

# Filter only Basic policies
df_basic = df[df['Policy Type'] == 'Basic']

# Calculate Age Factors (mean method)
age_factors_basic = df_basic.groupby('Age Group')['Premium Amount'].mean().reset_index()
age_factors_basic['Age Factor'] = (age_factors_basic['Premium Amount'] - base_premium_basic) / base_premium_basic

print("\n✅ Age Adjustment Factors (Basic Policy - Full Data):")
print(age_factors_basic)

# Calculate Location Factors (mean method)
location_factors_basic = df_basic.groupby('Location')['Premium Amount'].mean().reset_index()
location_factors_basic['Location Factor'] = (location_factors_basic['Premium Amount'] - base_premium_basic) / base_premium_basic

print("\n✅ Location Adjustment Factors (Basic Policy - Full Data):")
print(location_factors_basic)

✅ Base Premium (Basic Policy from Full Data): ₹10825.14

✅ Age Adjustment Factors (Basic Policy - Full Data):
  Age Group  Premium Amount  Age Factor
0     18-30        14259.28        0.32
1     31-50        15300.27        0.41
2       51+        29131.29        1.69

✅ Location Adjustment Factors (Basic Policy - Full Data):
  Location  Premium Amount  Location Factor
0   Tier-1        19542.62             0.81
1   Tier-2        19393.83             0.79
2   Tier-3        19040.39             0.76

# Base Premium (Comprehensive Policy from Full Data)
base_premium_comprehensive = df[df['Policy Type'] == 'Comprehensive']['Premium Amount'].median()
print(f"✅ Base Premium (Comprehensive Policy from Full Data): ₹{base_premium_comprehensive:.2f}")

# Filter only Comprehensive policies
df_comprehensive = df[df['Policy Type'] == 'Comprehensive']

# Calculate Age Factors (mean method)
age_factors_comprehensive = df_comprehensive.groupby('Age Group')['Premium Amount'].mean().reset_index()
age_factors_comprehensive['Age Factor'] = (age_factors_comprehensive['Premium Amount'] - base_premium_comprehensive) / base_premium_comprehensive

print("\n✅ Age Adjustment Factors (Comprehensive Policy - Full Data):")
print(age_factors_comprehensive)

# Calculate Location Factors (mean method)
location_factors_comprehensive = df_comprehensive.groupby('Location')['Premium Amount'].mean().reset_index()
location_factors_comprehensive['Location Factor'] = (location_factors_comprehensive['Premium Amount'] - base_premium_comprehensive) / base_premium_comprehensive

print("\n✅ Location Adjustment Factors (Comprehensive Policy - Full Data):")
print(location_factors_comprehensive)

✅ Base Premium (Comprehensive Policy from Full Data): ₹11397.64

✅ Age Adjustment Factors (Comprehensive Policy - Full Data):
  Age Group  Premium Amount  Age Factor
0     18-30        14953.29        0.31
1     31-50        15951.82        0.40
2       51+        30567.09        1.68

✅ Location Adjustment Factors (Comprehensive Policy - Full Data):
  Location  Premium Amount  Location Factor
0   Tier-1        20323.39             0.78
1   Tier-2        20375.34             0.79
2   Tier-3        20056.35             0.76

# Base Premium (Premium Policy from Full Data)
base_premium_premium = df[df['Policy Type'] == 'Premium']['Premium Amount'].median()
print(f"✅ Base Premium (Premium Policy from Full Data): ₹{base_premium_premium:.2f}")

# Filter only Premium policies
df_premium = df[df['Policy Type'] == 'Premium']

# Calculate Age Factors (mean method)
age_factors_premium = df_premium.groupby('Age Group')['Premium Amount'].mean().reset_index()
age_factors_premium['Age Factor'] = (age_factors_premium['Premium Amount'] - base_premium_premium) / base_premium_premium

print("\n✅ Age Adjustment Factors (Premium Policy - Full Data):")
print(age_factors_premium)

# Calculate Location Factors (mean method)
location_factors_premium = df_premium.groupby('Location')['Premium Amount'].mean().reset_index()
location_factors_premium['Location Factor'] = (location_factors_premium['Premium Amount'] - base_premium_premium) / base_premium_premium

print("\n✅ Location Adjustment Factors (Premium Policy - Full Data):")
print(location_factors_premium)

✅ Base Premium (Premium Policy from Full Data): ₹12952.51

✅ Age Adjustment Factors (Premium Policy - Full Data):
  Age Group  Premium Amount  Age Factor
0     18-30        16899.40        0.30
1     31-50        18001.85        0.39
2       51+        33519.55        1.59

✅ Location Adjustment Factors (Premium Policy - Full Data):
  Location  Premium Amount  Location Factor
0   Tier-1        22759.68             0.76
1   Tier-2        22628.89             0.75
2   Tier-3        22264.68             0.72

# Risk Adjustment Data
data = {
   'Basic': {
       'Age': {
           '18-30': (14259.28, 0.32),
           '31-50': (15300.27, 0.41),
           '51+': (29131.29, 1.69)
    },
    'Location': {
           'Tier-1': (19542.62, 0.81),
           'Tier-2': (19393.83, 0.79),
           'Tier-3': (19040.39, 0.76)
    }
    },
  'Comprehensive': {
    'Age': {
          '18-30': (14953.29, 0.31),
          '31-50': (15951.82, 0.40),
          '51+': (30567.09, 1.68)
    },
    'Location': {
          'Tier-1': (20323.39, 0.78),
          'Tier-2': (20375.34, 0.79),
          'Tier-3': (20056.35, 0.76)
    }
    },
  'Premium': {
    'Age': {
        '18-30': (16899.40, 0.30),
        '31-50': (18001.85, 0.39),
        '51+': (33519.55, 1.59)
    },
    'Location': {
        'Tier-1': (22759.68, 0.76),
        'Tier-2': (22628.89, 0.75),
        'Tier-3': (22264.68, 0.72)
    }
    }
}

# Risk Adjustment Factors
def get_risk_factors(smoking_status, exercise_freq, health_score, previous_claims):
    smoking_factor = 0.1312 if smoking_status == 'Yes' else 0.00

    # Previous Claims Adjustment
    previous_claims_factors = {
        0: 0.00,
        1: 0.05,
        2: 0.10,
        3: 0.15,
        4: 0.20,
        5: 0.25
    }
    # Get factor, default to 0.25 if claims exceed 5
    previous_claims_factor = previous_claims_factors.get(previous_claims, 0.25)



    exercise_factors = {
        'Daily': 0.00,
        'Weekly': 0.0002,
        'Monthly': 0.0482,
        'Rarely': 0.3151
    }
    exercise_factor = exercise_factors.get(exercise_freq, 0.00)

    health_score_factors = {
        'High': 0.2930,
        'Medium': 0.1863,
        'Low': 0.00
    }
    health_score_factor = health_score_factors.get(health_score, 0.00)

    return smoking_factor, exercise_factor, health_score_factor , previous_claims_factor

# Function to calculate premium
def calculate_final_premium(row):
    policy_type = row['Policy Type']
    age_group = row['Age Group']
    location = row['Location']

    # Check if policy type, age group, location are valid
    if policy_type not in data or age_group not in data[policy_type]['Age'] or location not in data[policy_type]['Location']:
        return None

    # Get base premium and factors
    age_premium, age_factor = data[policy_type]['Age'][age_group]
    loc_premium, loc_factor = data[policy_type]['Location'][location]

    # Get risk factors
    smoking_factor, exercise_factor, health_score_factor, previous_claims_factor = get_risk_factors(row['Smoking Status'], row['Exercise Frequency'], row['Health Score'],row['Previous Claims'])

    # Calculate final premium
    base_premium = age_premium
    total_factor = loc_factor + smoking_factor + exercise_factor + health_score_factor + previous_claims_factor
    final_premium = base_premium * (1 + total_factor)

    return round(final_premium, 2)

# Assuming your DataFrame is called df
df['Final Premium'] = df.apply(calculate_final_premium, axis=1)  # Apply function row-wise

# Check the DataFrame with the new column
print(df.head())

   id  Age Gender  Annual Income    Marital Status  Number of Dependents  \
0   0   19  Woman      864214.00    Spouse Present                     1   
1   1   39  Woman      892701.25    Spouse Present                     3   
2   2   23    Man     2201772.00  Formerly Married                     3   
3   3   21    Man     3997541.99    Spouse Present                     2   
4   4   21    Man     3409986.00       Not Married                     1   

  Education Level Occupation  Health Score Location  ... Insurance Duration  \
0   Undergraduate   Business         26.60   Tier-1  ...               5.00   
1   Post Graduate    Missing         21.57   Tier-2  ...               2.00   
2   Undergraduate   Business         50.18   Tier-3  ...               3.00   
3   Undergraduate    Missing         16.94   Tier-2  ...               1.00   
4   Undergraduate   Business         24.38   Tier-2  ...               4.00   

           Policy Start Date  Customer Feedback  Smoking Status  \
0 2023-12-23 15:21:39.134960               Poor              No   
1 2023-06-12 15:21:39.111551            Average             Yes   
2 2023-09-30 15:21:39.221386               Good             Yes   
3 2024-06-12 15:21:39.226954               Poor             Yes   
4 2021-12-01 15:21:39.252145               Poor             Yes   

  Exercise Frequency  Property Type Premium Amount Age Group  \
0             Weekly  Detached Home        2690.70     18-30   
1            Monthly  Detached Home       10908.90     31-50   
2             Weekly  Detached Home       21563.14     18-30   
3              Daily           Flat        2690.70     18-30   
4             Weekly  Detached Home        2690.70     18-30   

  Health Score Category  Final Premium  
0                Medium       31436.26  
1                Medium       32213.11  
2                  High       32132.52  
3                   Low       28107.89  
4                Medium       31794.53  

[5 rows x 23 columns]

df['Final Premium']

df['IPR'] = (df['Final Premium'] / df['Annual Income']) * 100

def categorize_affordability(ipr):
    if ipr <= 2:
        return 'Higher Affordability'
    elif 2 < ipr <= 4:
        return 'High Affordability'
    elif 4 < ipr <= 12:  # Increased range from 10 to 12
        return 'Moderate Affordability'
    else:
        return 'Low Affordability'

# Apply the updated affordability categories
df['Affordability_Category'] = df['IPR'].apply(categorize_affordability)

# Check the updated category distribution
print(df['Affordability_Category'].value_counts(normalize=True) * 100)

Affordability_Category
Moderate Affordability   33.09
Higher Affordability     25.35
High Affordability       21.42
Low Affordability        20.14
Name: proportion, dtype: float64

Acko Health Insurance: Data-Driven Insurance Premium Pricing Strategy¶

Objective¶

Business Impact¶

Dataset Information¶

Explanation of Data Columns¶

Import Data¶

Data Cleaning¶

Creating New Columns¶

Data Types¶

Filling Missing Data¶

Outilers¶

Key Metrics & Calculations¶

Data Analysis¶

1.Older Age Groups Pay Higher Health Insurance Premiums¶

2.Health insurance premiums vary across locations due to differences in healthcare access and costs¶

3.Policy Type Influences Health Insurance Premium Amounts¶

4.Smokers tend to have higher insurance premiums compared to non-smokers.¶

5.Individuals who exercise more frequently (Daily/Weekly) tend to have lower health insurance premiums compared to those who exercise less frequently (Monthly/Rarely).¶

6.Higher the Health Score, higher the Premium Amount.¶

7.Higher Previous Claims Lead to Increased Premium Prices.¶

Formula Making¶

Age Factor¶

Affordability Of Price¶

Overall Analysis:¶

Recommendations:¶

	id	Age	Gender	Annual Income	Marital Status	Number of Dependents	Education Level	Occupation	Health Score	Location	Policy Type	Previous Claims	Credit Score	Insurance Duration	Policy Start Date	Customer Feedback	Smoking Status	Exercise Frequency	Property Type	Premium Amount
0	0	19.0	Woman	8.642140e+05	Spouse Present	1.0	Undergraduate	Business	26.598761	Tier-1	Premium	2.0	372.0	5.0	2023-12-23 15:21:39.134960	Poor	No	Weekly	Detached Home	1945.913327
1	1	39.0	Woman	8.927012e+05	Spouse Present	3.0	Post Graduate	Missing	21.569731	Tier-2	Comprehensive	1.0	694.0	2.0	2023-06-12 15:21:39.111551	Average	Yes	Monthly	Detached Home	10908.896072
2	2	23.0	Man	2.201772e+06	Formerly Married	3.0	Undergraduate	Business	50.177549	Tier-3	Premium	1.0	NaN	3.0	2023-09-30 15:21:39.221386	Good	Yes	Weekly	Detached Home	21563.135198
3	3	21.0	Man	3.997542e+06	Spouse Present	2.0	Undergraduate	Missing	16.938144	Tier-2	Basic	1.0	367.0	1.0	2024-06-12 15:21:39.226954	Poor	Yes	Daily	Flat	2653.539143
4	4	21.0	Man	3.409986e+06	Not Married	1.0	Undergraduate	Business	24.376094	Tier-2	Premium	0.0	598.0	4.0	2021-12-01 15:21:39.252145	Poor	Yes	Weekly	Detached Home	1269.243463
5	5	29.0	Man	1.295259e+06	Spouse Present	1.0	Post Graduate	Missing	36.053198	Tier-1	Premium	2.0	614.0	5.0	2022-05-20 15:21:39.207847	Average	No	Weekly	Detached Home	1252.689000
6	6	41.0	Man	1.136688e+06	Spouse Present	0.0	Undergraduate	Missing	NaN	Tier-2	Basic	2.0	807.0	6.0	2020-02-21 15:21:39.219432	Poor	No	Weekly	Detached Home	NaN
7	7	48.0	Woman	8.942382e+06	Spouse Present	2.0	Secondary Education	Full-Time Worker	13.769783	Tier-3	Comprehensive	1.0	398.0	5.0	2022-08-08 15:21:39.181605	Average	No	Rarely	Apartment	21833.802349
8	8	21.0	Man	1.490380e+05	Formerly Married	3.0	Undergraduate	Missing	24.869551	Tier-1	Premium	1.0	685.0	8.0	2020-12-14 15:21:39.198406	Average	No	Monthly	Apartment	10997.945102
9	9	44.0	Man	2.510442e+06	Spouse Present	2.0	Undergraduate	Full-Time Worker	22.473718	Tier-1	Comprehensive	1.0	635.0	3.0	2020-08-02 15:21:39.144722	Poor	No	Daily	Apartment	10580.678238

	0
id	0
Age	18705
Gender	0
Annual Income	44949
Marital Status	0
Number of Dependents	109672
Education Level	0
Occupation	0
Health Score	74076
Location	0
Policy Type	0
Previous Claims	364029
Credit Score	137882
Insurance Duration	1
Policy Start Date	0
Customer Feedback	77824
Smoking Status	0
Exercise Frequency	0
Property Type	0
Premium Amount	415032
Age Group	0
Health Score Category	0

	0
id	int64
Age	float64
Gender	object
Annual Income	float64
Marital Status	object
Number of Dependents	float64
Education Level	object
Occupation	object
Health Score	float64
Location	object
Policy Type	object
Previous Claims	float64
Credit Score	float64
Insurance Duration	float64
Policy Start Date	object
Customer Feedback	object
Smoking Status	object
Exercise Frequency	object
Property Type	object
Premium Amount	float64
Age Group	object
Health Score Category	object

	id	Age	Gender	Annual Income	Marital Status	Number of Dependents	Education Level	Occupation	Health Score	Location	...	Credit Score	Insurance Duration	Policy Start Date	Customer Feedback	Smoking Status	Exercise Frequency	Property Type	Premium Amount	Age Group	Health Score Category
0	0	19	Woman	864214.00	Spouse Present	1	Undergraduate	Business	26.60	Tier-1	...	372.00	5.00	2023-12-23 15:21:39.134960	Poor	No	Weekly	Detached Home	1945.91	18-30	Medium
1	1	39	Woman	892701.25	Spouse Present	3	Post Graduate	Missing	21.57	Tier-2	...	694.00	2.00	2023-06-12 15:21:39.111551	Average	Yes	Monthly	Detached Home	10908.90	31-50	Medium
2	2	23	Man	2201772.00	Formerly Married	3	Undergraduate	Business	50.18	Tier-3	...	NaN	3.00	2023-09-30 15:21:39.221386	Good	Yes	Weekly	Detached Home	21563.14	18-30	High
3	3	21	Man	3997541.99	Spouse Present	2	Undergraduate	Missing	16.94	Tier-2	...	367.00	1.00	2024-06-12 15:21:39.226954	Poor	Yes	Daily	Flat	2653.54	18-30	Low
4	4	21	Man	3409986.00	Not Married	1	Undergraduate	Business	24.38	Tier-2	...	598.00	4.00	2021-12-01 15:21:39.252145	Poor	Yes	Weekly	Detached Home	1269.24	18-30	Medium
5	5	29	Man	1295259.40	Spouse Present	1	Post Graduate	Missing	36.05	Tier-1	...	614.00	5.00	2022-05-20 15:21:39.207847	Average	No	Weekly	Detached Home	1252.69	18-30	Medium
6	6	41	Man	1136687.84	Spouse Present	0	Undergraduate	Missing	NaN	Tier-2	...	807.00	6.00	2020-02-21 15:21:39.219432	Poor	No	Weekly	Detached Home	NaN	31-50	High
7	7	48	Woman	8942382.00	Spouse Present	2	Secondary Education	Full-Time Worker	13.77	Tier-3	...	398.00	5.00	2022-08-08 15:21:39.181605	Average	No	Rarely	Apartment	21833.80	31-50	Low
8	8	21	Man	149038.00	Formerly Married	3	Undergraduate	Missing	24.87	Tier-1	...	685.00	8.00	2020-12-14 15:21:39.198406	Average	No	Monthly	Apartment	10997.95	18-30	Medium
9	9	44	Man	2510442.00	Spouse Present	2	Undergraduate	Full-Time Worker	22.47	Tier-1	...	635.00	3.00	2020-08-02 15:21:39.144722	Poor	No	Daily	Apartment	10580.68	31-50	Medium

	Exercise Frequency	Premium Amount
0	Daily	18960.46
3	Weekly	18968.28
1	Monthly	19879.45
2	Rarely	25057.54

	Previous Claims	Premium Amount
0	0.0	23097.32
1	1.0	18835.50
2	2.0	22888.42
3	3.0	23065.29
4	4.0	23487.99
5	5.0	24176.75
6	6.0	24572.97

	Final Premium
0	31436.26
1	32213.11
2	32132.52
3	28107.89
4	31794.53
...	...
1199995	32583.35
1199996	56249.56
1199997	25783.63
1199998	59329.60
1199999	32605.70