import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
import matplotlib.pyplot as plt  # For plotting and visualization
import seaborn as sns  # For advanced visualizations

# Install gdown
!pip install gdown

# Import necessary libraries
import gdown
import pandas as pd

# Google Drive file ID
file_id = "1i4ia9ZNfAXgu6JGTXCUgFn7Pb8wltzLH"

# Construct the file's download URL
download_url = f"https://drive.google.com/uc?id={file_id}"

# Set the output file name
output_file = "acko_dataset.csv"

# Download the file
gdown.download(download_url, output_file, quiet=False)

# Load the CSV file into a Pandas DataFrame
data = pd.read_csv(output_file)

Requirement already satisfied: gdown in /usr/local/lib/python3.11/dist-packages (5.2.0)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.11/dist-packages (from gdown) (4.13.3)
Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from gdown) (3.17.0)
Requirement already satisfied: requests[socks] in /usr/local/lib/python3.11/dist-packages (from gdown) (2.32.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from gdown) (4.67.1)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown) (2.6)
Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown) (4.12.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (2025.1.31)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (1.7.1)

Downloading...
From (original): https://drive.google.com/uc?id=1i4ia9ZNfAXgu6JGTXCUgFn7Pb8wltzLH
From (redirected): https://drive.google.com/uc?id=1i4ia9ZNfAXgu6JGTXCUgFn7Pb8wltzLH&confirm=t&uuid=488ec659-2105-4f13-91f0-1c42d7f709b8
To: /content/acko_dataset.csv
100%|██████████| 219M/219M [00:02<00:00, 93.0MB/s]

#Verifying the imported data

print(f"\033[1mDataset - Sample") # \033[1m is used to print a bold header for clarity
data # Displaying the imported dataset

Dataset - Sample

# Checking the number of rows and column in dataset using shape function.

print("\033[1mThe Number of Rows and Columns in the dataset are:") # \033[1m is used to print a bold header for clarity
data.shape # Displaying the shape of the dataset

The Number of Rows and Columns in the dataset are:

(1200000, 20)

print("\nDataset Information:") # Printing a newline and a header for dataset information
data.info() # Displaying the info

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1200000 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            1200000 non-null  object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Credit Score          1062118 non-null  float64
 13  Insurance Duration    1199999 non-null  float64
 14  Policy Start Date     1200000 non-null  object 
 15  Customer Feedback     1122176 non-null  object 
 16  Smoking Status        1200000 non-null  object 
 17  Exercise Frequency    1200000 non-null  object 
 18  Property Type         1200000 non-null  object 
 19  Premium Amount        784968 non-null   float64
dtypes: float64(8), int64(1), object(11)
memory usage: 183.1+ MB

duplicate_count = len(data[data.duplicated()]) # Counting the number of duplicate rows in the dataset
print(f"\033[1mNumber of Duplicate Rows in the Dataset are:") # Printing the number of duplicate rows in bold
duplicate_count # Displaying the duplicate count

Number of Duplicate Rows in the Dataset are:

0

duplicate_counts = data['id'].duplicated().sum()  # Counting the number of duplicate values in the 'id' column
print("\033[1mTotal Duplicate IDs:")  # Printing the total number of duplicate IDs in bold
duplicate_counts  # Displaying the duplicate count

Total Duplicate IDs:

0

# Checking summary statistics of the dataset using describe() function.

print("\033[1m Data Summary Statistics: \033[0m\n") # \033[1m and \033[0m codes are used to print statement in bold.
data.describe() # Displaying the result

 Data Summary Statistics:

data.dtypes  # Displaying the data types of each column in the dataset

data['id'] = data['id'].astype('object')  # Converting the 'id' column to object type

# QC - Verifying the change
print(data.dtypes['id'])  # Printing the data type of the 'id' column to confirm the conversion

object

data['Number of Dependents'] = data['Number of Dependents'].fillna(0).astype(int)  # Filling NaN values with 0 and converting to integer

# QC - Verifying the change
print(data.dtypes['Number of Dependents'])  # Printing the data type of 'Number of Dependents' to confirm the conversion

int64

data['Policy Start Date'] = pd.to_datetime(data['Policy Start Date'])  # Converting 'Policy Start Date' to datetime format

# Extracting just the date (removes time component)
data['Policy Start Date'] = data['Policy Start Date'].dt.date

# Converting back to datetime with normalized time (midnight)
data['Policy Start Date'] = pd.to_datetime(data['Policy Start Date']).dt.normalize()

# QC - Verifying the change
print(data.dtypes['Policy Start Date'])  # Printing data type of 'Policy Start Date' to confirm conversion

# Getting the minimum and maximum dates to validate the transformation
min_date = data['Policy Start Date'].min()
max_date = data['Policy Start Date'].max()

# Printing results
print("Minimum Start Date:", min_date)  # Displaying the earliest policy start date
print("Maximum Start Date:", max_date)  # Displaying the latest policy start date

datetime64[ns]
Minimum Start Date: 2019-08-17 00:00:00
Maximum Start Date: 2024-08-15 00:00:00

# Identifying distribution of missing values and their percentages
missing_values = data.isnull().sum()
missing_percentage = (missing_values / len(data)) * 100

# Combining into a single DataFrame
missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage (%)': missing_percentage.round(1)  # Round to 1 decimal place}
})

# Removing rows where Missing Values are 0
missing_data = missing_data[missing_data["Missing Values"] > 0]

# Display results
print("\nMissing Values and Percentages:")
print(missing_data.sort_values(by='Missing Values', ascending=False))

Missing Values and Percentages:
                    Missing Values  Percentage (%)
Premium Amount              415032            34.6
Previous Claims             364029            30.3
Credit Score                137882            11.5
Customer Feedback            77824             6.5
Health Score                 74076             6.2
Annual Income                44949             3.7
Age                          18705             1.6
Insurance Duration               1             0.0

# Visualizing the missing values using a heatmap
sns.heatmap(data.isnull(), cbar=False)  # Plotting missing values (NaNs) as a heatmap, hiding the color bar

<Axes: >

# Imputing missing Age values using the mean
data['Age'] = data['Age'].fillna(data['Age'].mean())  # Replacing NaN values with the mean of the Age column

# QC - Verifying if any missing values remain
print(data['Age'].isnull().sum())  # Printing the count of missing values in the Age column (should be 0)

# Checking the range of Age values after imputation
min_age_value = data['Age'].min()  # Extracting the minimum Age value
max_age_value = data['Age'].max()  # Extracting the maximum Age value

# Printing the min and max Age values
print("Minimum Age:", min_age_value)
print("Maximum Age:", max_age_value)

0
Minimum Age: 18.0
Maximum Age: 64.0

# Checking correlation of numerical columns with 'Health Score'
correlation = data.select_dtypes(include=np.number).corr()['Health Score'].sort_values(ascending=False)  # Computing correlation and sorting in descending order
print(correlation)  # Printing the correlation values

Health Score            1.000000
Premium Amount          0.161121
Age                     0.115790
Previous Claims         0.067157
Annual Income           0.022784
Credit Score            0.012182
Number of Dependents    0.004436
Insurance Duration      0.002294
Name: Health Score, dtype: float64

# Imputing missing Health Scores using median Health Score within each Age group
data['Health Score'] = data.groupby('Age')['Health Score'].transform(lambda x: x.fillna(x.median()))  # Filling NaNs with median value per Age group

# QC - Checking if any missing values remain
print(data['Health Score'].isnull().sum())  # Print count of missing values after imputation

# Getting the minimum and maximum Health Scores to verify the data range
min_value = data['Health Score'].min()  # Finding the minimum Health Score
max_value = data['Health Score'].max()  # Finding the maximum Health Score

# Printing results
print("Minimum Health Score:", min_value)
print("Maximum Health Score:", max_value)

0
Minimum Health Score: 2.3917132464334734
Maximum Health Score: 60.0

# Computing correlation of 'Premium Amount' with other numeric variables
correlation = data.select_dtypes(include=np.number).corr()['Premium Amount'].sort_values(ascending=False)  # Selecting numeric columns, computing correlation with 'Premium Amount', and sorting in descending order

# Printing the correlation values
print(correlation)

Premium Amount          1.000000
Age                     0.254410
Health Score            0.161121
Number of Dependents    0.088942
Annual Income           0.009807
Previous Claims        -0.000159
Insurance Duration     -0.059571
Credit Score           -0.073946
Name: Premium Amount, dtype: float64

# Filling missing 'Premium Amount' values based on Age and Health Score bins using median
data['Premium Amount'] = data.groupby(['Age', pd.cut(data['Health Score'], bins=[0, 20, 40, 60])])['Premium Amount'].transform(lambda x: x.fillna(x.median()))

# QC - Checking for remaining null values
print(data['Premium Amount'].isnull().sum())  # Printing count of missing values after imputation

# Getting the minimum and maximum values of 'Premium Amount' to verify the changes
min_pa_value = data['Premium Amount'].min()  # Extracting minimum value
max_pa_value = data['Premium Amount'].max()  # Extracting maximum value

# Print the results
print("Minimum Premium Amount:", min_pa_value)
print("Maximum Premium Amount:", max_pa_value)

<ipython-input-20-d10843b398a5>:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  data['Premium Amount'] = data.groupby(['Age', pd.cut(data['Health Score'], bins=[0, 20, 40, 60])])['Premium Amount'].transform(lambda x: x.fillna(x.median()))

0
Minimum Premium Amount: 292.65005896500105
Maximum Premium Amount: 240000.0

# Replacing all missing values in 'Customer Feedback' with the string "No Feedback"
data['Customer Feedback'].fillna('No Feedback', inplace=True)

# QC - Checking if any null values remain
print(data['Customer Feedback'].isnull().sum())  # Should return 0 if all missing values are replaced

# Getting unique values in 'Customer Feedback' to verify changes
unique_feedback = data['Customer Feedback'].unique()

# Printing the unique feedback values
print(unique_feedback)

0
['Poor' 'Average' 'Good' 'No Feedback']

<ipython-input-21-3f8c8f2a1c9c>:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Customer Feedback'].fillna('No Feedback', inplace=True)

#Checking Mean of Number of Dependents
data['Number of Dependents'].mean()

1.8262391666666666

#Checking Median Number of Dependents by Marital Status
data.groupby('Marital Status')['Number of Dependents'].median()

# Filling missing values in 'Number of Dependents' with 2
data['Number of Dependents'].fillna(2, inplace=True)

# QC - Checking if any null values remain
print(data['Number of Dependents'].isnull().sum())  # Should return 0 if all missing values are filled

# Getting minimum and maximum values for validation
min_dep_value = data['Number of Dependents'].min()
max_dep_value = data['Number of Dependents'].max()

# Printing the min and max values
print("Minimum Number of Dependents:", min_dep_value)
print("Maximum Number of Dependents:", max_dep_value)

0
Minimum Number of Dependents: 0
Maximum Number of Dependents: 4

<ipython-input-24-6eb96447e542>:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Number of Dependents'].fillna(2, inplace=True)

# Computing correlation between 'Credit Score' and 'Annual Income'
correlation = data[['Credit Score', 'Annual Income']].corr()

# Printing the correlation matrix
print(correlation)

               Credit Score  Annual Income
Credit Score       1.000000      -0.175664
Annual Income     -0.175664       1.000000

# Filling missing values in 'Credit Score' with the column mean
data['Credit Score'] = data['Credit Score'].fillna(data['Credit Score'].mean())

# QC - Checking if any missing values remain
print(data['Credit Score'].isnull().sum())

# Getting minimum and maximum Credit Score values
min_cs_value = data['Credit Score'].min()
max_cs_value = data['Credit Score'].max()

# Printing results
print("Minimum Credit Score:", min_cs_value)
print("Maximum Credit Score:", max_cs_value)

0
Minimum Credit Score: 300.0
Maximum Credit Score: 849.0

# Computing correlation of 'Previous Claims' with other numeric variables
correlation = data.select_dtypes(include=np.number).corr()['Previous Claims'].sort_values(ascending=False)

# Printing the correlation values
print(correlation)

Previous Claims         1.000000
Health Score            0.065243
Annual Income           0.042921
Credit Score            0.034533
Insurance Duration      0.003001
Age                     0.001854
Premium Amount          0.000707
Number of Dependents   -0.005386
Name: Previous Claims, dtype: float64

# Filling missing values in 'Previous Claims' with 0
data['Previous Claims'].fillna(0, inplace=True)

# QC - Checking if any missing values remain
print(data['Previous Claims'].isnull().sum())  # Should print 0 if all missing values are replaced

# Getting the minimum and maximum values of 'Previous Claims' to verify the range
min_pc_value = data['Previous Claims'].min()
max_pc_value = data['Previous Claims'].max()

# Printing the results
print("Minimum Previous Claims:", min_pc_value)
print("Maximum Previous Claims:", max_pc_value)

0
Minimum Previous Claims: 0.0
Maximum Previous Claims: 9.0

<ipython-input-28-41f16569522e>:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Previous Claims'].fillna(0, inplace=True)

# Defining age bins
bins = [18, 25, 35, 45, 55, 64]
labels = ["18-25", "26-35", "36-45", "46-55", "56-64"]

# Creating an age group column
data["Age Group"] = pd.cut(data["Age"], bins=bins, labels=labels, include_lowest=True)

# Calculating mean income at Location and Age Group level
income_means = data.groupby(["Location", "Age Group"])["Annual Income"].transform("mean")

# Filling missing values with the calculated mean
data["Annual Income"].fillna(income_means, inplace=True)

# Checking if all missing values are filled
print(data["Annual Income"].isnull().sum())

0

<ipython-input-29-ca5039504230>:9: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  income_means = data.groupby(["Location", "Age Group"])["Annual Income"].transform("mean")
<ipython-input-29-ca5039504230>:12: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Annual Income"].fillna(income_means, inplace=True)

print(data.isnull().sum())

id                      0
Age                     0
Gender                  0
Annual Income           0
Marital Status          0
Number of Dependents    0
Education Level         0
Occupation              0
Health Score            0
Location                0
Policy Type             0
Previous Claims         0
Credit Score            0
Insurance Duration      1
Policy Start Date       0
Customer Feedback       0
Smoking Status          0
Exercise Frequency      0
Property Type           0
Premium Amount          0
Age Group               0
dtype: int64

# Defining custom income breakpoints for quintiles
bins = [0, 500000, 1000000, 2000000, 5000000, data["Annual Income"].max()]
labels = ["Very Low", "Low", "Medium", "High", "Very High"]

# Assigning each row to an Income Quintile
data["Income Group"] = pd.cut(data["Annual Income"], bins=bins, labels=labels, include_lowest=True)

# Computing summary statistics for each quintile
income_summary = data.groupby("Income Group")["Annual Income"].agg([
    ("Min", "min"),
    ("Max", "max"),
    ("75th Percentile", lambda x: x.quantile(0.75)),
    ("90th Percentile", lambda x: x.quantile(0.90))
]).reset_index()

# Displaying the result
print(income_summary)

  Income Group         Min           Max  75th Percentile  90th Percentile
0     Very Low       10.75  4.999932e+05     3.310140e+05     4.373960e+05
1          Low   500004.00  9.999940e+05     8.738900e+05     9.586154e+05
2       Medium  1000008.00  1.999997e+06     1.667854e+06     1.823767e+06
3         High  2000016.00  4.999884e+06     3.769122e+06     4.270416e+06
4    Very High  5000056.00  1.304357e+07     9.022604e+06     1.048271e+07

<ipython-input-31-0a2ec6d1e92f>:9: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  income_summary = data.groupby("Income Group")["Annual Income"].agg([

# Set figure size
plt.figure(figsize=(12, 6))  # Define the size of the figure (width=12, height=6)

# Create multiple boxplots for all numeric columns
data.boxplot(rot=75, color='red')  # Generate boxplots for all numeric columns, rotate labels for readability, and set box color to red

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)  # Rotate x-axis labels by 45 degrees to prevent overlap

# Set title for the plot
plt.title("Boxplots for All Numeric Columns")  # Add a title to the plot

# Display the plot
plt.show()  # Render the boxplot

# Creating a boxplot for the "Previous Claims" column

# Set figure size
plt.figure(figsize=(10, 4))  # Define the size of the figure (width=10, height=4)

# Create a boxplot for the "Previous Claims" column
sns.boxplot(data["Previous Claims"])  # Generate a boxplot to visualize the distribution and outliers in the "Previous Claims" column

# Display the plot
plt.show()  # Render the boxplot

# Count occurrences of each unique value in the "Previous Claims" column
print(data['Previous Claims'].value_counts())

Previous Claims
0.0    669462
1.0    300811
2.0    167661
3.0     49011
4.0     10668
5.0      2018
6.0       302
7.0        58
8.0         8
9.0         1
Name: count, dtype: int64

print("\nObservations About the Dataset:")
if duplicate_count > 0:
    print(f"- There are {duplicate_count} duplicate rows in the dataset.")
else:
    print("- The table's granularity is at ID level. Thus each record represents a customer's data in Acko Database")

if missing_values.sum() > 0:
    print("- There are missing values in the dataset. Here’s a summary:")
    print(missing_percentage[missing_percentage > 0].round(1).astype(str) + "%")

else:
    print("- No missing values found in the dataset.")

print(f"- The time period spans from {min_date} to {max_date} ")
print("- The dataset can be used for further analyses after handling duplicates and imputing missing values.")

Observations About the Dataset:
- The table's granularity is at ID level. Thus each record represents a customer's data in Acko Database
- There are missing values in the dataset. Here’s a summary:
Age                    1.6%
Annual Income          3.7%
Health Score           6.2%
Previous Claims       30.3%
Credit Score          11.5%
Insurance Duration     0.0%
Customer Feedback      6.5%
Premium Amount        34.6%
dtype: object
- The time period spans from 2019-08-17 00:00:00 to 2024-08-15 00:00:00 
- The dataset can be used for further analyses after handling duplicates and imputing missing values.

# Unique Values for Each Variable
print("\n### Unique Values for Each Variable ###") #Prints a section header to indicate that unique values per column are being displayed.
for column in data.columns.tolist(): # Iterates through all column names in the dataset.
    print(f"No. of unique values in {column}: {data[column].nunique()}.") # Prints the column name along with the count of unique values in that column using .nunique().

### Unique Values for Each Variable ###
No. of unique values in id: 1200000.
No. of unique values in Age: 48.
No. of unique values in Gender: 2.
No. of unique values in Annual Income: 247775.
No. of unique values in Marital Status: 4.
No. of unique values in Number of Dependents: 5.
No. of unique values in Education Level: 4.
No. of unique values in Occupation: 4.
No. of unique values in Health Score: 923540.
No. of unique values in Location: 3.
No. of unique values in Policy Type: 3.
No. of unique values in Previous Claims: 10.
No. of unique values in Credit Score: 551.
No. of unique values in Insurance Duration: 9.
No. of unique values in Policy Start Date: 1826.
No. of unique values in Customer Feedback: 4.
No. of unique values in Smoking Status: 2.
No. of unique values in Exercise Frequency: 4.
No. of unique values in Property Type: 3.
No. of unique values in Premium Amount: 784564.
No. of unique values in Age Group: 5.
No. of unique values in Income Group: 5.

# Copying the dataset for anlysis
df = data.copy()

#QC
df.describe().T

# Defining the mappings for each categorical column
# Marital Status Scoring (Higher risk = lower score)
marital_scores = {
    "Spouse Present": 4,    # Lowest risk
    "Formerly Married": 3,
    "Not Married": 2,
    "Unknown": 1            # Highest risk (uncertain)
}

# Education Level Scoring (Higher education = lower risk)
education_scores = {
    "PhD": 4,               # Lowest risk
    "Post Graduate": 3,
    "Undergraduate": 2,
    "Secondary Education": 1  # Highest risk
}

# Occupation Scoring (Stable jobs = lower risk)
occupation_scores = {
    "Full-Time Worker": 4,   # Lowest risk
    "Business": 3,
    "Not Currently Working": 2,
    "Missing": 1             # Highest risk (unknown status)
}

# City Tier Scoring (Tier 1 = better healthcare & infrastructure)
city_scores = {
    "Tier-1": 3,
    "Tier-2": 2,
    "Tier-3": 1   # Highest risk (limited facilities)
}

# Policy Type Scoring (More coverage = lower risk)
policy_scores = {
    "Premium": 3,           # Best coverage (lowest risk)
    "Comprehensive": 2,
    "Basic": 1              # Least coverage (highest risk)
}

# Property Type Scoring (Stable housing = lower risk)
property_scores = {
    "Apartment": 3,      # More stable (lower risk)
    "Flat": 2,
    "Detached Home": 1           # Higher density = slightly more risk
}

#Smoking Scoring
smoking_scores = {
    "Yes": 0,  # Higher risk
    "No": 1    # Lower risk
}

#Exercising Scoring
exercise_scores = {
    "Daily": 4,      # Lowest risk
    "Weekly": 3,     # Moderate risk
    "Monthly": 2,    # Higher risk
    "Rarely": 1      # Highest risk
}

#No.of dependents Scoring
dependents_scores = {
    0: 4,
    1: 4,      # Lowest risk
    2: 3,     # Moderate risk
    3: 2,    # Higher risk
    4: 1      # Highest risk
}

# Functioning to score Previous Claims
def score_previous_claims(value):
    if value == 0:
        return 5  # Lowest risk (no claims)
    elif value == 1:
        return 4  # Low risk
    elif value in [2, 3]:
        return 3  # Moderate risk
    elif value in [4, 5, 6]:
        return 2  # High risk
    else:
        return 1  # Highest risk (7+ claims)

# Adding the new columns to the data DataFrame based on the mappings
data["Marital Score"] = data["Marital Status"].map(marital_scores)
data["Education Score"] = data["Education Level"].map(education_scores)
data["Occupation Score"] = data["Occupation"].map(occupation_scores)
data["City Score"] = data["Location"].map(city_scores)
data["Policy Score"] = data["Policy Type"].map(policy_scores)
data["Property Score"] = data["Property Type"].map(property_scores)
data["Smoking Score"] = data["Smoking Status"].map(smoking_scores)
data["Exercise Score"] = data["Exercise Frequency"].map(exercise_scores)
data["Dependents Score"] = data["Number of Dependents"].map(dependents_scores)

# Applying function for Previous Claims Score
data["Previous Claims Score"] = data["Previous Claims"].apply(score_previous_claims)

# Checking for missing values after mapping
print(data.isnull().sum())

# Viewing the updated data
data.head()

id                       0
Age                      0
Gender                   0
Annual Income            0
Marital Status           0
Number of Dependents     0
Education Level          0
Occupation               0
Health Score             0
Location                 0
Policy Type              0
Previous Claims          0
Credit Score             0
Insurance Duration       1
Policy Start Date        0
Customer Feedback        0
Smoking Status           0
Exercise Frequency       0
Property Type            0
Premium Amount           0
Age Group                0
Income Group             0
Marital Score            0
Education Score          0
Occupation Score         0
City Score               0
Policy Score             0
Property Score           0
Smoking Score            0
Exercise Score           0
Dependents Score         0
Previous Claims Score    0
dtype: int64

# Identifying columns with "Score" in their name
score_cols = ['Marital Score', 'Education Score', 'Occupation Score', 'City Score',
              'Policy Score', 'Property Score', 'Smoking Score','Health Score','Credit Score']

# Applying Min-Max normalization manually
for col in score_cols:
    data[col + "_Normalized"] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())

# Verifying the results
data[[col + "_Normalized" for col in score_cols]].head()

# Creating a new column Customer Score by summing all the normalized score columns.
data['Customer Score'] = data[['Marital Score_Normalized', 'Education Score_Normalized',
                               'Occupation Score_Normalized', 'City Score_Normalized',
                               'Policy Score_Normalized', 'Property Score_Normalized',
                               'Smoking Score_Normalized','Health Score_Normalized','Credit Score_Normalized']].sum(axis=1).round(0)

# Applying the condition
data['Customer Score'] = np.where(data['Customer Score'] > 5, '>5', data['Customer Score'])# Replaces any Customer Score greater than 5 with the string '>5'.

# Verifying the result
data[['Customer Score']].head()

# Define segmentation function
def customer_risk_segment(row):
    """Segments customers into Low Risk, Medium Risk, or High Risk based on multiple factors."""

# Defines conditions for High Risk
    high_risk_conditions = (
        row["Smoking Status"] == "Yes" and
        row["Education Level"] in ["Secondary Education", "Undergraduate"] and
        row["Occupation"] in ["Not Currently Working", "Missing"] and
        row["Number of Dependents"] > 2 and
        row["Annual Income"] < 500000
    )

# Defines conditions for Low Risk
    low_risk_conditions = (
        row["Smoking Status"] == "No" and
        row["Location"] in ["Tier-1", "Tier-2"] and
        row["Occupation"] == "Full-Time Worker" and
        row["Marital Status"] in ["Spouse Present","Formerly Married"] and
        row["Policy Type"] in ["Premium", "Comprehensive"] and
        row["Number of Dependents"] <=2 and
        row["Exercise Frequency"] in ["Weekly", "Daily"]
    )

    if high_risk_conditions:
        return "High Risk" #Returns "High Risk" if high-risk conditions are met.
    elif low_risk_conditions:
        return "Low Risk" # Returns "Low Risk" if low-risk conditions are met.
    else:
        return "Medium Risk" # Otherwise, classifies as "Medium Risk".

# Apply the segmentation function
data["Risk Segment"] = data.apply(customer_risk_segment, axis=1)

# View segmentation results
data[["Risk Segment"]].head()

data["Risk Segment"].value_counts() #Counting the number of occurrences for each unique value in the Risk Segment column.

cross_tab = pd.crosstab(data["Risk Segment"], data["Customer Score"], margins=True) # Creating a cross-tabulation (contingency table) to analyze relationships between two categorical variables.
cross_tab

tier_mapping = {
    "1.0": {"Low Risk": 4, "Medium Risk": 4, "High Risk": 5},
    "2.0": {"Low Risk": 3, "Medium Risk": 4, "High Risk": 5},
    "3.0": {"Low Risk": 2, "Medium Risk": 4, "High Risk": 5},
    "4.0": {"Low Risk": 2, "Medium Risk": 2, "High Risk": 4},
    "5.0": {"Low Risk": 1, "Medium Risk": 2, "High Risk": 3},
    ">5": {"Low Risk": 1, "Medium Risk": 1, "High Risk": 2}
}

# Defining function to assign tier
def assign_tier(row):
    return tier_mapping.get(row["Customer Score"], {}).get(row["Risk Segment"], None)

# Applying function to create the "Tier" column
data["Tier"] = data.apply(assign_tier, axis=1)

# Checking the updated dataset
data.head()

data["Tier"].value_counts() # Counting the number of occurrences of each unique Tier value.

print("\n⚠️  ALERT: The script requires manual input for discount percentages.")
print("   Please enter discount values for each tier when prompted.\n")

# Ensuring Discount Percentage column exists and is numeric
data["Discount Percentage"] = 0.0  # Initialize with 0

# Getting unique tiers dynamically from the data
unique_tiers = data["Tier"].unique()

# Asking for discount percentages for each unique tier and updating the existing column
for tier in unique_tiers:
    discount = float(input(f"Enter discount percentage for {tier} (as a number, e.g., 12 for 12%): "))
    data.loc[data["Tier"] == tier, "Discount Percentage"] = discount / 100  # Convert to decimal format

# Calculating the risk-adjusted premium dynamically
data["Risk Adjusted Premium"] = data["Premium Amount"] * (1 - data["Discount Percentage"])

# Displaying updated table
print(data[["Tier", "Premium Amount", "Discount Percentage", "Risk Adjusted Premium"]])

⚠️  ALERT: The script requires manual input for discount percentages.
   Please enter discount values for each tier when prompted.

Enter discount percentage for 1 (as a number, e.g., 12 for 12%): 12
Enter discount percentage for 2 (as a number, e.g., 12 for 12%): 10
Enter discount percentage for 4 (as a number, e.g., 12 for 12%): 7
Enter discount percentage for 3 (as a number, e.g., 12 for 12%): 8
Enter discount percentage for 5 (as a number, e.g., 12 for 12%): 7
         Tier  Premium Amount  Discount Percentage  Risk Adjusted Premium
0           1     1945.913327                 0.12            1712.403728
1           2    10908.896072                 0.10            9818.006465
2           2    21563.135198                 0.10           19406.821679
3           4     2653.539143                 0.07            2467.791403
4           2     1269.243463                 0.10            1142.319116
...       ...             ...                  ...                    ...
1199995     1     9573.841948                 0.12            8424.980914
1199996     2    19920.750532                 0.10           17928.675479
1199997     2     1459.859503                 0.10            1313.873553
1199998     2    38622.914451                 0.10           34760.623006
1199999     2    37863.127378                 0.10           34076.814640

[1200000 rows x 4 columns]

# Calculating Risk Adjusted Premium
data["Risk Adjusted Premium"] = data["Premium Amount"] * (1 - data["Discount Percentage"])

# Displaying the first few rows to verify
data[["Tier", "Premium Amount", "Discount Percentage", "Risk Adjusted Premium"]].head()

# Converting categorical variables to numerical encoding
categorical_cols = ["Education Level", "Smoking Status", "Location","Marital Status","Occupation","Policy Type","Exercise Frequency","Property Type"]
dc = data.copy() # Create a copy to avoid modifying the original dataset

for col in categorical_cols:
    dc[col] = dc[col].astype("category").cat.codes # Converting categorical variables into numerical codes

# Calculating the correlation matrix
correlation_matrix = dc[["Premium Amount", "Education Level", "Smoking Status", "Annual Income", "Location", "Number of Dependents","Age","Health Score","Occupation","Policy Type",
                         "Marital Status","Previous Claims","Credit Score","Exercise Frequency","Property Type"]].corr()

# Displaying correlation of Premium Amount with selected features
correlation_premium = correlation_matrix["Premium Amount"].sort_values(ascending=False)
print(correlation_premium)

Premium Amount          1.000000
Age                     0.260550
Health Score            0.165821
Previous Claims         0.087215
Smoking Status          0.078461
Number of Dependents    0.070299
Policy Type             0.047385
Exercise Frequency      0.032151
Annual Income           0.009737
Education Level         0.002612
Property Type          -0.000772
Marital Status         -0.002192
Occupation             -0.005001
Location               -0.023397
Credit Score           -0.057053
Name: Premium Amount, dtype: float64

# Selecting numerical customer categories
num_cols = ["Premium Amount", "Number of Dependents", "Health Score", "Credit Score", "Insurance Duration","Age","Annual Income", "Previous Claims" ]

# Computing correlation matrix
correlation_matrix = data[num_cols].corr()

# Plotting heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Between Premium & Risk Factors")
plt.show()

# Checking premium amount variation across categorical features
categorical_cols = ["Location", "Property Type", "Policy Type","Occupation"]

for col in categorical_cols:
    print(f"\nAverage Premium for {col}:\n")
    print(data.groupby(col)["Premium Amount"].mean()) #  Calculates the average premium for each category and prints the result

Average Premium for Location:

Location
Tier-1    22952.630351
Tier-2    21900.198307
Tier-3    21490.429932
Name: Premium Amount, dtype: float64

Average Premium for Property Type:

Property Type
Apartment        22137.392018
Detached Home    22108.577150
Flat             22089.183495
Name: Premium Amount, dtype: float64

Average Premium for Policy Type:

Policy Type
Basic            20849.694320
Comprehensive    21668.083019
Premium          23804.534636
Name: Premium Amount, dtype: float64

Average Premium for Occupation:

Occupation
Business                 22081.178831
Full-Time Worker         22292.661750
Missing                  22294.276896
Not Currently Working    21583.619493
Name: Premium Amount, dtype: float64

# Plotting the Histogram with KDE
plt.figure(figsize=(12, 6))
sns.histplot(data=data, x="Premium Amount", bins=50, kde=True, color='Blue', alpha=0.6) # Added data=data

# Formatting the X-axis to show amounts in ₹
plt.xticks(np.arange(0, data["Premium Amount"].max(), 50000), labels=[f"{int(x/1000)}K" for x in np.arange(0, data["Premium Amount"].max(), 50000)]) # Changed max("Premium Amount") to data["Premium Amount"].max()

# Labels and Title
plt.xlabel("Premium Amount (₹)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("Premium Amount Distribution", fontsize=14)

# Showing the Plot
plt.show()

# Calculating the percentage of insurances with Premium Amount < 50K
percent_below_50k = (data[data["Premium Amount"] < 50000].shape[0] / data.shape[0]) * 100

# Printing the result
print(f"Percentage of insurances with Premium Amount < ₹50K: {percent_below_50k:.2f}%")

Percentage of insurances with Premium Amount < ₹50K: 90.62%

# Grouping data by Age Group and calculate the average Premium Amount
age_premium = data.groupby("Age Group")["Premium Amount"].mean().reset_index()

# Sorting Age Group in case it's not in order
# Convert Age Group to string, remove NaNs and then sort
age_premium["Age Group"] = age_premium["Age Group"].astype(str)
age_premium["Age Group"] = pd.Categorical(age_premium["Age Group"], categories=sorted(age_premium["Age Group"].dropna().unique()), ordered=True)
age_premium = age_premium.sort_values("Age Group")

# Plotting the line chart
plt.figure(figsize=(10, 5))
sns.lineplot(x="Age Group", y="Premium Amount", data=age_premium, marker="o", label="Avg Premium Amount", color="b")

# Labels and Title
plt.xlabel("Age Group")
plt.ylabel("Average Premium Amount")
plt.title("Average Premium Amount v/s, Age Groups")
plt.legend()
plt.xticks(rotation=45)  # Rotate x-axis labels if necessary
plt.grid(axis="y", linestyle="--", alpha=0.7)  # Add grid lines for better readability

# Showing the plot
plt.show()

<ipython-input-53-c5856972c962>:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  age_premium = data.groupby("Age Group")["Premium Amount"].mean().reset_index()

# Aggregating Data: Count of customers and Average Previous Claims by Age Group
agg_data = data.groupby('Age Group').agg(
    Count_of_Customers=('id', 'count'),
    Avg_Previous_Claims=('Previous Claims', 'mean')
).reset_index()

# Sorting for better visualization
agg_data = agg_data.sort_values(by='Count_of_Customers', ascending=False)

# Setting style
sns.set(style="whitegrid")

# Creating figure and axis
fig, ax = plt.subplots(figsize=(10, 6))

# Creating bar chart with color based on Avg Previous Claims
bars = sns.barplot(
    y='Age Group', x='Count_of_Customers', data=agg_data,
    palette=sns.color_palette("Blues", as_cmap=True),
    hue=agg_data['Avg_Previous_Claims'], dodge=False
)

# Removing the hue legend (since it's not needed in a bar chart)
bars.legend_.remove()

# Adding a color bar to indicate the gradient of avg previous claims
norm = plt.Normalize(agg_data['Avg_Previous_Claims'].min(), agg_data['Avg_Previous_Claims'].max())
sm = plt.cm.ScalarMappable(cmap="Blues", norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label('Average Previous Claims')

# Titles and labels
plt.xlabel("#Insurances")
plt.ylabel("Age Group")
plt.title("Age Group vs. Avg Previous Claims")

# Displaying the plot
plt.show()

<ipython-input-54-0c17d9d20f04>:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  agg_data = data.groupby('Age Group').agg(

# Counting the number of policies per location
location_counts = data['Location'].value_counts()

# Defining three shades of blue
colors = ['#89CFF0', '#4682B4', '#1E3A5F']

# Creating the pie chart
plt.figure(figsize=(4,4))
plt.pie(location_counts, labels=location_counts.index, autopct='%1.1f%%', colors=colors, startangle=140)
plt.title('#Insurances - Location Distribution')
plt.show()

# Defining three shades of blue
blue_palette = ['#89CFF0', '#4682B4', '#1E3A5F']

# Creating the count plot
plt.figure(figsize=(8,4))
sns.countplot(x='Location', hue='Policy Type', data=data, palette=blue_palette)
plt.title('Policy Type Distribution Across Locations')
plt.xlabel('Location')
plt.ylabel('Count of Policies')
plt.legend(title='Policy Type')
plt.xticks(rotation=45)  # Rotate x-axis labels if needed
plt.show()

# Defining five shades of blue
blue_palette = ['#89CFF0', '#4682B4', '#1E3A5F', '#0F52BA', '#5D8AA8']

# Calculating the average premium amount by tier
avg_premium_by_tier = data.groupby('Tier')['Premium Amount'].mean().reset_index()

# Setting plot size
plt.figure(figsize=(8, 6))

# Creating the bar chart
sns.barplot(x='Tier', y='Premium Amount', data=avg_premium_by_tier, palette=blue_palette)

# Adding labels and title
plt.title('Average Premium Amount v/s. Tier')
plt.xlabel('Tier')
plt.ylabel('Average Premium Amount')

# Displaying the chart
plt.show()

<ipython-input-57-ac3a9fd306c3>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Tier', y='Premium Amount', data=avg_premium_by_tier, palette=blue_palette)

# Creating the Previous Claims Group based on the given conditions
data['Previous Claims Group'] = pd.cut(
    data['Previous Claims'],
    bins=[-1, 0, 1, float('inf')],
    labels=['No Claims', '1 Claim', 'Multiple Claims']
)

# Resetting index to ensure unique index values
data = data.reset_index(drop=True)

# Defining shades of blue for consistency
blue_palette = ['#89CFF0', '#4682B4', '#0F52BA']

# Plotting the violin plot
plt.figure(figsize=(10, 6))
sns.violinplot(x='Previous Claims Group', y='Premium Amount', data=data, palette=blue_palette)

# Adding labels and title
plt.title(" Previous Claims vs. Premium Amount", fontsize=14)
plt.xlabel("Previous Claims Group")
plt.ylabel("Premium Amount")

# Showing the plot
plt.tight_layout()
plt.show()

<ipython-input-58-44e6b5e1a654>:16: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Previous Claims Group', y='Premium Amount', data=data, palette=blue_palette)

# Grouping data to calculate average premium per number of dependents
avg_premium = data.groupby('Number of Dependents')['Premium Amount'].mean().reset_index()

# Ensuring sorting by number of dependents
avg_premium = avg_premium.sort_values(by='Number of Dependents')

# Defining shades of blue for consistency
blue_palette = ['#B0C4DE', '#6495ED', '#4682B4', '#1E90FF', '#0F52BA']

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='Number of Dependents', y='Premium Amount', data=avg_premium, palette=blue_palette)

# Labels and title
plt.xlabel("Number of Dependents", fontsize=12)
plt.ylabel("Avg. Premium Amount", fontsize=12)
plt.title("Dependents vs. Avg Premium Amount", fontsize=14)

# Show the chart
plt.tight_layout()
plt.show()

<ipython-input-59-743c08b7ec7b>:12: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Number of Dependents', y='Premium Amount', data=avg_premium, palette=blue_palette)

# Grouping data to calculate average premium per exercise frequency
avg_premium = data.groupby('Exercise Frequency')['Premium Amount'].mean().reset_index()

# Sorting in descending order by premium amount
avg_premium = avg_premium.sort_values(by='Premium Amount', ascending=False)

# Define shades of blue for consistency
blue_palette = ['#B0C4DE', '#6495ED', '#4682B4', '#1E90FF', '#0F52BA']

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(y='Exercise Frequency', x='Premium Amount', data=avg_premium, palette=blue_palette)

# Labels and title
plt.ylabel("Exercise Frequency")
plt.xlabel("Avg. Premium Amount")
plt.title("Exercise Frequency vs. Avg Premium Amount")

# Displaying the chart
plt.show()

<ipython-input-60-ebc5379e6736>:12: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y='Exercise Frequency', x='Premium Amount', data=avg_premium, palette=blue_palette)
<ipython-input-60-ebc5379e6736>:12: UserWarning: The palette list has more values (5) than needed (4), which may not be intended.
  sns.barplot(y='Exercise Frequency', x='Premium Amount', data=avg_premium, palette=blue_palette)

#Calculating average premium amount for smokers and non-smokers
smoker_premium_avg = data.groupby("Smoking Status")["Premium Amount"].mean()

# Defining shades of blue for consistency
blue_palette = [ '#4682B4', '#1E90FF']

# Plotting bar chart
plt.figure(figsize=(8, 5))
sns.barplot(x=smoker_premium_avg.index, y=smoker_premium_avg.values, palette=blue_palette, edgecolor='black')

# Labels and Title
plt.xlabel("Smoker Status")
plt.ylabel("Average Premium Amount")
plt.title(" Smokers Pay Higher Premiums")
plt.grid(axis="y", linestyle="--", alpha=0.7)

# Showing plot
plt.show()

<ipython-input-61-a5a284b603f8>:9: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=smoker_premium_avg.index, y=smoker_premium_avg.values, palette=blue_palette, edgecolor='black')

# Computing average values
avg_data = data.groupby(["Age Group", "Policy Type"]).agg(
    {"Risk Adjusted Premium": "mean", "Premium Amount": "mean"}
).reset_index()

# Set color and marker styles
age_palette = {
    "18-25": "blue", "26-35": "orange", "36-45": "red", "46-55": "cyan", "56-64": "green"
}
policy_markers = {"Basic": "o", "Comprehensive": "s", "Premium": "P"}

# Creating scatter plot
plt.figure(figsize=(12, 6))
sns.scatterplot(
    data=avg_data,
    x="Risk Adjusted Premium",
    y="Premium Amount",
    hue="Age Group",
    style="Policy Type",
    palette=age_palette,
    markers=policy_markers,
    s=150  # Adjust marker size
)

# Labels & Title
plt.title("Avg Risk Adjusted Premium vs. Avg Premium Amount", fontsize=14)
plt.xlabel("Avg Risk Adjusted Premium", fontsize=12)
plt.ylabel("Avg Premium Amount", fontsize=12)

# Showing legend
plt.legend(title="Legend", bbox_to_anchor=(1.05, 1), loc="upper left")

# Displaying plot
plt.show()

<ipython-input-62-f54827742e14>:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  avg_data = data.groupby(["Age Group", "Policy Type"]).agg(

# Grouping data
data_grouped = data.groupby(["Location", "Policy Type", "Education Level"])["Premium Amount"].mean().reset_index()

# Pivot table for heatmap
pivot_data = data_grouped.pivot_table(values="Premium Amount", index=["Education Level"], columns=["Location", "Policy Type"], aggfunc="mean")

# Setting figure size
plt.figure(figsize=(12, 6))

# Creating heatmap
sns.heatmap(pivot_data, cmap="coolwarm", annot=True, fmt=".0f", linewidths=0.5)

# Titles and labels
plt.title("Education Level & Location vs. Premium Amount (by Policy Type)")
plt.xlabel("Location & Policy Type")
plt.ylabel("Education Level")

# Showing plot
plt.show()

# Grouping data
data_grouped = data.groupby(["Number of Dependents", "Marital Status", "Property Type"])["Premium Amount"].mean().reset_index()

# Pivot table for heatmap
pivot_data = data_grouped.pivot_table(values="Premium Amount", index=["Number of Dependents"], columns=["Marital Status", "Property Type"], aggfunc="mean")

# Setting figure size
plt.figure(figsize=(12, 6))

# Creating heatmap
sns.heatmap(pivot_data, cmap="coolwarm", annot=True, fmt=".0f", linewidths=0.5)

# Titles and labels
plt.title("Dependents, Marital Status & Property Type vs. Premium Amount")
plt.xlabel("Marital Status & Property Type")
plt.ylabel("Number of Dependents")

# Showing plot
plt.show()

# Sorting data based on Previous Claims for better visualization
data = data.sort_values(by="Previous Claims")

# Defining shades of blue for better aesthetics
blue_palette = ['#B0C4DE', '#6495ED', '#4682B4', '#1E90FF', '#0F52BA']

# Plotting the boxplot
plt.figure(figsize=(10, 5))
sns.boxplot(x="Previous Claims", y="Premium Amount", data=data, palette=blue_palette)

# Labels and Title
plt.xlabel("Previous Claims", fontsize=12)
plt.ylabel("Premium Amount", fontsize=12)
plt.title("Impact of Previous Claims on Premium Amount", fontsize=14)

# Rotate x-axis labels if needed
plt.xticks(rotation=45)

# Adding grid lines for better readability
plt.grid(axis="y", linestyle="--", alpha=0.7)

# Showing the plot
plt.tight_layout()
plt.show()

<ipython-input-65-fd20f71998a0>:9: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Previous Claims", y="Premium Amount", data=data, palette=blue_palette)
<ipython-input-65-fd20f71998a0>:9: UserWarning: 
The palette list has fewer values (5) than needed (10) and will cycle, which may produce an uninterpretable plot.
  sns.boxplot(x="Previous Claims", y="Premium Amount", data=data, palette=blue_palette)

# List of columns to exclude (all score and normalized score columns except 'Customer Score')
exclude_columns = ['Marital Score', 'Education Score', 'Occupation Score', 'City Score',
                   'Policy Score', 'Property Score', 'Smoking Score',
                   'Marital Score_Normalized', 'Education Score_Normalized',
                   'Occupation Score_Normalized', 'City Score_Normalized',
                   'Policy Score_Normalized', 'Property Score_Normalized',
                   'Smoking Score_Normalized','Health Score_Normalized','Credit Score_Normalized']

# Select all columns except the excluded ones
final_data = data.drop(columns=exclude_columns)

# Verify the new DataFrame
final_data.head()

final_data# List of columns to exclude (all score and normalized score columns except 'Customer Score')
exclude_columns = ['Marital Score', 'Education Score', 'Occupation Score', 'City Score',
                   'Policy Score', 'Property Score', 'Smoking Score',
                   'Marital Score_Normalized', 'Education Score_Normalized',
                   'Occupation Score_Normalized', 'City Score_Normalized',
                   'Policy Score_Normalized', 'Property Score_Normalized',
                   'Smoking Score_Normalized','Health Score_Normalized','Credit Score_Normalized', "Exercise Score", "Dependents Score", "Previous Claims Score"]

# Select all columns except the excluded ones
final_data = data.drop(columns=exclude_columns)

# Verify the new DataFrame
final_data.head()

# Save DataFrame to CSV
#final_data.to_csv("Acko_data.csv", index=False)  # Saves file without index

#from google.colab import files
#files.download("Acko_data.csv")

	id	Age	Annual Income	Number of Dependents	Health Score	Previous Claims	Credit Score	Insurance Duration	Premium Amount
count	1.200000e+06	1.181295e+06	1.155051e+06	1.090328e+06	1.125924e+06	835971.000000	1.062118e+06	1.199999e+06	784968.000000
mean	5.999995e+05	4.114556e+01	1.664521e+06	2.009934e+00	3.186879e+01	1.002689	5.929244e+02	5.018219e+00	25763.411424
std	3.464103e+05	1.353995e+01	2.115112e+06	1.417338e+00	1.239609e+01	0.982840	1.499819e+02	2.594331e+00	30563.216524
min	0.000000e+00	1.800000e+01	1.075000e+01	0.000000e+00	2.391713e+00	0.000000	3.000000e+02	1.000000e+00	292.650059
25%	2.999998e+05	3.000000e+01	3.968939e+05	1.000000e+00	2.209691e+01	0.000000	4.680000e+02	3.000000e+00	6840.682284
50%	5.999995e+05	4.100000e+01	8.581660e+05	2.000000e+00	3.096556e+01	1.000000	5.950000e+02	5.000000e+00	14824.932460
75%	8.999992e+05	5.300000e+01	1.990566e+06	3.000000e+00	4.114583e+01	2.000000	7.210000e+02	7.000000e+00	31316.333081
max	1.199999e+06	6.400000e+01	1.304357e+07	4.000000e+00	6.000000e+01	9.000000	8.490000e+02	9.000000e+00	240000.000000

	count	mean	min	25%	50%	75%	max	std
Age	1200000.0	41.145563	18.0	30.0	41.145563	53.0	64.0	13.434008
Annual Income	1200000.0	1664518.999916	10.75	421658.0	888624.0	1904577.7408	13043574.0	2075120.716829
Number of Dependents	1200000.0	1.826239	0.0	0.0	2.0	3.0	4.0	1.469939
Health Score	1200000.0	31.815757	2.391713	22.818895	30.611358	40.44498	60.0	12.018261
Previous Claims	1200000.0	0.698516	0.0	0.0	0.0	1.0	9.0	0.940962
Credit Score	1200000.0	592.92435	300.0	484.0	592.92435	706.0	849.0	141.102504
Insurance Duration	1199999.0	5.018219	1.0	3.0	5.0	7.0	9.0	2.594331
Policy Start Date	1200000	2022-02-12 13:44:51.791998976	2019-08-17 00:00:00	2020-11-20 00:00:00	2022-02-14 00:00:00	2023-05-06 00:00:00	2024-08-15 00:00:00	NaN
Premium Amount	1200000.0	22111.710058	292.650059	9445.078506	13028.110728	23751.257479	240000.0	25479.519693

Customer Score	1.0	2.0	3.0	4.0	5.0	>5	All
Risk Segment
High Risk	85	1860	7564	9417	3768	342	23036
Low Risk	0	0	0	17	1306	15393	16716
Medium Risk	639	19696	127934	328437	398153	285389	1160248
All	724	21556	135498	337871	403227	301124	1200000

	count
Tier
2	726949
1	302088
4	157686
5	9509
3	3768

	id	Age	Gender	Annual Income	Marital Status	Number of Dependents	Education Level	Occupation	Health Score	Location	Policy Type	Previous Claims	Credit Score	Insurance Duration	Policy Start Date	Customer Feedback	Smoking Status	Exercise Frequency	Property Type	Premium Amount
0	0	19.0	Woman	8.642140e+05	Spouse Present	1.0	Undergraduate	Business	26.598761	Tier-1	Premium	2.0	372.0	5.0	2023-12-23 15:21:39.134960	Poor	No	Weekly	Detached Home	1945.913327
1	1	39.0	Woman	8.927012e+05	Spouse Present	3.0	Post Graduate	Missing	21.569731	Tier-2	Comprehensive	1.0	694.0	2.0	2023-06-12 15:21:39.111551	Average	Yes	Monthly	Detached Home	10908.896072
2	2	23.0	Man	2.201772e+06	Formerly Married	3.0	Undergraduate	Business	50.177549	Tier-3	Premium	1.0	NaN	3.0	2023-09-30 15:21:39.221386	Good	Yes	Weekly	Detached Home	21563.135198
3	3	21.0	Man	3.997542e+06	Spouse Present	2.0	Undergraduate	Missing	16.938144	Tier-2	Basic	1.0	367.0	1.0	2024-06-12 15:21:39.226954	Poor	Yes	Daily	Flat	2653.539143
4	4	21.0	Man	3.409986e+06	Not Married	1.0	Undergraduate	Business	24.376094	Tier-2	Premium	0.0	598.0	4.0	2021-12-01 15:21:39.252145	Poor	Yes	Weekly	Detached Home	1269.243463
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1199995	1199995	36.0	Woman	7.491760e+05	Spouse Present	0.0	Post Graduate	Full-Time Worker	15.772907	Tier-1	Premium	NaN	372.0	3.0	2023-05-03 15:21:39.257696	Poor	No	Daily	Flat	NaN
1199996	1199996	54.0	Man	3.077596e+06	Not Married	NaN	PhD	Business	17.483482	Tier-2	Comprehensive	NaN	597.0	4.0	2022-09-10 15:21:39.134960	Poor	No	Weekly	Flat	NaN
1199997	1199997	19.0	Man	1.462116e+06	Formerly Married	0.0	Post Graduate	Not Currently Working	20.724469	Tier-3	Basic	0.0	NaN	6.0	2021-05-25 15:21:39.106582	Good	No	Monthly	Apartment	1459.859503
1199998	1199998	55.0	Man	NaN	Not Married	1.0	PhD	Missing	24.547381	Tier-3	Premium	1.0	407.0	4.0	2021-09-19 15:21:39.190215	Poor	No	Daily	Flat	38622.914451
1199999	1199999	21.0	Woman	NaN	Formerly Married	0.0	PhD	Missing	18.125323	Tier-2	Premium	0.0	502.0	6.0	2020-08-26 15:21:39.155231	Good	Yes	Monthly	Detached Home	37863.127378

	0
id	int64
Age	float64
Gender	object
Annual Income	float64
Marital Status	object
Number of Dependents	float64
Education Level	object
Occupation	object
Health Score	float64
Location	object
Policy Type	object
Previous Claims	float64
Credit Score	float64
Insurance Duration	float64
Policy Start Date	object
Customer Feedback	object
Smoking Status	object
Exercise Frequency	object
Property Type	object
Premium Amount	float64

	Marital Score_Normalized	Education Score_Normalized	Occupation Score_Normalized	City Score_Normalized	Policy Score_Normalized	Property Score_Normalized	Smoking Score_Normalized	Health Score_Normalized	Credit Score_Normalized
0	1.000000	0.333333	0.666667	1.0	1.0	0.0	1.0	0.420201	0.131148
1	1.000000	0.666667	0.000000	0.5	0.5	0.0	0.0	0.332904	0.717668
2	0.666667	0.333333	0.666667	0.0	1.0	0.0	0.0	0.829496	0.533560
3	1.000000	0.333333	0.000000	0.5	0.0	0.5	0.0	0.252506	0.122040
4	0.333333	0.333333	0.666667	0.5	1.0	0.0	0.0	0.381618	0.542805

	Tier	Premium Amount	Discount Percentage	Risk Adjusted Premium
0	1	1945.913327	0.12	1712.403728
1	2	10908.896072	0.10	9818.006465
2	2	21563.135198	0.10	19406.821679
3	4	2653.539143	0.07	2467.791403
4	2	1269.243463	0.10	1142.319116

	id	Age	Gender	Annual Income	Marital Status	Number of Dependents	Education Level	Occupation	Health Score	Location	...	Exercise Score	Dependents Score	Previous Claims Score	Customer Score	Risk Segment	Tier	Discount Percentage	Risk Adjusted Premium	Income Group	Previous Claims Group
599999	599999	61.0	Man	3.186832e+06	Not Married	3	Undergraduate	Full-Time Worker	38.398085	Tier-1	...	3	2	5	>5	Medium Risk	1	0.12	19547.509798	High	No Claims
686586	686586	64.0	Man	2.172978e+06	Not Married	1	Undergraduate	Business	38.584074	Tier-3	...	4	4	5	4.0	Medium Risk	2	0.10	20149.953723	High	No Claims
686589	686589	30.0	Woman	1.036558e+06	Not Married	0	Undergraduate	Business	11.751145	Tier-1	...	1	4	5	4.0	Medium Risk	2	0.10	19351.805894	Medium	No Claims
686593	686593	18.0	Woman	1.238363e+06	Not Married	1	Post Graduate	Missing	45.813976	Tier-1	...	3	4	5	5.0	Medium Risk	2	0.10	11685.333321	Medium	No Claims
686594	686594	39.0	Man	8.477180e+05	Not Married	3	PhD	Full-Time Worker	14.123073	Tier-2	...	3	2	5	>5	Medium Risk	1	0.12	8968.623151	Low	No Claims

NEXTLEAP'S GRADUATION PROJECT -- ACKO ANALYSIS¶

BUSINESS PROBLEM OVERVIEW¶

Acko Health Insurance: Designing a Data-Driven Insurance Premium Pricing Strategy¶

Objective¶

Base Formula to be used¶

Business Impact:¶

Dataset Overview¶

Dataset Overview¶

Column Definitions¶

⚠️ Important Note: ⚠️¶

Analysis & Visualization¶

I. Data Importing, Cleaning and Identifying Outliers¶

1. Importing Necessary Libraries¶

2. Loading the dataset from G-Drive¶

II. Data Understanding¶

1. Dataset - Sample View¶

2. Displaying the shape of the dataset¶

3. Displaying dataset information¶

4. Checking for Duplicate Values in the Dataset & the Granularity¶

5. Displaying the data summary statistics.¶

6. Correcting Datatypes¶

a. Coverting datatype of id to object ¶

b. Coverting datatype of Number of Dependents to int64 ¶

c. Coverting datatype of Policy Start Date to datetime64 ¶

8. Identifying the number of missing/null values in the data¶

a. Imputing Values for Age¶

b. Imputing Values for Health Score¶

c. Imputing Values for Premium Amount¶

d. Imputing Values for Customer Feedback¶

e. Imputing Values for Number of Dependants¶

f. Imputing Values for Credit Score¶

g. Imputing Values for Previous Claims¶

h. Imputing Values for Annual Income¶

i. Verifying the number of missing/null values in the data¶

7. Introducing Age and Income Groups to aid analyses¶

a. Introducing Income Group from Annual Income¶

b. Introduced Age Group from Age¶

9. Checking Outliers¶

a. Checking Ouliers for all Numeric columns¶

11. Summary of Dataset Observations¶

a. Summary¶

b. Unique Values for Each Variable¶

c. Copying the dataset for analysis¶

III. Data Processing¶

a. Customer Segmentation¶

Code Breakdown & Explanation¶

b. Normalization¶

Code Breakdown & Explanation¶

Code Breakdown & Explanation¶

c. Data Driven Segmentation¶

Code Breakdown & Explanation¶

d. Introducing Cross Tab for Customer Tiering¶

Code Breakdown & Explanation¶

e. Business Strategy¶

f. Risk Adjusted Premium¶

Finding Correlations & Influential Attributes¶

IV. Hypothesis Testing¶

Hypothesis 1: Majority of Customers Opt for Lower Premium Insurance Plans¶

Observations:¶

Hypothesis 2: Older age groups tend to pay higher premium amounts¶

Observations & Takeaways¶

Hypothesis 3: Older age groups tend to have a higher number of previous claims compared to younger groups¶

Observations and Takeaways¶

Hypothesis 4: Geographic factors can influence the type and number of policies taken¶

Observations & Takeaways¶

Hypothesis 5: Average Premium amount varies with different Tiers¶

Observations & Takeaways¶

Hypothesis 6: Number of Previous Claims influence the Premium Amount¶

Observations & Takeaways¶

Hypothesis 7: The Number of Dependents Influences the Average Premium Amount¶

Observations and Takeaways¶

Hypothesis 8: Lower Exercise Frequency Leads to Higher Premium Amounts¶

Observations & Takeaways¶

Hypothesis 9: Smokers Pay Higher Health Insurance Premiums¶

Observations & Takeaways¶

Hypothesis 10: The difference in amount between Basic and Premium policies increase with age.¶

Observations & Takeaways¶

Hypothesis 11: Higher Education Levels and Prime Locations Lead to Higher Premiums¶

Observations & Takeaways¶

Hypothesis 12: Family Size, Marital Status, and Property Type Influence Premium Pricing¶

a. Coverting datatype of `id` to `object` ¶

b. Coverting datatype of `Number of Dependents` to `int64` ¶

c. Coverting datatype of `Policy Start Date` to `datetime64` ¶

a. Imputing Values for `Age`¶

b. Imputing Values for `Health Score`¶

c. Imputing Values for `Premium Amount`¶

d. Imputing Values for `Customer Feedback`¶

e. Imputing Values for `Number of Dependants`¶

f. Imputing Values for `Credit Score`¶

g. Imputing Values for `Previous Claims`¶

h. Imputing Values for `Annual Income`¶

a. Introducing `Income Group` from `Annual Income`¶

b. Introduced `Age Group` from `Age`¶