import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
import matplotlib.pyplot as plt  # For plotting and visualization
import seaborn as sns  # For advanced visualizations

# Step 1: Install gdown
!pip install gdown

# Step 2: Import necessary libraries
import gdown
import pandas as pd

# Step 3: Set the file ID and create a download URL
file_id = "1i4ia9ZNfAXgu6JGTXCUgFn7Pb8wltzLH"
download_url = f"https://drive.google.com/uc?id={file_id}"

# Step 4: Set the output file name
output_file = "acko_dataset.csv"

# Step 5: Download the file
gdown.download(download_url, output_file, quiet=False)

# Step 6: Load the CSV file into a Pandas DataFrame
data = pd.read_csv(output_file)

Requirement already satisfied: gdown in /usr/local/lib/python3.11/dist-packages (5.2.0)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.11/dist-packages (from gdown) (4.13.3)
Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from gdown) (3.17.0)
Requirement already satisfied: requests[socks] in /usr/local/lib/python3.11/dist-packages (from gdown) (2.32.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from gdown) (4.67.1)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown) (2.6)
Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown) (4.12.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (2025.1.31)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (1.7.1)

Downloading...
From (original): https://drive.google.com/uc?id=1i4ia9ZNfAXgu6JGTXCUgFn7Pb8wltzLH
From (redirected): https://drive.google.com/uc?id=1i4ia9ZNfAXgu6JGTXCUgFn7Pb8wltzLH&confirm=t&uuid=2a5d4fea-a2a4-4c0f-8796-8315bc58c66f
To: /content/acko_dataset.csv
100%|██████████| 219M/219M [00:02<00:00, 78.8MB/s]

print("First 5 Rows of the Dataset:")
data.head(5)

First 5 Rows of the Dataset:

rows, columns = data.shape
print(f"\nThe dataset contains {rows} rows and {columns} columns.")

The dataset contains 1200000 rows and 20 columns.

random_sample = data[data.notna().all(axis=1)].sample(n=10, random_state=42)  # Randomly select 10 rows with no missing values
print(random_sample)

              id   Age Gender  Annual Income  Marital Status  \
940383    940383  34.0    Man   7.100460e+05     Not Married   
384739    384739  27.0    Man   1.286560e+05     Not Married   
841062    841062  20.0  Woman   6.368160e+05  Spouse Present   
482497    482497  26.0  Woman   7.206594e+05     Not Married   
105947    105947  35.0    Man   2.201772e+06  Spouse Present   
836098    836098  26.0  Woman   4.540721e+05  Spouse Present   
627498    627498  60.0    Man   8.426382e+06  Spouse Present   
1122754  1122754  25.0    Man   5.733600e+05     Not Married   
962397    962397  35.0  Woman   1.320255e+06     Not Married   
278381    278381  47.0    Man   6.327320e+05  Spouse Present   

         Number of Dependents      Education Level        Occupation  \
940383                    4.0  Secondary Education  Full-Time Worker   
384739                    1.0                  PhD          Business   
841062                    2.0                  PhD  Full-Time Worker   
482497                    4.0        Undergraduate           Missing   
105947                    4.0                  PhD          Business   
836098                    0.0        Undergraduate           Missing   
627498                    2.0        Undergraduate  Full-Time Worker   
1122754                   0.0        Undergraduate           Missing   
962397                    1.0        Post Graduate           Missing   
278381                    3.0        Undergraduate  Full-Time Worker   

         Health Score Location    Policy Type  Previous Claims  Credit Score  \
940383      36.694979   Tier-3        Premium              2.0         784.0   
384739      52.368069   Tier-2          Basic              1.0         694.0   
841062      48.977416   Tier-2        Premium              0.0         626.0   
482497      46.865991   Tier-1          Basic              2.0         445.0   
105947      42.556413   Tier-2        Premium              1.0         849.0   
836098      31.726851   Tier-3        Premium              0.0         761.0   
627498      42.781384   Tier-1          Basic              2.0         691.0   
1122754     33.049018   Tier-2  Comprehensive              2.0         487.0   
962397      11.816854   Tier-1        Premium              1.0         776.0   
278381      30.242150   Tier-3        Premium              1.0         561.0   

         Insurance Duration           Policy Start Date Customer Feedback  \
940383                  5.0  2023-09-21 15:21:39.190215              Poor   
384739                  5.0  2021-03-05 15:21:39.217387           Average   
841062                  1.0  2022-06-23 15:21:39.279729              Good   
482497                  4.0  2020-06-22 15:21:39.134960              Poor   
105947                  2.0  2022-10-14 15:21:39.167099              Good   
836098                  4.0  2020-01-02 15:21:39.228521              Poor   
627498                  2.0  2023-09-01 15:21:39.173834              Good   
1122754                 6.0  2024-06-19 15:21:39.124659              Good   
962397                  5.0  2021-02-17 15:21:39.155231              Good   
278381                  5.0  2020-08-29 15:21:39.219432              Poor   

        Smoking Status Exercise Frequency  Property Type  Premium Amount  
940383              No              Daily           Flat    24143.634352  
384739             Yes             Weekly           Flat    58629.569372  
841062              No             Weekly           Flat    38107.625453  
482497             Yes             Weekly           Flat     2632.489722  
105947             Yes              Daily           Flat     3503.121200  
836098              No             Weekly  Detached Home     7425.437892  
627498              No            Monthly           Flat   102616.812058  
1122754            Yes            Monthly  Detached Home     1238.769345  
962397             Yes              Daily      Apartment    28459.080582  
278381             Yes             Weekly      Apartment    31378.963219

print("\nDataset Information:")
data.info()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1200000 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            1200000 non-null  object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Credit Score          1062118 non-null  float64
 13  Insurance Duration    1199999 non-null  float64
 14  Policy Start Date     1200000 non-null  object 
 15  Customer Feedback     1122176 non-null  object 
 16  Smoking Status        1200000 non-null  object 
 17  Exercise Frequency    1200000 non-null  object 
 18  Property Type         1200000 non-null  object 
 19  Premium Amount        784968 non-null   float64
dtypes: float64(8), int64(1), object(11)
memory usage: 183.1+ MB

data["Policy Start Date"] = pd.to_datetime(data["Policy Start Date"])

duplicate_count = len(data[data.duplicated()])
print(f"Number of Duplicate Rows in the Dataset: {duplicate_count}")

Number of Duplicate Rows in the Dataset: 0

# Now you can proceed with the missing value check:
missing_values = data.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

Missing Values in Each Column:
id                           0
Age                      18705
Gender                       0
Annual Income            44949
Marital Status               0
Number of Dependents    109672
Education Level              0
Occupation                   0
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount          415032
dtype: int64

missing_values = data.isnull().sum().sum()
print(missing_values)

1242170

missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0]

plt.figure(figsize=(8, 5))
missing_values.plot(kind='bar', color='blue')
plt.title('Missing Values by Column')
plt.xlabel('Columns')
plt.ylabel('Count')
plt.show()

print("\nObservations About the Dataset:")
if duplicate_count > 0:
    print(f"- There are {duplicate_count} duplicate rows in the dataset.")
else:
    print("- No duplicate rows found in the dataset.")

if missing_values.sum() > 0:
    print("- There are missing values in the dataset. Here’s a summary:")
    print(missing_values[missing_values > 0])
else:
    print("- No missing values found in the dataset.")

print("- The dataset is ready for further analysis after handling duplicates and missing values.")

Observations About the Dataset:
- No duplicate rows found in the dataset.
- There are missing values in the dataset. Here’s a summary:
Age                      18705
Annual Income            44949
Number of Dependents    109672
Health Score             74076
Previous Claims         364029
Credit Score            137882
Insurance Duration           1
Customer Feedback        77824
Premium Amount          415032
dtype: int64
- The dataset is ready for further analysis after handling duplicates and missing values.

# Dataset Columns
print("Dataset Columns:")
print(data.columns)

Dataset Columns:
Index(['id', 'Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Credit Score',
       'Insurance Duration', 'Policy Start Date', 'Customer Feedback',
       'Smoking Status', 'Exercise Frequency', 'Property Type',
       'Premium Amount'],
      dtype='object')

# Dataset Describe
print("\nDataset Summary Statistics:")
print(data.describe(include='all'))

Dataset Summary Statistics:
                  id           Age   Gender  Annual Income Marital Status  \
count   1.200000e+06  1.181295e+06  1200000   1.155051e+06        1200000   
unique           NaN           NaN        2            NaN              4   
top              NaN           NaN      Man            NaN    Not Married   
freq             NaN           NaN   602571            NaN         552049   
mean    5.999995e+05  4.114556e+01      NaN   1.664521e+06            NaN   
min     0.000000e+00  1.800000e+01      NaN   1.075000e+01            NaN   
25%     2.999998e+05  3.000000e+01      NaN   3.968939e+05            NaN   
50%     5.999995e+05  4.100000e+01      NaN   8.581660e+05            NaN   
75%     8.999992e+05  5.300000e+01      NaN   1.990566e+06            NaN   
max     1.199999e+06  6.400000e+01      NaN   1.304357e+07            NaN   
std     3.464103e+05  1.353995e+01      NaN   2.115112e+06            NaN   

        Number of Dependents Education Level        Occupation  Health Score  \
count           1.090328e+06         1200000           1200000  1.125924e+06   
unique                   NaN               4                 4           NaN   
top                      NaN   Undergraduate  Full-Time Worker           NaN   
freq                     NaN          627193            373716           NaN   
mean            2.009934e+00             NaN               NaN  3.186879e+01   
min             0.000000e+00             NaN               NaN  2.391713e+00   
25%             1.000000e+00             NaN               NaN  2.209691e+01   
50%             2.000000e+00             NaN               NaN  3.096556e+01   
75%             3.000000e+00             NaN               NaN  4.114583e+01   
max             4.000000e+00             NaN               NaN  6.000000e+01   
std             1.417338e+00             NaN               NaN  1.239609e+01   

       Location Policy Type  Previous Claims  Credit Score  \
count   1200000     1200000    835971.000000  1.062118e+06   
unique        3           3              NaN           NaN   
top      Tier-3     Premium              NaN           NaN   
freq     401542      401846              NaN           NaN   
mean        NaN         NaN         1.002689  5.929244e+02   
min         NaN         NaN         0.000000  3.000000e+02   
25%         NaN         NaN         0.000000  4.680000e+02   
50%         NaN         NaN         1.000000  5.950000e+02   
75%         NaN         NaN         2.000000  7.210000e+02   
max         NaN         NaN         9.000000  8.490000e+02   
std         NaN         NaN         0.982840  1.499819e+02   

        Insurance Duration              Policy Start Date Customer Feedback  \
count         1.199999e+06                        1200000           1122176   
unique                 NaN                            NaN                 3   
top                    NaN                            NaN           Average   
freq                   NaN                            NaN            377905   
mean          5.018219e+00  2022-02-13 05:06:30.972380672               NaN   
min           1.000000e+00     2019-08-17 15:21:39.080371               NaN   
25%           3.000000e+00  2020-11-20 15:21:39.121168896               NaN   
50%           5.000000e+00  2022-02-14 15:21:39.151731968               NaN   
75%           7.000000e+00  2023-05-06 15:21:39.182597120               NaN   
max           9.000000e+00     2024-08-15 15:21:39.287115               NaN   
std           2.594331e+00                            NaN               NaN   

       Smoking Status Exercise Frequency  Property Type  Premium Amount  
count         1200000            1200000        1200000   784968.000000  
unique              2                  4              3             NaN  
top               Yes             Weekly  Detached Home             NaN  
freq           601873             306179         400349             NaN  
mean              NaN                NaN            NaN    25763.411424  
min               NaN                NaN            NaN      292.650059  
25%               NaN                NaN            NaN     6840.682284  
50%               NaN                NaN            NaN    14824.932460  
75%               NaN                NaN            NaN    31316.333081  
max               NaN                NaN            NaN   240000.000000  
std               NaN                NaN            NaN    30563.216524

# Unique Values for Each Variable
print("\n### Unique Values for Each Variable ###")
for column in data.columns.tolist():
    print(f"No. of unique values in {column}: {data[column].nunique()}.")

### Unique Values for Each Variable ###
No. of unique values in id: 1200000.
No. of unique values in Age: 47.
No. of unique values in Gender: 2.
No. of unique values in Annual Income: 247760.
No. of unique values in Marital Status: 4.
No. of unique values in Number of Dependents: 5.
No. of unique values in Education Level: 4.
No. of unique values in Occupation: 4.
No. of unique values in Health Score: 923518.
No. of unique values in Location: 3.
No. of unique values in Policy Type: 3.
No. of unique values in Previous Claims: 10.
No. of unique values in Credit Score: 550.
No. of unique values in Insurance Duration: 9.
No. of unique values in Policy Start Date: 167381.
No. of unique values in Customer Feedback: 3.
No. of unique values in Smoking Status: 2.
No. of unique values in Exercise Frequency: 4.
No. of unique values in Property Type: 3.
No. of unique values in Premium Amount: 784492.

# Copying the dataset for analysis
data = data.copy()

# Checking basic stats
print("Dataset Shape:", data.shape)
print("Dataset Columns:", data.columns)

Dataset Shape: (1200000, 20)
Dataset Columns: Index(['id', 'Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Credit Score',
       'Insurance Duration', 'Policy Start Date', 'Customer Feedback',
       'Smoking Status', 'Exercise Frequency', 'Property Type',
       'Premium Amount'],
      dtype='object')

# Ensure 'Policy Start Date' is in datetime format
data["Policy Start Date"] = pd.to_datetime(data["Policy Start Date"], errors='coerce')

# 1️⃣ Marital & Dependents Status
data["Marital & Dependents Status"] = np.where(
    (data["Marital Status"] == "Not Married") & (data["Number of Dependents"] == 0),
    "Single",
    "Family"
)

# 2️⃣ Risk Score (Health Score divided by Credit Score)
data["Risk Score"] = data["Health Score"] / data["Credit Score"]

# 3️⃣ Premium Category (Categorizing based on Premium Amount)
data["Premium Category"] = data["Premium Amount"].apply(lambda x: "High" if x > 10000 else "Low")

# 4️⃣ Policy Age (Years) (Calculating how old the policy is)
data["Policy Age (Years)"] = (pd.to_datetime("today") - data["Policy Start Date"]).dt.days // 365

# 5️⃣ Financial Responsibility Score (Income divided by dependents +1 to avoid division by zero)
data["Financial Responsibility Score"] = (data["Annual Income"] / (data["Number of Dependents"] + 1)).fillna(0)

# 6️⃣ Healthy Lifestyle Score (Combining Exercise Frequency & Smoking Status)
exercise_map = {"Daily": 5, "Weekly": 3, "Monthly": 1, "None": 0}
smoking_map = {"Yes": -2, "No": 0}

data["Healthy Lifestyle Score"] = data["Exercise Frequency"].map(exercise_map).fillna(0) + data["Smoking Status"].map(smoking_map).fillna(0)

# 7️⃣ Claim Frequency (Previous Claims divided by Insurance Duration)
data["Claim Frequency"] = data["Previous Claims"] / data["Insurance Duration"]

# Display the first few rows to verify
print(data.head())

   id   Age Gender  Annual Income    Marital Status  Number of Dependents  \
0   0  19.0  Woman   8.642140e+05    Spouse Present                   1.0   
1   1  39.0  Woman   8.927012e+05    Spouse Present                   3.0   
2   2  23.0    Man   2.201772e+06  Formerly Married                   3.0   
3   3  21.0    Man   3.997542e+06    Spouse Present                   2.0   
4   4  21.0    Man   3.409986e+06       Not Married                   1.0   

  Education Level Occupation  Health Score Location  ... Exercise Frequency  \
0   Undergraduate   Business     26.598761   Tier-1  ...             Weekly   
1   Post Graduate    Missing     21.569731   Tier-2  ...            Monthly   
2   Undergraduate   Business     50.177549   Tier-3  ...             Weekly   
3   Undergraduate    Missing     16.938144   Tier-2  ...              Daily   
4   Undergraduate   Business     24.376094   Tier-2  ...             Weekly   

   Property Type  Premium Amount  Marital & Dependents Status Risk Score  \
0  Detached Home     1945.913327                       Family   0.071502   
1  Detached Home    10908.896072                       Family   0.031080   
2  Detached Home    21563.135198                       Family        NaN   
3           Flat     2653.539143                       Family   0.046153   
4  Detached Home     1269.243463                       Family   0.040763   

  Premium Category Policy Age (Years) Financial Responsibility Score  \
0              Low                  1                   4.321070e+05   
1             High                  1                   2.231753e+05   
2             High                  1                   5.504430e+05   
3              Low                  0                   1.332514e+06   
4              Low                  3                   1.704993e+06   

  Healthy Lifestyle Score  Claim Frequency  
0                     3.0         0.400000  
1                    -1.0         0.500000  
2                     1.0         0.333333  
3                     3.0         1.000000  
4                     1.0         0.000000  

[5 rows x 27 columns]

# Dataset Columns
print("Dataset Columns:")
print(data.columns)

Dataset Columns:
Index(['id', 'Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Credit Score',
       'Insurance Duration', 'Policy Start Date', 'Customer Feedback',
       'Smoking Status', 'Exercise Frequency', 'Property Type',
       'Premium Amount', 'Marital & Dependents Status', 'Risk Score',
       'Premium Category', 'Policy Age (Years)',
       'Financial Responsibility Score', 'Healthy Lifestyle Score',
       'Claim Frequency'],
      dtype='object')

data.columns = data.columns.str.strip()

print(data.head())

   id   Age Gender  Annual Income    Marital Status  Number of Dependents  \
0   0  19.0  Woman   8.642140e+05    Spouse Present                   1.0   
1   1  39.0  Woman   8.927012e+05    Spouse Present                   3.0   
2   2  23.0    Man   2.201772e+06  Formerly Married                   3.0   
3   3  21.0    Man   3.997542e+06    Spouse Present                   2.0   
4   4  21.0    Man   3.409986e+06       Not Married                   1.0   

  Education Level Occupation  Health Score Location  ... Exercise Frequency  \
0   Undergraduate   Business     26.598761   Tier-1  ...             Weekly   
1   Post Graduate    Missing     21.569731   Tier-2  ...            Monthly   
2   Undergraduate   Business     50.177549   Tier-3  ...             Weekly   
3   Undergraduate    Missing     16.938144   Tier-2  ...              Daily   
4   Undergraduate   Business     24.376094   Tier-2  ...             Weekly   

   Property Type  Premium Amount  Marital & Dependents Status Risk Score  \
0  Detached Home     1945.913327                       Family   0.071502   
1  Detached Home    10908.896072                       Family   0.031080   
2  Detached Home    21563.135198                       Family        NaN   
3           Flat     2653.539143                       Family   0.046153   
4  Detached Home     1269.243463                       Family   0.040763   

  Premium Category Policy Age (Years) Financial Responsibility Score  \
0              Low                  1                   4.321070e+05   
1             High                  1                   2.231753e+05   
2             High                  1                   5.504430e+05   
3              Low                  0                   1.332514e+06   
4              Low                  3                   1.704993e+06   

  Healthy Lifestyle Score  Claim Frequency  
0                     3.0         0.400000  
1                    -1.0         0.500000  
2                     1.0         0.333333  
3                     3.0         1.000000  
4                     1.0         0.000000  

[5 rows x 27 columns]

def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers[[column]]

# Detecting outliers for key columns
outliers_dict = {}
for col in ["Age", "Annual Income", "Number of Dependents", "Health Score",
            "Previous Claims", "Credit Score", "Insurance Duration", "Premium Amount"]:
    outliers_dict[col] = detect_outliers_iqr(data, col)

# Display outliers count for each column
for col, outliers in outliers_dict.items():
    print(f"Column: {col} → Outliers Count: {len(outliers)}")

Column: Age → Outliers Count: 0
Column: Annual Income → Outliers Count: 108267
Column: Number of Dependents → Outliers Count: 0
Column: Health Score → Outliers Count: 0
Column: Previous Claims → Outliers Count: 369
Column: Credit Score → Outliers Count: 0
Column: Insurance Duration → Outliers Count: 0
Column: Premium Amount → Outliers Count: 68621

from scipy.stats import zscore

# Compute Z-scores
data_numeric = data[["Age", "Annual Income", "Number of Dependents", "Health Score",
                 "Previous Claims", "Credit Score", "Insurance Duration", "Premium Amount"]]

z_scores = np.abs(zscore(data_numeric))

# Get rows where any value has a Z-score above 3
outlier_rows = data[(z_scores > 3).any(axis=1)]
print("Total Outliers Detected (Z-Score Method):", len(outlier_rows))

Total Outliers Detected (Z-Score Method): 0

# List of numerical columns
num_cols = ["Age", "Annual Income", "Number of Dependents", "Health Score",
            "Previous Claims", "Credit Score", "Insurance Duration", "Premium Amount"]

# Plot boxplots
plt.figure(figsize=(12, 6))
for i, col in enumerate(num_cols, 1):
    plt.subplot(2, 4, i)
    sns.boxplot(y=data[col])
    plt.title(col)

plt.tight_layout()
plt.show()

# Creating a dictionary to store metrics
metrics = {
    "Total Records": [data.shape[0]],
    "Total Columns": [data.shape[1]],
    "Missing Values (%)": [data.isnull().sum().sum() / (data.shape[0] * data.shape[1]) * 100],
    "Unique Customers": [data["id"].nunique()],
    "Average Age": [data["Age"].mean()],
    "Median Annual Income": [data["Annual Income"].median()],
    "Average Credit Score": [data["Credit Score"].mean()],
    "Premium Amount (Mean)": [data["Premium Amount"].mean()],
    "Premium Amount (Median)": [data["Premium Amount"].median()],
    "Health Score (Avg)": [data["Health Score"].mean()],
    "Average Insurance Duration": [data["Insurance Duration"].mean()],
    "Total Previous Claims": [data["Previous Claims"].sum()],
    "Claim Ratio": [(data["Previous Claims"].sum() / data["Insurance Duration"].sum())],
    "Policy Type Distribution": [data["Policy Type"].value_counts(normalize=True).to_dict()],
    "Smoking Rate (%)": [(data["Smoking Status"] == "Yes").mean() * 100],
    "Exercise Frequency Distribution": [data["Exercise Frequency"].value_counts(normalize=True).to_dict()]
}

# Convert dictionary to DataFrame
metrics_data = pd.DataFrame.from_dict(metrics, orient='index', columns=["Value"])

# Display formatted metrics
print(metrics_data)

                                                                             Value
Total Records                                                              1200000
Total Columns                                                                   27
Missing Values (%)                                                        5.582448
Unique Customers                                                           1200000
Average Age                                                              41.145563
Median Annual Income                                                      858166.0
Average Credit Score                                                     592.92435
Premium Amount (Mean)                                                 25763.411424
Premium Amount (Median)                                                14824.93246
Health Score (Avg)                                                       31.868794
Average Insurance Duration                                                5.018219
Total Previous Claims                                                     838219.0
Claim Ratio                                                               0.139196
Policy Type Distribution         {'Premium': 0.3348716666666667, 'Comprehensive...
Smoking Rate (%)                                                         50.156083
Exercise Frequency Distribution  {'Weekly': 0.25514916666666665, 'Monthly': 0.2...

def calculate_base_premium(age, location_risk, policy_type):
    if age <= 30:
        base = 5000
    elif age <= 50:
        base = 7500
    else:
        base = 12000

    location_factor = 1.2 if location_risk == "High" else 1.0
    policy_factor = 1.5 if policy_type == "Comprehensive" else 1.0

    return base * location_factor * policy_factor

def calculate_risk_adjustment(health_score, smoking, exercise, claims, credit_score):
    adjustment = 1.0

    if health_score < 40:
        adjustment += 0.15
    if smoking == "Yes":
        adjustment += 0.20
    if exercise == "None":
        adjustment += 0.10
    if claims > 2:
        adjustment += 0.25
    if credit_score < 600:
        adjustment += 0.20

    return adjustment

def calculate_total_premium(age, location_risk, policy_type, health_score, smoking, exercise, claims, credit_score):
    base_premium = calculate_base_premium(age, location_risk, policy_type)
    risk_adjustment = calculate_risk_adjustment(health_score, smoking, exercise, claims, credit_score)

    total_premium = base_premium * risk_adjustment
    return round(total_premium, 2)

# Assuming the columns in your DataFrame are named exactly as expected by the function
data['risk_adjusted_premium'] = data.apply(lambda row: calculate_total_premium(
    row['Age'],
    row['Location'],  # Assuming 'Location' represents 'location_risk'
    row['Policy Type'],
    row['Health Score'],
    row['Smoking Status'],  # Assuming 'Smoking Status' represents 'smoking'
    row['Exercise Frequency'],  # Assuming 'Exercise Frequency' represents 'exercise'
    row['Previous Claims'],  # Assuming 'Previous Claims' represents 'claims'
    row['Credit Score']
), axis=1)

# Boxplot: Distribution of risk-adjusted premiums
plt.figure(figsize=(10, 5))
sns.boxplot(y=data["risk_adjusted_premium"], palette="coolwarm")
plt.title("Distribution of Risk-Adjusted Premiums")
plt.ylabel("Risk-Adjusted Premium")
plt.show()

# Histogram: Frequency distribution of risk-adjusted premiums
plt.figure(figsize=(10, 5))
sns.histplot(data["risk_adjusted_premium"], bins=30, kde=True, color="skyblue")
plt.title("Risk-Adjusted Premium Distribution")
plt.xlabel("Risk-Adjusted Premium")
plt.ylabel("Frequency")
plt.show()

<ipython-input-87-44c48d97a448>:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(y=data["risk_adjusted_premium"], palette="coolwarm")

sns.histplot(data['Premium Amount'], bins=30, kde=True)
plt.title("Distribution of Premium Amounts")
plt.xlabel("Premium Amount")
plt.ylabel("Exercise Frequency")
plt.show()

sns.histplot(data['Age'], bins=20, kde=True)
plt.title("Age Distribution of Customers")
plt.xlabel("Age")
plt.ylabel("Exercise Frequency")
plt.show()

sns.histplot(data['Health Score'], bins=20, kde=True)
plt.title("Health Score Distribution")
plt.xlabel("Health Score")
plt.ylabel("Exercise Frequency")
plt.show()

# Group data by Age and calculate the average Premium Amount
age_premium_avg = data.groupby("Age")["Premium Amount"].mean().reset_index()

plt.figure(figsize=(10, 5))
sns.lineplot(x=age_premium_avg["Age"], y=age_premium_avg["Premium Amount"], marker="o")
plt.title("Average Premium Amount by Age")
plt.xlabel("Age")
plt.ylabel("Average Premium Amount")
plt.grid(True)
plt.show()

sns.boxplot(x=data['Policy Type'], y=data['Premium Amount'])
plt.title("Policy Type vs. Premium Amount")
plt.xlabel("Policy Type")
plt.ylabel("Premium Amount")
plt.show()

sns.boxplot(x=data['Location'], y=data['Premium Amount'])
plt.title("Location Tier vs. Premium Amount")
plt.xlabel("Location Tier")
plt.ylabel("Premium Amount")
plt.show()

data.groupby('Marital Status')['Premium Amount'].sum().plot(kind='pie', autopct='%1.1f%%')
plt.title("Premium Distribution by Marital Status")
plt.show()

plt.figure(figsize=(12, 6))
sns.heatmap(data.select_dtypes(include=np.number).corr(), annot=True, cmap='coolwarm', fmt=".2f") # Select only numeric columns
plt.title("Feature Correlation Matrix")
plt.show()

# Group by dependents and calculate the average premium
dependents_premium = data.groupby('Number of Dependents')['Premium Amount'].mean()

# Create trend line
z = np.polyfit(dependents_premium.index, dependents_premium.values, 1)
p = np.poly1d(z)

# Plot the line chart
plt.plot(dependents_premium.index, dependents_premium.values, marker='o', linestyle='-', color='blue', label="Avg Premium")

# Plot the trend line
plt.plot(dependents_premium.index, p(dependents_premium.index), linestyle='--', color='red', label="Trend Line")

# Labels and title
plt.title("Number of Dependents vs. Average Premium Amount")
plt.xlabel("Number of Dependents")
plt.ylabel("Average Premium Amount")
plt.legend()
plt.grid(True)
plt.show()

sns.boxplot(x=data['Exercise Frequency'], y=data['Health Score'])
plt.title("Exercise Frequency vs. Health Score")
plt.xlabel("Exercise Frequency")
plt.ylabel("Health Score")
plt.show()

data.groupby(pd.cut(data['Credit Score'], bins=5))['Previous Claims'].mean().plot(kind='bar', color='green')
plt.title("Credit Score vs. Average Previous Claims")
plt.xlabel("Credit Score Range")
plt.ylabel("Average Number of Claims")
plt.show()

<ipython-input-98-a54bc60fabab>:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  data.groupby(pd.cut(data['Credit Score'], bins=5))['Previous Claims'].mean().plot(kind='bar', color='green')

pd.crosstab(data['Marital Status'], data['Policy Type']).plot(kind='bar', stacked=True, colormap='viridis')
plt.title("Policy Type Preference by Marital Status")
plt.xlabel("Marital Status")
plt.ylabel("Count of Customers")
plt.legend(title="Policy Type")
plt.show()

plt.figure(figsize=(8, 5))
sns.histplot(data["Policy Age (Years)"], bins=20, kde=True, color="red")
plt.title("Distribution of Policy Age")
plt.xlabel("Policy Age (Years)")
plt.ylabel("Number of Customers")
plt.show()

plt.figure(figsize=(8, 5))
sns.violinplot(data=data, x="Healthy Lifestyle Score", y="Premium Amount", palette="coolwarm")
plt.title("Healthy Lifestyle Score vs. Premium Amount")
plt.xlabel("Healthy Lifestyle Score")
plt.ylabel("Premium Amount")
plt.show()

<ipython-input-101-c2c77587bceb>:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(data=data, x="Healthy Lifestyle Score", y="Premium Amount", palette="coolwarm")

from scipy.stats import pearsonr

# Remove NaN and infinite values
data = data.replace([np.inf, -np.inf], np.nan)
data = data.dropna(subset=['Age', 'Premium Amount', 'Credit Score'])

# Scatter plot with regression line (Age vs. Premium Amount)
plt.figure(figsize=(8, 5))
sns.regplot(data=data, x="Age", y="Premium Amount", scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'})
plt.title("Age vs. Premium Amount")
plt.xlabel("Customer Age")
plt.ylabel("Premium Amount")
plt.show()

# Compute Pearson correlation (Age & Premium Amount)
corr_age, p_age = pearsonr(data['Age'], data['Premium Amount'])
print(f"Correlation between Age and Premium Amount: {corr_age:.2f}, p-value: {p_age:.4f}")

# Scatter plot (Credit Score vs. Premium Amount)
plt.figure(figsize=(8, 5))
sns.regplot(data=data, x="Credit Score", y="Premium Amount", scatter_kws={'alpha': 0.5}, line_kws={'color': 'green'})
plt.title("Credit Score vs. Premium Amount")
plt.xlabel("Credit Score")
plt.ylabel("Premium Amount")
plt.show()

# Compute Pearson correlation (Credit Score & Premium Amount)
corr_credit, p_credit = pearsonr(data['Credit Score'], data['Premium Amount'])
print(f"Correlation between Credit Score and Premium Amount: {corr_credit:.2f}, p-value: {p_credit:.4f}")

# Grouped analysis: Average premium per age group & credit score category
data["Age Group"] = pd.cut(data["Age"], bins=[18, 30, 40, 50, 60, 80], labels=["18-30", "31-40", "41-50", "51-60", "61-80"])
data["Credit Category"] = pd.cut(data["Credit Score"], bins=[300, 500, 650, 750, 850], labels=["Poor", "Fair", "Good", "Excellent"])

# Average Premium by Age Group & Credit Score Category
age_credit_premium = data.groupby(["Age Group", "Credit Category"])["Premium Amount"].mean().unstack()

# Heatmap for Age Group vs. Credit Score & Premium Amount
plt.figure(figsize=(8, 5))
sns.heatmap(age_credit_premium, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Average Premium Amount by Age Group and Credit Score")
plt.xlabel("Credit Score Category")
plt.ylabel("Age Group")
plt.show()

Correlation between Age and Premium Amount: 0.25, p-value: 0.0000

Correlation between Credit Score and Premium Amount: -0.07, p-value: 0.0000

<ipython-input-102-37867fe3de62>:36: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  age_credit_premium = data.groupby(["Age Group", "Credit Category"])["Premium Amount"].mean().unstack()

from scipy.stats import f_oneway

# Data Cleaning: Remove missing or invalid values
data = data.dropna(subset=['Education Level', 'Occupation', 'Premium Amount'])

# Box Plot: Premium Distribution by Education Level & Occupation
plt.figure(figsize=(12, 6))
sns.boxplot(data=data, x="Education Level", y="Premium Amount", hue="Occupation", palette="Set2")
plt.title("Premium Amount Distribution by Education Level & Occupation")
plt.xlabel("Education Level")
plt.ylabel("Premium Amount")
plt.xticks(rotation=45)
plt.legend(title="Occupation", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

# Compute Average Premium per Education Level & Occupation
education_occupation_premium = data.groupby(["Education Level", "Occupation"])["Premium Amount"].mean().reset_index()

# Bar Plot: Average Premium by Education Level & Occupation
plt.figure(figsize=(12, 6))
sns.barplot(data=education_occupation_premium, x="Education Level", y="Premium Amount", hue="Occupation", palette="Blues_d")
plt.title("Average Premium Amount by Education Level & Occupation")
plt.xlabel("Education Level")
plt.ylabel("Average Premium Amount")
plt.xticks(rotation=45)
plt.legend(title="Occupation", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

from scipy.stats import pearsonr, f_oneway

# Remove NaN and infinite values
data = data.replace([np.inf, -np.inf], np.nan)
data = data.dropna(subset=['Annual Income', 'Premium Amount', 'Credit Score', 'Health Score'])

# Scatter plot: Annual Income vs. Premium Amount with regression line
plt.figure(figsize=(8,5))
sns.regplot(data=data, x="Annual Income", y="Premium Amount", scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.title("Annual Income vs. Premium Amount")
plt.xlabel("Annual Income")
plt.ylabel("Premium Amount")
plt.show()

# Compute Pearson correlation for multiple variables
corr_income, p_income = pearsonr(data['Annual Income'], data['Premium Amount'])
corr_credit, p_credit = pearsonr(data['Credit Score'], data['Premium Amount'])
corr_health, p_health = pearsonr(data['Health Score'], data['Premium Amount'])

print(f"Correlation between Annual Income and Premium Amount: {corr_income:.2f}, p-value: {p_income:.4f}")
print(f"Correlation between Credit Score and Premium Amount: {corr_credit:.2f}, p-value: {p_credit:.4f}")
print(f"Correlation between Health Score and Premium Amount: {corr_health:.2f}, p-value: {p_health:.4f}")

# Grouped analysis: Average premium per income group
data["Income Group"] = pd.cut(data["Annual Income"], bins=[0, 30000, 60000, 100000, 200000, np.inf],
                            labels=["<30K", "30K-60K", "60K-100K", "100K-200K", "200K+"])

income_premium = data.groupby("Income Group")["Premium Amount"].mean().reset_index()

# Bar plot for Income Groups vs. Average Premium
plt.figure(figsize=(8,5))
sns.barplot(data=income_premium, x="Income Group", y="Premium Amount", palette="Blues_d")
plt.title("Average Premium Amount by Income Group")
plt.xlabel("Income Group")
plt.ylabel("Average Premium Amount")
plt.show()

Correlation between Annual Income and Premium Amount: 0.01, p-value: 0.0000
Correlation between Credit Score and Premium Amount: -0.07, p-value: 0.0000
Correlation between Health Score and Premium Amount: 0.17, p-value: 0.0000

<ipython-input-104-b980c95f10e2>:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  income_premium = data.groupby("Income Group")["Premium Amount"].mean().reset_index()
<ipython-input-104-b980c95f10e2>:32: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=income_premium, x="Income Group", y="Premium Amount", palette="Blues_d")

# Remove NaN and infinite values
data = data.replace([np.inf, -np.inf], np.nan)
data = data.dropna(subset=['Credit Score', 'Health Score', 'Premium Amount'])

# Create categorical bins for Credit Score and Health Score
data["Credit Score Group"] = pd.cut(data["Credit Score"], bins=[300, 500, 700, 900],
                                    labels=["Low (300-500)", "Medium (500-700)", "High (700-900)"])

data["Health Score Group"] = pd.cut(data["Health Score"], bins=[0, 40, 70, 100],
                                    labels=["Poor (0-40)", "Average (40-70)", "Good (70-100)"])

# Box Plot: Premium Amount Distribution across Credit Score & Health Score Groups
plt.figure(figsize=(10,6))
sns.boxplot(data=data, x="Credit Score Group", y="Premium Amount", hue="Health Score Group", palette="coolwarm")
plt.title("Premium Amount Distribution by Credit Score & Health Score Group")
plt.xlabel("Credit Score Group")
plt.ylabel("Premium Amount")
plt.legend(title="Health Score Group")
plt.show()

import scipy.stats as stats

# Remove NaN values
data = data.dropna(subset=['Smoking Status', 'Premium Amount', 'Age'])

# Create Age Groups
bins = [18, 30, 45, 60, np.inf]  # Age brackets
labels = ["18-30", "30-45", "45-60", "60+"]
data["Age Group"] = pd.cut(data["Age"], bins=bins, labels=labels, right=False)

# Box Plot: Premium Amount by Smoking Status & Age Group
plt.figure(figsize=(10, 6))
sns.boxplot(data=data, x="Smoking Status", y="Premium Amount", hue="Age Group", palette="coolwarm")
plt.title("Premium Amount Distribution by Smoking Status & Age Group")
plt.xlabel("Smoking Status (Non-Smoker vs. Smoker)")
plt.ylabel("Premium Amount")
plt.legend(title="Age Group")
plt.show()

# Bar Plot: Average Premium by Smoking Status & Age Group
avg_premium = data.groupby(["Smoking Status", "Age Group"])["Premium Amount"].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=avg_premium, x="Smoking Status", y="Premium Amount", hue="Age Group", palette="Reds_d")
plt.title("Average Premium Amount by Smoking Status & Age Group")
plt.xlabel("Smoking Status (Non-Smoker vs. Smoker)")
plt.ylabel("Average Premium Amount")
plt.legend(title="Age Group")
plt.show()

<ipython-input-106-ed8912f6ea39>:21: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  avg_premium = data.groupby(["Smoking Status", "Age Group"])["Premium Amount"].mean().reset_index()

import scipy.stats as stats
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Remove NaN values
data = data.dropna(subset=['Exercise Frequency', 'Age Group', 'Premium Amount'])

# Box plot to visualize premium differences based on exercise frequency and age group
plt.figure(figsize=(10, 6))
sns.boxplot(data=data, x="Exercise Frequency", y="Premium Amount", hue="Age Group", palette="coolwarm")
plt.title("Distribution of Premium Amount by Exercise Frequency & Age Group")
plt.xlabel("Exercise Frequency (Low to High)")
plt.ylabel("Premium Amount")
plt.legend(title="Age Group")
plt.show()

# Bar plot for average premium by exercise frequency and age group
avg_premium = data.groupby(["Exercise Frequency", "Age Group"])["Premium Amount"].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=avg_premium, x="Exercise Frequency", y="Premium Amount", hue="Age Group", palette="Greens_d")
plt.title("Average Premium Amount by Exercise Frequency & Age Group")
plt.xlabel("Exercise Frequency (Low to High)")
plt.ylabel("Average Premium Amount")
plt.legend(title="Age Group")
plt.show()

<ipython-input-107-6b596418ab5a>:19: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  avg_premium = data.groupby(["Exercise Frequency", "Age Group"])["Premium Amount"].mean().reset_index()

# Set figure size
plt.figure(figsize=(10, 6))

# Create a violin plot with three columns: Policy Type, Premium Amount, and Age Group
sns.violinplot(data=data, x="Policy Type", y="Premium Amount", hue="Age Group", palette="muted", inner="quartile", split=True)

# Add labels and title
plt.title("Premium Distribution by Policy Type & Age Group")
plt.xlabel("Policy Type")
plt.ylabel("Premium Amount")
plt.legend(title="Age Group")

# Show plot
plt.show()

# Set figure size
plt.figure(figsize=(10, 6))

# Loop through age groups to plot separate KDEs
age_groups = ["18-30", "30-45", "45-60", "60+"]
colors = ["blue", "green", "orange", "red"]

for age, color in zip(age_groups, colors):
    sns.kdeplot(data=data[(data["Policy Type"] == "Comprehensive") & (data["Age Group"] == age)]["Premium Amount"],
                label=f"Comprehensive - {age}", shade=True, color=color, linestyle="dashed")
    sns.kdeplot(data=data[(data["Policy Type"] == "Basic") & (data["Age Group"] == age)]["Premium Amount"],
                label=f"Basic - {age}", shade=True, color=color)

# Add labels and title
plt.title("Density Plot of Premium Amounts by Policy Type & Age Group")
plt.xlabel("Premium Amount")
plt.ylabel("Density")
plt.legend()
plt.show()

<ipython-input-109-10fb98385408>:9: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=data[(data["Policy Type"] == "Comprehensive") & (data["Age Group"] == age)]["Premium Amount"],
<ipython-input-109-10fb98385408>:11: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=data[(data["Policy Type"] == "Basic") & (data["Age Group"] == age)]["Premium Amount"],
<ipython-input-109-10fb98385408>:9: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=data[(data["Policy Type"] == "Comprehensive") & (data["Age Group"] == age)]["Premium Amount"],
<ipython-input-109-10fb98385408>:11: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=data[(data["Policy Type"] == "Basic") & (data["Age Group"] == age)]["Premium Amount"],
<ipython-input-109-10fb98385408>:9: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=data[(data["Policy Type"] == "Comprehensive") & (data["Age Group"] == age)]["Premium Amount"],
<ipython-input-109-10fb98385408>:11: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=data[(data["Policy Type"] == "Basic") & (data["Age Group"] == age)]["Premium Amount"],
<ipython-input-109-10fb98385408>:9: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=data[(data["Policy Type"] == "Comprehensive") & (data["Age Group"] == age)]["Premium Amount"],
<ipython-input-109-10fb98385408>:11: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=data[(data["Policy Type"] == "Basic") & (data["Age Group"] == age)]["Premium Amount"],

# Remove NaN and infinite values
data = data.replace([np.inf, -np.inf], np.nan)
data = data.dropna(subset=['Insurance Duration', 'Premium Amount', 'Customer Feedback'])

# Violin Plot: Premium Distribution by Insurance Duration & Customer Feedback
plt.figure(figsize=(10, 6))
sns.violinplot(data=data, x="Insurance Duration", y="Premium Amount", hue="Customer Feedback",
               palette="coolwarm", split=True, inner="quartile")

# Add labels and title
plt.title("Premium Distribution by Insurance Duration & Customer Feedback")
plt.xlabel("Insurance Duration (Years)")
plt.ylabel("Premium Amount")
plt.legend(title="Customer Feedback")
plt.show()

import scipy.stats as stats

# Remove NaN and infinite values
data = data.replace([np.inf, -np.inf], np.nan)
data = data.dropna(subset=['Annual Income', 'Premium Amount'])

# Creating Income Groups for better analysis
data["Income Group"] = pd.cut(data["Annual Income"], bins=[0, 50000, 100000, 200000, 500000],
                            labels=["Low (0-50K)", "Mid (50K-100K)", "High (100K-200K)", "Very High (200K-500K)"])

# KDE Plot: Density of Premiums by Income Group
plt.figure(figsize=(8, 5))
sns.kdeplot(data=data, x="Premium Amount", hue="Income Group", fill=True, alpha=0.5)
plt.title("Density of Premium Amounts Across Income Groups")
plt.xlabel("Premium Amount")
plt.ylabel("Density")
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Remove NaN and infinite values
data = data.replace([np.inf, -np.inf], np.nan)
data = data.dropna(subset=['Property Type', 'Premium Amount', 'Location'])

# Boxen Plot: Premium Distribution by Property Type & Location
plt.figure(figsize=(10, 6))
sns.boxenplot(data=data, x="Property Type", y="Premium Amount", hue="Location", palette="coolwarm")
plt.title("Premium Distribution by Property Type & Location")
plt.xlabel("Property Type")
plt.ylabel("Premium Amount")
plt.xticks(rotation=45)
plt.legend(title="Location", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# KDE Plot: Density of Premium Amounts by Property Type & Location
plt.figure(figsize=(10, 6))

# Create a dictionary to map location to linestyle
location_linestyle_map = {
    data['Location'].unique()[0]: '-',
    data['Location'].unique()[1]: '--',
    data['Location'].unique()[2]: ':',
}


for location in data['Location'].unique():
    sns.kdeplot(
        data=data[data['Location'] == location],
        x="Premium Amount",
        hue="Property Type",
        fill=True,
        alpha=0.5,
        linestyle=location_linestyle_map.get(location, '-'),  # Use get with a default
        label=f"{location}"  # Add labels to the legend
    )

plt.title("Density of Premium Amounts by Property Type & Location")
plt.xlabel("Premium Amount")
plt.ylabel("Density")
plt.legend(title="Property Type & Location", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Convert 'Policy Start Date' to datetime format
data["Policy Start Date"] = pd.to_datetime(data["Policy Start Date"], errors="coerce")

# Extract Month from 'Policy Start Date'
data["Policy Start Month"] = data["Policy Start Date"].dt.month

# Aggregate policy counts per month, property type, and location
monthly_sales = data.groupby(["Policy Start Month", "Property Type", "Location"]).size().reset_index(name="Policy Count")

# Line Plot: Monthly Policy Sales Trend by Property Type
plt.figure(figsize=(12, 6))
sns.lineplot(x="Policy Start Month", y="Policy Count", hue="Property Type", data=monthly_sales, marker="o", palette="tab10")
plt.xticks(ticks=range(1, 13), labels=["Jan", "Feb", "Mar", "Apr", "May", "Jun",
                                        "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"])
plt.title("Monthly Policy Purchases Trend by Property Type")
plt.xlabel("Month")
plt.ylabel("Number of Policies Sold")
plt.grid(True)
plt.legend(title="Property Type")
plt.show()

# Bar Plot: Monthly Policy Sales by Location
plt.figure(figsize=(12, 6))
sns.barplot(x="Policy Start Month", y="Policy Count", hue="Location", data=monthly_sales, palette="coolwarm")
plt.xticks(ticks=range(1, 13), labels=["Jan", "Feb", "Mar", "Apr", "May", "Jun",
                                        "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"])
plt.title("Policy Purchases by Month and Location")
plt.xlabel("Month")
plt.ylabel("Number of Policies Sold")
plt.legend(title="Location")
plt.show()

	id	Age	Gender	Annual Income	Marital Status	Number of Dependents	Education Level	Occupation	Health Score	Location	Policy Type	Previous Claims	Credit Score	Insurance Duration	Policy Start Date	Customer Feedback	Smoking Status	Exercise Frequency	Property Type	Premium Amount
0	0	19.0	Woman	8.642140e+05	Spouse Present	1.0	Undergraduate	Business	26.598761	Tier-1	Premium	2.0	372.0	5.0	2023-12-23 15:21:39.134960	Poor	No	Weekly	Detached Home	1945.913327
1	1	39.0	Woman	8.927012e+05	Spouse Present	3.0	Post Graduate	Missing	21.569731	Tier-2	Comprehensive	1.0	694.0	2.0	2023-06-12 15:21:39.111551	Average	Yes	Monthly	Detached Home	10908.896072
2	2	23.0	Man	2.201772e+06	Formerly Married	3.0	Undergraduate	Business	50.177549	Tier-3	Premium	1.0	NaN	3.0	2023-09-30 15:21:39.221386	Good	Yes	Weekly	Detached Home	21563.135198
3	3	21.0	Man	3.997542e+06	Spouse Present	2.0	Undergraduate	Missing	16.938144	Tier-2	Basic	1.0	367.0	1.0	2024-06-12 15:21:39.226954	Poor	Yes	Daily	Flat	2653.539143
4	4	21.0	Man	3.409986e+06	Not Married	1.0	Undergraduate	Business	24.376094	Tier-2	Premium	0.0	598.0	4.0	2021-12-01 15:21:39.252145	Poor	Yes	Weekly	Detached Home	1269.243463

Acko Health Insurance: Data-Driven Insurance Premium Pricing Strategy¶

Acko Health Insurance product and Premimum pricing Analysis¶

Introduction:¶

1. Data Understanding¶

2. Data Preparation¶

3. Exploratory Data Analysis (EDA)¶

4. Charting and Insights¶

5. Insights and Recommendations¶

Unlocking Customer Demand and Optimizing Pricing in the Health Insurance Market¶

Objectives:¶

Business Impact¶

Dataset Overview¶

Dataset Overview¶

Column Definitions¶

Analysis & Visualisation !¶

1. Importing and Cleaning Data¶

Importing Necessary Libraries¶

Loading the Dataset from google drive¶

Viewing the First Few Rows of the Dataset¶

Checking the Shape of the Dataset¶

Random Sample¶

Displaying Dataset Information¶

Data Type Corrections¶

Checking for Duplicate Values in the Dataset¶

Checking for Missing/Null Values¶

Summary of Dataset Observations¶

2. Data Types¶

Unique Values for each variable.¶

3. Data Wrangling¶

Data Wrangling Code¶

Converting and Creating columns¶

Outliers¶

Visualizing Outliers with Boxplots¶

Metrics¶

Explanation of Metrics¶

Understanding the Pricing Challenge¶

Columns segmentation for Key Insurance Factors¶

1. Customer Personal Information:¶

2. Financial & Professional Details:¶

3. Health & Lifestyle Factors¶

4. Insurance Policy Details¶

5.Others Factors Affecting Premium¶

Risk-Adjusted Premium Calculation: Rule-Based Approach¶

EDA(Exploratory Data Analysis)¶

1. Distribution of Premium Amounts¶

2. Age Distribution¶

3. Health Score Distribution¶

4.Age vs. Premium Amount¶

Premium Amounts by Policy Type¶

Location Tier vs. Premium Amount¶

Marital Status vs. Premium Amount¶

Correlation Analysis (Feature Importance)¶

Dependents vs. Premium Amount¶

Health Score vs. Exercise Frequency¶

Credit Score vs. Previous Claims¶

Marital Status vs. Policy Type Preference¶

Policy Tenure Analysis¶

Healthy Lifestyle Score vs. Premium Amount¶

Hypothesis¶

1.Customer Demographics & Premium Pricing¶

Hypothesis-1 - Impact of Age & Credit Score on Insurance Premiums¶

Conclusion:¶

Hypothesis-2 - Impact of Education Level on Premium Amount: Distribution, Averages, and Statistical Analysis¶

Hypothesis:3 - Annual Income on Premium Amount and Pricing Strategies¶

2 . Risk Factors & Adjustments¶

Hypothesis-1- Box Plot for Premium Distribution by Credit Score & Health Score Groups¶

Hypothesis-2 - Smoking Status and Age on Insurance Premium Amounts¶

Hypothesis-3 -Exercise Frequency on Insurance Premium Discounts¶

3. Policy Type & Affordability¶

Hypothesis -1 - Premium Distribution Across Policy Types & Age Groups¶

Conclusion:¶

KDE Plot: Premium Distribution by Policy Type & Age Group¶

Hypothesis-2 - Distribution of Premiums by Insurance Duration¶

4 . Pricing Strategy & Business Insights¶

Hypothesis -1 -Income Levels on Premium Sensitivity and Dynamic Pricing¶

Hypothesis-2 - Property Type on Premium Amounts¶

Hypoyhesis-3 - Seasonal Trends in Policy Purchases: Analyzing Monthly Sales Patterns¶

Key Findings & Recommendations: Insurance Premium Analysis¶

Key Findings:¶