%pip install pandas
import matplotlib.pyplot as plt
import seaborn as sns

Requirement already satisfied: pandas in /opt/anaconda3/lib/python3.12/site-packages (2.2.2)
Requirement already satisfied: numpy>=1.26.0 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2023.3)
Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Note: you may need to restart the kernel to use updated packages.

import pandas as pd

file_path="./bumble.csv"

df=pd.read_csv(file_path)

df_info=df.info()
df_head=df.head()
df_info,df_head

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   gender       59946 non-null  object 
 3   body_type    54650 non-null  object 
 4   diet         35551 non-null  object 
 5   drinks       56961 non-null  object 
 6   education    53318 non-null  object 
 7   ethnicity    54266 non-null  object 
 8   height       59943 non-null  float64
 9   income       59946 non-null  int64  
 10  job          51748 non-null  object 
 11  last_online  59946 non-null  object 
 12  location     59946 non-null  object 
 13  pets         40025 non-null  object 
 14  religion     39720 non-null  object 
 15  sign         48890 non-null  object 
 16  speaks       59896 non-null  object 
dtypes: float64(1), int64(2), object(14)
memory usage: 7.8+ MB

(None,
    age     status gender       body_type               diet    drinks  \
 0   22     single      m  a little extra  strictly anything  socially   
 1   35     single      m         average       mostly other     often   
 2   38  available      m            thin           anything  socially   
 3   23     single      m            thin         vegetarian  socially   
 4   29     single      m        athletic                NaN  socially   
 
                            education            ethnicity  height  income  \
 0      working on college/university         asian, white    75.0      -1   
 1              working on space camp                white    70.0   80000   
 2     graduated from masters program                  NaN    68.0      -1   
 3      working on college/university                white    71.0   20000   
 4  graduated from college/university  asian, black, other    66.0      -1   
 
                            job       last_online  \
 0               transportation  2012-06-28-20-30   
 1         hospitality / travel  2012-06-29-21-41   
 2                          NaN  2012-06-27-09-10   
 3                      student  2012-06-28-14-22   
 4  artistic / musical / writer  2012-06-27-21-26   
 
                           location                       pets  \
 0  south san francisco, california  likes dogs and likes cats   
 1              oakland, california  likes dogs and likes cats   
 2        san francisco, california                   has cats   
 3             berkeley, california                 likes cats   
 4        san francisco, california  likes dogs and likes cats   
 
                                    religion  \
 0     agnosticism and very serious about it   
 1  agnosticism but not too serious about it   
 2                                       NaN   
 3                                       NaN   
 4                                       NaN   
 
                                  sign  \
 0                              gemini   
 1                              cancer   
 2  pisces but it doesn&rsquo;t matter   
 3                              pisces   
 4                            aquarius   
 
                                               speaks  
 0                                            english  
 1  english (fluently), spanish (poorly), french (...  
 2                               english, french, c++  
 3                           english, german (poorly)  
 4                                            english  )

!pip install numpy

Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.12/site-packages (1.26.4)

import numpy as np

#for prevent loosing data start copying original data
df_clean=df.copy()

missing_counts = df_clean.isnull().sum()
missing_percentage = (missing_counts / len(df_clean)) * 100

# Combine into a DataFrame for better readability
missing_df = pd.DataFrame({
    "Missing Count": missing_counts,
    "Missing %": missing_percentage.round(2)
}).sort_values(by="Missing %", ascending=False)
missing_percentage

age                        0.000000
status                     0.000000
gender                     0.000000
body_type                  9.061868
diet                      40.714514
drinks                     5.024314
education                 11.085081
ethnicity                  9.583644
height                     0.000000
income                    81.940155
job                       13.926454
last_online                0.000000
location                   0.000000
pets                      33.498385
religion                  34.533064
sign                      18.331381
speaks                     0.081638
days_since_last_online     0.000000
age_group                  0.000000
age_scaled                 0.000000
height_scaled              0.000000
income_scaled              0.000000
dtype: float64

df_clean['last_online']=pd.to_datetime(df_clean['last_online'],format="%Y-%m-%d-%H-%M",errors='coerce')
#replace -1 with NaN in income
df_clean['income']=df_clean['income'].replace(-1,np.nan)

#drop duplicate row if its present
df_clean.drop_duplicates(inplace=True)

# handle age and height
df.clean=df_clean[(df_clean['age'] >= 18) &(df_clean['age']<=100)]
df.clean=df_clean[(df_clean['height'] >=48) & (df_clean['height']<=84)]



#checking missing data and clean that up
missing_summary=df.clean.isnull().sum().sort_values(ascending=False)
missing_summary.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 56346 entries, 0 to 59945
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   age                     56346 non-null  int64         
 1   status                  56346 non-null  object        
 2   gender                  56346 non-null  object        
 3   body_type               51240 non-null  object        
 4   diet                    33405 non-null  object        
 5   drinks                  53515 non-null  object        
 6   education               50100 non-null  object        
 7   ethnicity               50946 non-null  object        
 8   height                  56346 non-null  float64       
 9   income                  10176 non-null  float64       
 10  job                     48499 non-null  object        
 11  last_online             56346 non-null  datetime64[ns]
 12  location                56346 non-null  object        
 13  pets                    37471 non-null  object        
 14  religion                36888 non-null  object        
 15  sign                    46017 non-null  object        
 16  speaks                  56300 non-null  object        
 17  days_since_last_online  56346 non-null  int64         
 18  age_group               56346 non-null  category      
 19  age_scaled              56346 non-null  float64       
 20  height_scaled           56346 non-null  float64       
 21  income_scaled           56346 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(5), int64(2), object(13)
memory usage: 9.5+ MB

def detect_outliers_iqr(series):
    Q1=series.quantile(0.25)
    Q3=series.quantile(0.75)
    IQR=Q3-Q1
    lower_bound=Q1-1.5*IQR
    upper_bound=Q3+1.5*IQR
    
    outliers=series[(series <lower_bound) | (series > upper_bound)]

    return len(outliers),lower_bound,upper_bound

#detect outliers for age,height and income

outliers_age=detect_outliers_iqr(df_clean['age'])
outliers_height=detect_outliers_iqr(df_clean['height'])
outliers_income=detect_outliers_iqr(df_clean['income'])


outliers_summary={
    "age":{
        "outliers":outliers_age[0],
        "lower_bound":outliers_age[1],
        "upper_bound":outliers_age[2]
    },
    "height":{
        "outliers":outliers_height[0],
        "lower_bound":outliers_height[1],
        "upper_bound":outliers_height[2]
    },
    "income":outliers_income[0],
        "lower_bound":outliers_income[1],
        "upper_bound":outliers_income[2]
}

outliers_summary

{'age': {'outliers': 2638, 'lower_bound': 9.5, 'upper_bound': 53.5},
 'height': {'outliers': 285, 'lower_bound': 58.5, 'upper_bound': 78.5},
 'income': 718,
 'lower_bound': -100000.0,
 'upper_bound': 220000.0}

#remove age outliers
df_clean=df_clean[(df_clean['age']>9.5)&(df_clean['age']<=53.5)]

#remove height outliers
df_clean=df_clean[(df_clean['height']>58.5) & (df_clean['height'] <=78.5)]

#remove income outliers
df_clean=df_clean[(df_clean['income'].isna()) | (df_clean['income'] <=220000)]

#new shape after outliers removal
df_clean.shape

(56346, 17)

from datetime import datetime
# !pip install scikit-learn
import sklearn
from sklearn.preprocessing import StandardScaler

#days since last online
today=pd.Timestamp(datetime.today().date())
df_clean['days_since_last_online']=(today-df_clean['last_online']).dt.days

#age group
age_bins=[18,25,35,45,53.5]
age_labels=['18-25','26-35','36-45','46-53']
df_clean['age_group']=pd.cut(df_clean['age'],bins=age_bins,labels=age_labels,right=True,include_lowest=True)

#standardize numerical feature:AGE, HEIGHT, INCOME
scaler=StandardScaler()
df_clean[['age_scaled','height_scaled']]=scaler.fit_transform(df_clean[['age','height']])

#for income, fill NaN with median temporarly just for scaling
income_median=df_clean['income'].median()
df_clean['income_scaled']=scaler.fit_transform(df_clean[['income']].fillna(income_median))

#show new feature columns
df_clean[['age','age_group','days_since_last_online','age_scaled','height_scaled','income_scaled']].head()

#visualization
sns.set(style="whitegrid")

#distribution of users by gender
plt.figure(figsize=(4,6))
sns.countplot(data=df_clean, x='gender',palette='muted')
plt.title("User distribution by Gender")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_32497/4264358472.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df_clean, x='gender',palette='muted')

# Distribution by age group
plt.figure(figsize=(6,4))
sns.countplot(data=df_clean, x='age_group',order=['18-25','26-35','36-45','46-53'],palette='')
plt.title("User distribution by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/3996538591.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df_clean, x='age_group',order=['18-25','26-35','36-45','46-53'],palette='muted')

# top 10 location by user count
top_locations=df_clean['location'].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(x=top_locations.values,y=top_locations.index,palette='muted')
plt.title("Top 10 Locations by User Count")
plt.xlabel("Number of user")
plt.ylabel("Locations")
plt.tight_layout()
plt.show()

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/615294961.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_locations.values,y=top_locations.index,palette='muted')

# lifestyle analysis: Diet vs Drinks
plt.figure(figsize=(10,6))
sns.countplot(data=df_clean, x='diet',hue='drinks',order=df_clean['diet'].value_counts().index,palette='Set2')
plt.title("Diet vs Drinking Habits")
plt.xlabel("Count")
plt.ylabel("Diet Type")
plt.legend(title='Drinks')
plt.tight_layout()
plt.xticks(rotation=90)
plt.show()

#correlation heatmap for numerical columns
plt.figure(figsize=(8,6))
numeric_cols=['age','height','income','days_since_last_online']
corr=df_clean[numeric_cols].corr()
sns.heatmap(corr,annot=True,cmap='coolwarm',fmt='.2f')
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

#Scatterplot: age vs Income
plt.figure(figsize=(7,5))

sns.scatterplot(data=df_clean,x='age',y='income',alpha=0.5,marker='x',color='teal')
plt.title("Age vs Income")
plt.xlabel("Age")
plt.ylabel("Income")
plt.tight_layout()
plt.show()

#part 1:Data Cleaning
# Inspecting Missing Data

missing_counts=df_clean.isnull().sum()
missing_percentage=(missing_counts/len(df_clean))*100

#combine into a dataframe for better readibility
missing_df=pd.DataFrame({
    "Missing Count":missing_counts,
    "Missing %":missing_percentage
    
}).sort_values(by='Missing %', ascending=False)
print(missing_df)

#also identify columns where more than 50% of the data is missing
high_missing_cols=missing_df[missing_df['Missing %']>50]
high_missing_cols

                        Missing Count  Missing %
income                          46170  81.940155
diet                            22941  40.714514
religion                        19458  34.533064
pets                            18875  33.498385
sign                            10329  18.331381
job                              7847  13.926454
education                        6246  11.085081
ethnicity                        5400   9.583644
body_type                        5106   9.061868
drinks                           2831   5.024314
speaks                             46   0.081638
age                                 0   0.000000
height_scaled                       0   0.000000
age_scaled                          0   0.000000
age_group                           0   0.000000
days_since_last_online              0   0.000000
last_online                         0   0.000000
location                            0   0.000000
status                              0   0.000000
height                              0   0.000000
gender                              0   0.000000
income_scaled                       0   0.000000

missing_before=df_clean[['height','income']].isnull().sum()

#apply imputation using median by gender and age_group
def impute_group_median(df,column,group_cols):
    df[column + '_imputed']=df[column]
    for name,group in df.groupby(group_cols):
        median_val=group[column].median()
        mask=group[column].isnull()
        df.loc[mask.index[mask],column+'_imputed']=median_val
    return df

#impute height and income
df_clean=impute_group_median(df_clean,'height',['gender','age_group'])
df_clean=impute_group_median(df_clean,'income',['gender','age_group'])

#display how many missing value remain after imputation
missing_after=df_clean[['height_imputed','income_imputed']].isnull().sum()

#compare before and after

missing_comparison=pd.DataFrame({
    'Missing Before':missing_before,
    'Missing After':missing_after
})
missing_comparison

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1956053079.py:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  for name,group in df.groupby(group_cols):
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1956053079.py:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  for name,group in df.groupby(group_cols):

df_clean.dtypes

age                                int64
status                            object
gender                            object
body_type                         object
diet                              object
drinks                            object
education                         object
ethnicity                         object
height                           float64
income                           float64
job                               object
last_online               datetime64[ns]
location                          object
pets                              object
religion                          object
sign                              object
speaks                            object
days_since_last_online             int64
age_group                       category
age_scaled                       float64
height_scaled                    float64
income_scaled                    float64
height_imputed                   float64
income_imputed                   float64
dtype: object

# no need of change data type
df_clean['last_active_year']=df_clean['last_online'].dt.year
df_clean['last_active_month']=df_clean['last_online'].dt.month
df_clean['weekdays_last_online']=df_clean['last_online'].dt.day_name()

q10,q90=df_clean['income'].quantile([0.10,0.90])
trimmed_income=df_clean[(df_clean['income'] >=q10) & (df_clean['income'] <=q90)]
trimmed_mean=trimmed_income['income'].mean()

q10_age, q90_age = df_clean['age'].quantile([0.10, 0.90])
trimmed_age = df_clean[(df_clean['age'] >= q10_age) & (df_clean['age'] <= q90_age)]

summary_stats = {
    "Full Income Mean": df_clean['income_imputed'].mean(),
    "Trimmed Income Mean (10th-90th pct)": trimmed_income['income_imputed'].mean(),
    "Full Age Mean": df_clean['age'].mean(),
    "Trimmed Age Mean (10th-90th pct)": trimmed_age['age'].mean()
}

summary_stats

{'Full Income Mean': 50047.38579491002,
 'Trimmed Income Mean (10th-90th pct)': 51009.795748228426,
 'Full Age Mean': 31.107886983991765,
 'Trimmed Age Mean (10th-90th pct)': 30.29911172221131}

# In the time of data cleaning process we can see the percentage of missing data for column income is more than 80%
#they reason behind may people do not want to share their data ,its kind sensitive data. like -income ,job,body_type,diet etc.

plt.figure(figsize=(14,6))
sns.heatmap(df_clean.isnull(),cbar=False,cmap='magma',yticklabels=False)
plt.title("Heatmap of Missing Value in Bumble Dataset")
plt.tight_layout()
plt.show()

#this is bar chart for missing data % column wise
missing_percent = (df_clean.isnull().sum() / len(df_clean)) * 100
missing_percent = missing_percent[missing_percent > 0].sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=missing_percent.values, hue=missing_percent.index, palette='flare')
plt.title("Percentage of Missing Values by Column")
plt.xlabel("% Missing")
plt.ylabel("Column")
plt.tight_layout()
plt.legend(title='column')
plt.show()

# initially we already did grouping but it is okay let's do it again 
# creating bin
age_bins = [18, 25, 35, 45, 53.5]
age_labels = ['18-25', '26-35', '36-45', '46-53']
df_clean['age_group'] = pd.cut(df_clean['age'], bins=age_bins, labels=age_labels, include_lowest=True)

# we use imputed income for 
q1, q3 = df_clean['income_imputed'].quantile([0.25, 0.75])

#classifying category wise income
q1, q3 = df_clean['income_imputed'].quantile([0.25, 0.75])

def income_category(val):
    if pd.isna(val):
        return np.nan
    elif val < q1:
        return 'Low Income'`
    elif val <= q3:
        return 'Medium Income'
    else:
        return 'High Income'

df_clean['income_group'] = df_clean['income_imputed'].apply(income_category)

#2. Derived Features
#creating new feature name -profile_completeness--- how much % user profile is completed base on different metric

user_input_columns = ['age', 'height', 'diet', 'drinks', 'education', 'ethnicity', 'body_type',
                      'job', 'pets', 'sign', 'income']  # Add/remove based on what's meaningful

df_clean['profile_completeness'] = df_clean[user_input_columns].notnull().sum(axis=1) / len(user_input_columns) * 100


completeness_bins = [0, 40, 60, 80, 90, 100]
completeness_labels = ['0-40%', '41-60%', '61-80%', '81-90%', '91-100%']
df_clean['completeness_group'] = pd.cut(df_clean['profile_completeness'], bins=completeness_bins, labels=completeness_labels)

#over all completeness by user (percentage group  )
completeness_distribution=df_clean['completeness_group'].value_counts().sort_index()
print(" Completeness Group Distribution:\n", completeness_distribution)

#let use chart for better understanding
plt.figure(figsize=(8,5))
sns.countplot(data=df_clean, x='completeness_group',order=completeness_labels,palette='Blues')
plt.title("User Profile Completeness Distribution")
plt.xlabel("Profile Completeness (%)")
plt.ylabel("User Count")
plt.tight_layout()
plt.show()

 Completeness Group Distribution:
 completeness_group
0-40%       1096
41-60%      3612
61-80%     15118
81-90%     15797
91-100%    20723
Name: count, dtype: int64

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/4169348503.py:20: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df_clean, x='completeness_group',order=completeness_labels,palette='Blues')

#average profile completeness by age group 

completeness_by_age=df_clean.groupby('age_group')['profile_completeness'].median()
print(" Completeness by Age Group :\n", completeness_by_age)
#let use chart for better understanding
plt.figure(figsize=(8,5))
sns.barplot(x=completeness_by_age.index,y=completeness_by_age.values,palette='Greens')
plt.title("User Profile Completeness by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Average Completeness (%)")
# plt.ylim(85,90)
plt.tight_layout()
plt.show()
#now we can see not much differnce

 Completeness by Age Group :
 age_group
18-25    81.818182
26-35    81.818182
36-45    81.818182
46-53    81.818182
Name: profile_completeness, dtype: float64

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/3015216240.py:3: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  completeness_by_age=df_clean.groupby('age_group')['profile_completeness'].median()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/3015216240.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=completeness_by_age.index,y=completeness_by_age.values,palette='Greens')

#average profile completeness by gender
completeness_by_gender=df_clean.groupby('gender')['profile_completeness'].median()
print(" Completeness by Gender :\n", completeness_by_gender)

#let use chart for better understanding
plt.figure(figsize=(6,5))
sns.barplot(x=completeness_by_gender.index,y=completeness_by_gender.values,palette='Oranges')
plt.title("User Profile Completeness by Gender")
plt.xlabel("Gender")
plt.ylabel("Average Completeness (%)")
# plt.ylim(85,90)
plt.tight_layout()
plt.show()
#now we can see not much differnce

 Completeness by Gender :
 gender
f    81.818182
m    81.818182
Name: profile_completeness, dtype: float64

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1304463441.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=completeness_by_gender.index,y=completeness_by_gender.values,palette='Oranges')

#unit conversion 
#change height value from inch to cm
df_clean['height_cm']=df_clean['height']*2.54

#Part 3: Data Analysis
#1. Demographic Analysis

#gender distribution
gender_distribution=df_clean['gender'].value_counts(normalize=True)*100
plt.figure(figsize=(5, 4))
gender_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=['lightblue', 'lightcoral'])
plt.title("Gender Distribution")
plt.ylabel("")
plt.tight_layout()
plt.show()

#yes you can significant imbalance betweens gender , as number of male is higher

#relationship status
status_distribution=df_clean['status'].value_counts(normalize=True)*100
plt.figure(figsize=(7, 4))
sns.barplot(x=status_distribution.index, y=status_distribution.values, palette='pastel')
plt.title("Relationship Status Distribution (Overall)")
plt.ylabel("Percentage of Users")
plt.xlabel("Status")
plt.tight_layout()
plt.show()

#a huge number of single user

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/932796976.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=status_distribution.index, y=status_distribution.values, palette='pastel')

#status by gender
status_by_gender=df_clean.groupby('gender')['status'].value_counts(normalize=True).unstack().fillna(0)*100
plt.figure(figsize=(8, 5))
status_by_gender.T.plot(kind='bar', figsize=(9, 5), colormap='viridis')
plt.title("Relationship Status Distribution by Gender")
plt.ylabel("Percentage of Users")
plt.xlabel("Status")
plt.xticks(rotation=0)
plt.legend(title="Gender")
plt.tight_layout()
plt.show()

#and so target audience should be single people

<Figure size 800x500 with 0 Axes>

#2. Correlation Analysis
#convert gender to numeric data type
df_clean['gender_numeric']=df_clean['gender'].map({'m':1,'f':0})

#correlation matrix

correlation_column=['age','income_imputed','profile_completeness','gender_numeric']
correlation_matrix=df_clean[correlation_column].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Matrix of Numerical Features")
plt.tight_layout()
plt.show()

# Extract correlation between age and income
age_income_corr = correlation_matrix.loc['age', 'income_imputed']
age_income_corr

#correlation between age and income is 0.63
#This is a weak positive correlation, meaning:As age increases, income tends to increase slightly — but the relationship is not very strong.

0.6284495732633397

# cleaning column diet and drink
lifestyle_df=df_clean[['diet','drinks']].dropna(how='all')

#counting number of particular diet 
diet_distribution=lifestyle_df['diet'].value_counts(normalize=True).sort_values(ascending=False) *100

#count of combination table of (drink and diet) 
diet_and_drink_table=lifestyle_df.groupby(['diet','drinks']).size().unstack(fill_value=0)


percentage_of_diet_and_drink=diet_and_drink_table.div(diet_and_drink_table.sum(axis=1), axis=0) * 100

plt.figure(figsize=(10, 5))
sns.barplot(x=diet_distribution.values, y=diet_distribution.index, palette='viridis')
plt.title("Distribution of Dietary Preferences")
plt.xlabel("Percentage of Users (%)")
plt.ylabel("Diet Type")
plt.tight_layout()
plt.show()
# perfered diet is mostly anything

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/3661622710.py:14: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=diet_distribution.values, y=diet_distribution.index, palette='viridis')

selected_drinks = ['not at all', 'rarely', 'socially', 'often']
percentage_of_diet_and_drink[selected_drinks].plot(kind='barh', stacked=True,figsize=(12, 7),colormap='tab20c')
plt.title("Drinking Habits by Dietary Preference")
plt.xlabel("Percentage Within Diet Group (%)")
plt.ylabel("Diet Type")
plt.legend(title="Drinking Habit", bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()                                                                      
                                                                        
# strictly halal and halal people are not likely to drink

df_clean['location'].dropna().head(10)
df_clean[['city', 'state']] = df_clean['location'].str.lower().str.extract(r'^(.*),\s*(.*)$')


#top 5 cities
top_5_cities=df_clean['city'].value_counts().head(5)
#top 5 state
top_5_states=df_clean['state'].value_counts().head(5)

#collect all top city
all_top_cities=df_clean[df_clean['city'].isin(top_5_cities.index)]

#collect all top state
all_top_states=df_clean[df_clean['state'].isin(top_5_states.index)]

#average age by city
avg_age_by_city=all_top_cities.groupby('city')['age'].mean().round(1).sort_values()


avg_age_df = avg_age_by_city.reset_index()
avg_age_df.columns = ['city', 'avg_age']

avg_income_by_city=all_top_cities.groupby('city')['income'].mean().round(0).sort_values(ascending=False)
avg_income_by_state=all_top_states.groupby('city')['income'].mean().round(0).sort_values(ascending=False)

plt.figure(figsize=(10,5))
sns.barplot(x=top_5_cities.values,y=top_5_cities.index, palette="Blues")
plt.title("Top 5 Cities by User Count")
plt.xlabel("Number of Users")
plt.ylabel("City")
plt.tight_layout()
plt.show()

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/606072808.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_5_cities.values,y=top_5_cities.index, palette="Blues")

avg_age_df = avg_age_by_city.reset_index()
avg_age_df.columns = ['city', 'avg_age']
plt.figure(figsize=(10, 4))
sns.barplot(data=avg_age_df, x='avg_age', y='city', palette='magma')
plt.title("Average Age by Top Cities")
plt.xlabel("Average Age")
plt.ylabel("City")
plt.tight_layout()
plt.show()

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/604349013.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=avg_age_df, x='avg_age', y='city', palette='magma')

avg_income_by_city=all_top_cities.groupby('city')['income'].mean().round(0).sort_values(ascending=False)

avg_income_df=avg_income_by_city.reset_index()
avg_income_df.columns=['city','avg_income']

plt.figure(figsize=(10, 6))
sns.barplot(data=avg_income_df, x='avg_income', y='city', palette='magma')
plt.title("Average Income by Top Cities")
plt.xlabel("Average Income")
plt.ylabel("City")
plt.tight_layout()
plt.show()

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/2790284345.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=avg_income_df, x='avg_income', y='city', palette='magma')

avg_income_by_state=all_top_states.groupby('state')['income'].mean().round(0).sort_values(ascending=False)

avg_income_df=avg_income_by_state.reset_index()
avg_income_df.columns=['state','avg_income']

plt.figure(figsize=(10, 6))
sns.barplot(data=avg_income_df, x='avg_income', y='state', palette='magma')
plt.title("Average Income by Top States")
plt.xlabel("Average Income")
plt.ylabel("State")
plt.tight_layout()
plt.show()

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1354353335.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=avg_income_df, x='avg_income', y='state', palette='magma')

# avg height by gender
avg_height_by_gender=df_clean.groupby('gender')['height'].mean().round(1).sort_values()

plt.figure(figsize=(8,4))
sns.barplot(x=avg_height_by_gender.index,y=avg_height_by_gender.values, palette='muted')
plt.title("Average Height by Gender")
plt.xlabel("Gender")
plt.ylabel("Average Height (inches)")
plt.tight_layout()
plt.show()

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/2515973691.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=avg_height_by_gender.index,y=avg_height_by_gender.values, palette='muted')

avg_height_by_age_group = df_clean.groupby('age_group')['height'].mean().round(1).sort_values(ascending=False)

avg_height_age=avg_height_by_age_group.reset_index()
avg_height_age.columns=['age_group','avg_height']


plt.figure(figsize=(6, 6))
sns.barplot(data=avg_height_age, x='age_group', y='avg_height', palette='Blues')
plt.title("Average Income by Top States")
plt.xlabel("Average Income")
plt.ylabel("State")
plt.tight_layout()
plt.show()


avg_height_by_age_group

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1742119313.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  avg_height_by_age_group = df_clean.groupby('age_group')['height'].mean().round(1).sort_values(ascending=False)
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1742119313.py:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=avg_height_age, x='age_group', y='avg_height', palette='Blues')

age_group
26-35    68.4
36-45    68.3
18-25    68.2
46-53    68.2
Name: height, dtype: float64

#How does the age distribution differ by gender
plt.figure(figsize=(14, 6))
sns.boxplot(data=df_clean, x='body_type', y='height', palette='Set3')
plt.title("Height Distribution Across Body Types")
plt.xlabel("Body Type")
plt.ylabel("Height (inches)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

height_stats_by_body_type = df_clean.groupby('body_type')['height'].mean().round(1).sort_values(ascending=False)
height_stats_by_body_type

# athletic are taller-- 69.7 and  curvy people are shorter

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/3480227085.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_clean, x='body_type', y='height', palette='Set3')

body_type
athletic          69.7
jacked            69.1
a little extra    68.9
used up           68.9
overweight        68.7
fit               68.6
skinny            68.5
average           68.1
thin              67.9
rather not say    67.0
full figured      66.5
curvy             65.3
Name: height, dtype: float64

#6. Income Analysis
income_by_gender=df_clean.groupby('gender')['income'].median().round(1).sort_values(ascending=False)

avg_income_df=income_by_gender.reset_index()
avg_income_df.columns=['gender','avg_income']

plt.figure(figsize=(6, 6))
sns.barplot(data=avg_income_df, x='gender', y='avg_income', palette='Blues')
plt.title("Average Income by Gender")
plt.ylabel("Average Income")
plt.xlabel("Gender")
plt.tight_layout()
plt.show()

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1120282396.py:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=avg_income_df, x='gender', y='avg_income', palette='Blues')

avg_height_by_age_group = df_clean.groupby('age_group')['income'].mean().round(1).sort_values(ascending=False)

avg_income_df=avg_height_by_age_group.reset_index()
avg_income_df.columns=['age_group','avg_income']


plt.figure(figsize=(6, 6))
sns.barplot(data=avg_income_df, x='age_group', y='avg_income', palette='Blues')
plt.title("Average Income by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Average Height")
plt.tight_layout()
plt.show()


#YES! older users more likely to report higher incomes.

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1242257768.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  avg_height_by_age_group = df_clean.groupby('age_group')['income'].mean().round(1).sort_values(ascending=False)
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1242257768.py:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=avg_income_df, x='age_group', y='avg_income', palette='Blues')

#part 4 Part 4: Data Visualization


plt.figure(figsize=(6, 6))
sns.countplot(data=df_clean, x='age_group', palette='Blues')
plt.title("User distribution by Age Group")
plt.xlabel("Age Group")
plt.ylabel("User")
plt.tight_layout()
plt.show()

#age of between 26 to 35 user are dominating

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1577851910.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df_clean, x='age_group', palette='Blues')

sns.countplot(data=df_clean, x='age_group', hue='gender', palette='muted')
plt.title("User Distribution by Age Group and Gender")
plt.xlabel("Age Group")
plt.ylabel("Number of Users")
plt.legend(title="Gender")
plt.tight_layout()
plt.show()

#2. Income and Age
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_clean, x='age', y='income', alpha=0.4, color='teal')
sns.regplot(data=df_clean, x='age', y='income', scatter=False, color='red', label='Trend Line')
plt.title("Relationship Between Age and Income")
plt.xlabel("Age")
plt.ylabel("Income")
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 6))
sns.boxplot(data=df_clean, x='age_group', y='income', palette='Set3')
plt.title("Income by group")
plt.xlabel("Age Group")
plt.ylabel("Income")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

median_income_by_age_group= df_clean.groupby('age_group')['income'].median().round(1).sort_values(ascending=False)
median_income_by_age_group

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1119495270.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_clean, x='age_group', y='income', palette='Set3')

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1119495270.py:10: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  median_income_by_age_group= df_clean.groupby('age_group')['income'].median().round(1).sort_values(ascending=False)

age_group
36-45    70000.0
46-53    70000.0
26-35    50000.0
18-25    20000.0
Name: income, dtype: float64

income_by_gender_status = df_clean.groupby(['gender', 'status'])['income'].median().unstack().round(0)

income_by_gender_status

income_plot_df = df_clean.groupby(['gender', 'status'])['income'].median().reset_index()
income_plot_df.columns = ['Gender', 'Status', 'Median Income']

# Create barplot
plt.figure(figsize=(12, 6))
sns.barplot(data=income_plot_df, x='Status', y='Median Income', hue='Gender', palette='muted')
plt.title("Median Income by Gender and Relationship Status")
plt.xlabel("Relationship Status")
plt.ylabel("Median Income")
plt.legend(title="Gender")
plt.tight_layout()
plt.show()

pet_df=df_clean['pets'].value_counts()

plt.figure(figsize=(8,4))
sns.countplot(data=df_clean, x='pets',order=pet_df.index, palette='Set2')
plt.title("Distribution of Pets Preferences")
plt.xlabel("Pets Preference")
plt.ylabel("Number of Users")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

#likes dogs and likes cat is more prefered

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1505912911.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df_clean, x='pets',order=pet_df.index, palette='Set2')

# Grouped bar chart: Pets preference by gender and age group
plt.figure(figsize=(14, 6))
sns.countplot(data=df_clean, x='pets', hue='gender', order=pet_df.index, palette='Set2')
plt.title("Pets Preferences by Gender")
plt.xlabel("Pets Preference")
plt.ylabel("Number of Users")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

most_liked_pet=df_clean['pets'].value_counts().head(5).index

df_top_pets=df_clean[df_clean['pets'].isin(most_liked_pet)]
top_pet_by_age_group=df_top_pets.groupby(['age_group', 'pets']).size().reset_index(name='count')

top_pet_by_age_group['percentage'] = top_pet_by_age_group.groupby('age_group')['count'].transform(lambda x: (x / x.sum()) * 100)

plt.figure(figsize=(14, 7))
sns.barplot(data=top_pet_by_age_group, x='age_group', y='percentage', hue='pets', palette='Set2')
plt.title("Pet preference by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Percentage")
plt.legend(title="Pets")
plt.tight_layout()
plt.show()

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/4199129173.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  top_pet_by_age_group=df_top_pets.groupby(['age_group', 'pets']).size().reset_index(name='count')
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/4199129173.py:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  top_pet_by_age_group['percentage'] = top_pet_by_age_group.groupby('age_group')['count'].transform(lambda x: (x / x.sum()) * 100)

df_clean['sign'] = df_clean['sign'].str.extract(r'([a-zA-Z]+)')  # extract only the zodiac sign word
df_clean['sign'] = df_clean['sign'].str.lower()

sign_distribution=df_clean['sign'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(sign_distribution, labels=sign_distribution.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
plt.title("Zodiac Sign Distribution")
plt.axis('equal')
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))

sns.barplot(x=sign_distribution.index, y=sign_distribution.values, palette='Set3')
plt.title("Zodiac Sign Distribution")
plt.axis('equal')
plt.tight_layout()
plt.xticks(rotation=90)
plt.show()

/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/2636164624.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=sign_distribution.index, y=sign_distribution.values, palette='Set3')

	age	age_group	days_since_last_online	age_scaled	height_scaled	income_scaled
0	22	18-25	4655	-1.202994	1.765774	-0.076767
1	35	26-35	4654	0.514081	0.446840	1.840292
2	38	36-45	4656	0.910329	-0.080733	-0.076767
3	23	18-25	4655	-1.070911	0.710627	-1.993827
4	29	26-35	4656	-0.278415	-0.608307	-0.076767

	Missing Before	Missing After
height	0.0	NaN
height_imputed	NaN	0.0
income	46170.0	NaN
income_imputed	NaN	0.0

status	available	married	seeing someone	single	unknown
gender
f	20000.0	30000.0	20000.0	40000.0	20000.0
m	50000.0	60000.0	40000.0	50000.0	NaN