In [3]:
%pip install pandas
import matplotlib.pyplot as plt
import seaborn as sns
Requirement already satisfied: pandas in /opt/anaconda3/lib/python3.12/site-packages (2.2.2) Requirement already satisfied: numpy>=1.26.0 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (1.26.4) Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2024.1) Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2023.3) Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0) Note: you may need to restart the kernel to use updated packages.
In [4]:
import pandas as pd
In [5]:
file_path="./bumble.csv"
In [6]:
df=pd.read_csv(file_path)
In [7]:
df_info=df.info()
df_head=df.head()
df_info,df_head
<class 'pandas.core.frame.DataFrame'> RangeIndex: 59946 entries, 0 to 59945 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 59946 non-null int64 1 status 59946 non-null object 2 gender 59946 non-null object 3 body_type 54650 non-null object 4 diet 35551 non-null object 5 drinks 56961 non-null object 6 education 53318 non-null object 7 ethnicity 54266 non-null object 8 height 59943 non-null float64 9 income 59946 non-null int64 10 job 51748 non-null object 11 last_online 59946 non-null object 12 location 59946 non-null object 13 pets 40025 non-null object 14 religion 39720 non-null object 15 sign 48890 non-null object 16 speaks 59896 non-null object dtypes: float64(1), int64(2), object(14) memory usage: 7.8+ MB
Out[7]:
(None,
age status gender body_type diet drinks \
0 22 single m a little extra strictly anything socially
1 35 single m average mostly other often
2 38 available m thin anything socially
3 23 single m thin vegetarian socially
4 29 single m athletic NaN socially
education ethnicity height income \
0 working on college/university asian, white 75.0 -1
1 working on space camp white 70.0 80000
2 graduated from masters program NaN 68.0 -1
3 working on college/university white 71.0 20000
4 graduated from college/university asian, black, other 66.0 -1
job last_online \
0 transportation 2012-06-28-20-30
1 hospitality / travel 2012-06-29-21-41
2 NaN 2012-06-27-09-10
3 student 2012-06-28-14-22
4 artistic / musical / writer 2012-06-27-21-26
location pets \
0 south san francisco, california likes dogs and likes cats
1 oakland, california likes dogs and likes cats
2 san francisco, california has cats
3 berkeley, california likes cats
4 san francisco, california likes dogs and likes cats
religion \
0 agnosticism and very serious about it
1 agnosticism but not too serious about it
2 NaN
3 NaN
4 NaN
sign \
0 gemini
1 cancer
2 pisces but it doesn’t matter
3 pisces
4 aquarius
speaks
0 english
1 english (fluently), spanish (poorly), french (...
2 english, french, c++
3 english, german (poorly)
4 english )
In [8]:
!pip install numpy
Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.12/site-packages (1.26.4)
In [9]:
import numpy as np
In [50]:
#for prevent loosing data start copying original data
df_clean=df.copy()
In [48]:
missing_counts = df_clean.isnull().sum()
missing_percentage = (missing_counts / len(df_clean)) * 100
# Combine into a DataFrame for better readability
missing_df = pd.DataFrame({
"Missing Count": missing_counts,
"Missing %": missing_percentage.round(2)
}).sort_values(by="Missing %", ascending=False)
missing_percentage
Out[48]:
age 0.000000 status 0.000000 gender 0.000000 body_type 9.061868 diet 40.714514 drinks 5.024314 education 11.085081 ethnicity 9.583644 height 0.000000 income 81.940155 job 13.926454 last_online 0.000000 location 0.000000 pets 33.498385 religion 34.533064 sign 18.331381 speaks 0.081638 days_since_last_online 0.000000 age_group 0.000000 age_scaled 0.000000 height_scaled 0.000000 income_scaled 0.000000 dtype: float64
In [35]:
df_clean['last_online']=pd.to_datetime(df_clean['last_online'],format="%Y-%m-%d-%H-%M",errors='coerce')
#replace -1 with NaN in income
df_clean['income']=df_clean['income'].replace(-1,np.nan)
#drop duplicate row if its present
df_clean.drop_duplicates(inplace=True)
# handle age and height
df.clean=df_clean[(df_clean['age'] >= 18) &(df_clean['age']<=100)]
df.clean=df_clean[(df_clean['height'] >=48) & (df_clean['height']<=84)]
#checking missing data and clean that up
missing_summary=df.clean.isnull().sum().sort_values(ascending=False)
missing_summary.head(10)
<class 'pandas.core.frame.DataFrame'> Index: 56346 entries, 0 to 59945 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 56346 non-null int64 1 status 56346 non-null object 2 gender 56346 non-null object 3 body_type 51240 non-null object 4 diet 33405 non-null object 5 drinks 53515 non-null object 6 education 50100 non-null object 7 ethnicity 50946 non-null object 8 height 56346 non-null float64 9 income 10176 non-null float64 10 job 48499 non-null object 11 last_online 56346 non-null datetime64[ns] 12 location 56346 non-null object 13 pets 37471 non-null object 14 religion 36888 non-null object 15 sign 46017 non-null object 16 speaks 56300 non-null object 17 days_since_last_online 56346 non-null int64 18 age_group 56346 non-null category 19 age_scaled 56346 non-null float64 20 height_scaled 56346 non-null float64 21 income_scaled 56346 non-null float64 dtypes: category(1), datetime64[ns](1), float64(5), int64(2), object(13) memory usage: 9.5+ MB
In [15]:
def detect_outliers_iqr(series):
Q1=series.quantile(0.25)
Q3=series.quantile(0.75)
IQR=Q3-Q1
lower_bound=Q1-1.5*IQR
upper_bound=Q3+1.5*IQR
outliers=series[(series <lower_bound) | (series > upper_bound)]
return len(outliers),lower_bound,upper_bound
#detect outliers for age,height and income
outliers_age=detect_outliers_iqr(df_clean['age'])
outliers_height=detect_outliers_iqr(df_clean['height'])
outliers_income=detect_outliers_iqr(df_clean['income'])
outliers_summary={
"age":{
"outliers":outliers_age[0],
"lower_bound":outliers_age[1],
"upper_bound":outliers_age[2]
},
"height":{
"outliers":outliers_height[0],
"lower_bound":outliers_height[1],
"upper_bound":outliers_height[2]
},
"income":outliers_income[0],
"lower_bound":outliers_income[1],
"upper_bound":outliers_income[2]
}
outliers_summary
Out[15]:
{'age': {'outliers': 2638, 'lower_bound': 9.5, 'upper_bound': 53.5},
'height': {'outliers': 285, 'lower_bound': 58.5, 'upper_bound': 78.5},
'income': 718,
'lower_bound': -100000.0,
'upper_bound': 220000.0}
In [23]:
#remove age outliers
df_clean=df_clean[(df_clean['age']>9.5)&(df_clean['age']<=53.5)]
#remove height outliers
df_clean=df_clean[(df_clean['height']>58.5) & (df_clean['height'] <=78.5)]
#remove income outliers
df_clean=df_clean[(df_clean['income'].isna()) | (df_clean['income'] <=220000)]
#new shape after outliers removal
df_clean.shape
Out[23]:
(56346, 17)
In [25]:
from datetime import datetime
# !pip install scikit-learn
import sklearn
from sklearn.preprocessing import StandardScaler
In [27]:
#days since last online
today=pd.Timestamp(datetime.today().date())
df_clean['days_since_last_online']=(today-df_clean['last_online']).dt.days
#age group
age_bins=[18,25,35,45,53.5]
age_labels=['18-25','26-35','36-45','46-53']
df_clean['age_group']=pd.cut(df_clean['age'],bins=age_bins,labels=age_labels,right=True,include_lowest=True)
#standardize numerical feature:AGE, HEIGHT, INCOME
scaler=StandardScaler()
df_clean[['age_scaled','height_scaled']]=scaler.fit_transform(df_clean[['age','height']])
#for income, fill NaN with median temporarly just for scaling
income_median=df_clean['income'].median()
df_clean['income_scaled']=scaler.fit_transform(df_clean[['income']].fillna(income_median))
#show new feature columns
df_clean[['age','age_group','days_since_last_online','age_scaled','height_scaled','income_scaled']].head()
Out[27]:
| age | age_group | days_since_last_online | age_scaled | height_scaled | income_scaled | |
|---|---|---|---|---|---|---|
| 0 | 22 | 18-25 | 4655 | -1.202994 | 1.765774 | -0.076767 |
| 1 | 35 | 26-35 | 4654 | 0.514081 | 0.446840 | 1.840292 |
| 2 | 38 | 36-45 | 4656 | 0.910329 | -0.080733 | -0.076767 |
| 3 | 23 | 18-25 | 4655 | -1.070911 | 0.710627 | -1.993827 |
| 4 | 29 | 26-35 | 4656 | -0.278415 | -0.608307 | -0.076767 |
In [29]:
#visualization
sns.set(style="whitegrid")
#distribution of users by gender
plt.figure(figsize=(4,6))
sns.countplot(data=df_clean, x='gender',palette='muted')
plt.title("User distribution by Gender")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.tight_layout()
plt.show()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_32497/4264358472.py:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data=df_clean, x='gender',palette='muted')
In [120]:
# Distribution by age group
plt.figure(figsize=(6,4))
sns.countplot(data=df_clean, x='age_group',order=['18-25','26-35','36-45','46-53'],palette='')
plt.title("User distribution by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.tight_layout()
plt.show()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/3996538591.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data=df_clean, x='age_group',order=['18-25','26-35','36-45','46-53'],palette='muted')
In [130]:
# top 10 location by user count
top_locations=df_clean['location'].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(x=top_locations.values,y=top_locations.index,palette='muted')
plt.title("Top 10 Locations by User Count")
plt.xlabel("Number of user")
plt.ylabel("Locations")
plt.tight_layout()
plt.show()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/615294961.py:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=top_locations.values,y=top_locations.index,palette='muted')
In [148]:
# lifestyle analysis: Diet vs Drinks
plt.figure(figsize=(10,6))
sns.countplot(data=df_clean, x='diet',hue='drinks',order=df_clean['diet'].value_counts().index,palette='Set2')
plt.title("Diet vs Drinking Habits")
plt.xlabel("Count")
plt.ylabel("Diet Type")
plt.legend(title='Drinks')
plt.tight_layout()
plt.xticks(rotation=90)
plt.show()
In [152]:
#correlation heatmap for numerical columns
plt.figure(figsize=(8,6))
numeric_cols=['age','height','income','days_since_last_online']
corr=df_clean[numeric_cols].corr()
sns.heatmap(corr,annot=True,cmap='coolwarm',fmt='.2f')
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()
In [158]:
#Scatterplot: age vs Income
plt.figure(figsize=(7,5))
sns.scatterplot(data=df_clean,x='age',y='income',alpha=0.5,marker='x',color='teal')
plt.title("Age vs Income")
plt.xlabel("Age")
plt.ylabel("Income")
plt.tight_layout()
plt.show()
In [174]:
#part 1:Data Cleaning
# Inspecting Missing Data
missing_counts=df_clean.isnull().sum()
missing_percentage=(missing_counts/len(df_clean))*100
#combine into a dataframe for better readibility
missing_df=pd.DataFrame({
"Missing Count":missing_counts,
"Missing %":missing_percentage
}).sort_values(by='Missing %', ascending=False)
print(missing_df)
#also identify columns where more than 50% of the data is missing
high_missing_cols=missing_df[missing_df['Missing %']>50]
high_missing_cols
Missing Count Missing % income 46170 81.940155 diet 22941 40.714514 religion 19458 34.533064 pets 18875 33.498385 sign 10329 18.331381 job 7847 13.926454 education 6246 11.085081 ethnicity 5400 9.583644 body_type 5106 9.061868 drinks 2831 5.024314 speaks 46 0.081638 age 0 0.000000 height_scaled 0 0.000000 age_scaled 0 0.000000 age_group 0 0.000000 days_since_last_online 0 0.000000 last_online 0 0.000000 location 0 0.000000 status 0 0.000000 height 0 0.000000 gender 0 0.000000 income_scaled 0 0.000000
Out[174]:
| Missing Count | Missing % | |
|---|---|---|
| income | 46170 | 81.940155 |
In [184]:
missing_before=df_clean[['height','income']].isnull().sum()
#apply imputation using median by gender and age_group
def impute_group_median(df,column,group_cols):
df[column + '_imputed']=df[column]
for name,group in df.groupby(group_cols):
median_val=group[column].median()
mask=group[column].isnull()
df.loc[mask.index[mask],column+'_imputed']=median_val
return df
#impute height and income
df_clean=impute_group_median(df_clean,'height',['gender','age_group'])
df_clean=impute_group_median(df_clean,'income',['gender','age_group'])
#display how many missing value remain after imputation
missing_after=df_clean[['height_imputed','income_imputed']].isnull().sum()
#compare before and after
missing_comparison=pd.DataFrame({
'Missing Before':missing_before,
'Missing After':missing_after
})
missing_comparison
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1956053079.py:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. for name,group in df.groupby(group_cols): /var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1956053079.py:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. for name,group in df.groupby(group_cols):
Out[184]:
| Missing Before | Missing After | |
|---|---|---|
| height | 0.0 | NaN |
| height_imputed | NaN | 0.0 |
| income | 46170.0 | NaN |
| income_imputed | NaN | 0.0 |
In [186]:
df_clean.dtypes
Out[186]:
age int64 status object gender object body_type object diet object drinks object education object ethnicity object height float64 income float64 job object last_online datetime64[ns] location object pets object religion object sign object speaks object days_since_last_online int64 age_group category age_scaled float64 height_scaled float64 income_scaled float64 height_imputed float64 income_imputed float64 dtype: object
In [188]:
# no need of change data type
df_clean['last_active_year']=df_clean['last_online'].dt.year
df_clean['last_active_month']=df_clean['last_online'].dt.month
df_clean['weekdays_last_online']=df_clean['last_online'].dt.day_name()
In [192]:
q10,q90=df_clean['income'].quantile([0.10,0.90])
trimmed_income=df_clean[(df_clean['income'] >=q10) & (df_clean['income'] <=q90)]
trimmed_mean=trimmed_income['income'].mean()
q10_age, q90_age = df_clean['age'].quantile([0.10, 0.90])
trimmed_age = df_clean[(df_clean['age'] >= q10_age) & (df_clean['age'] <= q90_age)]
summary_stats = {
"Full Income Mean": df_clean['income_imputed'].mean(),
"Trimmed Income Mean (10th-90th pct)": trimmed_income['income_imputed'].mean(),
"Full Age Mean": df_clean['age'].mean(),
"Trimmed Age Mean (10th-90th pct)": trimmed_age['age'].mean()
}
summary_stats
Out[192]:
{'Full Income Mean': 50047.38579491002,
'Trimmed Income Mean (10th-90th pct)': 51009.795748228426,
'Full Age Mean': 31.107886983991765,
'Trimmed Age Mean (10th-90th pct)': 30.29911172221131}
In [212]:
# In the time of data cleaning process we can see the percentage of missing data for column income is more than 80%
#they reason behind may people do not want to share their data ,its kind sensitive data. like -income ,job,body_type,diet etc.
plt.figure(figsize=(14,6))
sns.heatmap(df_clean.isnull(),cbar=False,cmap='magma',yticklabels=False)
plt.title("Heatmap of Missing Value in Bumble Dataset")
plt.tight_layout()
plt.show()
In [214]:
#this is bar chart for missing data % column wise
missing_percent = (df_clean.isnull().sum() / len(df_clean)) * 100
missing_percent = missing_percent[missing_percent > 0].sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=missing_percent.values, hue=missing_percent.index, palette='flare')
plt.title("Percentage of Missing Values by Column")
plt.xlabel("% Missing")
plt.ylabel("Column")
plt.tight_layout()
plt.legend(title='column')
plt.show()
In [ ]:
# initially we already did grouping but it is okay let's do it again
# creating bin
age_bins = [18, 25, 35, 45, 53.5]
age_labels = ['18-25', '26-35', '36-45', '46-53']
df_clean['age_group'] = pd.cut(df_clean['age'], bins=age_bins, labels=age_labels, include_lowest=True)
# we use imputed income for
q1, q3 = df_clean['income_imputed'].quantile([0.25, 0.75])
#classifying category wise income
q1, q3 = df_clean['income_imputed'].quantile([0.25, 0.75])
def income_category(val):
if pd.isna(val):
return np.nan
elif val < q1:
return 'Low Income'`
elif val <= q3:
return 'Medium Income'
else:
return 'High Income'
df_clean['income_group'] = df_clean['income_imputed'].apply(income_category)
In [282]:
#2. Derived Features
#creating new feature name -profile_completeness--- how much % user profile is completed base on different metric
user_input_columns = ['age', 'height', 'diet', 'drinks', 'education', 'ethnicity', 'body_type',
'job', 'pets', 'sign', 'income'] # Add/remove based on what's meaningful
df_clean['profile_completeness'] = df_clean[user_input_columns].notnull().sum(axis=1) / len(user_input_columns) * 100
completeness_bins = [0, 40, 60, 80, 90, 100]
completeness_labels = ['0-40%', '41-60%', '61-80%', '81-90%', '91-100%']
df_clean['completeness_group'] = pd.cut(df_clean['profile_completeness'], bins=completeness_bins, labels=completeness_labels)
#over all completeness by user (percentage group )
completeness_distribution=df_clean['completeness_group'].value_counts().sort_index()
print(" Completeness Group Distribution:\n", completeness_distribution)
#let use chart for better understanding
plt.figure(figsize=(8,5))
sns.countplot(data=df_clean, x='completeness_group',order=completeness_labels,palette='Blues')
plt.title("User Profile Completeness Distribution")
plt.xlabel("Profile Completeness (%)")
plt.ylabel("User Count")
plt.tight_layout()
plt.show()
Completeness Group Distribution: completeness_group 0-40% 1096 41-60% 3612 61-80% 15118 81-90% 15797 91-100% 20723 Name: count, dtype: int64
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/4169348503.py:20: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data=df_clean, x='completeness_group',order=completeness_labels,palette='Blues')
In [296]:
#average profile completeness by age group
completeness_by_age=df_clean.groupby('age_group')['profile_completeness'].median()
print(" Completeness by Age Group :\n", completeness_by_age)
#let use chart for better understanding
plt.figure(figsize=(8,5))
sns.barplot(x=completeness_by_age.index,y=completeness_by_age.values,palette='Greens')
plt.title("User Profile Completeness by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Average Completeness (%)")
# plt.ylim(85,90)
plt.tight_layout()
plt.show()
#now we can see not much differnce
Completeness by Age Group : age_group 18-25 81.818182 26-35 81.818182 36-45 81.818182 46-53 81.818182 Name: profile_completeness, dtype: float64
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/3015216240.py:3: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
completeness_by_age=df_clean.groupby('age_group')['profile_completeness'].median()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/3015216240.py:7: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.
sns.barplot(x=completeness_by_age.index,y=completeness_by_age.values,palette='Greens')
In [292]:
#average profile completeness by gender
completeness_by_gender=df_clean.groupby('gender')['profile_completeness'].median()
print(" Completeness by Gender :\n", completeness_by_gender)
#let use chart for better understanding
plt.figure(figsize=(6,5))
sns.barplot(x=completeness_by_gender.index,y=completeness_by_gender.values,palette='Oranges')
plt.title("User Profile Completeness by Gender")
plt.xlabel("Gender")
plt.ylabel("Average Completeness (%)")
# plt.ylim(85,90)
plt.tight_layout()
plt.show()
#now we can see not much differnce
Completeness by Gender : gender f 81.818182 m 81.818182 Name: profile_completeness, dtype: float64
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1304463441.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=completeness_by_gender.index,y=completeness_by_gender.values,palette='Oranges')
In [ ]:
#unit conversion
#change height value from inch to cm
df_clean['height_cm']=df_clean['height']*2.54
In [670]:
#Part 3: Data Analysis
#1. Demographic Analysis
#gender distribution
gender_distribution=df_clean['gender'].value_counts(normalize=True)*100
plt.figure(figsize=(5, 4))
gender_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=['lightblue', 'lightcoral'])
plt.title("Gender Distribution")
plt.ylabel("")
plt.tight_layout()
plt.show()
#yes you can significant imbalance betweens gender , as number of male is higher
In [328]:
#relationship status
status_distribution=df_clean['status'].value_counts(normalize=True)*100
plt.figure(figsize=(7, 4))
sns.barplot(x=status_distribution.index, y=status_distribution.values, palette='pastel')
plt.title("Relationship Status Distribution (Overall)")
plt.ylabel("Percentage of Users")
plt.xlabel("Status")
plt.tight_layout()
plt.show()
#a huge number of single user
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/932796976.py:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=status_distribution.index, y=status_distribution.values, palette='pastel')
In [334]:
#status by gender
status_by_gender=df_clean.groupby('gender')['status'].value_counts(normalize=True).unstack().fillna(0)*100
plt.figure(figsize=(8, 5))
status_by_gender.T.plot(kind='bar', figsize=(9, 5), colormap='viridis')
plt.title("Relationship Status Distribution by Gender")
plt.ylabel("Percentage of Users")
plt.xlabel("Status")
plt.xticks(rotation=0)
plt.legend(title="Gender")
plt.tight_layout()
plt.show()
#and so target audience should be single people
<Figure size 800x500 with 0 Axes>
In [340]:
#2. Correlation Analysis
#convert gender to numeric data type
df_clean['gender_numeric']=df_clean['gender'].map({'m':1,'f':0})
#correlation matrix
correlation_column=['age','income_imputed','profile_completeness','gender_numeric']
correlation_matrix=df_clean[correlation_column].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Matrix of Numerical Features")
plt.tight_layout()
plt.show()
# Extract correlation between age and income
age_income_corr = correlation_matrix.loc['age', 'income_imputed']
age_income_corr
#correlation between age and income is 0.63
#This is a weak positive correlation, meaning:As age increases, income tends to increase slightly — but the relationship is not very strong.
Out[340]:
0.6284495732633397
In [344]:
# cleaning column diet and drink
lifestyle_df=df_clean[['diet','drinks']].dropna(how='all')
#counting number of particular diet
diet_distribution=lifestyle_df['diet'].value_counts(normalize=True).sort_values(ascending=False) *100
#count of combination table of (drink and diet)
diet_and_drink_table=lifestyle_df.groupby(['diet','drinks']).size().unstack(fill_value=0)
percentage_of_diet_and_drink=diet_and_drink_table.div(diet_and_drink_table.sum(axis=1), axis=0) * 100
plt.figure(figsize=(10, 5))
sns.barplot(x=diet_distribution.values, y=diet_distribution.index, palette='viridis')
plt.title("Distribution of Dietary Preferences")
plt.xlabel("Percentage of Users (%)")
plt.ylabel("Diet Type")
plt.tight_layout()
plt.show()
# perfered diet is mostly anything
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/3661622710.py:14: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=diet_distribution.values, y=diet_distribution.index, palette='viridis')
In [365]:
selected_drinks = ['not at all', 'rarely', 'socially', 'often']
percentage_of_diet_and_drink[selected_drinks].plot(kind='barh', stacked=True,figsize=(12, 7),colormap='tab20c')
plt.title("Drinking Habits by Dietary Preference")
plt.xlabel("Percentage Within Diet Group (%)")
plt.ylabel("Diet Type")
plt.legend(title="Drinking Habit", bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()
# strictly halal and halal people are not likely to drink
In [387]:
df_clean['location'].dropna().head(10)
df_clean[['city', 'state']] = df_clean['location'].str.lower().str.extract(r'^(.*),\s*(.*)$')
#top 5 cities
top_5_cities=df_clean['city'].value_counts().head(5)
#top 5 state
top_5_states=df_clean['state'].value_counts().head(5)
#collect all top city
all_top_cities=df_clean[df_clean['city'].isin(top_5_cities.index)]
#collect all top state
all_top_states=df_clean[df_clean['state'].isin(top_5_states.index)]
#average age by city
avg_age_by_city=all_top_cities.groupby('city')['age'].mean().round(1).sort_values()
avg_age_df = avg_age_by_city.reset_index()
avg_age_df.columns = ['city', 'avg_age']
avg_income_by_city=all_top_cities.groupby('city')['income'].mean().round(0).sort_values(ascending=False)
avg_income_by_state=all_top_states.groupby('city')['income'].mean().round(0).sort_values(ascending=False)
In [427]:
plt.figure(figsize=(10,5))
sns.barplot(x=top_5_cities.values,y=top_5_cities.index, palette="Blues")
plt.title("Top 5 Cities by User Count")
plt.xlabel("Number of Users")
plt.ylabel("City")
plt.tight_layout()
plt.show()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/606072808.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=top_5_cities.values,y=top_5_cities.index, palette="Blues")
In [459]:
avg_age_df = avg_age_by_city.reset_index()
avg_age_df.columns = ['city', 'avg_age']
plt.figure(figsize=(10, 4))
sns.barplot(data=avg_age_df, x='avg_age', y='city', palette='magma')
plt.title("Average Age by Top Cities")
plt.xlabel("Average Age")
plt.ylabel("City")
plt.tight_layout()
plt.show()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/604349013.py:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(data=avg_age_df, x='avg_age', y='city', palette='magma')
In [451]:
avg_income_by_city=all_top_cities.groupby('city')['income'].mean().round(0).sort_values(ascending=False)
avg_income_df=avg_income_by_city.reset_index()
avg_income_df.columns=['city','avg_income']
plt.figure(figsize=(10, 6))
sns.barplot(data=avg_income_df, x='avg_income', y='city', palette='magma')
plt.title("Average Income by Top Cities")
plt.xlabel("Average Income")
plt.ylabel("City")
plt.tight_layout()
plt.show()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/2790284345.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(data=avg_income_df, x='avg_income', y='city', palette='magma')
In [455]:
avg_income_by_state=all_top_states.groupby('state')['income'].mean().round(0).sort_values(ascending=False)
avg_income_df=avg_income_by_state.reset_index()
avg_income_df.columns=['state','avg_income']
plt.figure(figsize=(10, 6))
sns.barplot(data=avg_income_df, x='avg_income', y='state', palette='magma')
plt.title("Average Income by Top States")
plt.xlabel("Average Income")
plt.ylabel("State")
plt.tight_layout()
plt.show()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1354353335.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(data=avg_income_df, x='avg_income', y='state', palette='magma')
In [533]:
# avg height by gender
avg_height_by_gender=df_clean.groupby('gender')['height'].mean().round(1).sort_values()
plt.figure(figsize=(8,4))
sns.barplot(x=avg_height_by_gender.index,y=avg_height_by_gender.values, palette='muted')
plt.title("Average Height by Gender")
plt.xlabel("Gender")
plt.ylabel("Average Height (inches)")
plt.tight_layout()
plt.show()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/2515973691.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=avg_height_by_gender.index,y=avg_height_by_gender.values, palette='muted')
In [535]:
avg_height_by_age_group = df_clean.groupby('age_group')['height'].mean().round(1).sort_values(ascending=False)
avg_height_age=avg_height_by_age_group.reset_index()
avg_height_age.columns=['age_group','avg_height']
plt.figure(figsize=(6, 6))
sns.barplot(data=avg_height_age, x='age_group', y='avg_height', palette='Blues')
plt.title("Average Income by Top States")
plt.xlabel("Average Income")
plt.ylabel("State")
plt.tight_layout()
plt.show()
avg_height_by_age_group
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1742119313.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
avg_height_by_age_group = df_clean.groupby('age_group')['height'].mean().round(1).sort_values(ascending=False)
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1742119313.py:8: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.
sns.barplot(data=avg_height_age, x='age_group', y='avg_height', palette='Blues')
Out[535]:
age_group 26-35 68.4 36-45 68.3 18-25 68.2 46-53 68.2 Name: height, dtype: float64
In [547]:
#How does the age distribution differ by gender
plt.figure(figsize=(14, 6))
sns.boxplot(data=df_clean, x='body_type', y='height', palette='Set3')
plt.title("Height Distribution Across Body Types")
plt.xlabel("Body Type")
plt.ylabel("Height (inches)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
height_stats_by_body_type = df_clean.groupby('body_type')['height'].mean().round(1).sort_values(ascending=False)
height_stats_by_body_type
# athletic are taller-- 69.7 and curvy people are shorter
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/3480227085.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df_clean, x='body_type', y='height', palette='Set3')
Out[547]:
body_type athletic 69.7 jacked 69.1 a little extra 68.9 used up 68.9 overweight 68.7 fit 68.6 skinny 68.5 average 68.1 thin 67.9 rather not say 67.0 full figured 66.5 curvy 65.3 Name: height, dtype: float64
In [553]:
#6. Income Analysis
income_by_gender=df_clean.groupby('gender')['income'].median().round(1).sort_values(ascending=False)
avg_income_df=income_by_gender.reset_index()
avg_income_df.columns=['gender','avg_income']
plt.figure(figsize=(6, 6))
sns.barplot(data=avg_income_df, x='gender', y='avg_income', palette='Blues')
plt.title("Average Income by Gender")
plt.ylabel("Average Income")
plt.xlabel("Gender")
plt.tight_layout()
plt.show()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1120282396.py:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(data=avg_income_df, x='gender', y='avg_income', palette='Blues')
In [557]:
avg_height_by_age_group = df_clean.groupby('age_group')['income'].mean().round(1).sort_values(ascending=False)
avg_income_df=avg_height_by_age_group.reset_index()
avg_income_df.columns=['age_group','avg_income']
plt.figure(figsize=(6, 6))
sns.barplot(data=avg_income_df, x='age_group', y='avg_income', palette='Blues')
plt.title("Average Income by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Average Height")
plt.tight_layout()
plt.show()
#YES! older users more likely to report higher incomes.
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1242257768.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
avg_height_by_age_group = df_clean.groupby('age_group')['income'].mean().round(1).sort_values(ascending=False)
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1242257768.py:8: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.
sns.barplot(data=avg_income_df, x='age_group', y='avg_income', palette='Blues')
In [573]:
#part 4 Part 4: Data Visualization
plt.figure(figsize=(6, 6))
sns.countplot(data=df_clean, x='age_group', palette='Blues')
plt.title("User distribution by Age Group")
plt.xlabel("Age Group")
plt.ylabel("User")
plt.tight_layout()
plt.show()
#age of between 26 to 35 user are dominating
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1577851910.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data=df_clean, x='age_group', palette='Blues')
In [577]:
sns.countplot(data=df_clean, x='age_group', hue='gender', palette='muted')
plt.title("User Distribution by Age Group and Gender")
plt.xlabel("Age Group")
plt.ylabel("Number of Users")
plt.legend(title="Gender")
plt.tight_layout()
plt.show()
In [596]:
#2. Income and Age
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_clean, x='age', y='income', alpha=0.4, color='teal')
sns.regplot(data=df_clean, x='age', y='income', scatter=False, color='red', label='Trend Line')
plt.title("Relationship Between Age and Income")
plt.xlabel("Age")
plt.ylabel("Income")
plt.legend()
plt.tight_layout()
plt.show()
In [598]:
plt.figure(figsize=(14, 6))
sns.boxplot(data=df_clean, x='age_group', y='income', palette='Set3')
plt.title("Income by group")
plt.xlabel("Age Group")
plt.ylabel("Income")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
median_income_by_age_group= df_clean.groupby('age_group')['income'].median().round(1).sort_values(ascending=False)
median_income_by_age_group
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1119495270.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=df_clean, x='age_group', y='income', palette='Set3')
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1119495270.py:10: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
median_income_by_age_group= df_clean.groupby('age_group')['income'].median().round(1).sort_values(ascending=False)
Out[598]:
age_group 36-45 70000.0 46-53 70000.0 26-35 50000.0 18-25 20000.0 Name: income, dtype: float64
In [606]:
income_by_gender_status = df_clean.groupby(['gender', 'status'])['income'].median().unstack().round(0)
income_by_gender_status
Out[606]:
| status | available | married | seeing someone | single | unknown |
|---|---|---|---|---|---|
| gender | |||||
| f | 20000.0 | 30000.0 | 20000.0 | 40000.0 | 20000.0 |
| m | 50000.0 | 60000.0 | 40000.0 | 50000.0 | NaN |
In [614]:
income_plot_df = df_clean.groupby(['gender', 'status'])['income'].median().reset_index()
income_plot_df.columns = ['Gender', 'Status', 'Median Income']
# Create barplot
plt.figure(figsize=(12, 6))
sns.barplot(data=income_plot_df, x='Status', y='Median Income', hue='Gender', palette='muted')
plt.title("Median Income by Gender and Relationship Status")
plt.xlabel("Relationship Status")
plt.ylabel("Median Income")
plt.legend(title="Gender")
plt.tight_layout()
plt.show()
In [632]:
pet_df=df_clean['pets'].value_counts()
plt.figure(figsize=(8,4))
sns.countplot(data=df_clean, x='pets',order=pet_df.index, palette='Set2')
plt.title("Distribution of Pets Preferences")
plt.xlabel("Pets Preference")
plt.ylabel("Number of Users")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
#likes dogs and likes cat is more prefered
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/1505912911.py:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data=df_clean, x='pets',order=pet_df.index, palette='Set2')
In [636]:
# Grouped bar chart: Pets preference by gender and age group
plt.figure(figsize=(14, 6))
sns.countplot(data=df_clean, x='pets', hue='gender', order=pet_df.index, palette='Set2')
plt.title("Pets Preferences by Gender")
plt.xlabel("Pets Preference")
plt.ylabel("Number of Users")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
In [666]:
most_liked_pet=df_clean['pets'].value_counts().head(5).index
df_top_pets=df_clean[df_clean['pets'].isin(most_liked_pet)]
top_pet_by_age_group=df_top_pets.groupby(['age_group', 'pets']).size().reset_index(name='count')
top_pet_by_age_group['percentage'] = top_pet_by_age_group.groupby('age_group')['count'].transform(lambda x: (x / x.sum()) * 100)
plt.figure(figsize=(14, 7))
sns.barplot(data=top_pet_by_age_group, x='age_group', y='percentage', hue='pets', palette='Set2')
plt.title("Pet preference by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Percentage")
plt.legend(title="Pets")
plt.tight_layout()
plt.show()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/4199129173.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
top_pet_by_age_group=df_top_pets.groupby(['age_group', 'pets']).size().reset_index(name='count')
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/4199129173.py:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
top_pet_by_age_group['percentage'] = top_pet_by_age_group.groupby('age_group')['count'].transform(lambda x: (x / x.sum()) * 100)
In [690]:
df_clean['sign'] = df_clean['sign'].str.extract(r'([a-zA-Z]+)') # extract only the zodiac sign word
df_clean['sign'] = df_clean['sign'].str.lower()
sign_distribution=df_clean['sign'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(sign_distribution, labels=sign_distribution.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
plt.title("Zodiac Sign Distribution")
plt.axis('equal')
plt.tight_layout()
plt.show()
In [688]:
plt.figure(figsize=(8, 6))
sns.barplot(x=sign_distribution.index, y=sign_distribution.values, palette='Set3')
plt.title("Zodiac Sign Distribution")
plt.axis('equal')
plt.tight_layout()
plt.xticks(rotation=90)
plt.show()
/var/folders/ld/38gknt89237bvrzf52v0pqr80000gn/T/ipykernel_21241/2636164624.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=sign_distribution.index, y=sign_distribution.values, palette='Set3')
In [ ]: