import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

bumble_path = r"C:\Users\HP\OneDrive\Documents\NextLeap Projects\Python\bumble.csv"
df = pd.read_csv(bumble_path)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   gender       59946 non-null  object 
 3   body_type    54650 non-null  object 
 4   diet         35551 non-null  object 
 5   drinks       56961 non-null  object 
 6   education    53318 non-null  object 
 7   ethnicity    54266 non-null  object 
 8   height       59943 non-null  float64
 9   income       59946 non-null  int64  
 10  job          51748 non-null  object 
 11  last_online  59946 non-null  object 
 12  location     59946 non-null  object 
 13  pets         40025 non-null  object 
 14  religion     39720 non-null  object 
 15  sign         48890 non-null  object 
 16  speaks       59896 non-null  object 
dtypes: float64(1), int64(2), object(14)
memory usage: 7.8+ MB

df.head()

import pandas as pd  
bumble_path = r"C:\Users\HP\OneDrive\Documents\NextLeap Projects\Python\bumble.csv" 
df = pd.read_csv(bumble_path)
df.head()
missing_values = df.isnull().sum()
print("Missing Values in Each Column:")
print(missing_values)

Missing Values in Each Column:
age                0
status             0
gender             0
body_type       5296
diet           24395
drinks          2985
education       6628
ethnicity       5680
height             3
income             0
job             8198
last_online        0
location           0
pets           19921
religion       20226
sign           11056
speaks            50
dtype: int64

missing_values = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_data = pd.DataFrame({'Total Missing Values': missing_values, 'Percentage Missing': missing_percentage})
missing_data = missing_data.sort_values(by='Percentage Missing', ascending=False)
print("Missing Values Summary:")
print(missing_data)

Missing Values Summary:
             Total Missing Values  Percentage Missing
diet                        24395           40.694959
religion                    20226           33.740366
pets                        19921           33.231575
sign                        11056           18.443266
job                          8198           13.675641
education                    6628           11.056618
ethnicity                    5680            9.475194
body_type                    5296            8.834618
drinks                       2985            4.979482
speaks                         50            0.083408
height                          3            0.005005
last_online                     0            0.000000
location                        0            0.000000
income                          0            0.000000
status                          0            0.000000
gender                          0            0.000000
age                             0            0.000000

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   gender       59946 non-null  object 
 3   body_type    54650 non-null  object 
 4   diet         35551 non-null  object 
 5   drinks       56961 non-null  object 
 6   education    53318 non-null  object 
 7   ethnicity    54266 non-null  object 
 8   height       59943 non-null  float64
 9   income       59946 non-null  int64  
 10  job          51748 non-null  object 
 11  last_online  59946 non-null  object 
 12  location     59946 non-null  object 
 13  pets         40025 non-null  object 
 14  religion     39720 non-null  object 
 15  sign         48890 non-null  object 
 16  speaks       59896 non-null  object 
dtypes: float64(1), int64(2), object(14)
memory usage: 7.8+ MB

# Impute 'height' based on gender
calculate_median_height = df.groupby(["gender"])["height"].transform("median")
df["height"] = df["height"].fillna(calculate_median_height)

# Impute 'income' based on gender and age group
calculate_median_income = df.groupby(["gender", "age"])["income"].transform("median")
df["income"] = df["income"].fillna(calculate_median_income)

# Print the calculated median values for verification
print("Calculated Median Heights:\n", calculate_median_height)
print("\nCalculated Median Income:\n", calculate_median_income)

Calculated Median Heights:
 0        70.0
1        70.0
2        70.0
3        70.0
4        70.0
         ... 
59941    65.0
59942    70.0
59943    70.0
59944    70.0
59945    70.0
Name: height, Length: 59946, dtype: float64

Calculated Median Income:
 0       -1.0
1       -1.0
2       -1.0
3       -1.0
4       -1.0
        ... 
59941   -1.0
59942   -1.0
59943   -1.0
59944   -1.0
59945   -1.0
Name: income, Length: 59946, dtype: float64

df.dtypes

age              int64
status          object
gender          object
body_type       object
diet            object
drinks          object
education       object
ethnicity       object
height         float64
income           int64
job             object
last_online     object
location        object
pets            object
religion        object
sign            object
speaks          object
dtype: object

df["last_online"] = pd.to_datetime(df["last_online"], format="%Y-%m-%d-%H-%M", errors="coerce")
df["last_online"]

0       2012-06-28 20:30:00
1       2012-06-29 21:41:00
2       2012-06-27 09:10:00
3       2012-06-28 14:22:00
4       2012-06-27 21:26:00
                ...        
59941   2012-06-12 21:47:00
59942   2012-06-29 11:01:00
59943   2012-06-27 23:37:00
59944   2012-06-23 13:01:00
59945   2012-06-29 00:42:00
Name: last_online, Length: 59946, dtype: datetime64[ns]

df.describe()

age_range = (df['age'].min(), df['age'].max())
height_range = (df['height'].min(), df['height'].max())
income_range = (df['income'].min(), df['income'].max())

print(f"The age range is: {age_range}")
print(f"The height range is: {height_range}")
print(f"The income range is: {income_range}")

The age range is: (18, 110)
The height range is: (1.0, 95.0)
The income range is: (-1, 1000000)

import matplotlib.pyplot as plt
import seaborn as sns

def plot_outliers(data, column):
    plt.figure(figsize=(8,5))
    sns.boxplot(y=data[column], palette='coolwarm')
    plt.title(f'Boxplot of {column}')
    plt.show()

plot_outliers(df, 'age')
plot_outliers(df, 'height')
plot_outliers(df, 'income')

C:\Program Files\KMSpico\temp\ipykernel_10508\4067286754.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(y=data[column], palette='coolwarm')

C:\Program Files\KMSpico\temp\ipykernel_10508\4067286754.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(y=data[column], palette='coolwarm')

C:\Program Files\KMSpico\temp\ipykernel_10508\4067286754.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(y=data[column], palette='coolwarm')

(df['income'] == -1).value_counts() 
df.loc[df['income'] == -1, 'income'] = 0 
(df['income'] == -1).value_counts()

income
False    59946
Name: count, dtype: int64

lower_bound_income = df['income'].quantile(0.10)
upper_bound_income = df['income'].quantile(0.90)

lower_outliers = df.loc[df['income'] < lower_bound_income, 'income']
upper_outliers = df.loc[df['income'] > upper_bound_income, 'income']

middle_80_income = df.loc[(df['income'] >= lower_bound_income) & (df['income'] <= upper_bound_income), 'income']

print(f"Lower bound income: {lower_bound_income}, Upper bound income: {upper_bound_income}")
print(f"Lower bound outliers range: {lower_outliers.min()} - {lower_outliers.max()}")
print(f"Upper bound outliers range: {upper_outliers.min()} - {upper_outliers.max()}")
print(f"Middle 80% mean: {middle_80_income.mean()}, Median: {middle_80_income.median()}")

Lower bound income: 0.0, Upper bound income: 50000.0
Lower bound outliers range: nan - nan
Upper bound outliers range: 60000 - 1000000
Middle 80% mean: 3297.01223769799, Median: 0.0

lower_bound_age = df['age'].quantile(0.10)
upper_bound_age = df['age'].quantile(0.90)

lower_outliers = df.loc[df['age'] < lower_bound_age, 'age']
upper_outliers = df.loc[df['age'] > upper_bound_age, 'age']

middle_80_age = df.loc[(df['age'] >= lower_bound_age) & (df['age'] <= upper_bound_age), 'age']

print(f"Lower bound outliers range: {lower_outliers.min()} - {lower_outliers.max()}")
print(f"Upper bound outliers range: {upper_outliers.min()} - {upper_outliers.max()}")
print(f"Lower bound age: {lower_bound_age}, Upper bound age: {upper_bound_age}")

Lower bound outliers range: 18 - 22
Upper bound outliers range: 47 - 110
Lower bound age: 23.0, Upper bound age: 46.0

lower_bound_height = df['height'].quantile(0.10)
upper_bound_height = df['height'].quantile(0.90)

lower_outliers = df.loc[df['height'] < lower_bound_height, 'height']
upper_outliers = df.loc[df['height'] > upper_bound_height, 'height']

print(f"Height lower bound: {lower_bound_height}, upper bound: {upper_bound_height}")
print(f"Lower outliers range: {lower_outliers.min()} - {lower_outliers.max()}")
print(f"Upper outliers range: {upper_outliers.min()} - {upper_outliers.max()}")

# Capping height values within a reasonable range
lower_bound = min(lower_bound_height, 48)  # Ensure no extreme low values
upper_bound = max(upper_bound_height, 90)  # Ensure no extreme tall values

df['height'] = df['height'].apply(lambda x: lower_bound if x < lower_bound else (upper_bound if x > upper_bound else x))

Height lower bound: 63.0, upper bound: 73.0
Lower outliers range: 1.0 - 62.0
Upper outliers range: 74.0 - 95.0

import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12, 8))
sns.heatmap(df.isnull(), cbar=False, cmap="coolwarm")
plt.title('Heatmap of Missing Data')
plt.xlabel('Columns', fontsize=14)
plt.ylabel('Rows', fontsize=14)
plt.show()

for col in ['sign', 'job', 'education', 'ethnicity', 'body_type', 'drinks']:
    df[col] = df[col].fillna(df[col].mode()[0])

age_bins = [18, 25, 35, 45, float('inf')]
age_labels = ["18-25", "26-35", "36-45", "46+"]

df["age_group"] = pd.cut(df["age"], bins=age_bins, labels=age_labels)

df[["age", "age_group"]].head(10)

age_distribution = df["age_group"].value_counts()
print(age_distribution)

age_group
26-35    28621
18-25    14145
36-45    10803
46+       6068
Name: count, dtype: int64

Q1 = df["income"].quantile(0.25)
Q2 = df["income"].quantile(0.50)
Q3 = df["income"].quantile(0.90)

df["income_category"] = np.where(df["income"] <= Q1, "Low Income",
                          np.where(df["income"] <= Q2, "Medium Income", "High Income"))

income_distribution = df["income_category"].value_counts()
print(income_distribution)

income_category
Low Income     48442
High Income    11504
Name: count, dtype: int64

df["profile_completeness"] = df.notnull().sum(axis=1) / df.shape[1] * 100
print(df[["profile_completeness"]].head())

   profile_completeness
0            100.000000
1            100.000000
2             94.736842
3             94.736842
4             89.473684

print(df.groupby("status")["profile_completeness"].mean())

status
available         94.973896
married           94.685908
seeing someone    94.777642
single            94.260014
unknown           91.578947
Name: profile_completeness, dtype: float64

df['height_cm'] = df['height'] * 2.54
print(df[["height", "height_cm"]].head())

   height  height_cm
0    75.0     190.50
1    70.0     177.80
2    68.0     172.72
3    71.0     180.34
4    66.0     167.64

gender_distribution = df['gender'].value_counts()
print("Gender distribution:")
print(gender_distribution)

Gender distribution:
gender
m    35829
f    24117
Name: count, dtype: int64

status_distribution = df['status'].value_counts(normalize=True) * 100
print("Status distribution (Percentage):")
print(status_distribution)

Status distribution (Percentage):
status
single            92.911954
seeing someone     3.443099
available          3.111133
married            0.517132
unknown            0.016682
Name: proportion, dtype: float64

status_gender_distribution = df.groupby(['gender', 'status']).size() / len(df) * 100
print("Status distribution by gender (Percentage):")
print(status_gender_distribution)

Status distribution by gender (Percentage):
gender  status        
f       available          1.094318
        married            0.225203
        seeing someone     1.673173
        single            37.231842
        unknown            0.006673
m       available          2.016815
        married            0.291929
        seeing someone     1.769926
        single            55.680112
        unknown            0.010009
dtype: float64

correlation_matrix = df[['age', 'income', 'height', 'profile_completeness']].corr()
print("Correlation Matrix:")
print(correlation_matrix)

Correlation Matrix:
                           age    income    height  profile_completeness
age                   1.000000 -0.001004 -0.022958              0.038296
income               -0.001004  1.000000  0.067056              0.060893
height               -0.022958  0.067056  1.000000             -0.011650
profile_completeness  0.038296  0.060893 -0.011650              1.000000

age_income_corr = df['age'].corr(df['income'])
print(f"Correlation between age and income: {age_income_corr}")

Correlation between age and income: -0.0010038681910054018

diet_distribution = df['diet'].value_counts(normalize=True) * 100
print("Dietary Preference Distribution:")
print(diet_distribution)

Dietary Preference Distribution:
diet
mostly anything        46.651290
anything               17.391916
strictly anything      14.382155
mostly vegetarian       9.687491
mostly other            2.832550
strictly vegetarian     2.461253
vegetarian              1.876178
strictly other          1.271413
mostly vegan            0.950747
other                   0.931057
strictly vegan          0.641332
vegan                   0.382549
mostly kosher           0.241906
mostly halal            0.135017
strictly halal          0.050631
strictly kosher         0.050631
halal                   0.030941
kosher                  0.030941
Name: proportion, dtype: float64

drinking_by_diet = df.groupby('diet')['drinks'].value_counts(normalize=True) * 100
print("Drinking habits by diet:")
print(drinking_by_diet)

Drinking habits by diet:
diet        drinks     
anything    socially       76.419214
            often           9.283519
            rarely          8.248423
            not at all      4.771147
            very often      0.938056
                             ...    
vegetarian  rarely         10.044978
            often           8.845577
            not at all      5.547226
            very often      1.049475
            desperately     0.899550
Name: proportion, Length: 103, dtype: float64

df[['city', 'state']] = df['location'].str.split(', ', expand=True, n=1)
df['state'] = df['state'].fillna('Unknown')
print(df[['location', 'city', 'state']].head(10))

                          location                 city       state
0  south san francisco, california  south san francisco  california
1              oakland, california              oakland  california
2        san francisco, california        san francisco  california
3             berkeley, california             berkeley  california
4        san francisco, california        san francisco  california
5        san francisco, california        san francisco  california
6        san francisco, california        san francisco  california
7        san francisco, california        san francisco  california
8    belvedere tiburon, california    belvedere tiburon  california
9            san mateo, california            san mateo  california

df[df['state'].isnull()]['location'].unique()

array([], dtype=object)

top_cities = df['city'].value_counts().head(5)
top_states = df['state'].value_counts().head(5)
print("Top 5 cities:")
print(top_cities)
print("Top 5 states:")
print(top_states)

Top 5 cities:
city
san francisco    31064
oakland           7214
berkeley          4212
san mateo         1331
palo alto         1064
Name: count, dtype: int64
Top 5 states:
state
california       59855
new york            17
illinois             8
massachusetts        5
texas                4
Name: count, dtype: int64

height_by_gender = df.groupby('gender')['height_cm'].mean()
print("Average height by gender:")
print(height_by_gender)

Average height by gender:
gender
f    165.378255
m    178.940090
Name: height_cm, dtype: float64

height_by_age_group = df.groupby('age_group')['height_cm'].mean()
print("Average height by age group:")
print(height_by_age_group)

Average height by age group:
age_group
18-25    173.309345
26-35    173.763476
36-45    173.571604
46+      172.571401
Name: height_cm, dtype: float64

C:\Program Files\KMSpico\temp\ipykernel_10508\4240767852.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  height_by_age_group = df.groupby('age_group')['height_cm'].mean()

income_distribution = df[df['income'] > 0]['income'].value_counts().sort_values(ascending=False)
print("Income distribution:")
print(income_distribution)

Income distribution:
income
20000      2952
100000     1621
80000      1111
30000      1048
40000      1005
50000       975
60000       736
70000       707
150000      631
1000000     521
250000      149
500000       48
Name: count, dtype: int64

income_by_age_group = df[df['income'] > 0].groupby('age_group')['income'].mean()
income_by_gender = df[df['income'] > 0].groupby('gender')['income'].mean()
print("Income by age group:")
print(income_by_age_group)
print("Income by gender:")
print(income_by_gender)

Income by age group:
age_group
18-25    101760.808926
26-35    108869.783617
36-45    105937.645416
46+       91844.426624
Name: income, dtype: float64
Income by gender:
gender
f     86633.472535
m    110984.388035
Name: income, dtype: float64

C:\Program Files\KMSpico\temp\ipykernel_10508\1608165714.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  income_by_age_group = df[df['income'] > 0].groupby('age_group')['income'].mean()

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

title_font = {'fontsize': 14, 'fontweight': 'bold'}
label_font = {'fontsize': 12}

plt.figure(figsize=(10, 6))
sns.histplot(df['age'], bins=20, kde=True, color='#3498db', edgecolor='black')

mean_age = df['age'].mean()
plt.axvline(mean_age, color='red', linestyle='--', label=f'Mean Age: {mean_age:.2f}')

plt.xlabel('Age', **label_font)
plt.ylabel('Frequency', **label_font)
plt.title('Distribution of Age with Mean Age Indicated', **title_font)
plt.legend()
plt.show()

plt.figure(figsize=(10, 6))
ax = sns.histplot(df, x='age', hue='gender', palette='viridis', kde=True, bins=20, hue_order=df['gender'].unique())
if ax.get_legend() is not None:
    ax.legend(title="Gender", loc="upper right")

plt.xlabel('Age', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Age Distribution by Gender', fontsize=14)
plt.show()

C:\Program Files\KMSpico\temp\ipykernel_10508\2309023062.py:4: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  ax.legend(title="Gender", loc="upper right")

plt.figure(figsize=(10, 6))
sns.regplot(data=df[df['income'] > 0], x='age', y='income', scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'})

plt.xlabel('Age', **label_font)
plt.ylabel('Income', **label_font)
plt.title('Income vs Age with Trend Line', **title_font)
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(data=df[df['income'] > 0], x='age_group', y='income', hue='age_group', palette='viridis', showfliers=False, legend=False)

plt.xlabel('Age Group', **label_font)
plt.ylabel('Income', **label_font)
plt.title('Income Distribution Across Age Groups', **title_font)
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(data=df[df['income'] > 0], x='status', y='income', hue='gender', palette='coolwarm')

plt.xlabel('Status', **label_font)
plt.ylabel('Income', **label_font)
plt.title('Income Variation by Gender and Status', **title_font)
plt.legend(title='Gender')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

pet_distribution = df['pets'].value_counts()

plt.figure(figsize=(10, 6))
pet_distribution = df['pets'].value_counts()
sns.barplot(x=pet_distribution.index, y=pet_distribution.values, hue=pet_distribution.index, 
            palette='pastel', dodge=False, legend=False, edgecolor='black')
plt.xlabel('Pet Categories', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Distribution of Pet Categories', fontsize=14)
plt.xticks(rotation=45)
plt.show()

pet_distribution_by_group = df.groupby(['age_group', 'gender', 'pets']).size().reset_index(name='count')

plt.figure(figsize=(12, 6))
sns.barplot(data=pet_distribution_by_group, x="age_group", y="count", hue="pets", 
            palette="muted", dodge=True, errorbar=None)  # `ci=None` changed to `errorbar=None`

plt.xlabel("Age Group", fontsize=12)
plt.ylabel("Number of Users", fontsize=12)
plt.title("Pet Preferences by Age Group and Gender", fontsize=14)
plt.legend(title="Pet Preference", fontsize=10)
plt.xticks(rotation=45)
plt.show()

C:\Program Files\KMSpico\temp\ipykernel_10508\2670204561.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  pet_distribution_by_group = df.groupby(['age_group', 'gender', 'pets']).size().reset_index(name='count')

sign_distribution = df['sign'].value_counts(normalize=True) * 100

plt.figure(figsize=(12, 8))
sns.barplot(x=sign_distribution.index, y=sign_distribution.values, palette="viridis")
plt.xlabel('Zodiac Signs', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)
plt.title('Distribution of Zodiac Signs', fontsize=14)
plt.xticks(rotation=90)
plt.show()

C:\Program Files\KMSpico\temp\ipykernel_10508\1950768638.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=sign_distribution.index, y=sign_distribution.values, palette="viridis")

plt.figure(figsize=(14, 8))
sign_across_gender = df.groupby(['sign', 'gender']).size().reset_index(name='count')
sns.barplot(data=sign_across_gender, x='sign', y='count', hue='gender', palette='coolwarm')

plt.title('Distribution of Zodiac Signs across Gender', fontsize=14)
plt.xlabel('Zodiac Sign', fontsize=12)
plt.ylabel('Number of Users', fontsize=12)
plt.xticks(rotation=90)
plt.legend(title='Gender', fontsize=10)
plt.show()

plt.figure(figsize=(14, 8))
sign_across_status = df.groupby(['sign', 'status']).size().reset_index(name='count')
sns.barplot(data=sign_across_status, x='sign', y='count', hue='status', palette='crest')

plt.title('Distribution of Zodiac Signs across Status', fontsize=14)
plt.xlabel('Zodiac Sign', fontsize=12)
plt.ylabel('Number of Users', fontsize=12)
plt.xticks(rotation=90)
plt.legend(title='Status', fontsize=10)
plt.grid(True, axis='y', linestyle='dashed', alpha=0.5)
plt.show()

	age	height	income	last_online
count	59946.000000	59946.000000	59946.000000	59946
mean	32.340290	68.295282	20033.222534	2012-05-22 06:43:35.300770560
min	18.000000	1.000000	-1.000000	2011-06-27 01:52:00
25%	26.000000	66.000000	-1.000000	2012-05-29 20:37:15
50%	30.000000	68.000000	-1.000000	2012-06-27 14:30:00
75%	37.000000	71.000000	-1.000000	2012-06-30 01:09:00
max	110.000000	95.000000	1000000.000000	2012-07-01 08:57:00
std	9.452779	3.994738	97346.192104	NaN

	age	status	gender	body_type	diet	drinks	education	ethnicity	height	income	job	last_online	location	pets	religion	sign	speaks
0	22	single	m	a little extra	strictly anything	socially	working on college/university	asian, white	75.0	-1	transportation	2012-06-28-20-30	south san francisco, california	likes dogs and likes cats	agnosticism and very serious about it	gemini	english
1	35	single	m	average	mostly other	often	working on space camp	white	70.0	80000	hospitality / travel	2012-06-29-21-41	oakland, california	likes dogs and likes cats	agnosticism but not too serious about it	cancer	english (fluently), spanish (poorly), french (...
2	38	available	m	thin	anything	socially	graduated from masters program	NaN	68.0	-1	NaN	2012-06-27-09-10	san francisco, california	has cats	NaN	pisces but it doesn’t matter	english, french, c++
3	23	single	m	thin	vegetarian	socially	working on college/university	white	71.0	20000	student	2012-06-28-14-22	berkeley, california	likes cats	NaN	pisces	english, german (poorly)
4	29	single	m	athletic	NaN	socially	graduated from college/university	asian, black, other	66.0	-1	artistic / musical / writer	2012-06-27-21-26	san francisco, california	likes dogs and likes cats	NaN	aquarius	english

	age	age_group
0	22	18-25
1	35	26-35
2	38	36-45
3	23	18-25
4	29	26-35
5	29	26-35
6	32	26-35
7	31	26-35
8	24	18-25
9	37	36-45

1. Data Cleaning¶

Importing, Analyzing the Data and set visualization styles¶

Analyzing the data¶

Load the Bumble Dataset¶

Display dataset structure¶

Display first five rows¶

Count of missing values per column¶

Percentage of Missing Values for All Columns¶

Findings: No columns should be dropped since the missing percentage is less than 50¶

Handling missing numerical data (e.g., height, income) should be handled by imputing the median value of height and income for the corresponding category, such as gender, age group, or location.¶

Explanation :¶

groupby(["gender"])["height"].transform("median")¶

groupby(["gender", "age"])["income"].transform("median")¶

Reason for doing:¶

To check are there any inconsistencies in the data types across columns (e.g., numerical data stored as strings)?¶

To find which columns require conversion to numerical data types for proper analysis (e.g., income)?¶

Does the last_online column need to be converted into a datetime format? What additional insights can be gained by analyzing this as a date field?¶

Outliers¶

Display summary statistics to detect outliers¶

Findings:¶

Identify the range of numerical columns¶

Findings:¶

Plotting Outliers Using Boxplots for all numerical columns¶

Findings:¶

Any -1 values in numerical columns like income should be replaced with 0, as they may represent missing or invalid data.¶

Findings:¶

Treating Outliers for Each Column Using the Middle 80% Approach¶

Findings:¶

Findings:¶

Findings:¶

Conclusion¶

Missing Data Visualization¶

Create a heatmap to visualize missing values across the dataset. Which columns show consistent missing data patterns?¶

Findings:¶

Mode Imputation for Categorical Columns with Less Than 20% Missing Values¶

Findings:¶

2. Data Processing¶

Binning and Grouping¶

Bin the age column into categories such as "18-25", "26-35", "36-45", and "46+" to create a new column, age_group.¶

Findings:¶

Count the number of users in each age group¶

Findings:¶

Group income into categories like "Low Income," "Medium Income," and "High Income" based on meaningful thresholds (e.g., quartiles).¶

Findings:¶

Derived Features¶

Create a new feature, profile_completeness, by calculating the percentage of non-missing values for each user profile.¶

Findings:¶

Analyze profile completeness by relationship status¶

Findings:¶

Unit Conversion¶

Convert the height column from inches to centimeters using the conversion factor (1 inch = 2.54 cm).¶

Findings:¶

3. Data Analysis¶

Demographic Analysis¶

What is the gender distribution across the platform? Are there any significant imbalances?¶

Findings:¶

What are the proportions of users in different status categories?¶

Findings:¶

How does status vary by gender?¶

Findings:¶

Correlation Analysis¶

What are the correlations between numerical columns (age, income, height, profile completeness)?¶

Findings:¶

How does age correlate with income?¶

Findings:¶

Diet and Lifestyle Analysis¶

How do dietary preferences distribute across the platform?¶

Findings:¶

How do drinking habits vary across different diet categories?¶

Findings:¶

Geographical Insights¶

Extract city and state information from location¶

Steps:¶

What are the top 5 cities and states with the highest number of users?¶

Findings:¶

Height Analysis¶

What is the average height of users across different gender categories?¶

Findings:¶

How does height vary by age group?¶

Findings:¶

Bin the age column into categories such as "18-25", "26-35", "36-45", and "46+" to create a new column, `age_group`.¶

Create a new feature, `profile_completeness`, by calculating the percentage of non-missing values for each user profile.¶

Convert the `height` column from inches to centimeters using the conversion factor (1 inch = 2.54 cm).¶