import numpy as np
import pandas as pd

data = pd.read_csv(r"C:\Users\msrav\OneDrive\Desktop\nextleap course documents\python\bumble.csv")

data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   gender       59946 non-null  object 
 3   body_type    54650 non-null  object 
 4   diet         35551 non-null  object 
 5   drinks       56961 non-null  object 
 6   education    53318 non-null  object 
 7   ethnicity    54266 non-null  object 
 8   height       59943 non-null  float64
 9   income       59946 non-null  int64  
 10  job          51748 non-null  object 
 11  last_online  59946 non-null  object 
 12  location     59946 non-null  object 
 13  pets         40025 non-null  object 
 14  religion     39720 non-null  object 
 15  sign         48890 non-null  object 
 16  speaks       59896 non-null  object 
dtypes: float64(1), int64(2), object(14)
memory usage: 7.8+ MB

data.describe()

data.shape

(59946, 17)

missing_values = data.isnull().sum()
missing_percentage = (missing_values / len(data)) /100

# Create a DataFrame to display the results
missing_data_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage Missing': missing_percentage})

missing_data_df

calculate_median = data.groupby(["gender"])["height"].transform("median")
data["height"] = data["height"].fillna(calculate_median)
print(calculate_median)

0        70.0
1        70.0
2        70.0
3        70.0
4        70.0
         ... 
59941    65.0
59942    70.0
59943    70.0
59944    70.0
59945    70.0
Name: height, Length: 59946, dtype: float64

data.dtypes

age              int64
status          object
gender          object
body_type       object
diet            object
drinks          object
education       object
ethnicity       object
height         float64
income           int64
job             object
last_online     object
location        object
pets            object
religion        object
sign            object
speaks          object
dtype: object

data["last_online"]= pd.to_datetime(data["last_online"] , format= "%Y-%m-%d-%H-%M")
data["last_online"]

0       2012-06-28 20:30:00
1       2012-06-29 21:41:00
2       2012-06-27 09:10:00
3       2012-06-28 14:22:00
4       2012-06-27 21:26:00
                ...        
59941   2012-06-12 21:47:00
59942   2012-06-29 11:01:00
59943   2012-06-27 23:37:00
59944   2012-06-23 13:01:00
59945   2012-06-29 00:42:00
Name: last_online, Length: 59946, dtype: datetime64[ns]

data.describe()

age_range = (data['age'].min() , data['age'].max())
height_range = (data['height'].min() , data['height'].max())
income_range = (data['income'].min() , data['income'].max())

print(f"The age range is : {age_range} ")
print(f"The height range is : {height_range} ")
print(f"The income range is : {income_range} ")

The age range is : (18, 110) 
The height range is : (1.0, 95.0) 
The income range is : (-1, 1000000)

data['income'] = data['income'].replace(-1, 0)

data['income'].min()

0

numerical_cols = ['age', 'height', 'income']

lower_bound = data[numerical_cols].quantile(0.10)
upper_bound = data[numerical_cols].quantile(0.90)

# Filter data within middle 80% (excluding extreme outliers)
data_filtered = data[(data[numerical_cols] > lower_bound) & (data[numerical_cols] < upper_bound)]

# Calculate mean and median on the middle 80% of data
trimmed_mean = data_filtered[numerical_cols].mean()
trimmed_median = data_filtered[numerical_cols].median()

# Display results
print("Trimmed Mean:\n", trimmed_mean)
print("Trimmed Median:\n", trimmed_median)

Trimmed Mean:
 age          31.357686
height       68.254426
income    26109.890110
dtype: float64
Trimmed Median:
 age          30.0
height       68.0
income    20000.0
dtype: float64

import seaborn as sns
import matplotlib.pyplot as plt

# Set figure size
plt.figure(figsize=(12, 6))

# Create a heatmap to visualize missing values
sns.heatmap(data.isnull(), cmap='viridis', cbar=True)

# Add title
plt.title("Missing Values Heatmap", fontsize=14)
plt.xlabel('Columns', fontsize=14)
plt.ylabel('Rows', fontsize=14)

# Show plot
plt.show()

import numpy as np 

data['age_group'] = np.where(data['age'] <= 25,"18-25",
                    np.where(data['age'] <=35,"26-35",
                    np.where(data['age'] <=45,"36-45","46+")))

                          
age_distribution_count = data['age_group'].value_counts()
age_distribution_count

26-35    28621
18-25    14454
36-45    10803
46+       6068
Name: age_group, dtype: int64

lower_quantile = data['income'].quantile(0.25)
middle_quantile = data['income'].quantile(0.50)
higher_quantile = data['income'].quantile(0.90)

data['income_category'] = np.where(data['income'] <= lower_quantile,"Low Income",
                          np.where(data['income'] <=middle_quantile,"Medium income","High income"))

                          
income_distribution = data['income_category'].value_counts()
income_distribution

Low Income     48442
High income    11504
Name: income_category, dtype: int64

data[['income','income_category']]

non_missing_counts = data.notnull().sum(axis=1)  
total_columns = data.shape[1]  
data['profile_completeness'] = (non_missing_counts / total_columns) * 100

data['profile_completeness']

0        100.000000
1        100.000000
2         84.210526
3         94.736842
4         89.473684
            ...    
59941     84.210526
59942    100.000000
59943     94.736842
59944    100.000000
59945     94.736842
Name: profile_completeness, Length: 59946, dtype: float64

data.groupby('profile_completeness')['gender'].value_counts()

profile_completeness  gender
52.631579             m           52
                      f           21
57.894737             m          165
                      f           81
63.157895             m          488
                      f          262
68.421053             m          639
                      f          394
73.684211             m         1234
                      f          780
78.947368             m         2381
                      f         1600
84.210526             m         4541
                      f         2942
89.473684             m         7507
                      f         5021
94.736842             m         9998
                      f         6865
100.000000            m         8824
                      f         6151
Name: gender, dtype: int64

# Convert height from inches to centimeters (1 inch = 2.54 cm)
data['height_cm'] = data['height'] * 2.54

# Display first few rows to verify the new column
data[['height', 'height_cm']].head()

# Analyze gender distribution
gender_distribution = data['gender'].value_counts(normalize=True) * 100  

# Display results
gender_distribution

m    59.768792
f    40.231208
Name: gender, dtype: float64

# Analyze relationship status distribution
status_distribution = data['status'].value_counts(normalize=True) * 100 

# Display results
status_distribution

single            92.911954
seeing someone     3.443099
available          3.111133
married            0.517132
unknown            0.016682
Name: status, dtype: float64

status_gender_distribution = data.groupby(["status", "gender"]).size() / len(data) * 100
status_gender_data = status_gender_distribution.reset_index(name="Percentage")
status_gender_data_sorted = status_gender_data.sort_values(by="Percentage", ascending=False)
print(status_gender_data_sorted)

           status gender  Percentage
7          single      m   55.680112
6          single      f   37.231842
1       available      m    2.016815
5  seeing someone      m    1.769926
4  seeing someone      f    1.673173
0       available      f    1.094318
3         married      m    0.291929
2         married      f    0.225203
9         unknown      m    0.010009
8         unknown      f    0.006673

# Perform correlation analysis on numerical columns
correlation_matrix = data[['age', 'income', 'height']].corr()

# Display correlation matrix
correlation_matrix

age_income_correlation = data['age'].corr(data['income'])
age_income_correlation

-0.0010038681910053916

diet_percentage = data['diet'].value_counts() * 100 / len(data['diet'])
diet_percentage

mostly anything        27.666567
anything               10.314283
strictly anything       8.529343
mostly vegetarian       5.745171
mostly other            1.679845
strictly vegetarian     1.459647
vegetarian              1.112668
strictly other          0.754012
mostly vegan            0.563841
other                   0.552164
strictly vegan          0.380342
vegan                   0.226871
mostly kosher           0.143462
mostly halal            0.080072
strictly halal          0.030027
strictly kosher         0.030027
halal                   0.018350
kosher                  0.018350
Name: diet, dtype: float64

# Analyze drinking habits across different diet categories
drink_diet_distribution = data.groupby("diet")["drinks"].value_counts(normalize=True) * 100

# Display results
drink_diet_distribution

diet        drinks     
anything    socially       75.667557
            often           9.579439
            rarely          8.511348
            not at all      4.923231
            very often      0.967957
                             ...    
vegetarian  rarely         10.771704
            often           9.485531
            not at all      5.948553
            very often      1.125402
            desperately     0.964630
Name: drinks, Length: 103, dtype: float64

data[['city','state']] = data['location'].str.split(', ',expand = True, n=1)
data[['city','state']]

# Ensure proper extraction by handling extra spaces
data[['city', 'state']] = data['location'].str.split(',', n=1, expand=True)

# Trim any leading/trailing spaces
data['city'] = data['city'].str.strip()
data['state'] = data['state'].str.strip()

# Count the number of users per city and state
top_cities = data['city'].value_counts().head(5)  # Top 5 cities
print(f"TOP 5 CITIES WITH HIGHEST NUMBER OF USERS: \n\n{top_cities}")
top_states = data['state'].value_counts().head(5)  # Top 5 states
print(f"TOP 5 CITIES WITH HIGHEST NUMBER OF USERS: \n\n{top_states}")

TOP 5 CITIES WITH HIGHEST NUMBER OF USERS: 

san francisco    31064
oakland           7214
berkeley          4212
san mateo         1331
palo alto         1064
Name: city, dtype: int64
TOP 5 CITIES WITH HIGHEST NUMBER OF USERS: 

california       59855
new york            17
illinois             8
massachusetts        5
texas                4
Name: state, dtype: int64

city_with_average_age = data.groupby('city')['age'].mean()

print(f" Cities with high average age is :\n {city_with_average_age.sort_values(ascending = False).head(5)}")
print(f" Cities with low average age is :\n {city_with_average_age.sort_values(ascending = False).tail(5)}")

 Cities with high average age is :
 city
forest knolls     62.5
bellingham        59.0
port costa        53.0
seaside           50.0
redwood shores    47.0
Name: age, dtype: float64
 Cities with low average age is :
 city
fayetteville      20.0
isla vista        19.0
canyon            19.0
canyon country    19.0
long beach        19.0
Name: age, dtype: float64

# Calculate average income per city and state
city_income = data.groupby('city')['income'].mean()
state_income = data.groupby('state')['income'].mean()

# Get average income for the top 5 cities and states
high_income_state = state_income.sort_values(ascending = False).head(5)
high_income_city = city_income.sort_values(ascending = False).head(5)

print("Average Income in Top 5 Cities:\n",high_income_state )
print("\nAverage Income in Top 5 States:\n",high_income_city)

Average Income in Top 5 Cities:
 state
new jersey                  150000.0
colorado                     75000.0
vietnam                      60000.0
british columbia, canada     60000.0
pennsylvania                 40000.0
Name: income, dtype: float64

Average Income in Top 5 States:
 city
petaluma        500000.000000
santa cruz      230000.000000
south orange    150000.000000
boulder         150000.000000
montara          85833.333333
Name: income, dtype: float64

# Compute average height per gender category
avg_height_by_gender = data.groupby("gender")["height"].mean().dropna()
avg_height_by_gender

gender
f    65.103869
m    70.443468
Name: height, dtype: float64

avg_height_by_age_group = data.groupby("age_group")["height"].mean()
avg_height_by_age_group

age_group
18-25    68.200913
26-35    68.406764
36-45    68.325095
46+      67.941167
Name: height, dtype: float64

# Compute average height per body type category
avg_height_by_body_type = data.groupby("body_type")["height"].mean().sort_values()
avg_height_by_body_type

body_type
curvy             65.210245
full figured      66.464817
rather not say    67.272727
thin              67.866058
average           68.100805
skinny            68.544176
fit               68.546062
a little extra    68.820084
overweight        68.948198
used up           69.180282
jacked            69.292162
athletic          69.707336
Name: height, dtype: float64

income_without_zero = data[data['income'] != 0]

income_without_zero['income'].value_counts().sort_values(ascending = False)

20000      2952
100000     1621
80000      1111
30000      1048
40000      1005
50000       975
60000       736
70000       707
150000      631
1000000     521
250000      149
500000       48
Name: income, dtype: int64

df_valid_income = data[data["income"] > 0]

avg_income_by_age_gender = df_valid_income.groupby(["age_group", "gender"])["income"].mean().unstack()
avg_income_by_age_gender

import matplotlib.pyplot as plt
import seaborn as sns

# Plot histogram of age distribution
plt.figure(figsize=(10, 6))
sns.histplot(data["age"], bins=30, kde=True, color="red")

# Add a vertical line for the mean age
mean_age = data["age"].mean()
plt.axvline(mean_age, color="blue", linestyle="dashed", linewidth=2, label=f"Mean Age: {mean_age:.1f}")

# Labels and title
plt.xlabel("Age")
plt.ylabel("User Count")
plt.title("Age Distribution of Users")
plt.legend()
plt.show()

import seaborn as sns


sns.set_style("whitegrid")

plt.figure(figsize=(10, 6))
sns.histplot(data, x="age", hue="gender", bins=30, kde=True, element="step", common_norm=False)


plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Age Distribution by Gender")
plt.legend(title="Gender", labels=["Male", "Female"])

plt.show()

# Filter out unrealistic income values (-1 likely represents missing data)
df_filtered = data[data["income"] >= 0]

# Create a scatterplot with a trend line
plt.figure(figsize=(10, 6))
sns.regplot(data=df_filtered, x="age", y="income", line_kws={"color": "orange"})

# Labels and title
plt.xlabel("Age")
plt.ylabel("Income")
plt.title("Relationship Between Income and Age")

# Show plot
plt.show()

plt.figure(figsize=(12,6))
sns.boxplot(data = data[data['income'] > 0], x="age_group", y='income',showfliers = False)
plt.xlabel('Age Group',fontsize =14)
plt.ylabel('Income',fontsize =14)
plt.title('Distribution of Income Over Age Group ',fontsize =16)
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns


palette = {"m": "blue", "f": "red"}
plt.figure(figsize=(12, 6))
sns.barplot(data=data, x="status", y="income", hue="gender", ci=None, palette=palette)


plt.xlabel("Status", fontsize=14)
plt.ylabel("Income", fontsize=14)
plt.title("Income by Gender and Status", fontsize=16)
plt.legend(title="Gender", fontsize=12)
plt.show()

C:\Users\msrav\AppData\Local\Temp\ipykernel_1736\3273359307.py:9: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=data, x="status", y="income", hue="gender", ci=None, palette=palette)

pet_counts = data['pets'].value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(x=pet_counts.index, y=pet_counts.values)
plt.xticks(rotation=90, ha='right')
plt.title('Distribution of Pet Preferences')
plt.xlabel('Pet Preference')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

pet_distribution = data.groupby(['age_group', 'gender', 'pets']).size().reset_index(name='count')
pet_distribution

plt.figure(figsize=(12, 6))
sns.barplot(data=pet_distribution, x="age_group", y="count", hue="pets", ci = None)

plt.xlabel("Age Group", fontsize=14)
plt.ylabel("Number of Users", fontsize=14)
plt.title("Pet Preferences by Age Group and Gender", fontsize=16)
plt.legend(title="Pet Preference", fontsize=8)
plt.show()

C:\Users\msrav\AppData\Local\Temp\ipykernel_1736\1986096596.py:6: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=pet_distribution, x="age_group", y="count", hue="pets", ci = None)

import matplotlib.pyplot as plt

zodiac_counts = data['sign'].value_counts()

plt.figure(figsize=(10, 6))
plt.pie(zodiac_counts, labels=zodiac_counts.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)

plt.title("Distribution of Zodiac Signs Among Users", fontsize=14)
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x=zodiac_counts.index, y=zodiac_counts.values, palette="viridis")

plt.xlabel("Zodiac Sign", fontsize=14)
plt.ylabel("Number of Users", fontsize=14)
plt.title("Zodiac Sign Distribution Among Users", fontsize=16)
plt.xticks(rotation=45)
plt.show()

zodiac_distribution = data.groupby(['sign', 'gender', 'status']).size().reset_index(name='count')

plt.figure(figsize=(14, 7))
sns.barplot(data=zodiac_distribution, x='sign', y='count', hue='gender', ci=None, palette='pastel')

plt.xlabel("Zodiac Sign", fontsize=14)
plt.ylabel("Number of Users", fontsize=14)
plt.title("Zodiac Sign Distribution Across Gender", fontsize=16)
plt.xticks(rotation=90)
plt.legend(title="Gender")

plt.show()

C:\Users\msrav\AppData\Local\Temp\ipykernel_1736\618449131.py:4: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=zodiac_distribution, x='sign', y='count', hue='gender', ci=None, palette='pastel')

sign_across_status = data.groupby(['sign', 'status']).size().reset_index(name='count')


plt.figure(figsize=(14,8))
sns.barplot(data=sign_across_status, x='sign', y='count', hue='status')

plt.title('Distribution of Zodiac Signs across Status', fontsize=14)
plt.xlabel('Zodiac Sign', fontsize=14)
plt.ylabel('Number of Users', fontsize=14)
plt.xticks(rotation=90)  
plt.grid(True, axis= 'y', linestyle = 'dashed', alpha = 0.5)
plt.show()

zodiac_pivot = data.groupby(['sign', 'status']).size().unstack().fillna(0)
zodiac_pivot.plot(kind='bar', stacked=True, figsize=(14, 7), colormap="viridis")

plt.xlabel("Zodiac Sign", fontsize=14)
plt.ylabel("Number of Users", fontsize=14)
plt.title("Zodiac Sign Distribution Across Relationship Status", fontsize=16)
plt.xticks(rotation=90)
plt.legend(title="Relationship Status")
plt.show()

	age	height	income
count	59946.000000	59943.000000	59946.000000
mean	32.340290	68.295281	20033.222534
std	9.452779	3.994803	97346.192104
min	18.000000	1.000000	-1.000000
25%	26.000000	66.000000	-1.000000
50%	30.000000	68.000000	-1.000000
75%	37.000000	71.000000	-1.000000
max	110.000000	95.000000	1000000.000000

	Missing Values	Percentage Missing
age	0	0.000000e+00
status	0	0.000000e+00
gender	0	0.000000e+00
body_type	5296	8.834618e-04
diet	24395	4.069496e-03
drinks	2985	4.979482e-04
education	6628	1.105662e-03
ethnicity	5680	9.475194e-04
height	3	5.004504e-07
income	0	0.000000e+00
job	8198	1.367564e-03
last_online	0	0.000000e+00
location	0	0.000000e+00
pets	19921	3.323158e-03
religion	20226	3.374037e-03
sign	11056	1.844327e-03
speaks	50	8.340840e-06

	age	height	income
count	59946.000000	59946.000000	59946.000000
mean	32.340290	68.295282	20033.222534
std	9.452779	3.994738	97346.192104
min	18.000000	1.000000	-1.000000
25%	26.000000	66.000000	-1.000000
50%	30.000000	68.000000	-1.000000
75%	37.000000	71.000000	-1.000000
max	110.000000	95.000000	1000000.000000

	height	height_cm
0	75.0	190.50
1	70.0	177.80
2	68.0	172.72
3	71.0	180.34
4	66.0	167.64

	age	income	height
age	1.000000	-0.001004	-0.022253
income	-0.001004	1.000000	0.065048
height	-0.022253	0.065048	1.000000

Introduction¶

Purpose¶

Part 1: Data Cleaning¶

Data Types¶

3. Outliers¶

4. Missing Data Visualization¶

Part 2: Data Processing¶

Binning and Grouping¶

2. Derived Features:¶

3.Unit Conversion¶

Part 3: Data Analysis¶

1. Demographic Analysis¶

2. Correlation Analysis¶

3. Diet and Lifestyle Analysis¶

4. Geographical Insights¶

5. Height Analysis¶

6. Income Analysis¶

Part 4: Data Visualization¶

1. Age Distribution¶

2. Income and Age¶

3. Pets and Preferences¶

4. Signs and Personality¶

	age	status	gender	body_type	diet	drinks	education	ethnicity	height	income	job	last_online	location	pets	religion	sign	speaks
0	22	single	m	a little extra	strictly anything	socially	working on college/university	asian, white	75.0	-1	transportation	2012-06-28-20-30	south san francisco, california	likes dogs and likes cats	agnosticism and very serious about it	gemini	english
1	35	single	m	average	mostly other	often	working on space camp	white	70.0	80000	hospitality / travel	2012-06-29-21-41	oakland, california	likes dogs and likes cats	agnosticism but not too serious about it	cancer	english (fluently), spanish (poorly), french (...
2	38	available	m	thin	anything	socially	graduated from masters program	NaN	68.0	-1	NaN	2012-06-27-09-10	san francisco, california	has cats	NaN	pisces but it doesn’t matter	english, french, c++
3	23	single	m	thin	vegetarian	socially	working on college/university	white	71.0	20000	student	2012-06-28-14-22	berkeley, california	likes cats	NaN	pisces	english, german (poorly)
4	29	single	m	athletic	NaN	socially	graduated from college/university	asian, black, other	66.0	-1	artistic / musical / writer	2012-06-27-21-26	san francisco, california	likes dogs and likes cats	NaN	aquarius	english

	income	income_category
0	0	Low Income
1	80000	High income
2	0	Low Income
3	20000	High income
4	0	Low Income
...	...	...
59941	0	Low Income
59942	0	Low Income
59943	100000	High income
59944	0	Low Income
59945	0	Low Income

	city	state
0	south san francisco	california
1	oakland	california
2	san francisco	california
3	berkeley	california
4	san francisco	california
...	...	...
59941	oakland	california
59942	san francisco	california
59943	south san francisco	california
59944	san francisco	california
59945	san francisco	california

gender	f	m
age_group
18-25	86066.350711	106618.773946
26-35	90398.126464	114944.801027
36-45	87302.977233	112680.608365
46+	75299.760192	100156.626506