%pip install pandas

Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.2.2)
Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

import pandas as pd

# Provide the full path to your dataset
dataset_path = '/content/drive/MyDrive/Grad Project Dataset/bumble (1).csv'
df = pd.read_csv(dataset_path)

# Display the first few rows of the dataframe
print(df.head())

   age     status gender       body_type               diet    drinks  \
0   22     single      m  a little extra  strictly anything  socially   
1   35     single      m         average       mostly other     often   
2   38  available      m            thin           anything  socially   
3   23     single      m            thin         vegetarian  socially   
4   29     single      m        athletic                NaN  socially   

                           education            ethnicity  height  income  \
0      working on college/university         asian, white    75.0      -1   
1              working on space camp                white    70.0   80000   
2     graduated from masters program                  NaN    68.0      -1   
3      working on college/university                white    71.0   20000   
4  graduated from college/university  asian, black, other    66.0      -1   

                           job       last_online  \
0               transportation  2012-06-28-20-30   
1         hospitality / travel  2012-06-29-21-41   
2                          NaN  2012-06-27-09-10   
3                      student  2012-06-28-14-22   
4  artistic / musical / writer  2012-06-27-21-26   

                          location                       pets  \
0  south san francisco, california  likes dogs and likes cats   
1              oakland, california  likes dogs and likes cats   
2        san francisco, california                   has cats   
3             berkeley, california                 likes cats   
4        san francisco, california  likes dogs and likes cats   

                                   religion  \
0     agnosticism and very serious about it   
1  agnosticism but not too serious about it   
2                                       NaN   
3                                       NaN   
4                                       NaN   

                                 sign  \
0                              gemini   
1                              cancer   
2  pisces but it doesn&rsquo;t matter   
3                              pisces   
4                            aquarius   

                                              speaks  
0                                            english  
1  english (fluently), spanish (poorly), french (...  
2                               english, french, c++  
3                           english, german (poorly)  
4                                            english

df= df.drop_duplicates()
df

missing_values = df.isnull().sum()
missing_values

# Get the index of null values in the 'height' column
null_indexes = df[df['height'].isnull()].index
print(null_indexes)

Index([36428, 54002, 58983], dtype='int64')

percentage_of_missing_values = ((missing_values)/ len(df)) * 100
percentage_of_missing_values

missing_data = pd.DataFrame({ 'Missing Values': missing_values,'Percentage Missing': percentage_of_missing_values})
missing_data

calculate_median = df.groupby(["gender"])["height"].transform("median")
df["height"] = df["height"].fillna(calculate_median)
print(calculate_median)

0        70.0
1        70.0
2        70.0
3        70.0
4        70.0
         ... 
59941    65.0
59942    70.0
59943    70.0
59944    70.0
59945    70.0
Name: height, Length: 59946, dtype: float64

df.dtypes

df["last_online"]= pd.to_datetime(df["last_online"] , format= "%Y-%m-%d-%H-%M")
df["last_online"]

df.dtypes

column_names = ["income", "age", "height"]
for columns in column_names:
     Q1 = df[columns].quantile(0.25)
     Q3 = df[columns].quantile(0.75)
     iqr = Q3-Q1
     lower_bound = Q1-1.5*iqr
     upper_bound = Q3+1.5*iqr
     outlier = (df[columns] <= lower_bound) | (df[columns] >= upper_bound)
     outlier_values = df[outlier]
     range_value = df[columns].max() - df[columns].min()
     print(f"\n Outliers in {columns}")
     print(f"Range of {columns}: Min = {df[columns].min()}, Max = {df[columns].max()}, Range = {range_value}")
     if columns == "age":
          outlier_values = outlier_values.sort_values(by="age", ascending=True)
     elif columns == "income":
          outlier_values = outlier_values.sort_values(by="income", ascending=False)
     print(outlier_values[columns])

 Outliers in income
Range of income: Min = -1, Max = 1000000, Range = 1000001
37173    1000000
37884    1000000
39222    1000000
55335    1000000
57466    1000000
          ...   
22164         -1
22165         -1
22166         -1
22167         -1
59945         -1
Name: income, Length: 59946, dtype: int64

 Outliers in age
Range of age: Min = 18, Max = 110, Range = 92
40999     54
5394      54
14705     54
52394     54
8675      54
        ... 
52644     69
28700     69
6448      69
25324    109
2512     110
Name: age, Length: 2638, dtype: int64

 Outliers in height
Range of height: Min = 1.0, Max = 95.0, Range = 94.0
189      79.0
243      80.0
280      80.0
402      91.0
433      79.0
         ... 
58147    94.0
58286    84.0
59038    79.0
59067    79.0
59697    79.0
Name: height, Length: 285, dtype: float64

columns = df[["income", "age", "height"]]
columns.replace(-1, 0, inplace= True)
print(columns)

       income  age  height
0           0   22    75.0
1       80000   35    70.0
2           0   38    68.0
3       20000   23    71.0
4           0   29    66.0
...       ...  ...     ...
59941       0   59    62.0
59942       0   24    72.0
59943  100000   42    71.0
59944       0   27    73.0
59945       0   39    68.0

[59946 rows x 3 columns]

<ipython-input-16-fa4e58afb01b>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  columns.replace(-1, 0, inplace= True)

column_names = ["income", "age", "height"]
for columns in column_names:
    Q1 = df[columns].quantile(0.10)
    Q3 = df[columns].quantile(0.90)
    filtered_values = df[(df[columns] > Q1) & (df[columns] < Q3)]
    mean_values = filtered_values[columns].mean()
    median_values = filtered_values[columns].median()
    print(f"\n {columns}: filtered_median: {median_values}, filtered_mean: {mean_values}")

 income: filtered_median: 20000.0, filtered_mean: 26109.89010989011

 age: filtered_median: 30.0, filtered_mean: 31.357685563997663

 height: filtered_median: 68.0, filtered_mean: 68.25442646465552

missing_values = df.isnull().sum()
print(missing_values)
plt.figure(figsize=(12, 8))
sns.heatmap(df.isnull(), cbar = False, cmap = "inferno")
plt.title('Heatmap of Missing Data')

age                0
status             0
gender             0
body_type       5296
diet           24395
drinks          2985
education       6628
ethnicity       5680
height             0
income             0
job             8198
last_online        0
location           0
pets           19921
religion       20226
sign           11056
speaks            50
dtype: int64

Text(0.5, 1.0, 'Heatmap of Missing Data')

bins = [18,25, 35, 45, 110]
label = ["18-25", "26-35", "36-45","46+"]
df["age_group"] = pd.cut(df["age"], bins, labels= label)
df["age_group"]
sns.countplot(x='age_group', data=df, palette="mako")
plt.title("Distribution of Users Across Age Groups")

<ipython-input-19-bf52aafc422b>:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='age_group', data=df, palette="mako")

Text(0.5, 1.0, 'Distribution of Users Across Age Groups')

# Group income into categories like "Low Income," "Medium Income," and "High Income" based on meaningful thresholds (e.g., quartiles).
# What insights can be derived from these groups?
Q1 = df["income"].quantile(0.25)
Q2 = df["income"].quantile(0.50)
Q3 = df["income"].quantile(0.90)
df["Income_Category"] = df["income"].apply(lambda x:"Low Income" if x <= Q1 else ("Medium Income" if Q1 < x < Q3 else "High Income"))
print(df[["income", "Income_Category"]].head(10))

   income Income_Category
0      -1      Low Income
1   80000     High Income
2      -1      Low Income
3   20000   Medium Income
4      -1      Low Income
5      -1      Low Income
6      -1      Low Income
7      -1      Low Income
8      -1      Low Income
9      -1      Low Income

df

df["profile_completeness"] = df.notnull().sum(axis=1) / df.shape[1] * 100
print(df["profile_completeness"])
print(df.groupby("status")["profile_completeness"].mean())

0        100.000000
1        100.000000
2         84.210526
3         94.736842
4         89.473684
            ...    
59941     84.210526
59942    100.000000
59943     94.736842
59944    100.000000
59945     94.736842
Name: profile_completeness, Length: 59946, dtype: float64
status
available         91.364470
married           90.747029
seeing someone    91.115871
single            90.774803
unknown           84.210526
Name: profile_completeness, dtype: float64

df["height_cm"] = df["height"] * 2.54
print(df[["height" , "height_cm"]])

       height  height_cm
0        75.0     190.50
1        70.0     177.80
2        68.0     172.72
3        71.0     180.34
4        66.0     167.64
...       ...        ...
59941    62.0     157.48
59942    72.0     182.88
59943    71.0     180.34
59944    73.0     185.42
59945    68.0     172.72

[59946 rows x 2 columns]

gender_distribution = df.groupby("gender")["gender"].count().reset_index(name = "count")
print(gender_distribution)
gender_distribution.plot.bar(x="gender", color = "lightblue")
plt.xticks(rotation = 0)
plt.title("Distribution of gender across the platform")

  gender  count
0      f  24117
1      m  35829

Text(0.5, 1.0, 'Distribution of gender across the platform')

status_proportions = df.groupby("status")["status"].count()/len(df["status"]) * 100
status_proportions_df = status_proportions.reset_index(name =  "Proportion in %")
status_proportions_df = status_proportions_df.sort_values(by = "Proportion in %", ascending = False)
print(status_proportions_df)
status_proportions.plot.bar(x="status", color = "blue")
plt.ylabel("Proportion in %")
plt.xticks(rotation = 0)
plt.title("Distribution of status across the platform")

           status  Proportion in %
3          single        92.911954
2  seeing someone         3.443099
0       available         3.111133
1         married         0.517132
4         unknown         0.016682

Text(0.5, 1.0, 'Distribution of status across the platform')

status_by_gender = df.groupby(["status", "gender"])["status"].count()/len(df["status"]) * 100
status_by_gender_df = status_by_gender.reset_index(name =  "Proportion in %")
status_by_gender_df = status_by_gender_df.sort_values(by = "Proportion in %", ascending = False)
print(status_by_gender_df)
plt.figure(figsize=(10, 6))
sns.relplot(status_by_gender_df, x= "status", y = "Proportion in %", col = "gender", kind = "scatter")

           status gender  Proportion in %
7          single      m        55.680112
6          single      f        37.231842
1       available      m         2.016815
5  seeing someone      m         1.769926
4  seeing someone      f         1.673173
0       available      f         1.094318
3         married      m         0.291929
2         married      f         0.225203
9         unknown      m         0.010009
8         unknown      f         0.006673

<seaborn.axisgrid.FacetGrid at 0x783e81f64640>

<Figure size 1000x600 with 0 Axes>

correlation_columns = df[["age", "income", "height"]]
correlation_data = correlation_columns.corr()
print(correlation_data)
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_data, annot=True, fmt=".2f", cmap="Blues", cbar=True)
plt.title("Correlation Heatmap")
plt.show()

             age    income    height
age     1.000000 -0.001004 -0.022253
income -0.001004  1.000000  0.065048
height -0.022253  0.065048  1.000000

correlation = df["age"].corr(df["income"])
print(correlation)

-0.0010038398754361403

dietary_preferences =  df.groupby("diet")["diet"].count() / len(df["diet"]) * 100
print(dietary_preferences)

diet
anything               10.314283
halal                   0.018350
kosher                  0.018350
mostly anything        27.666567
mostly halal            0.080072
mostly kosher           0.143462
mostly other            1.679845
mostly vegan            0.563841
mostly vegetarian       5.745171
other                   0.552164
strictly anything       8.529343
strictly halal          0.030027
strictly kosher         0.030027
strictly other          0.754012
strictly vegan          0.380342
strictly vegetarian     1.459647
vegan                   0.226871
vegetarian              1.112668
Name: diet, dtype: float64

drinking_habits = df.groupby("diet")["drinks"].count()/len(df["drinks"]) * 100
drinking_habits_df = drinking_habits.reset_index(name =  "drinks in %")
drinking_habits_df = drinking_habits_df.sort_values(by = "drinks in %", ascending = False)
print(drinking_habits_df)

                   diet  drinks in %
3       mostly anything    27.086044
0              anything     9.995663
10    strictly anything     8.319154
8     mostly vegetarian     5.539986
6          mostly other     1.626464
15  strictly vegetarian     1.416275
17           vegetarian     1.037601
13       strictly other     0.720649
7          mostly vegan     0.537150
9                 other     0.520468
14       strictly vegan     0.365329
16                vegan     0.211857
5         mostly kosher     0.140126
4          mostly halal     0.071731
11       strictly halal     0.030027
12      strictly kosher     0.030027
2                kosher     0.018350
1                 halal     0.015014

df[["city", "state"]] = df["location"].str.split(",", expand = True, n= 1)
df["city"] = df["city"].str.strip()
df["state"] = df["state"].str.strip()
top_cities = df["city"].value_counts(ascending = False).head(5)
top_states = df["state"].value_counts(ascending = False).head(5)
print(f"Top 5 cities with highest number of users\n {top_cities}" )
print(f"Top 5 states with highest number of users\n {top_states}")

Top 5 cities with highest number of users
 city
san francisco    31064
oakland           7214
berkeley          4212
san mateo         1331
palo alto         1064
Name: count, dtype: int64
Top 5 states with highest number of users
 state
california       59855
new york            17
illinois             8
massachusetts        5
texas                4
Name: count, dtype: int64

top_cities = df["city"].value_counts(ascending = False).head(5).index
top_cities_df = df[df["city"].isin(top_cities)]
age_cities = top_cities_df.groupby("city")["age"].mean().sort_values(ascending=False)
print(age_cities)

city
san mateo        33.437265
oakland          33.178819
palo alto        31.980263
san francisco    31.614312
berkeley         31.391738
Name: age, dtype: float64

top_states = df["state"].value_counts(ascending = False).head(5).index
top_states_df = df[df["state"].isin(top_states)]
income_states = top_states_df.groupby("state")["income"].mean().sort_values(ascending=False)
print(income_states)

state
new york         31763.823529
california       20043.465609
massachusetts     5999.200000
texas             4999.250000
illinois            -1.000000
Name: income, dtype: float64

height_gender = df.groupby("gender")["height"].mean()
print(height_gender)

gender
f    65.103869
m    70.443468
Name: height, dtype: float64

height_age = df.groupby("age_group")["height"].mean()
print(height_age)

age_group
18-25    68.224532
26-35    68.406764
36-45    68.325095
46+      67.941167
Name: height, dtype: float64

<ipython-input-35-12c16651a4a6>:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  height_age = df.groupby("age_group")["height"].mean()

income_excluding_zero = df[df["income"] > 0]
income_excluding_zero["income"].value_counts().sort_values(ascending = False)
plt.figure(figsize=(12,6))
sns.countplot(income_excluding_zero , x = "income")

<Axes: xlabel='income', ylabel='count'>

plt.hist(df["age"], color = "Grey", edgecolor= "black")
mean_age = df["age"].mean()
plt.axvline(mean_age, color= "red", label= "mean age")
plt.xlabel("age_group")
plt.title("Distribution of age across the platform")
plt.show()

plt.figure(figsize=(10, 4))
sns.histplot(data=df, x="age", hue="gender", palette = "mako")
plt.title("Age distribution differ by gender")

Text(0.5, 1.0, 'Age distribution differ by gender')

sns.regplot(data= df[df["income"] > 0], x="age", y="income", scatter=True, line_kws={"color": "red"})
plt.ticklabel_format(style='plain')
plt.xlabel("Age")
plt.ylabel("Income")
plt.yticks(np.arange(20000,1000000,step=100000))
plt.title("Scatterplot of Income vs. Age with Trend Line")

Text(0.5, 1.0, 'Scatterplot of Income vs. Age with Trend Line')

data_to_plot = df[df["income"] >0]
a=sns.boxplot(data_to_plot, x="age_group", y="income")
a.set(ylim=(20000,1000000))
y_ticks = np.arange(20000, 1000000, 200000)  # Start, stop, step
plt.yticks(y_ticks)
plt.title("Income vs age_group")

Text(0.5, 1.0, 'Income vs age_group')

income_gender_status = df.groupby(["gender", "status"])["income"].mean().reset_index()
sns.barplot(data=income_gender_status, x="status", y="income", hue="gender")
plt.title("Income levels with gender and status categories.")

Text(0.5, 1.0, 'Income levels with gender and status categories.')

plt.figure(figsize=(12,6))
sns.countplot(data= df , x = "pets")
plt.xticks(rotation = 90)
plt.title("Distribution of pet categories.")

Text(0.5, 1.0, 'Distribution of pet categories.')

pets_gender_age = df.groupby(["gender", "age_group", "pets"]).size().reset_index(name="count")
sns.relplot(data=pets_gender_age, x="age_group", y="count", hue="pets", col="gender", kind="scatter", palette = "Paired")
plt.figure(figsize=(12, 6))

<ipython-input-43-7a4db60f4f76>:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  pets_gender_age = df.groupby(["gender", "age_group", "pets"]).size().reset_index(name="count")

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

zodiac_counts = df["sign"].value_counts()
zodiac_counts.plot(kind='pie')
plt.title("Distribution of Zodiac Signs")

Text(0.5, 1.0, 'Distribution of Zodiac Signs')

plt.figure(figsize=(12,6))
sns.countplot(data= df , x = "sign")
plt.xticks(rotation = 90)
plt.title("Distribution of Zodiac Signs")

Text(0.5, 1.0, 'Distribution of Zodiac Signs')

gender_status = df.groupby(["gender", "status", "sign"])["status"].count().reset_index(name="count")
stacked_data = gender_status.pivot_table(index="sign", columns=["gender", "status"], values="count", aggfunc="sum")
stacked_data.plot(kind="bar", stacked=True, figsize=(20, 8), colormap="Paired")
plt.title("Distribution of Signs by Gender and Status")
plt.ylabel("Count")
plt.xlabel("Sign")
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')

<matplotlib.legend.Legend at 0x783e79356020>

	last_online
0	2012-06-28 20:30:00
1	2012-06-29 21:41:00
2	2012-06-27 09:10:00
3	2012-06-28 14:22:00
4	2012-06-27 21:26:00
...	...
59941	2012-06-12 21:47:00
59942	2012-06-29 11:01:00
59943	2012-06-27 23:37:00
59944	2012-06-23 13:01:00
59945	2012-06-29 00:42:00

Introduction¶

Part 1: Data Cleaning¶

(II) Data Types¶

III) Outliers¶

IV) Missing Data Visualization¶

Part 2: Data Processing¶

(II) Derived Features¶

(III) Unit Conversion¶

Part 3: Data Analysis¶

(II) Correlation Analysis¶

(III) Diet and Lifestyle Analysis¶

(IV) Geographical Insights¶

(V) Height Analysis¶

(VI) Income Analysis¶

Part 4 : Data Visualization¶

(III) Pets and Preferences¶

(IV) Signs and Personality¶

Summary of Analysis:¶

Recommendations:¶

	age	status	gender	body_type	diet	drinks	education	ethnicity	height	income	job	last_online	location	pets	religion	sign	speaks
0	22	single	m	a little extra	strictly anything	socially	working on college/university	asian, white	75.0	-1	transportation	2012-06-28-20-30	south san francisco, california	likes dogs and likes cats	agnosticism and very serious about it	gemini	english
1	35	single	m	average	mostly other	often	working on space camp	white	70.0	80000	hospitality / travel	2012-06-29-21-41	oakland, california	likes dogs and likes cats	agnosticism but not too serious about it	cancer	english (fluently), spanish (poorly), french (...
2	38	available	m	thin	anything	socially	graduated from masters program	NaN	68.0	-1	NaN	2012-06-27-09-10	san francisco, california	has cats	NaN	pisces but it doesn’t matter	english, french, c++
3	23	single	m	thin	vegetarian	socially	working on college/university	white	71.0	20000	student	2012-06-28-14-22	berkeley, california	likes cats	NaN	pisces	english, german (poorly)
4	29	single	m	athletic	NaN	socially	graduated from college/university	asian, black, other	66.0	-1	artistic / musical / writer	2012-06-27-21-26	san francisco, california	likes dogs and likes cats	NaN	aquarius	english
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
59941	59	single	f	NaN	NaN	socially	graduated from college/university	NaN	62.0	-1	sales / marketing / biz dev	2012-06-12-21-47	oakland, california	has dogs	catholicism but not too serious about it	cancer and it’s fun to think about	english
59942	24	single	m	fit	mostly anything	often	working on college/university	white, other	72.0	-1	entertainment / media	2012-06-29-11-01	san francisco, california	likes dogs and likes cats	agnosticism	leo but it doesn’t matter	english (fluently)
59943	42	single	m	average	mostly anything	not at all	graduated from masters program	asian	71.0	100000	construction / craftsmanship	2012-06-27-23-37	south san francisco, california	NaN	christianity but not too serious about it	sagittarius but it doesn’t matter	english (fluently)
59944	27	single	m	athletic	mostly anything	socially	working on college/university	asian, black	73.0	-1	medicine / health	2012-06-23-13-01	san francisco, california	likes dogs and likes cats	agnosticism but not too serious about it	leo and it’s fun to think about	english (fluently), spanish (poorly), chinese ...
59945	39	single	m	average	NaN	socially	graduated from masters program	white	68.0	-1	medicine / health	2012-06-29-00-42	san francisco, california	likes dogs and likes cats	catholicism and laughing about it	gemini and it’s fun to think about	english

	0
age	0
status	0
gender	0
body_type	5296
diet	24395
drinks	2985
education	6628
ethnicity	5680
height	3
income	0
job	8198
last_online	0
location	0
pets	19921
religion	20226
sign	11056
speaks	50

	0
age	0.000000
status	0.000000
gender	0.000000
body_type	8.834618
diet	40.694959
drinks	4.979482
education	11.056618
ethnicity	9.475194
height	0.005005
income	0.000000
job	13.675641
last_online	0.000000
location	0.000000
pets	33.231575
religion	33.740366
sign	18.443266
speaks	0.083408

	0
age	int64
status	object
gender	object
body_type	object
diet	object
drinks	object
education	object
ethnicity	object
height	float64
income	int64
job	object
last_online	object
location	object
pets	object
religion	object
sign	object
speaks	object