# Importing necessary libraries
import pandas as pd                 # For data manipulation and analysis
import numpy as np                  # For numerical computations
import seaborn as sns               # For advanced data visualization
import matplotlib.pyplot as plt     # For creating static and interactive plots

# Step 2: Import necessary libraries
import gdown
import pandas as pd

# Step 3: Define the file ID and create the download URL
file_id = "1v2_R5eIcgmv02XjhKfj1581oAhzZdLnU"
download_url = f"https://drive.google.com/uc?id={file_id}"

# Step 4: Set the output file name
output_file = "spinny_dataset.csv"

# Step 5: Download the file
gdown.download(download_url, output_file, quiet=False)

# Step 6: Load the CSV file into a Pandas DataFrame
data = pd.read_csv(output_file)

Downloading...
From: https://drive.google.com/uc?id=1v2_R5eIcgmv02XjhKfj1581oAhzZdLnU
To: /content/spinny_dataset.csv
100%|██████████| 13.7M/13.7M [00:00<00:00, 64.5MB/s]

data.head(5)

data = pd.read_csv(output_file)

missing_percentage = data.isnull().mean() * 100

missing_percentage

data = data.dropna(thresh=len(data) * 0.5, axis=1)

data

df = data.copy()

# Confirmation by printing the first few rows of the copied dataset
print("Copy of the dataset has been created with the name 'df'")

Copy of the dataset has been created with the name 'df'

for column in ['height', 'income']:
    df[column] = df[column].replace(-1, np.nan)
    df[column] = df.groupby(['gender'])[column].transform(lambda x: x.fillna(x.median()))

print(df)

       age     status gender       body_type               diet      drinks  \
0       22     single      m  a little extra  strictly anything    socially   
1       35     single      m         average       mostly other       often   
2       38  available      m            thin           anything    socially   
3       23     single      m            thin         vegetarian    socially   
4       29     single      m        athletic                NaN    socially   
...    ...        ...    ...             ...                ...         ...   
59941   59     single      f             NaN                NaN    socially   
59942   24     single      m             fit    mostly anything       often   
59943   42     single      m         average    mostly anything  not at all   
59944   27     single      m        athletic    mostly anything    socially   
59945   39     single      m         average                NaN    socially   

                               education            ethnicity  height  \
0          working on college/university         asian, white    75.0   
1                  working on space camp                white    70.0   
2         graduated from masters program                  NaN    68.0   
3          working on college/university                white    71.0   
4      graduated from college/university  asian, black, other    66.0   
...                                  ...                  ...     ...   
59941  graduated from college/university                  NaN    62.0   
59942      working on college/university         white, other    72.0   
59943     graduated from masters program                asian    71.0   
59944      working on college/university         asian, black    73.0   
59945     graduated from masters program                white    68.0   

         income                           job       last_online  \
0       60000.0                transportation  2012-06-28-20-30   
1       80000.0          hospitality / travel  2012-06-29-21-41   
2       60000.0                           NaN  2012-06-27-09-10   
3       20000.0                       student  2012-06-28-14-22   
4       60000.0   artistic / musical / writer  2012-06-27-21-26   
...         ...                           ...               ...   
59941   40000.0   sales / marketing / biz dev  2012-06-12-21-47   
59942   60000.0         entertainment / media  2012-06-29-11-01   
59943  100000.0  construction / craftsmanship  2012-06-27-23-37   
59944   60000.0             medicine / health  2012-06-23-13-01   
59945   60000.0             medicine / health  2012-06-29-00-42   

                              location                       pets  \
0      south san francisco, california  likes dogs and likes cats   
1                  oakland, california  likes dogs and likes cats   
2            san francisco, california                   has cats   
3                 berkeley, california                 likes cats   
4            san francisco, california  likes dogs and likes cats   
...                                ...                        ...   
59941              oakland, california                   has dogs   
59942        san francisco, california  likes dogs and likes cats   
59943  south san francisco, california                        NaN   
59944        san francisco, california  likes dogs and likes cats   
59945        san francisco, california  likes dogs and likes cats   

                                        religion  \
0          agnosticism and very serious about it   
1       agnosticism but not too serious about it   
2                                            NaN   
3                                            NaN   
4                                            NaN   
...                                          ...   
59941   catholicism but not too serious about it   
59942                                agnosticism   
59943  christianity but not too serious about it   
59944   agnosticism but not too serious about it   
59945          catholicism and laughing about it   

                                           sign  \
0                                        gemini   
1                                        cancer   
2            pisces but it doesn&rsquo;t matter   
3                                        pisces   
4                                      aquarius   
...                                         ...   
59941  cancer and it&rsquo;s fun to think about   
59942           leo but it doesn&rsquo;t matter   
59943   sagittarius but it doesn&rsquo;t matter   
59944     leo and it&rsquo;s fun to think about   
59945  gemini and it&rsquo;s fun to think about   

                                                  speaks  
0                                                english  
1      english (fluently), spanish (poorly), french (...  
2                                   english, french, c++  
3                               english, german (poorly)  
4                                                english  
...                                                  ...  
59941                                            english  
59942                                 english (fluently)  
59943                                 english (fluently)  
59944  english (fluently), spanish (poorly), chinese ...  
59945                                            english  

[59946 rows x 17 columns]

#df['last_online'] = pd.to_datetime(df['last_online'])
df['last_online'] = pd.to_datetime(df['last_online'], format='%Y-%m-%d-%H-%M')

print(type(df['last_online'][0]))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>

print(type(df['income'][0]))

<class 'numpy.float64'>

print(type(df['height'][0]))

<class 'numpy.float64'>

print(type(df['age'][0]))
print(type(df['status'][0]))
print(type(df['gender'][0]))
print(type(df['body_type'][0]))
print(type(df['diet'][0]))
print(type(df['drinks'][0]))
print(type(df['education'][0]))
print(type(df['ethnicity'][0]))
print(type(df['height'][0]))
print(type(df['income'][0]))
print(type(df['job'][0]))
print(type(df['location'][0]))
print(type(df['pets'][0]))
print(type(df['religion'][0]))
print(type(df['sign'][0]))
print(type(df['speaks'][0]))

<class 'numpy.int64'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>

print("Column Data Types:")
print(df.dtypes)

Column Data Types:
age                     int64
status                 object
gender                 object
body_type              object
diet                   object
drinks                 object
education              object
ethnicity              object
height                float64
income                float64
job                    object
last_online    datetime64[ns]
location               object
pets                   object
religion               object
sign                   object
speaks                 object
dtype: object

numeric_columns = ['income', 'height']
for col in numeric_columns:
    if df[col].dtype == 'object':  # Check if the column is not already numeric
        print(f"Column '{col}' requires conversion to numerical data type.")
for col in numeric_columns:
    if df[col].dtype == 'object':
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, set invalid values to NaN

df[col]

print("\nAfter Conversion:")
print(df.dtypes)

After Conversion:
age                     int64
status                 object
gender                 object
body_type              object
diet                   object
drinks                 object
education              object
ethnicity              object
height                float64
income                float64
job                    object
last_online    datetime64[ns]
location               object
pets                   object
religion               object
sign                   object
speaks                 object
dtype: object

df2 = pd.read_csv(output_file)

df2['last_online'] = pd.to_datetime(df2['last_online'], format='%Y-%m-%d-%H-%M')

df2['last_online']

# Calculate days since last online
df2['days_since_last_online'] = (pd.Timestamp.now() - df2['last_online']).dt.days
df2['days_since_last_online']

# Categorize users based on activity
def categorize_activity(days):
    if days <= 7:
        return 'Active'
    elif days <= 30:
        return 'Recently Active'
    else:
        return 'Inactive'

df2['activity_status'] = df2['days_since_last_online'].apply(categorize_activity)
df2['activity_status']

# 1. View Summary Statistics
print("Summary Statistics:")
print(df.describe())

Summary Statistics:
                age        height          income  \
count  59946.000000  59946.000000    59946.000000   
mean      32.340290     68.295282    61512.027491   
min       18.000000      1.000000    20000.000000   
25%       26.000000     66.000000    40000.000000   
50%       30.000000     68.000000    60000.000000   
75%       37.000000     71.000000    60000.000000   
max      110.000000     95.000000  1000000.000000   
std        9.452779      3.994738    91116.607782   

                         last_online  
count                          59946  
mean   2012-05-22 06:43:35.300770560  
min              2011-06-27 01:52:00  
25%              2012-05-29 20:37:15  
50%              2012-06-27 14:30:00  
75%              2012-06-30 01:09:00  
max              2012-07-01 08:57:00  
std                              NaN

import pandas as pd
import matplotlib.pyplot as plt

# Create the boxplot
df2[['age', 'height', 'income']].boxplot(figsize=(10, 5))
plt.title("Boxplot of Age, Height, and Income")
plt.show()

df = pd.read_csv(output_file)

df

# Function to find outliers using IQR for selected columns
def find_outliers_iqr(df, selected_columns):
    outliers = {}
    for col in selected_columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers[col] = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    return outliers

selected_columns = ["age", "height", "income"]
outliers_iqr = find_outliers_iqr(df, selected_columns)
print("Outliers using IQR for selected columns:", outliers_iqr)

Outliers using IQR for selected columns: {'age': 117      55
141      59
158      54
172      62
210      59
         ..
59740    61
59746    55
59794    55
59837    66
59941    59
Name: age, Length: 2638, dtype: int64, 'height': 189      79.0
243      80.0
280      80.0
402      91.0
433      79.0
         ... 
58147    94.0
58286    84.0
59038    79.0
59067    79.0
59697    79.0
Name: height, Length: 285, dtype: float64, 'income': 1         80000
3         20000
11        40000
13        30000
14        50000
          ...  
59917    100000
59927     50000
59930     70000
59934     80000
59943    100000
Name: income, Length: 11504, dtype: int64}

ranges = df[["age", "height", "income"]].agg(["min", "max"])
print(ranges)

     age  height   income
min   18     1.0       -1
max  110    95.0  1000000

# Replace -1 with 0 in the 'income' column
df['income'] = df['income'].replace(-1, 0)

# For multiple numerical columns
numerical_columns = ['age', 'height', 'income']  # Specify the columns to check
df[numerical_columns] = df[numerical_columns].replace(-1, 0)
df[numerical_columns]

# Replace -1 with 0 in all numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].replace(-1, 0)
df[numeric_cols]

def calculate_trimmed_mean_median(column):
    # Calculate 10th and 90th percentiles
    lower_bound = column.quantile(0.1)
    upper_bound = column.quantile(0.9)

    # Filter data to keep only middle 80%
    trimmed_data = column[(column >= lower_bound) & (column <= upper_bound)]

    # Calculate mean and median of trimmed data
    mean_value = trimmed_data.mean()
    median_value = trimmed_data.median()

    return mean_value, median_value

# Numerical columns to analyze
numerical_columns = ['age', 'height', 'income']

# Calculate trimmed mean and median for each column
trimmed_stats = {}
for col in numerical_columns:
    mean, median = calculate_trimmed_mean_median(df[col])
    trimmed_stats[col] = {'mean': mean, 'median': median}

# Print results
for col, stats in trimmed_stats.items():
    print(f"{col}:")
    print(f"  Trimmed Mean: {stats['mean']}")
    print(f"  Trimmed Median: {stats['median']}")
    print()

age:
  Trimmed Mean: 31.090303239005813
  Trimmed Median: 30.0

height:
  Trimmed Mean: 68.2309102137578
  Trimmed Median: 68.0

income:
  Trimmed Mean: 3297.01223769799
  Trimmed Median: 0.0

%pip install seaborn

Requirement already satisfied: seaborn in /usr/local/lib/python3.11/dist-packages (0.13.2)
Requirement already satisfied: numpy!=1.24.0,>=1.20 in /usr/local/lib/python3.11/dist-packages (from seaborn) (2.0.2)
Requirement already satisfied: pandas>=1.2 in /usr/local/lib/python3.11/dist-packages (from seaborn) (2.2.2)
Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in /usr/local/lib/python3.11/dist-packages (from seaborn) (3.10.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.1)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.56.0)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.8)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.2)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (11.1.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.1)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.2->seaborn) (2025.1)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.2->seaborn) (2025.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.17.0)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df=pd.read_csv(output_file)

# 1. Check for missing values
missing_values = df.isnull().sum()
missing_values

#  Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis", yticklabels=False)
plt.title("Heatmap of Missing Values")
plt.show()

sorted_missing = missing_values[missing_values > 0].sort_values(ascending=False)
print("Columns with Missing Values:\n", sorted_missing)

Columns with Missing Values:
 diet         24395
religion     20226
pets         19921
sign         11056
job           8198
education     6628
ethnicity     5680
body_type     5296
drinks        2985
speaks          50
height           3
dtype: int64

# Define bins and labels for age ranges
bins = [0, 25, 35, 45, float("inf")]
labels = ["18-25", "26-35", "36-45", "46+"]

# Bin the age column into age ranges
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)

# Analyze the distribution of users across age ranges
age_distribution = df["age_group"].value_counts().sort_index()

# Output the results
print("Age Distribution:\n", age_distribution)

Age Distribution:
 age_group
18-25    10923
26-35    30397
36-45    11915
46+       6711
Name: count, dtype: int64

# Replace invalid incomes (-1) with NaN for meaningful categorization
df["income"] = df["income"].replace(-1, pd.NA)
df["income"]

# Calculate quartile thresholds
q1 = df["income"].quantile(0.25)
q2 = df["income"].quantile(0.50)
q3 = df["income"].quantile(0.75)
q1,q2,q3

(20000.0, 50000.0, 100000.0)

# Define income categories based on thresholds
bins = [-float("inf"), q1, q2, q3, float("inf")]
labels = ["Low Income", "Medium Income", "High Income", "Very High Income"]
bins , labels

([-inf, 20000.0, 50000.0, 100000.0, inf],
 ['Low Income', 'Medium Income', 'High Income', 'Very High Income'])

# Add a category for missing income
df["income_group"] = pd.cut(df["income"].fillna(-1), bins=[-float("inf"), 0, q1, q2, q3, float("inf")],
                            labels=["Unknown Income", "Low Income", "Medium Income", "High Income", "Very High Income"], right=False)

print(df["income_group"])

0          Unknown Income
1             High Income
2          Unknown Income
3           Medium Income
4          Unknown Income
               ...       
59941      Unknown Income
59942      Unknown Income
59943    Very High Income
59944      Unknown Income
59945      Unknown Income
Name: income_group, Length: 59946, dtype: category
Categories (5, object): ['Unknown Income' < 'Low Income' < 'Medium Income' < 'High Income' <
                         'Very High Income']

<ipython-input-55-f686f7740fa6>:2: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["income_group"] = pd.cut(df["income"].fillna(-1), bins=[-float("inf"), 0, q1, q2, q3, float("inf")],

# Calculate profile completeness for each user
df["profile_completeness"] = df.notna().sum(axis=1) / len(df.columns) * 100
df["profile_completeness"]

# Analyze profile completeness distribution
completeness_distribution = df["profile_completeness"].value_counts(bins=[0, 25, 50, 75, 100], sort=False)
completeness_distribution

completeness_by_age = df.groupby("age")["profile_completeness"].mean()
print(completeness_by_age)

age
18     85.641288
19     85.313119
20     86.878003
21     86.665572
22     86.488325
23     86.478639
24     86.269035
25     86.282401
26     86.314941
27     86.314361
28     86.105439
29     86.386071
30     86.216176
31     86.300394
32     86.360955
33     86.274276
34     86.258232
35     86.435747
36     86.557835
37     86.655848
38     86.909379
39     86.801689
40     86.882984
41     87.454350
42     87.043401
43     87.118145
44     87.265834
45     87.787509
46     87.297396
47     87.483832
48     87.722946
49     87.719298
50     87.450319
51     87.624060
52     87.683599
53     86.695906
54     88.310664
55     87.845084
56     87.803457
57     88.384046
58     88.004275
59     87.997142
60     87.854251
61     89.114833
62     88.150016
63     88.138825
64     86.539357
65     88.652825
66     88.571429
67     88.118022
68     90.098127
69     86.757216
109    78.947368
110    47.368421
Name: profile_completeness, dtype: float64

# Conversion factor: 1 inch = 2.54 cm
conversion_factor = 2.54

# Convert height to centimeters and store in a new column
df["height_cm"] = df["height"] * conversion_factor
df["height_cm"]

# Calculate gender distribution (counts and percentages)
gender_counts = df["gender"].value_counts()
gender_percentages = df["gender"].value_counts(normalize=True) * 100

# Combine counts and percentages into a single DataFrame for clarity
gender_distribution = pd.DataFrame({
    "Count": gender_counts,
    "Percentage (%)": gender_percentages
})

# Display the results
print(gender_distribution)

        Count  Percentage (%)
gender                       
m       35829       59.768792
f       24117       40.231208

# Calculate status distribution (counts and percentages)
status_counts = df["status"].value_counts()
status_percentages = df["status"].value_counts(normalize=True) * 100

# Combine counts and percentages into a single DataFrame
status_distribution = pd.DataFrame({
    "Count": status_counts,
    "Percentage (%)": status_percentages
})

# Display the results
print(status_distribution)

                Count  Percentage (%)
status                               
single          55697       92.911954
seeing someone   2064        3.443099
available        1865        3.111133
married           310        0.517132
unknown            10        0.016682

# Calculate the status distribution by gender
status_by_gender = df.groupby(["gender", "status"]).size().unstack(fill_value=0)

# Calculate the proportion of each status category by gender
status_percentage_by_gender = status_by_gender.divide(status_by_gender.sum(axis=1), axis=0) * 100

# Display the results
print(status_percentage_by_gender)

status  available   married  seeing someone     single   unknown
gender                                                          
f        2.720073  0.559771        4.158892  92.544678  0.016586
m        3.374362  0.488431        2.961288  93.159173  0.016746

# Encode gender as numerical values (e.g., 0 for male, 1 for female)
df["gender_encoded"] = df["gender"].map({"m": 0, "f": 1})

# Select numerical columns
numerical_cols = ["age", "income", "gender_encoded"]

# Handle missing values by filling them with the column mean (alternatively, you can use df.dropna())
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Calculate correlation matrix
correlation_matrix = df[numerical_cols].corr()

# Display correlation matrix
print(correlation_matrix)

                     age    income  gender_encoded
age             1.000000 -0.004171        0.041481
income         -0.004171  1.000000       -0.021317
gender_encoded  0.041481 -0.021317        1.000000

<ipython-input-63-757a9fd1725b>:8: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Calculate the Pearson correlation between age and income
correlation = df["age"].corr(df["income"])
# Display the correlation
print(f"Correlation between age and income: {correlation:.2f}")

Correlation between age and income: -0.00

# Calculate the distribution of dietary preferences
diet_distribution = df["diet"].value_counts(normalize=True) * 100

# Display the percentage distribution
print(diet_distribution)

diet
mostly anything        46.651290
anything               17.391916
strictly anything      14.382155
mostly vegetarian       9.687491
mostly other            2.832550
strictly vegetarian     2.461253
vegetarian              1.876178
strictly other          1.271413
mostly vegan            0.950747
other                   0.931057
strictly vegan          0.641332
vegan                   0.382549
mostly kosher           0.241906
mostly halal            0.135017
strictly halal          0.050631
strictly kosher         0.050631
halal                   0.030941
kosher                  0.030941
Name: proportion, dtype: float64

# Cross-tabulate diet and drinks to see how drinking habits vary across diet categories
diet_drinks_distribution = pd.crosstab(df['diet'], df['drinks'], normalize='index') * 100

# Display the percentage distribution
print(diet_drinks_distribution)

drinks               desperately  not at all      often     rarely   socially  \
diet                                                                            
anything                0.350467    4.923231   9.579439   8.511348  75.667557   
halal                   0.000000   44.444444  11.111111   0.000000  44.444444   
kosher                  0.000000    9.090909   0.000000  18.181818  63.636364   
mostly anything         0.406479    5.185687   8.536060   9.466034  75.611258   
mostly halal            6.976744   23.255814   4.651163  18.604651  37.209302   
mostly kosher           1.190476    8.333333   4.761905  20.238095  59.523810   
mostly other            0.820513    9.333333   5.025641  18.051282  66.256410   
mostly vegan            0.621118   12.422360   6.832298  19.254658  59.937888   
mostly vegetarian       0.572117    5.871725   7.015959  14.031918  71.906052   
other                   0.320513   11.217949   7.371795  19.230769  61.538462   
strictly anything       1.323441    3.649489  13.735713   6.376579  73.731702   
strictly halal          5.555556   38.888889  11.111111   5.555556  27.777778   
strictly kosher        33.333333   22.222222  11.111111  16.666667   5.555556   
strictly other          3.240741   15.509259   7.870370  14.583333  56.481481   
strictly vegan          3.652968   26.484018   8.675799  16.894977  42.922374   
strictly vegetarian     1.060071   10.011779   9.658422  14.252061  63.957597   
vegan                   1.574803   18.110236  14.960630  13.385827  51.181102   
vegetarian              0.964630    5.948553   9.485531  10.771704  71.704180   

drinks               very often  
diet                             
anything               0.967957  
halal                  0.000000  
kosher                 9.090909  
mostly anything        0.794482  
mostly halal           9.302326  
mostly kosher          5.952381  
mostly other           0.512821  
mostly vegan           0.931677  
mostly vegetarian      0.602228  
other                  0.320513  
strictly anything      1.183076  
strictly halal        11.111111  
strictly kosher       11.111111  
strictly other         2.314815  
strictly vegan         1.369863  
strictly vegetarian    1.060071  
vegan                  0.787402  
vegetarian             1.125402

df['location'] = df['location'].dropna()
# Split the 'location' column into city and state columns, and handle missing or malformed entries
df[['city', 'state']] = df['location'].str.split(', ', n=1, expand=True)
# Count the number of users in each city and state
city_counts = df['city'].value_counts().head(5)
state_counts = df['state'].value_counts().head(5)
city_counts
state_counts

# Split the 'location' column into city and state columns
df[['city', 'state']] = df['location'].str.split(', ', n=1, expand=True)

# Group by city and calculate the average age for each city
city_age_stats = df.groupby('city')['age'].mean().sort_values(ascending=False)

# Display the average age by city
print(city_age_stats)

city
forest knolls      62.5
bellingham         59.0
port costa         53.0
seaside            50.0
redwood shores     47.0
                   ... 
san luis obispo    20.0
canyon             19.0
canyon country     19.0
isla vista         19.0
long beach         19.0
Name: age, Length: 198, dtype: float64

# Split the 'location' column into city and state columns
df[['city', 'state']] = df['location'].str.split(', ', n=1, expand=True)

# Group by state or city and calculate the average income
city_income_stats = df.groupby('city')['income'].mean().sort_values(ascending=False)
state_income_stats = df.groupby('state')['income'].mean().sort_values(ascending=False)

# Display the average income by city and state
print("Average Income by City:")
print(city_income_stats)
print(state_income_stats)

Average Income by City:
city
petaluma           552197.496523
santa cruz         292636.995828
montara            172829.160872
south orange       150000.000000
boulder            150000.000000
                       ...      
north hollywood     20000.000000
leander             20000.000000
pasadena            20000.000000
new orleans         20000.000000
rohnert park        20000.000000
Name: income, Length: 198, dtype: float64
state
new jersey                  150000.000000
colorado                    127197.496523
new york                    123877.935040
california                  104400.939498
germany                     104394.993046
georgia                     104394.993046
hawaii                      104394.993046
connecticut                 104394.993046
district of columbia        104394.993046
west virginia               104394.993046
wisconsin                   104394.993046
washington                  104394.993046
idaho                       104394.993046
mexico                      104394.993046
minnesota                   104394.993046
illinois                    104394.993046
ireland                     104394.993046
montana                     104394.993046
missouri                    104394.993046
nevada                      104394.993046
mississippi                 104394.993046
netherlands                 104394.993046
spain                       104394.993046
oregon                      104394.993046
north carolina              104394.993046
virginia                    104394.993046
tennessee                   104394.993046
utah                        104394.993046
rhode island                104394.993046
switzerland                 104394.993046
united kingdom              104394.993046
florida                     104394.993046
massachusetts                89515.994437
texas                        83296.244784
michigan                     83296.244784
ohio                         62197.496523
british columbia, canada     60000.000000
vietnam                      60000.000000
pennsylvania                 40000.000000
arizona                      33333.333333
louisiana                    20000.000000
Name: income, dtype: float64

gender_height_avg = df.groupby('gender')['height'].mean()
# Display the average height by gender
print(gender_height_avg)

gender
f    65.103873
m    70.443492
Name: height, dtype: float64

# Define age bins and labels
bins = [0, 25, 35, 45, 100]
labels = ["18-25", "26-35", "36-45", "46+"]
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# Group by age_group and calculate the average height
age_group_height_avg = df.groupby('age_group')['height'].mean()

# Display the average height by age group
print(age_group_height_avg)

age_group
18-25    68.145931
26-35    68.410693
36-45    68.315317
46+      67.976148
Name: height, dtype: float64

<ipython-input-71-f9264f31cf66>:7: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  age_group_height_avg = df.groupby('age_group')['height'].mean()

# Group by body type and describe the height distribution
body_type_height_distribution = df.groupby('body_type')['height'].describe()
# Display the distribution statistics
print(body_type_height_distribution)

                  count       mean       std   min   25%   50%    75%   max
body_type                                                                  
a little extra   2629.0  68.820084  3.930905  55.0  66.0  69.0  72.00  85.0
athletic        11819.0  69.707336  3.593712   3.0  67.0  70.0  72.00  95.0
average         14652.0  68.100805  3.850883  36.0  65.0  68.0  71.00  95.0
curvy            3924.0  65.210245  3.021022  36.0  63.0  65.0  67.00  95.0
fit             12711.0  68.546062  3.767420   9.0  66.0  69.0  71.00  95.0
full figured     1009.0  66.464817  3.352475  58.0  64.0  66.0  68.00  80.0
jacked            421.0  69.292162  5.871453  37.0  66.0  70.0  72.00  95.0
overweight        444.0  68.948198  4.225529  59.0  66.0  69.0  72.00  95.0
rather not say    198.0  67.272727  4.756653  59.0  64.0  67.0  70.75  95.0
skinny           1777.0  68.544176  3.992023  59.0  66.0  69.0  72.00  84.0
thin             4711.0  67.866058  4.091067   6.0  65.0  68.0  71.00  80.0
used up           355.0  69.180282  5.789251  36.0  66.0  70.0  72.00  95.0

df_filtered = df[df['income'] > 0]
# Describe the income distribution (excluding zeros)
income_description = df_filtered['income'].describe()
# Display the income distribution summary
print(income_description)

count      59946.000000
mean      104394.993046
std        88239.052798
min        20000.000000
25%       104394.993046
50%       104394.993046
75%       104394.993046
max      1000000.000000
Name: income, dtype: float64

# Create an 'age_group' column by categorizing ages
bins = [0, 18, 30, 40, 50, 100]  # Define age groups
labels = [ '0-18','18-30', '31-40', '41-50', '51+']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)
# Group by 'age_group' and 'gender' and calculate average income
grouped_income = df.groupby(['age_group', 'gender'])['income'].mean().reset_index()
# Display the grouped data
print(grouped_income)

  age_group gender         income
0      0-18      f            NaN
1      0-18      m            NaN
2     18-30      f  101740.061003
3     18-30      m  105071.928791
4     31-40      f  103352.298007
5     31-40      m  107689.944669
6     41-50      f  101633.033214
7     41-50      m  106586.780425
8       51+      f   99848.000669
9       51+      m  101792.747280

<ipython-input-74-5be5ed22db74>:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_income = df.groupby(['age_group', 'gender'])['income'].mean().reset_index()

# Calculate the mean age
mean_age = df['age'].mean()

# Plot a histogram
plt.figure(figsize=(8, 6))
plt.hist(df['age'], bins=10, edgecolor='black', alpha=0.7, color='skyblue')
plt.axvline(mean_age, color='red', linestyle='dashed', linewidth=1.5, label=f"Mean Age: {mean_age:.1f}")
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Number of Users")
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Plot age distribution by gender using Seaborn
plt.figure(figsize=(10, 6))
sns.histplot(
    data=df,
    x="age",
    hue="gender",
    bins=10,
    kde=True,
    palette="Set2",
    alpha=0.7
)
plt.title("Age Distribution by Gender")
plt.xlabel("Age")
plt.ylabel("Count")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title="Gender")
plt.show()

<ipython-input-76-0173dc7162e1>:16: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  plt.legend(title="Gender")

# Filter out rows with income of 0 (if present)
df = df[df["income"] > 0]
# Create a scatterplot with a trend line
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="age", y="income", color="blue", alpha=0.7)
sns.regplot(data=df, x="age", y="income", scatter=False, color="red", ci=None)
# Add titles and labels
plt.title("Relationship Between Age and Income", fontsize=14)
plt.xlabel("Age", fontsize=12)
plt.ylabel("Income", fontsize=12)
plt.grid(axis='both', linestyle='--', alpha=0.7)
plt.show()

# Define age groups
bins = [20, 30, 40, 50, 60]
labels = ["20-29", "30-39", "40-49", "50-59"]
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)

# Filter out rows with income of 0 (if present)
df = df[df["income"] > 0]

# Create a boxplot of income grouped by age_group
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x="age_group", y="income", palette="Set2")

# Add titles and labels
plt.title("Income Distribution by Age Group", fontsize=14)
plt.xlabel("Age Group", fontsize=12)
plt.ylabel("Income", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Identify the age group with the highest median income
median_income = df.groupby("age_group")["income"].median()
print("Median income by age group:")
print(median_income)
print(f"The age group with the highest median income is: {median_income.idxmax()}")

<ipython-input-78-4a0f27304a21>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x="age_group", y="income", palette="Set2")

Median income by age group:
age_group
20-29    104394.993046
30-39    104394.993046
40-49    104394.993046
50-59    104394.993046
Name: income, dtype: float64
The age group with the highest median income is: 20-29

<ipython-input-78-4a0f27304a21>:21: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  median_income = df.groupby("age_group")["income"].median()

# Group by gender and status, calculate median income
grouped_income = df.groupby(["gender", "status"])["income"].median().reset_index()

# Create a bar plot for income comparison
plt.figure(figsize=(10, 6))
sns.barplot(data=grouped_income, x="status", y="income", hue="gender", palette="Set2")

# Add titles and labels
plt.title("Median Income by Gender and Status", fontsize=14)
plt.xlabel("Status", fontsize=12)
plt.ylabel("Median Income", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)

plt.show()
# Display the grouped income DataFrame
print("Median Income by Gender and Status:")
print(grouped_income)

Median Income by Gender and Status:
  gender          status         income
0      f       available  104394.993046
1      f         married  104394.993046
2      f  seeing someone  104394.993046
3      f          single  104394.993046
4      f         unknown   62197.496523
5      m       available  104394.993046
6      m         married  104394.993046
7      m  seeing someone  104394.993046
8      m          single  104394.993046
9      m         unknown  104394.993046

# Count the distribution of pet preferences
pet_distribution = df["pets"].value_counts()

# Plot a bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=pet_distribution.index, y=pet_distribution.values, palette="pastel")

# Add titles and labels
plt.title("Distribution of Pet Preferences", fontsize=14)
plt.xlabel("Pet Preference", fontsize=12)
plt.ylabel("Number of Users", fontsize=12)
plt.xticks(rotation=15)
plt.grid(axis="y", linestyle="--", alpha=0.7)

plt.show()
# Display the distribution as a table
print("Pet Preferences Distribution:")
print(pet_distribution)

<ipython-input-80-89b98fe865fd>:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=pet_distribution.index, y=pet_distribution.values, palette="pastel")

Pet Preferences Distribution:
pets
likes dogs and likes cats          14814
likes dogs                          7224
likes dogs and has cats             4313
has dogs                            4134
has dogs and likes cats             2333
likes dogs and dislikes cats        2029
has dogs and has cats               1474
has cats                            1406
likes cats                          1063
has dogs and dislikes cats           552
dislikes dogs and likes cats         240
dislikes dogs and dislikes cats      196
dislikes cats                        122
dislikes dogs and has cats            81
dislikes dogs                         44
Name: count, dtype: int64

# Group the data by gender, age_group, and pets preferences
pets_analysis = (
    df.groupby(["gender", "age_group", "pets"])
    .size()
    .reset_index(name="count")
)

# Create a pivot table for visualization
pivot_pets = pets_analysis.pivot_table(
    index=["age_group", "pets"],
    columns="gender",
    values="count",
    fill_value=0
)

# Plot the pet preferences across gender and age groups
plt.figure(figsize=(14, 8))
sns.catplot(
     data=pets_analysis,
    x="age_group",
    y="count",
    hue="pets",
    col="gender",
    kind="bar",
    height=5,
    aspect=1.2,
    palette="pastel"
)

# Add titles and labels
plt.subplots_adjust(top=0.85)
plt.suptitle("Pets Preferences by Gender and Age Group", fontsize=16)
plt.xlabel("Age Group")
plt.ylabel("Count of Users")
plt.show()

<ipython-input-81-d8d73bdc6a0e>:3: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df.groupby(["gender", "age_group", "pets"])
<ipython-input-81-d8d73bdc6a0e>:9: FutureWarning: The default value of observed=False is deprecated and will change to observed=True in a future version of pandas. Specify observed=False to silence this warning and retain the current behavior
  pivot_pets = pets_analysis.pivot_table(

<Figure size 1400x800 with 0 Axes>

# Calculate the distribution of zodiac signs
zodiac_distribution = df['sign'].value_counts()

# Plot a pie chart
plt.figure(figsize=(10, 8))
zodiac_distribution.plot.pie(
    autopct='%1.1f%%',
    startangle=140,
    colors=plt.cm.Pastel1.colors,
    wedgeprops={'edgecolor': 'black'}
)
plt.title("Zodiac Sign Distribution Across the Platform")
plt.ylabel("")  # Remove y-axis label for better visualization
plt.show()

# Group by gender and status, then calculate the distribution of signs
sign_gender_status = df.groupby(['gender', 'status', 'sign']).size().reset_index(name='count')

# Create a heatmap to visualize the distribution of signs by gender and status
pivot_table = sign_gender_status.pivot_table(
    index='sign',
    columns=['gender', 'status'],
    values='count',
    aggfunc='sum',
    fill_value=0
)
# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(
    pivot_table,
    annot=True,
    fmt='d',
    cmap='coolwarm',
    cbar_kws={'label': 'Count'}
)
plt.title("Zodiac Sign Distribution by Gender and Status")
plt.xlabel("Gender and Status")
plt.ylabel("Zodiac Sign")
plt.xticks(rotation=45, ha='right')
plt.show()

	last_online
0	2012-06-28 20:30:00
1	2012-06-29 21:41:00
2	2012-06-27 09:10:00
3	2012-06-28 14:22:00
4	2012-06-27 21:26:00
...	...
59941	2012-06-12 21:47:00
59942	2012-06-29 11:01:00
59943	2012-06-27 23:37:00
59944	2012-06-23 13:01:00
59945	2012-06-29 00:42:00

	count
(-0.001, 25.0]	0
(25.0, 50.0]	71
(50.0, 75.0]	7669
(75.0, 100.0]	52206

	height_cm
0	190.50
1	177.80
2	172.72
3	180.34
4	167.64
...	...
59941	157.48
59942	182.88
59943	180.34
59944	185.42
59945	172.72

	age	status	gender	body_type	diet	drinks	education	ethnicity	height	income	job	last_online	location	pets	religion	sign	speaks
0	22	single	m	a little extra	strictly anything	socially	working on college/university	asian, white	75.0	-1	transportation	2012-06-28-20-30	south san francisco, california	likes dogs and likes cats	agnosticism and very serious about it	gemini	english
1	35	single	m	average	mostly other	often	working on space camp	white	70.0	80000	hospitality / travel	2012-06-29-21-41	oakland, california	likes dogs and likes cats	agnosticism but not too serious about it	cancer	english (fluently), spanish (poorly), french (...
2	38	available	m	thin	anything	socially	graduated from masters program	NaN	68.0	-1	NaN	2012-06-27-09-10	san francisco, california	has cats	NaN	pisces but it doesn’t matter	english, french, c++
3	23	single	m	thin	vegetarian	socially	working on college/university	white	71.0	20000	student	2012-06-28-14-22	berkeley, california	likes cats	NaN	pisces	english, german (poorly)
4	29	single	m	athletic	NaN	socially	graduated from college/university	asian, black, other	66.0	-1	artistic / musical / writer	2012-06-27-21-26	san francisco, california	likes dogs and likes cats	NaN	aquarius	english

	0
age	0.000000
status	0.000000
gender	0.000000
body_type	8.834618
diet	40.694959
drinks	4.979482
education	11.056618
ethnicity	9.475194
height	0.005005
income	0.000000
job	13.675641
last_online	0.000000
location	0.000000
pets	33.231575
religion	33.740366
sign	18.443266
speaks	0.083408

	days_since_last_online
0	4657
1	4656
2	4659
3	4658
4	4658
...	...
59941	4673
59942	4657
59943	4658
59944	4663
59945	4657

	activity_status
0	Inactive
1	Inactive
2	Inactive
3	Inactive
4	Inactive
...	...
59941	Inactive
59942	Inactive
59943	Inactive
59944	Inactive
59945	Inactive

	income
0	<NA>
1	80000
2	<NA>
3	20000
4	<NA>
...	...
59941	<NA>
59942	<NA>
59943	100000
59944	<NA>
59945	<NA>

	profile_completeness
0	94.736842
1	100.000000
2	78.947368
3	94.736842
4	84.210526
...	...
59941	78.947368
59942	94.736842
59943	94.736842
59944	94.736842
59945	89.473684

	count
state
california	59855
new york	17
illinois	8
massachusetts	5
oregon	4

Understanding Bumble Users: Data-Backed Insights¶

Introduction¶

Analyzing the Bumble dataset to answer key business and user behavior questions.¶

Columns Overview:¶

Dataset Summary¶

Columns Information¶

Key Metrics to Focus On¶

Importing the libraries¶

Loading the Dataset¶

Data Cleaning¶

1.Inspecting Missing Data¶

2. Data Types¶

3.Outliers¶

4. Missing Data Visualization¶

Part 2: Data Processing¶

1. Binning and Grouping¶

2.Derived Features¶

3. Unit Conversion¶

3.Data Analysis¶

1.Demographic Analysis¶

1.What is the gender distribution (gender) across the platform? Are there any significant imbalances?¶

2.What are the proportions of users in different status categories (e.g., single, married, seeing someone)? What does this suggest about the platform’s target audience?¶

3.How does status vary by gender? For example, what proportion of men and women identify as single?¶

2. Correlation Analysis¶

1. What are the correlations between numerical columns such as age, income, gender Are there any strong positive or negative relationships?¶

2.How does age correlate with income? Are older users more likely to report higher income levels?¶

3. Diet and Lifestyle Analysis¶

1.How do dietary preferences (diet) distribute across the platform? For example, what percentage of users identify as vegetarian, vegan, or follow "anything" diets?¶

2.How do drinking habits (drinks) vary across different diet categories? Are users with stricter diets (e.g., vegan) less likely to drink?¶

4. Geographical Insights¶

1.Extract city and state information from the location column. What are the top 5 cities and states with the highest number of users?¶

2.How does age vary across the top cities? Are certain cities dominated by younger or older users?¶

3.What are the average income levels in the top states or cities? Are there regional patterns in reported income?¶

5. Height Analysis¶

1.What is the average height of users across different gender categories?¶

2.How does height vary by age_group? Are there noticeable trends among younger vs. older users?¶

3.What is the distribution of height within body_type categories (e.g., athletic, curvy, thin)? Do the distributions align with expectations?¶

6. Income Analysis¶

1.What is the distribution of income across the platform? Are there specific income brackets that dominate? (don't count 0)¶

2.How does income vary by age_group and gender? Are older users more likely to report higher incomes?¶

Part 4: Data Visualization¶

1. Age Distribution¶

1.Plot a histogram of age with a vertical line indicating the mean age. What does the distribution reveal about the most common age group on the platform?¶

2.How does the age distribution differ by gender? Are there age groups where one gender is more prevalent?¶

2.Income and Age¶

1.Use a scatterplot to visualize the relationship between income and age, with a trend line indicating overall patterns. Are older users more likely to report higher incomes?¶

2.Create boxplots of income grouped by age_group. Which age group reports the highest median income?¶

3.Analyze income levels within gender and status categories. For example, are single men more likely to report higher incomes than single women?¶

3. Pets and Preferences¶

1.Create a bar chart showing the distribution of pets categories (e.g., likes dogs, likes cats). Which preferences are most common?¶

2.How do pets preferences vary across gender and age_group? Are younger users more likely to report liking pets compared to older users?¶

4. Signs and Personality¶

1.Create a pie chart showing the distribution of zodiac signs (sign) across the platform. Which signs are most and least represented? Is this the right chart? If not, replace with right chart.¶

2.How does sign vary across gender and status? Are there noticeable patterns or imbalances?¶