import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df=pd.read_csv('bumble.csv')
df.head(5)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   gender       59946 non-null  object 
 3   body_type    54650 non-null  object 
 4   diet         35551 non-null  object 
 5   drinks       56961 non-null  object 
 6   education    53318 non-null  object 
 7   ethnicity    54266 non-null  object 
 8   height       59943 non-null  float64
 9   income       59946 non-null  int64  
 10  job          51748 non-null  object 
 11  last_online  59946 non-null  object 
 12  location     59946 non-null  object 
 13  pets         40025 non-null  object 
 14  religion     39720 non-null  object 
 15  sign         48890 non-null  object 
 16  speaks       59896 non-null  object 
dtypes: float64(1), int64(2), object(14)
memory usage: 7.8+ MB

df.columns

Index(['age', 'status', 'gender', 'body_type', 'diet', 'drinks', 'education',
       'ethnicity', 'height', 'income', 'job', 'last_online', 'location',
       'pets', 'religion', 'sign', 'speaks'],
      dtype='object')

df.index

RangeIndex(start=0, stop=59946, step=1)

missing_values = (df.isnull().sum() / len(df)) * 100
missing_values[missing_values > 0]

body_type     8.834618
diet         40.694959
drinks        4.979482
education    11.056618
ethnicity     9.475194
height        0.005005
job          13.675641
pets         33.231575
religion     33.740366
sign         18.443266
speaks        0.083408
dtype: float64

print(df.groupby("gender")["height"].median())
calculate_median = df.groupby("gender")["height"].transform("median")
df["height"] =df["height"].fillna(calculate_median)
print(calculate_median)
print("Missing values in height after filling:", df["height"].isnull().sum())

gender
f    65.0
m    70.0
Name: height, dtype: float64
0        70.0
1        70.0
2        70.0
3        70.0
4        70.0
         ... 
59941    65.0
59942    70.0
59943    70.0
59944    70.0
59945    70.0
Name: height, Length: 59946, dtype: float64
Missing values in height after filling: 0

print(df.groupby("gender")["height"].median())
calculate_median = df.groupby("gender")["height"].transform("median")
df["height"] =df["height"].fillna(calculate_median)
print(calculate_median)
print("Missing values in height after filling:", df["height"].isnull().sum())

gender
f    65.0
m    70.0
Name: height, dtype: float64
0        70.0
1        70.0
2        70.0
3        70.0
4        70.0
         ... 
59941    65.0
59942    70.0
59943    70.0
59944    70.0
59945    70.0
Name: height, Length: 59946, dtype: float64
Missing values in height after filling: 0

df.dtypes

age              int64
status          object
gender          object
body_type       object
diet            object
drinks          object
education       object
ethnicity       object
height         float64
income           int64
job             object
last_online     object
location        object
pets            object
religion        object
sign            object
speaks          object
dtype: object

df["last_online"]= pd.to_datetime(df["last_online"] , format= "%Y-%m-%d-%H-%M")
df["last_online"]

0                             NaT
1                             NaT
2       2012-06-27 19:00:00+00:00
3       2012-06-29 12:00:00+00:00
4                             NaT
                   ...           
59941                         NaT
59942   2012-06-29 12:00:00+00:00
59943                         NaT
59944   2012-06-23 14:00:00+00:00
59945                         NaT
Name: last_online, Length: 59946, dtype: datetime64[ns, UTC]

df.describe()

print("Age Range:", df["age"].min(), "to", df["age"].max())
print("Height Range:", df["height"].min(), "to", df["height"].max())
print("Income Range:", df["income"].min(), "to", df["income"].max())

Age Range: 18 to 110
Height Range: 1.0 to 95.0
Income Range: -1 to 1000000

df["income"] = df["income"].replace(-1, 0)

print(df["income"].value_counts())

income
0          48442
20000       2952
100000      1621
80000       1111
30000       1048
40000       1005
50000        975
60000        736
70000        707
150000       631
1000000      521
250000       149
500000        48
Name: count, dtype: int64

df[["income", "age", "height"]] = df[["income", "age", "height"]].replace(-1, 0)


column_names = ["income", "age", "height"]

for column in column_names:
    Q1 = df[column].quantile(0.10)  # 10th percentile
    Q3 = df[column].quantile(0.90)  # 90th percentile


    filtered_values = df[(df[column] > Q1) & (df[column] < Q3)]


    mean_values = filtered_values[column].mean()
    median_values = filtered_values[column].median()
    print(f"\n{column}: Filtered Mean = {mean_values}, Filtered Median = {median_values}")

income: Filtered Mean = 26109.89010989011, Filtered Median = 20000.0

age: Filtered Mean = 31.357685563997663, Filtered Median = 30.0

height: Filtered Mean = 68.25442646465552, Filtered Median = 68.0

plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cmap="viridis", cbar=False, yticklabels=False)
plt.title("Missing Values Heatmap", fontsize=14)
plt.show()

bins = [18, 25, 35, 45, df["age"].max()]
labels = ["18-25", "26-35", "36-45", "46+"]
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=True)
df[["age", "age_group"]]

plt.figure(figsize=(8,5))
sns.countplot(x=df["age_group"])
plt.title("Age Group Distribution", fontsize=14)
plt.xlabel("Age Group", fontsize=12)
plt.ylabel("Number of Users", fontsize=12)
plt.show()

import numpy as np
Q1 = df["income"].quantile(0.25)
Q2 = df["income"].quantile(0.50)
Q3 = df["income"].quantile(0.90)

df["income_category"] = np.where(df["income"] <= Q1, "Low Income",
                        np.where(df["income"] <= Q2, "Medium Income", "High Income"))

distribution_count = df["income_category"].value_counts()
print(distribution_count)

income_category
Low Income     48442
High Income    11504
Name: count, dtype: int64

df[['income','income_category']]

df["profile_completeness"] = df.notnull().sum(axis=1) / df.shape[1] * 100

print(df["profile_completeness"])

print(df.groupby("status")["profile_completeness"].mean())

0         94.736842
1         94.736842
2         84.210526
3         94.736842
4         84.210526
            ...    
59941     78.947368
59942    100.000000
59943     89.473684
59944    100.000000
59945     89.473684
Name: profile_completeness, Length: 59946, dtype: float64
status
available         88.175533
married           87.589134
seeing someone    88.012546
single            87.622975
unknown           80.000000
Name: profile_completeness, dtype: float64

df["height_cm"] = df["height"] * 2.54
print(df[["height", "height_cm"]])

       height  height_cm
0        75.0     190.50
1        70.0     177.80
2        68.0     172.72
3        71.0     180.34
4        66.0     167.64
...       ...        ...
59941    62.0     157.48
59942    72.0     182.88
59943    71.0     180.34
59944    73.0     185.42
59945    68.0     172.72

[59946 rows x 2 columns]

gender_distribution = df["gender"].value_counts()
print(gender_distribution)

gender
m    35829
f    24117
Name: count, dtype: int64

status_distribution = df["status"].value_counts(normalize=True) * 100
print(status_distribution)

status
single            92.911954
seeing someone     3.443099
available          3.111133
married            0.517132
unknown            0.016682
Name: proportion, dtype: float64

status_by_gender = df.groupby("gender")["status"].value_counts(normalize=True) * 100
print(status_by_gender)

gender  status        
f       single            92.544678
        seeing someone     4.158892
        available          2.720073
        married            0.559771
        unknown            0.016586
m       single            93.159173
        available          3.374362
        seeing someone     2.961288
        married            0.488431
        unknown            0.016746
Name: proportion, dtype: float64

status_gender_distribution = df.groupby(["status", "gender"]).size() / len(df) * 100
status_gender_df = status_gender_distribution.reset_index(name="Percentage")
status_gender_df_sorted = status_gender_df.sort_values(by="Percentage", ascending=False)
print(status_gender_df_sorted)

           status gender  Percentage
7          single      m   55.680112
6          single      f   37.231842
1       available      m    2.016815
5  seeing someone      m    1.769926
4  seeing someone      f    1.673173
0       available      f    1.094318
3         married      m    0.291929
2         married      f    0.225203
9         unknown      m    0.010009
8         unknown      f    0.006673

correlation_matrix = df[["age", "income", "height"]].corr()
print(correlation_matrix)

             age    income    height
age     1.000000 -0.001004 -0.022253
income -0.001004  1.000000  0.065048
height -0.022253  0.065048  1.000000

correlation = df["age"].corr(df["income"])
print(correlation)

-0.0010038681910053897

# Calculate the percentage of users in each diet category
diet_distribution = df["diet"].value_counts(normalize=True) * 100

# Print the distribution
print(diet_distribution)

diet
mostly anything        46.651290
anything               17.391916
strictly anything      14.382155
mostly vegetarian       9.687491
mostly other            2.832550
strictly vegetarian     2.461253
vegetarian              1.876178
strictly other          1.271413
mostly vegan            0.950747
other                   0.931057
strictly vegan          0.641332
vegan                   0.382549
mostly kosher           0.241906
mostly halal            0.135017
strictly halal          0.050631
strictly kosher         0.050631
halal                   0.030941
kosher                  0.030941
Name: proportion, dtype: float64

drink_percentage_by_diet = df.groupby("diet")["drinks"].size() / len(df) * 100


drinks_by_diet_df = drink_percentage_by_diet.reset_index(name="Drink Percentage")

sorted_drinks_df = drinks_by_diet_df.sort_values(by="Drink Percentage", ascending=False)
print(sorted_drinks_df)

                   diet  Drink Percentage
3       mostly anything         27.666567
0              anything         10.314283
10    strictly anything          8.529343
8     mostly vegetarian          5.745171
6          mostly other          1.679845
15  strictly vegetarian          1.459647
17           vegetarian          1.112668
13       strictly other          0.754012
7          mostly vegan          0.563841
9                 other          0.552164
14       strictly vegan          0.380342
16                vegan          0.226871
5         mostly kosher          0.143462
4          mostly halal          0.080072
11       strictly halal          0.030027
12      strictly kosher          0.030027
1                 halal          0.018350
2                kosher          0.018350

df[['city','state']] = df['location'].str.split(', ',expand = True, n=1)
df[['city','state']].head(5)

top_cities = df["city"].value_counts().head(5)

# Count top 5 states with the highest number of users
top_states = df["state"].value_counts().head(5)

print("Top 5 Cities with Most Users:\n", top_cities)
print("\nTop 5 States with Most Users:\n", top_states)

Top 5 Cities with Most Users:
 city
san francisco    31064
oakland           7214
berkeley          4212
san mateo         1331
palo alto         1064
Name: count, dtype: int64

Top 5 States with Most Users:
 state
california       59855
new york            17
illinois             8
massachusetts        5
texas                4
Name: count, dtype: int64

city_with_average_age = df.groupby("city")["age"].mean()

print(f"Cities with high average age:\n{city_with_average_age.sort_values(ascending=False).head(5)}\n")
print(f"Cities with low average age:\n{city_with_average_age.sort_values(ascending=True).head(5)}")

Cities with high average age:
city
forest knolls     62.5
bellingham        59.0
port costa        53.0
seaside           50.0
redwood shores    47.0
Name: age, dtype: float64

Cities with low average age:
city
canyon country    19.0
canyon            19.0
long beach        19.0
isla vista        19.0
fayetteville      20.0
Name: age, dtype: float64

top_states = df["state"].value_counts().head(5).index
avg_income_states = df[df["state"].isin(top_states)].groupby("state")["income"].mean().sort_values(ascending=False)
print("Average Income in Top 5 States:\n", avg_income_states)

Average Income in Top 5 States:
 state
new york         31764.705882
california       20044.273661
massachusetts     6000.000000
texas             5000.000000
illinois             0.000000
Name: income, dtype: float64

height_gender = df.groupby("gender")["height"].mean()
print(height_gender)

gender
f    65.103869
m    70.443468
Name: height, dtype: float64

avg_height_by_gender = df.groupby("age_group")["height"].mean()
print( avg_height_by_gender)

age_group
18-25    68.224532
26-35    68.406764
36-45    68.325095
46+      67.941167
Name: height, dtype: float64

C:\Users\HP\AppData\Local\Temp\ipykernel_7800\1137765510.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  avg_height_by_gender = df.groupby("age_group")["height"].mean()

height_by_body_type = df.groupby("body_type")["height"].mean().sort_values(ascending=False)
print("Height Distribution by Body Type:\n", height_by_body_type)

Height Distribution by Body Type:
 body_type
athletic          69.707336
jacked            69.292162
used up           69.180282
overweight        68.948198
a little extra    68.820084
fit               68.546062
skinny            68.544176
average           68.100805
thin              67.866058
rather not say    67.272727
full figured      66.464817
curvy             65.210245
Name: height, dtype: float64

income_without_zero = df[df['income'] != 0]
income_without_zero['income'].value_counts().sort_values(ascending = False)

income
20000      2952
100000     1621
80000      1111
30000      1048
40000      1005
50000       975
60000       736
70000       707
150000      631
1000000     521
250000      149
500000       48
Name: count, dtype: int64

plt.hist(df["age"], color = "orange", edgecolor= "black")
mean_age = df["age"].mean()
plt.axvline(mean_age, color= "black", label= "mean age")
plt.xlabel("age_group")
plt.title("Distribution of age across the platform")
plt.show()

plt.figure(figsize=(10, 5))
sns.histplot(df, x="age", hue="gender", bins=20, kde=True, alpha=0.6)
plt.xlabel("Age")
plt.ylabel("User Count")
plt.title("Age Distribution by Gender")
plt.legend(title="Gender", labels=["Female", "Male"])
plt.show()

plt.figure(figsize=(10, 6))
sns.regplot(data=df[df['income']>0], x='age', y='income', line_kws={'color': 'red'})
plt.xlabel('Age',fontsize =14)
plt.ylabel('Income',fontsize =14)
plt.title('Income Distribution Over Age with Trendline',fontsize =14)
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
sns.boxplot(data=df[df['income'] > 0], x="age_group", y="income", showfliers=False)

plt.xlabel("Age Group", fontsize=14)
plt.ylabel("Income", fontsize=14)
plt.title("Income Distribution Over Age Groups", fontsize=14)

plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(data=df, x="status", y="income", hue="gender", ci=None,palette="Reds")
plt.xlabel("Status", fontsize=14)
plt.ylabel("Income", fontsize=14)
plt.title("Income by Gender and Status", fontsize=16)
plt.legend(title="Gender", fontsize=12)
plt.show()

C:\Users\HP\AppData\Local\Temp\ipykernel_7800\2982830235.py:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=df, x="status", y="income", hue="gender", ci=None,palette="Reds")

plt.figure(figsize=(12,6))
sns.barplot(x=df["pets"].value_counts().index,
            y=df["pets"].value_counts().values,
            palette="viridis")

plt.xlabel("Pet Preferences", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.title("Distribution of Pet Preferences", fontsize=16)
plt.xticks(rotation=90)
plt.show()

C:\Users\HP\AppData\Local\Temp\ipykernel_7800\1773610153.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=df["pets"].value_counts().index,

pet_distribution_by_group = df.groupby(['age_group', 'gender', 'pets']).size().reset_index(name="count")

plt.figure(figsize=(10, 6))
sns.barplot(data=pet_distribution_by_group, x="age_group", y="count", hue="pets")
plt.xlabel("Age Group", fontsize=14)
plt.ylabel("Number of Users", fontsize=14)
plt.title("Pet Preferences by Age Group and Gender", fontsize=16)
plt.legend(title="Pet Preference", fontsize=8)
plt.show()

C:\Users\HP\AppData\Local\Temp\ipykernel_7800\2559241296.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  pet_distribution_by_group = df.groupby(['age_group', 'gender', 'pets']).size().reset_index(name="count")

zodiac_counts = df["sign"].value_counts()
zodiac_counts.plot(kind='pie')
plt.title("Distribution of Zodiac Signs")

Text(0.5, 1.0, 'Distribution of Zodiac Signs')

plt.figure(figsize=(12,6))
sns.countplot(data= df , x = "sign",palette="Reds")
plt.xticks(rotation = 90)
plt.title("Distribution of Zodiac Signs")

C:\Users\HP\AppData\Local\Temp\ipykernel_7800\2906468300.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data= df , x = "sign",palette="Reds")

Text(0.5, 1.0, 'Distribution of Zodiac Signs')

import seaborn as sns
import matplotlib.pyplot as plt


zodiac_distribution = df.groupby(["gender", "status", "sign"]).size().reset_index(name="count")

plt.figure(figsize=(12, 6))
sns.barplot(data=zodiac_distribution, x="sign", y="count", hue="gender", ci=None, palette="coolwarm")

plt.xlabel("Zodiac Sign", fontsize=12)
plt.ylabel("Number of Users", fontsize=12)
plt.title("Zodiac Sign Distribution by Gender & Status", fontsize=14)
plt.xticks(rotation=90)  # Rotate labels for readability
plt.legend(title="Gender")
plt.show()

C:\Users\HP\AppData\Local\Temp\ipykernel_7800\1004284726.py:8: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=zodiac_distribution, x="sign", y="count", hue="gender", ci=None, palette="coolwarm")

pip install pandoc

Collecting pandoc
  Using cached pandoc-2.4-py3-none-any.whl
Collecting plumbum (from pandoc)
  Using cached plumbum-1.9.0-py3-none-any.whl.metadata (10 kB)
Requirement already satisfied: ply in c:\users\hp\anaconda3\lib\site-packages (from pandoc) (3.11)
Requirement already satisfied: pywin32 in c:\users\hp\anaconda3\lib\site-packages (from plumbum->pandoc) (305.1)
Using cached plumbum-1.9.0-py3-none-any.whl (127 kB)
Installing collected packages: plumbum, pandoc
Successfully installed pandoc-2.4 plumbum-1.9.0
Note: you may need to restart the kernel to use updated packages.

	age	status	gender	body_type	diet	drinks	education	ethnicity	height	income	job	last_online	location	pets	religion	sign	speaks
0	22	single	m	a little extra	strictly anything	socially	working on college/university	asian, white	75.0	-1	transportation	2012-06-28-20-30	south san francisco, california	likes dogs and likes cats	agnosticism and very serious about it	gemini	english
1	35	single	m	average	mostly other	often	working on space camp	white	70.0	80000	hospitality / travel	2012-06-29-21-41	oakland, california	likes dogs and likes cats	agnosticism but not too serious about it	cancer	english (fluently), spanish (poorly), french (...
2	38	available	m	thin	anything	socially	graduated from masters program	NaN	68.0	-1	NaN	2012-06-27-09-10	san francisco, california	has cats	NaN	pisces but it doesn’t matter	english, french, c++
3	23	single	m	thin	vegetarian	socially	working on college/university	white	71.0	20000	student	2012-06-28-14-22	berkeley, california	likes cats	NaN	pisces	english, german (poorly)
4	29	single	m	athletic	NaN	socially	graduated from college/university	asian, black, other	66.0	-1	artistic / musical / writer	2012-06-27-21-26	san francisco, california	likes dogs and likes cats	NaN	aquarius	english

	age	height	income
count	59946.000000	59946.000000	59946.000000
mean	32.340290	68.295282	20033.222534
std	9.452779	3.994738	97346.192104
min	18.000000	1.000000	-1.000000
25%	26.000000	66.000000	-1.000000
50%	30.000000	68.000000	-1.000000
75%	37.000000	71.000000	-1.000000
max	110.000000	95.000000	1000000.000000

	age	age_group
0	22	18-25
1	35	26-35
2	38	36-45
3	23	18-25
4	29	26-35
...	...	...
59941	59	46+
59942	24	18-25
59943	42	36-45
59944	27	26-35
59945	39	36-45

	income	income_category
0	0	Low Income
1	80000	High Income
2	0	Low Income
3	20000	High Income
4	0	Low Income
...	...	...
59941	0	Low Income
59942	0	Low Income
59943	100000	High Income
59944	0	Low Income
59945	0	Low Income

	city	state
0	south san francisco	california
1	oakland	california
2	san francisco	california
3	berkeley	california
4	san francisco	california

Bumble Milestone¶

Part1 : Data Cleaning

1. Inspecting Missing Data¶

1.Which columns in the dataset have missing values, and what percentage of data is missing in each column?¶

missing_values = df.isnull().sum()¶

2 . Are there columns where more than 50% of the data is missing? Drop those columns where missing values are >50%.

3. Missing numerical data (e.g., height, income) should be handled by imputing the median value of height and income for the corresponding category, such as gender, age group, or location. This ensures that the imputed values are contextually relevant and reduce potential biases in the analysis.

2 . Data Types¶

1 . Are there any inconsistencies in the data types across columns (e.g., numerical data stored as strings)?¶

2 . Which columns require conversion to numerical datatypes for proper analysis (e.g., income)?

3 . Does the last_online column need to be converted into a datetime format? What additional insights can be gained by analyzing this as a date field

3. Outliers

1. Are there any apparent outliers in numerical columns such as age, height, or income? What are the ranges of values in these columns?

2 . Any -1 values in numerical columns like income should be replaced with 0, as they may represent missing or invalid data.¶

4 . Missing Data Visualization¶

1 . Create a heatmap to visualize missing values across the dataset. Which columns show consistent missing data patterns?¶

Part 2 : Data Processing¶

1 . Binning and Grouping¶

1 .Bin the age column into categories such as "18-25", "26-35", "36-45", and "46+" to create a new column, age_group. How does the distribution of users vary across these age ranges?¶

2 . Group income into categories like "Low Income", "Medium Income" and "High Income" based on meaningful thresholds (e.g., quartiles). What insights can be derived from these groups?¶

2 . Derived Features¶

Derived features are new columns created based on the existing data to add depth to the analysis. These features often reveal hidden patterns or provide new dimensions to explore.¶

1 . Create a new feature, profile_completeness, by calculating the percentage of non-missing values for each user profile. How complete are most user profiles, and how does completeness vary across demographics?¶

3 . Unit Conversion¶

1 . Convert the height column from inches to centimeters using the conversion factor (1 inch = 2.54 cm). Store the converted values in a new column, height_cm.¶

Part 3 : Data Analysis¶

1 . Demographic Analysis¶

1. What is the gender distribution (gender) across the platform? Are there any significant imbalances?¶

2 . What are the proportions of users in different status categories (e.g., single, married, seeing someone)? What does this suggest about the platform’s target audience?¶

3 . How does status vary by gender? For example, what proportion of men and women identify as single?¶

2 . Correlation Analysis¶

Correlation analysis helps uncover relationships between variables, guiding feature engineering and hypothesis generation. For example, understanding how age correlates with income or word count in profiles can reveal behavioral trends that inform platform design.¶

1 . What are the correlations between numerical columns such as age, income, gender Are there any strong positive or negative relationships?¶

3 . How does age correlate with income? Are older users more likely to report higher income levels?¶

3. Diet and Lifestyle Analysis¶

1 . How do dietary preferences (diet) distribute across the platform? For example, what percentage of users identify as vegetarian, vegan, or follow "anything" diets?¶

Analysis¶

2 . How do drinking habits (drinks) vary across different diet categories? Are users with stricter diets (e.g., vegan) less likely to drink?¶

Analysis¶

4 . Geographical Insights¶

1 . Extract city and state information from the location column. What are the top 5 cities and states with the highest number of users?¶

Analysis¶

1 How does age vary across the top cities? Are certain cities dominated by younger or older users?¶

Analysis¶

3 . What are the average income levels in the top states or cities? Are there regional patterns in reported income?¶

Analysis¶

5 . Height Analysis¶

1 . What is the average height of users across different gender categories?¶

2.How does height vary by age_group? Are there noticeable trends among younger vs. older users?¶

3 . What is the distribution of height within body_type categories (e.g., athletic, curvy, thin)? Do the distributions align with expectations?¶

6. Income Analysis¶

1 . What is the distribution of income across the platform? Are there specific income brackets that dominate? (don't count 0)¶

2 . How does income vary by age_group and gender? Are older users more likely to report higher incomes?¶

Analysis¶

Part 4 : Data Visualization¶

1 . Age Distribution¶

1 . Plot a histogram of age with a vertical line indicating the mean age. What does the distribution reveal about the most common age group on the platform?¶

2 . How does the age distribution differ by gender? Are there age groups where one gender is more prevalent?¶

2. Income and Age¶

1 . Use a scatterplot to visualize the relationship between income and age, with a trend line indicating overall patterns. Are older users more likely to report higher incomes?¶

2 . Create boxplots of income grouped by age_group. Which age group reports the highest median income?¶

3 . 3.Analyze income levels within gender and status categories. For example, are single men more likely to report higher incomes than single women?¶

3 . Pets and Preferences¶

1 . Create a bar chart showing the distribution of pets categories (e.g., likes dogs, likes cats). Which preferences are most common?¶

2 . How do pets preferences vary across gender and age_group? Are younger users more likely to report liking pets compared to older users?¶

4 . 4. Signs and Personality¶

1 . Create a pie chart showing the distribution of zodiac signs (sign) across the platform. Which signs are most and least represented? Is this the right chart? If not, replace with right chart.¶

2 . How does sign vary across gender and status? Are there noticeable patterns or imbalances?¶