In [1]:
print("LBRARIES USED ")
LBRARIES USED
In [3]:
#library for mathmetic functins
import numpy as np
#library for dataframes or tables
import pandas as pd
#library for visualization functins(charts,graphs)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [5]:
# Load the dataset
df = pd.read_csv("bumble.csv")
In [7]:
print("PART 1 : DATA CLEANING")
print("1. Inspecting Missing Data")
PART 1 : DATA CLEANING 1. Inspecting Missing Data
In [17]:
#Question 1: Which columns in the dataset have missing values, and what percentage of data is missing in each column?
missing_data = df.isnull().mean() * 100
missing_data.sort_values(ascending=False)
#Explanation:this code checks the percentage of missing values in each column and sorts them in descending order. The result will help us understand the magnitude of missing data in different fields.
Out[17]:
diet 40.694959 religion 33.740366 pets 33.231575 sign 18.443266 job 13.675641 education 11.056618 ethnicity 9.475194 body_type 8.834618 drinks 4.979482 speaks 0.083408 height 0.005005 last_online 0.000000 location 0.000000 income 0.000000 status 0.000000 gender 0.000000 age 0.000000 dtype: float64
In [19]:
#Question 2: Are there columns where more than 50% of the data is missing? Drop those columns where missing values are >50%.
df_cleaned = df.dropna(thresh=len(df)*0.5, axis=1)
#Explanation : Drop columns with more than 50% missing data from the table.
In [21]:
#Question 3: Missing numerical data (e.g., height, income) should be handled by imputing the median value of height and income for the corresponding category, such as gender, age group, or location.
df_cleaned['height'] = df_cleaned.groupby('gender')['height'].transform(lambda x: x.fillna(x.median()))
df_cleaned['income'] = df_cleaned.groupby('gender')['income'].transform(lambda x: x.fillna(x.median()))
df.head()
#Explanation : This code imputes the missing values of height and income by the median values based on the gender.
Out[21]:
age | status | gender | body_type | diet | drinks | education | ethnicity | height | income | job | last_online | location | pets | religion | sign | speaks | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | single | m | a little extra | strictly anything | socially | working on college/university | asian, white | 75.0 | -1 | transportation | 2012-06-28-20-30 | south san francisco, california | likes dogs and likes cats | agnosticism and very serious about it | gemini | english |
1 | 35 | single | m | average | mostly other | often | working on space camp | white | 70.0 | 80000 | hospitality / travel | 2012-06-29-21-41 | oakland, california | likes dogs and likes cats | agnosticism but not too serious about it | cancer | english (fluently), spanish (poorly), french (... |
2 | 38 | available | m | thin | anything | socially | graduated from masters program | NaN | 68.0 | -1 | NaN | 2012-06-27-09-10 | san francisco, california | has cats | NaN | pisces but it doesn’t matter | english, french, c++ |
3 | 23 | single | m | thin | vegetarian | socially | working on college/university | white | 71.0 | 20000 | student | 2012-06-28-14-22 | berkeley, california | likes cats | NaN | pisces | english, german (poorly) |
4 | 29 | single | m | athletic | NaN | socially | graduated from college/university | asian, black, other | 66.0 | -1 | artistic / musical / writer | 2012-06-27-21-26 | san francisco, california | likes dogs and likes cats | NaN | aquarius | english |
In [23]:
print("2.inspecting data types")
2.inspecting data types
In [25]:
#Question 1: Are there any inconsistencies in the data types across columns (e.g., numerical data stored as strings)?
df_cleaned.dtypes
#This code checks whether the given data types are correct or not
Out[25]:
age int64 status object gender object body_type object diet object drinks object education object ethnicity object height float64 income int64 job object last_online object location object pets object religion object sign object speaks object dtype: object
In [27]:
#Question 2: Which columns require conversion to numerical data types for proper analysis (e.g., income)?
df_cleaned['income'] = pd.to_numeric(df_cleaned['income'], errors='coerce')
df_cleaned['height'] = pd.to_numeric(df_cleaned['height'], errors='coerce')
df_cleaned.dtypes
#This code changes the data types for income and height into numeric data type if they are not in numeric type
Out[27]:
age int64 status object gender object body_type object diet object drinks object education object ethnicity object height float64 income int64 job object last_online object location object pets object religion object sign object speaks object dtype: object
In [33]:
#Question 3: Does the last_online column need to be converted into a datetime format? What additional insights can be gained by analyzing this as a date field?
df_cleaned['last_online'] = pd.to_datetime(df_cleaned['last_online'], errors='coerce',utc=True)
#This code changes the data type of 'last_online' column from 'object' to 'datetime' data type
In [35]:
df_cleaned.dtypes
Out[35]:
age int64 status object gender object body_type object diet object drinks object education object ethnicity object height float64 income int64 job object last_online datetime64[ns, UTC] location object pets object religion object sign object speaks object dtype: object
In [37]:
print("3.Outliers")
3.Outliers
In [39]:
#Question 1: Are there any apparent outliers in numerical columns such as age, height, or income? What are the ranges of values in these columns?
df_cleaned[['age', 'height', 'income']].describe()
#this code will display the ranges of the values like min,max,etc for the values age,height and income.
Out[39]:
age | height | income | |
---|---|---|---|
count | 59946.000000 | 59946.000000 | 59946.000000 |
mean | 32.340290 | 68.295282 | 20033.222534 |
std | 9.452779 | 3.994738 | 97346.192104 |
min | 18.000000 | 1.000000 | -1.000000 |
25% | 26.000000 | 66.000000 | -1.000000 |
50% | 30.000000 | 68.000000 | -1.000000 |
75% | 37.000000 | 71.000000 | -1.000000 |
max | 110.000000 | 95.000000 | 1000000.000000 |
In [41]:
#Question 2: Any -1 values in numerical columns like income should be replaced with 0, as they may represent missing or invalid data.
df_cleaned['income'] = df_cleaned['income'].replace(-1, 0)
df_cleaned.head()
#This code replaces all the values which where -1 to 0 i.e., a valid value
Out[41]:
age | status | gender | body_type | diet | drinks | education | ethnicity | height | income | job | last_online | location | pets | religion | sign | speaks | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | single | m | a little extra | strictly anything | socially | working on college/university | asian, white | 75.0 | 0 | transportation | NaT | south san francisco, california | likes dogs and likes cats | agnosticism and very serious about it | gemini | english |
1 | 35 | single | m | average | mostly other | often | working on space camp | white | 70.0 | 80000 | hospitality / travel | NaT | oakland, california | likes dogs and likes cats | agnosticism but not too serious about it | cancer | english (fluently), spanish (poorly), french (... |
2 | 38 | available | m | thin | anything | socially | graduated from masters program | NaN | 68.0 | 0 | NaN | 2012-06-27 19:00:00+00:00 | san francisco, california | has cats | NaN | pisces but it doesn’t matter | english, french, c++ |
3 | 23 | single | m | thin | vegetarian | socially | working on college/university | white | 71.0 | 20000 | student | 2012-06-29 12:00:00+00:00 | berkeley, california | likes cats | NaN | pisces | english, german (poorly) |
4 | 29 | single | m | athletic | NaN | socially | graduated from college/university | asian, black, other | 66.0 | 0 | artistic / musical / writer | NaT | san francisco, california | likes dogs and likes cats | NaN | aquarius | english |
In [43]:
#Question 3: For other outliers, calculate the mean and median values using only the middle 80% of the data (removing extreme high and low values).
q_low = df_cleaned['income'].quantile(0.1)
q_high = df_cleaned['income'].quantile(0.9)
df_cleaned = df_cleaned[(df_cleaned['income'] > q_low) & (df_cleaned['income'] < q_high)]
df_cleaned.shape
#this code removes the top 10% and bottem 10% of the data and gives the 80% of the data from the table.
Out[43]:
(5005, 17)
In [45]:
print("4.Missing Data Visualization")
4.Missing Data Visualization
In [47]:
#Question: Create a heatmap to visualize missing values across the dataset. Which columns show consistent missing data patterns?
plt.figure(figsize=(12,8)) #size of the map
sns.heatmap(df_cleaned.isnull(), cbar=False, cmap='viridis') #"cbar=false" which tells not to display the color bar and "cmap='viridis'" controls colors of the heat map
plt.show()
#This code shows the visual representation of the missing data as a heat map.
In [51]:
print("Part 2: Data Processing")
print("1. Binning and Grouping")
Part 2: Data Processing 1. Binning and Grouping
In [55]:
#Question: Bin the age column into categories such as "18-25", "26-35", "36-45", and "46+" to create a new column, age_group.
bins = [18, 25, 35, 45, 100]
labels = ['18-25', '26-35', '36-45', '46+']
df_cleaned['age_group'] = pd.cut(df_cleaned['age'], bins=bins, labels=labels , right=True)
df_cleaned.info()
#This code divides the age column into categories and the stores into a new column with name 'age_group'.
<class 'pandas.core.frame.DataFrame'> Index: 5005 entries, 3 to 59914 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 5005 non-null int64 1 status 5005 non-null object 2 gender 5005 non-null object 3 body_type 4726 non-null object 4 diet 3311 non-null object 5 drinks 4931 non-null object 6 education 4689 non-null object 7 ethnicity 4716 non-null object 8 height 5005 non-null float64 9 income 5005 non-null int64 10 job 4873 non-null object 11 last_online 1993 non-null datetime64[ns, UTC] 12 location 5005 non-null object 13 pets 4011 non-null object 14 religion 4056 non-null object 15 sign 4621 non-null object 16 speaks 5005 non-null object 17 age_group 4945 non-null category dtypes: category(1), datetime64[ns, UTC](1), float64(1), int64(2), object(13) memory usage: 708.9+ KB
In [57]:
print("2. Derived Features")
2. Derived Features
In [61]:
#Question: Create a new feature, profile_completeness, by calculating the percentage of non-missing values for each user profile.
df_cleaned['profile_completeness'] = df_cleaned.notnull().mean(axis=1) * 100
df_cleaned.info()
#This code show the completness of each profile within a column called 'profile_cpmleteness'.
<class 'pandas.core.frame.DataFrame'> Index: 5005 entries, 3 to 59914 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 5005 non-null int64 1 status 5005 non-null object 2 gender 5005 non-null object 3 body_type 4726 non-null object 4 diet 3311 non-null object 5 drinks 4931 non-null object 6 education 4689 non-null object 7 ethnicity 4716 non-null object 8 height 5005 non-null float64 9 income 5005 non-null int64 10 job 4873 non-null object 11 last_online 1993 non-null datetime64[ns, UTC] 12 location 5005 non-null object 13 pets 4011 non-null object 14 religion 4056 non-null object 15 sign 4621 non-null object 16 speaks 5005 non-null object 17 age_group 4945 non-null category 18 profile_completeness 5005 non-null float64 dtypes: category(1), datetime64[ns, UTC](1), float64(2), int64(2), object(13) memory usage: 748.0+ KB
In [63]:
print("3.Unit Conversion")
3.Unit Conversion
In [65]:
#Question: Convert the height column from inches to centimeters using the conversion factor (1 inch = 2.54 cm).
df_cleaned['height_cm'] = df_cleaned['height'] * 2.54
df_cleaned.info()
#This code converts the data from inches to centimeters and stores the data in the new column called 'height_cm'.
<class 'pandas.core.frame.DataFrame'> Index: 5005 entries, 3 to 59914 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 5005 non-null int64 1 status 5005 non-null object 2 gender 5005 non-null object 3 body_type 4726 non-null object 4 diet 3311 non-null object 5 drinks 4931 non-null object 6 education 4689 non-null object 7 ethnicity 4716 non-null object 8 height 5005 non-null float64 9 income 5005 non-null int64 10 job 4873 non-null object 11 last_online 1993 non-null datetime64[ns, UTC] 12 location 5005 non-null object 13 pets 4011 non-null object 14 religion 4056 non-null object 15 sign 4621 non-null object 16 speaks 5005 non-null object 17 age_group 4945 non-null category 18 profile_completeness 5005 non-null float64 19 height_cm 5005 non-null float64 dtypes: category(1), datetime64[ns, UTC](1), float64(3), int64(2), object(13) memory usage: 787.1+ KB
In [67]:
df_cleaned.head()
#This shows the new columns with the updated data from tho 5 rows
Out[67]:
age | status | gender | body_type | diet | drinks | education | ethnicity | height | income | job | last_online | location | pets | religion | sign | speaks | age_group | profile_completeness | height_cm | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 23 | single | m | thin | vegetarian | socially | working on college/university | white | 71.0 | 20000 | student | 2012-06-29 12:00:00+00:00 | berkeley, california | likes cats | NaN | pisces | english, german (poorly) | 18-25 | 94.736842 | 180.34 |
11 | 28 | seeing someone | m | average | mostly anything | socially | graduated from college/university | white | 72.0 | 40000 | banking / financial / real estate | 2012-05-22 22:00:00+00:00 | daly city, california | likes cats | christianity and very serious about it | leo but it doesn’t matter | english (fluently), sign language (poorly) | 26-35 | 100.000000 | 182.88 |
13 | 30 | single | f | skinny | mostly anything | socially | graduated from high school | white | 66.0 | 30000 | sales / marketing / biz dev | 2012-06-13 22:00:00+00:00 | san francisco, california | has dogs and likes cats | christianity but not too serious about it | NaN | english | 26-35 | 94.736842 | 167.64 |
66 | 22 | single | m | athletic | mostly anything | rarely | working on college/university | asian | 65.0 | 20000 | education / academia | 2012-06-30 02:00:00+00:00 | san jose, california | NaN | buddhism and laughing about it | virgo but it doesn’t matter | english (fluently) | 18-25 | 94.736842 | 165.10 |
79 | 21 | single | m | fit | mostly anything | rarely | working on college/university | white | 71.0 | 20000 | entertainment / media | NaT | san francisco, california | likes dogs and likes cats | catholicism but not too serious about it | NaN | english | 18-25 | 89.473684 | 180.34 |
In [69]:
print("PART 3 : DATA aNALYSIS")
print("1. Demographic Analysis")
PART 3 : DATA aNALYSIS 1. Demographic Analysis
In [71]:
#Question 1: What is the gender distribution (gender) across the platform? Are there any significant imbalances?
gender_distribution = df_cleaned['gender'].value_counts(normalize=True) * 100
gender_distribution
#This code shows the percentage of users on the platform as male and female categories
Out[71]:
gender m 66.233766 f 33.766234 Name: proportion, dtype: float64
In [73]:
#Question 2: What are the proportions of users in different status categories (e.g., single, married, seeing someone)? What does this suggest about the platform’s target audience?
status_distribution = df_cleaned['status'].value_counts(normalize=True) * 100
status_distribution
#This code shows the percentage of users according to their marital status category
Out[73]:
status single 88.971029 seeing someone 5.654346 available 4.575425 married 0.759241 unknown 0.039960 Name: proportion, dtype: float64
In [75]:
#Question 3: How does status vary by gender? For example, what proportion of men and women identify as single?
status_by_gender = df_cleaned.groupby('gender')['status'].value_counts(normalize=True).unstack() * 100
status_by_gender
#This code shows the ppercentage of relationships based on gender if it varies from gender to gender
Out[75]:
status | available | married | seeing someone | single | unknown |
---|---|---|---|---|---|
gender | |||||
f | 5.088757 | 1.005917 | 7.573964 | 86.213018 | 0.118343 |
m | 4.313725 | 0.633484 | 4.675716 | 90.377074 | NaN |
In [77]:
print("2.Correlation Analysis")
2.Correlation Analysis
In [79]:
#Question 1: What are the correlations between numerical columns such as age, income, and height? Are there any strong positive or negative relationships?
correlation_matrix = df_cleaned[['age', 'income', 'height']].corr()
correlation_matrix
Out[79]:
age | income | height | |
---|---|---|---|
age | 1.000000 | 0.266827 | -0.021453 |
income | 0.266827 | 1.000000 | 0.017391 |
height | -0.021453 | 0.017391 | 1.000000 |
In [81]:
#Question 2: How does age correlate with income? Are older users more likely to report higher income levels?
# Plot a scatterplot to visualize the relationship between age and income
sns.scatterplot(x='age', y='income', data=df_cleaned)
plt.title('Age vs. Income')
plt.xlabel('Age')
plt.ylabel('Income')
plt.show()
In [83]:
print("3.Diet and lifestyle analysis")
3.Diet and lifestyle analysis
In [85]:
#Question 1: How do dietary preferences (diet) distribute across the platform? For example, what percentage of users identify as vegetarian, vegan, or follow "anything" diets?
diet_distribution = df_cleaned['diet'].value_counts(normalize=True) * 100
diet_distribution
#This code shows the percentage dietary preferences within the users
Out[85]:
diet mostly anything 46.934461 anything 15.040773 strictly anything 13.772274 mostly vegetarian 9.634551 mostly other 3.292057 strictly vegetarian 3.020236 strictly other 2.053760 mostly vegan 1.449713 strictly vegan 1.238297 vegetarian 1.208094 other 1.026880 vegan 0.543642 mostly kosher 0.362428 mostly halal 0.241619 halal 0.090607 strictly halal 0.060405 strictly kosher 0.030202 Name: proportion, dtype: float64
In [87]:
#Question 2: How do drinking habits (drinks) vary across different diet categories? Are users with stricter diets (e.g., vegan) less likely to drink?
drinks_by_diet = df_cleaned.groupby('diet')['drinks'].value_counts(normalize=True).unstack() * 100
drinks_by_diet
#This shows the percentage of users food prefences when dinking
Out[87]:
drinks | desperately | not at all | often | rarely | socially | very often |
---|---|---|---|---|---|---|
diet | ||||||
anything | 0.204918 | 7.786885 | 14.754098 | 11.680328 | 64.139344 | 1.434426 |
halal | NaN | NaN | NaN | NaN | 100.000000 | NaN |
mostly anything | 0.324254 | 8.819715 | 10.246433 | 13.488975 | 65.369650 | 1.750973 |
mostly halal | NaN | 62.500000 | 12.500000 | 12.500000 | NaN | 12.500000 |
mostly kosher | NaN | 8.333333 | NaN | 41.666667 | 50.000000 | NaN |
mostly other | NaN | 15.740741 | 7.407407 | 21.296296 | 53.703704 | 1.851852 |
mostly vegan | NaN | 18.750000 | 8.333333 | 25.000000 | 45.833333 | 2.083333 |
mostly vegetarian | 0.949367 | 6.962025 | 9.493671 | 17.721519 | 63.607595 | 1.265823 |
other | 2.941176 | 8.823529 | 11.764706 | 14.705882 | 61.764706 | NaN |
strictly anything | 1.545254 | 6.181015 | 16.777042 | 9.492274 | 63.796909 | 2.207506 |
strictly halal | NaN | NaN | NaN | NaN | 50.000000 | 50.000000 |
strictly kosher | NaN | NaN | 100.000000 | NaN | NaN | NaN |
strictly other | 3.076923 | 16.923077 | 7.692308 | 10.769231 | 56.923077 | 4.615385 |
strictly vegan | NaN | 25.641026 | 12.820513 | 20.512821 | 41.025641 | NaN |
strictly vegetarian | 3.000000 | 9.000000 | 18.000000 | 14.000000 | 54.000000 | 2.000000 |
vegan | NaN | 16.666667 | 16.666667 | 22.222222 | 38.888889 | 5.555556 |
vegetarian | 2.500000 | 12.500000 | 12.500000 | 5.000000 | 62.500000 | 5.000000 |
In [89]:
print("4.Geographical Insights")
4.Geographical Insights
In [91]:
#Question 1: Extract city and state information from the location column. What are the top 5 cities and states with the highest number of users?
df_cleaned[['city', 'state']] = df_cleaned['location'].str.split(',', expand=True)
top_cities = df_cleaned['city'].value_counts().head(5)
top_states = df_cleaned['state'].value_counts().head(5)
top_cities, top_states
Out[91]:
(city san francisco 1776 oakland 852 berkeley 432 hayward 118 san leandro 117 Name: count, dtype: int64, state california 4996 arizona 2 louisiana 1 michigan 1 texas 1 Name: count, dtype: int64)
In [93]:
#Question 2: How does age vary across the top cities? Are certain cities dominated by younger or older users?
avg_age_by_city = df_cleaned.groupby('city')['age'].mean().sort_values(ascending=False).head(5)
avg_age_by_city
#This code shows the top 5 cities according to the age percentage
Out[93]:
city forest knolls 65.0 tucson 52.0 lagunitas 41.5 larkspur 40.5 montara 38.0 Name: age, dtype: float64
In [95]:
#uestion 3: What are the average income levels in the top states or cities? Are there regional patterns in reported income?
avg_income_by_city = df_cleaned.groupby('city')['income'].mean().sort_values(ascending=False).head(5)
avg_income_by_state = df_cleaned.groupby('state')['income'].mean().sort_values(ascending=False).head(5)
avg_income_by_city, avg_income_by_state
Out[95]:
(city colma 40000.0 philadelphia 40000.0 corte madera 40000.0 concord 40000.0 forest knolls 40000.0 Name: income, dtype: float64, state new york 40000.000000 pennsylvania 40000.000000 massachusetts 30000.000000 california 26110.888711 arizona 20000.000000 Name: income, dtype: float64)
In [97]:
print("Height Analysis")
Height Analysis
In [99]:
#Question 1: What is the average height of users across different gender categories?
avg_height_by_gender = df_cleaned.groupby('gender')['height'].mean()
avg_height_by_gender
Out[99]:
gender f 65.240237 m 70.360483 Name: height, dtype: float64
In [101]:
#Question 2: How does height vary by age_group? Are there noticeable trends among younger vs. older users?
avg_height_by_age_group = df_cleaned.groupby('age_group',observed=False)['height'].mean()
avg_height_by_age_group
Out[101]:
age_group 18-25 68.631604 26-35 68.757000 36-45 68.448347 46+ 68.198529 Name: height, dtype: float64
In [103]:
#Question 3: What is the distribution of height within body_type categories (e.g., athletic, curvy, thin)? Do the distributions align with expectations?
sns.boxplot(x='height', y='body_type', data=df_cleaned)
plt.title('Height Distribution by Body Type')
plt.show()
#This code shows the distribution of height within the body type category using boxplot visualization
In [105]:
print("Income Analysis")
Income Analysis
In [107]:
#Question 1: What is the distribution of income across the platform? Are there specific income brackets that dominate? (don't count 0)
income_distribution = df_cleaned[df_cleaned['income'] > 0]['income'].describe()
income_distribution
Out[107]:
count 5005.00000 mean 26109.89011 std 7996.29661 min 20000.00000 25% 20000.00000 50% 20000.00000 75% 30000.00000 max 40000.00000 Name: income, dtype: float64
In [109]:
#Question 2: How does income correlate with age and height? Are older or taller users more likely to report higher incomes?
sns.pairplot(df_cleaned[['income', 'age', 'height']])
plt.show()
In [111]:
print("Part 4: Data Visualization")
print("1. Age Distribution")
Part 4: Data Visualization 1. Age Distribution
In [113]:
#Question 1: Plot a histogram of age with a vertical line indicating the mean age. What does the distribution reveal about the most common age group on the platform?
plt.figure(figsize=(5, 3))
sns.histplot(df_cleaned['age'], kde=False, bins=30, color='skyblue')
plt.axvline(df_cleaned['age'].mean(), color='red', linestyle='dashed', linewidth=2, label=f'Mean Age: {df_cleaned["age"].mean():.2f}')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend()
plt.show()
In [115]:
#Question 2: How does the age distribution differ by gender? Are there age groups where one gender is more prevalent?
plt.figure(figsize=(5,3))
sns.histplot(df_cleaned, x='age', hue='gender', multiple='stack', bins=30, palette='pastel')
plt.title('Age Distribution by Gender')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend(title='Gender', labels=['Male', 'Female'])
plt.show()
In [117]:
print("Income and Age")
Income and Age
In [119]:
#Question 1: Use a scatterplot to visualize the relationship between income and age, with a trend line indicating overall patterns. Are older users more likely to report higher incomes?
plt.figure(figsize=(5,3))
sns.regplot(x='age', y='income', data=df_cleaned, scatter_kws={'s': 10}, line_kws={'color': 'red'})
plt.title('Income vs. Age')
plt.xlabel('Age')
plt.ylabel('Income')
plt.show()
In [121]:
#Question 2: Create boxplots of income grouped by age group. Which age group reports the highest median income?
plt.figure(figsize=(5,3))
sns.boxplot(x='age_group', y='income', data=df_cleaned)
plt.title('Income Distribution by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Income')
plt.show()
In [123]:
print("Pets and Preferences")
Pets and Preferences
In [125]:
#Question 1: Create a bar chart showing the distribution of pet preferences (e.g., likes dogs, likes cats). Which preferences are most common?
plt.figure(figsize=(15,3))
sns.countplot(x='pets', data=df_cleaned, hue='pets', palette='Set2', legend=False)
plt.title('Distribution of Pet Preferences')
plt.xlabel('Pet Preference')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
In [127]:
#Question 2: How do pet preferences vary across gender and age group? Are younger users more likely to report liking pets compared to older users?
# Pet preferences by gender and age group
plt.figure(figsize=(15, 3))
sns.countplot(x='pets', hue='gender', data=df_cleaned, palette='Set2')
plt.title('Pet Preferences by Gender')
plt.xlabel('Pet Preference')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Gender', labels=['Male', 'Female'])
plt.show()
# Pet preferences by age group
plt.figure(figsize=(15, 3))
sns.countplot(x='pets', hue='age_group', data=df_cleaned, palette='Set3')
plt.title('Pet Preferences by Age Group')
plt.xlabel('Pet Preference')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Age Group')
plt.show()
In [129]:
print("4.Zodiac signs and Personality")
4.Zodiac signs and Personality
In [131]:
#Question 1: Create a pie chart showing the distribution of zodiac signs (sign) across the platform. Which signs are most and least represented? Is this the right chart? If not, replace with the right chart.
sign_distribution = df_cleaned['sign'].value_counts()
plt.figure(figsize=(10,10))
sign_distribution.plot.pie(autopct='%1.1f%%', startangle=90, cmap='Set3')
plt.title('Distribution of Zodiac Signs')
plt.ylabel('')
plt.show()
In [135]:
#Question 2: How does sign vary across gender and status? Are there noticeable patterns or imbalances?
#Zodiac signs by gender
plt.figure(figsize=(20, 3))
sns.countplot(x='sign', hue='gender', data=df_cleaned, palette='Set2')
plt.title('Zodiac Signs by Gender')
plt.xlabel('Zodiac Sign')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Gender', labels=['Male', 'Female'])
plt.show()
# Zodiac signs by status
plt.figure(figsize=(20, 3))
sns.countplot(x='sign', hue='status', data=df_cleaned, palette='Set2')
plt.title('Zodiac Signs by Status')
plt.xlabel('Zodiac Sign')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Status')
plt.show()
In [ ]: