In [1]:
print("LBRARIES USED ")
LBRARIES USED 
In [3]:
    #library for mathmetic functins
import numpy as np 
    #library for dataframes or tables 
import pandas as pd  
    #library for visualization functins(charts,graphs)
import matplotlib.pyplot as plt    
%matplotlib inline
import seaborn as sns 
In [5]:
    # Load the dataset
df = pd.read_csv("bumble.csv")
In [7]:
print("PART 1 : DATA CLEANING")
print("1. Inspecting Missing Data")
PART 1 : DATA CLEANING
1. Inspecting Missing Data
In [17]:
   #Question 1: Which columns in the dataset have missing values, and what percentage of data is missing in each column?

missing_data = df.isnull().mean() * 100
missing_data.sort_values(ascending=False)

   #Explanation:this code checks the percentage of missing values in each column and sorts them in descending order. The result will help us understand the magnitude of missing data in different fields.
Out[17]:
diet           40.694959
religion       33.740366
pets           33.231575
sign           18.443266
job            13.675641
education      11.056618
ethnicity       9.475194
body_type       8.834618
drinks          4.979482
speaks          0.083408
height          0.005005
last_online     0.000000
location        0.000000
income          0.000000
status          0.000000
gender          0.000000
age             0.000000
dtype: float64
In [19]:
   #Question 2: Are there columns where more than 50% of the data is missing? Drop those columns where missing values are >50%.

   
df_cleaned = df.dropna(thresh=len(df)*0.5, axis=1)
 
   #Explanation : Drop columns with more than 50% missing data from the table.
In [21]:
   #Question 3: Missing numerical data (e.g., height, income) should be handled by imputing the median value of height and income for the corresponding category, such as gender, age group, or location.

df_cleaned['height'] = df_cleaned.groupby('gender')['height'].transform(lambda x: x.fillna(x.median()))
df_cleaned['income'] = df_cleaned.groupby('gender')['income'].transform(lambda x: x.fillna(x.median()))
df.head()

    #Explanation : This code imputes the missing values of height and income by the median values based on the gender. 
Out[21]:
age status gender body_type diet drinks education ethnicity height income job last_online location pets religion sign speaks
0 22 single m a little extra strictly anything socially working on college/university asian, white 75.0 -1 transportation 2012-06-28-20-30 south san francisco, california likes dogs and likes cats agnosticism and very serious about it gemini english
1 35 single m average mostly other often working on space camp white 70.0 80000 hospitality / travel 2012-06-29-21-41 oakland, california likes dogs and likes cats agnosticism but not too serious about it cancer english (fluently), spanish (poorly), french (...
2 38 available m thin anything socially graduated from masters program NaN 68.0 -1 NaN 2012-06-27-09-10 san francisco, california has cats NaN pisces but it doesn’t matter english, french, c++
3 23 single m thin vegetarian socially working on college/university white 71.0 20000 student 2012-06-28-14-22 berkeley, california likes cats NaN pisces english, german (poorly)
4 29 single m athletic NaN socially graduated from college/university asian, black, other 66.0 -1 artistic / musical / writer 2012-06-27-21-26 san francisco, california likes dogs and likes cats NaN aquarius english
In [23]:
print("2.inspecting data types")
2.inspecting data types
In [25]:
   #Question 1: Are there any inconsistencies in the data types across columns (e.g., numerical data stored as strings)?

df_cleaned.dtypes

   #This code checks whether the given data types are correct or not
Out[25]:
age              int64
status          object
gender          object
body_type       object
diet            object
drinks          object
education       object
ethnicity       object
height         float64
income           int64
job             object
last_online     object
location        object
pets            object
religion        object
sign            object
speaks          object
dtype: object
In [27]:
    #Question 2: Which columns require conversion to numerical data types for proper analysis (e.g., income)?

df_cleaned['income'] = pd.to_numeric(df_cleaned['income'], errors='coerce')
df_cleaned['height'] = pd.to_numeric(df_cleaned['height'], errors='coerce')
df_cleaned.dtypes

    #This code changes the data types for income and height into numeric data type if they are not in numeric type
Out[27]:
age              int64
status          object
gender          object
body_type       object
diet            object
drinks          object
education       object
ethnicity       object
height         float64
income           int64
job             object
last_online     object
location        object
pets            object
religion        object
sign            object
speaks          object
dtype: object
In [33]:
    #Question 3: Does the last_online column need to be converted into a datetime format? What additional insights can be gained by analyzing this as a date field?

df_cleaned['last_online'] = pd.to_datetime(df_cleaned['last_online'], errors='coerce',utc=True)


    #This code changes the data type of 'last_online' column from 'object' to 'datetime' data type
In [35]:
df_cleaned.dtypes
Out[35]:
age                          int64
status                      object
gender                      object
body_type                   object
diet                        object
drinks                      object
education                   object
ethnicity                   object
height                     float64
income                       int64
job                         object
last_online    datetime64[ns, UTC]
location                    object
pets                        object
religion                    object
sign                        object
speaks                      object
dtype: object
In [37]:
print("3.Outliers")
3.Outliers
In [39]:
   #Question 1: Are there any apparent outliers in numerical columns such as age, height, or income? What are the ranges of values in these columns?

df_cleaned[['age', 'height', 'income']].describe()

   #this code will display the ranges of the values like min,max,etc for the values age,height and income.
Out[39]:
age height income
count 59946.000000 59946.000000 59946.000000
mean 32.340290 68.295282 20033.222534
std 9.452779 3.994738 97346.192104
min 18.000000 1.000000 -1.000000
25% 26.000000 66.000000 -1.000000
50% 30.000000 68.000000 -1.000000
75% 37.000000 71.000000 -1.000000
max 110.000000 95.000000 1000000.000000
In [41]:
     #Question 2: Any -1 values in numerical columns like income should be replaced with 0, as they may represent missing or invalid data.

df_cleaned['income'] = df_cleaned['income'].replace(-1, 0)
df_cleaned.head()

     #This code replaces all the values which where -1 to 0 i.e., a valid value
Out[41]:
age status gender body_type diet drinks education ethnicity height income job last_online location pets religion sign speaks
0 22 single m a little extra strictly anything socially working on college/university asian, white 75.0 0 transportation NaT south san francisco, california likes dogs and likes cats agnosticism and very serious about it gemini english
1 35 single m average mostly other often working on space camp white 70.0 80000 hospitality / travel NaT oakland, california likes dogs and likes cats agnosticism but not too serious about it cancer english (fluently), spanish (poorly), french (...
2 38 available m thin anything socially graduated from masters program NaN 68.0 0 NaN 2012-06-27 19:00:00+00:00 san francisco, california has cats NaN pisces but it doesn’t matter english, french, c++
3 23 single m thin vegetarian socially working on college/university white 71.0 20000 student 2012-06-29 12:00:00+00:00 berkeley, california likes cats NaN pisces english, german (poorly)
4 29 single m athletic NaN socially graduated from college/university asian, black, other 66.0 0 artistic / musical / writer NaT san francisco, california likes dogs and likes cats NaN aquarius english
In [43]:
    #Question 3: For other outliers, calculate the mean and median values using only the middle 80% of the data (removing extreme high and low values).

q_low = df_cleaned['income'].quantile(0.1)
q_high = df_cleaned['income'].quantile(0.9)
df_cleaned = df_cleaned[(df_cleaned['income'] > q_low) & (df_cleaned['income'] < q_high)]
df_cleaned.shape

    #this code removes the top 10% and bottem 10% of the data and gives the 80% of the data from the table.
Out[43]:
(5005, 17)
In [45]:
print("4.Missing Data Visualization")
4.Missing Data Visualization
In [47]:
    #Question: Create a heatmap to visualize missing values across the dataset. Which columns show consistent missing data patterns?

plt.figure(figsize=(12,8))   #size of the map
sns.heatmap(df_cleaned.isnull(), cbar=False, cmap='viridis')     #"cbar=false" which tells not to display the color bar and "cmap='viridis'" controls colors of the heat map
plt.show()

    #This code shows the visual representation of the missing data as a heat map.
No description has been provided for this image
In [51]:
print("Part 2: Data Processing")
print("1. Binning and Grouping")
Part 2: Data Processing
1. Binning and Grouping
In [55]:
    #Question: Bin the age column into categories such as "18-25", "26-35", "36-45", and "46+" to create a new column, age_group.

bins = [18, 25, 35, 45, 100]
labels = ['18-25', '26-35', '36-45', '46+']
df_cleaned['age_group'] = pd.cut(df_cleaned['age'], bins=bins, labels=labels , right=True)
df_cleaned.info()

    #This code divides the age column into categories and the stores into a new column with name 'age_group'.
<class 'pandas.core.frame.DataFrame'>
Index: 5005 entries, 3 to 59914
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   age          5005 non-null   int64              
 1   status       5005 non-null   object             
 2   gender       5005 non-null   object             
 3   body_type    4726 non-null   object             
 4   diet         3311 non-null   object             
 5   drinks       4931 non-null   object             
 6   education    4689 non-null   object             
 7   ethnicity    4716 non-null   object             
 8   height       5005 non-null   float64            
 9   income       5005 non-null   int64              
 10  job          4873 non-null   object             
 11  last_online  1993 non-null   datetime64[ns, UTC]
 12  location     5005 non-null   object             
 13  pets         4011 non-null   object             
 14  religion     4056 non-null   object             
 15  sign         4621 non-null   object             
 16  speaks       5005 non-null   object             
 17  age_group    4945 non-null   category           
dtypes: category(1), datetime64[ns, UTC](1), float64(1), int64(2), object(13)
memory usage: 708.9+ KB
In [57]:
print("2. Derived Features")
2. Derived Features
In [61]:
   #Question: Create a new feature, profile_completeness, by calculating the percentage of non-missing values for each user profile.

df_cleaned['profile_completeness'] = df_cleaned.notnull().mean(axis=1) * 100
df_cleaned.info()

   #This code show the completness of each profile within a column called 'profile_cpmleteness'.
<class 'pandas.core.frame.DataFrame'>
Index: 5005 entries, 3 to 59914
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   age                   5005 non-null   int64              
 1   status                5005 non-null   object             
 2   gender                5005 non-null   object             
 3   body_type             4726 non-null   object             
 4   diet                  3311 non-null   object             
 5   drinks                4931 non-null   object             
 6   education             4689 non-null   object             
 7   ethnicity             4716 non-null   object             
 8   height                5005 non-null   float64            
 9   income                5005 non-null   int64              
 10  job                   4873 non-null   object             
 11  last_online           1993 non-null   datetime64[ns, UTC]
 12  location              5005 non-null   object             
 13  pets                  4011 non-null   object             
 14  religion              4056 non-null   object             
 15  sign                  4621 non-null   object             
 16  speaks                5005 non-null   object             
 17  age_group             4945 non-null   category           
 18  profile_completeness  5005 non-null   float64            
dtypes: category(1), datetime64[ns, UTC](1), float64(2), int64(2), object(13)
memory usage: 748.0+ KB
In [63]:
print("3.Unit Conversion")
3.Unit Conversion
In [65]:
   #Question: Convert the height column from inches to centimeters using the conversion factor (1 inch = 2.54 cm).

df_cleaned['height_cm'] = df_cleaned['height'] * 2.54
df_cleaned.info()

   #This code converts the data from inches to centimeters and stores the data in the new column called 'height_cm'.
<class 'pandas.core.frame.DataFrame'>
Index: 5005 entries, 3 to 59914
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   age                   5005 non-null   int64              
 1   status                5005 non-null   object             
 2   gender                5005 non-null   object             
 3   body_type             4726 non-null   object             
 4   diet                  3311 non-null   object             
 5   drinks                4931 non-null   object             
 6   education             4689 non-null   object             
 7   ethnicity             4716 non-null   object             
 8   height                5005 non-null   float64            
 9   income                5005 non-null   int64              
 10  job                   4873 non-null   object             
 11  last_online           1993 non-null   datetime64[ns, UTC]
 12  location              5005 non-null   object             
 13  pets                  4011 non-null   object             
 14  religion              4056 non-null   object             
 15  sign                  4621 non-null   object             
 16  speaks                5005 non-null   object             
 17  age_group             4945 non-null   category           
 18  profile_completeness  5005 non-null   float64            
 19  height_cm             5005 non-null   float64            
dtypes: category(1), datetime64[ns, UTC](1), float64(3), int64(2), object(13)
memory usage: 787.1+ KB
In [67]:
df_cleaned.head()

   #This shows the new columns with the updated data from tho 5 rows
Out[67]:
age status gender body_type diet drinks education ethnicity height income job last_online location pets religion sign speaks age_group profile_completeness height_cm
3 23 single m thin vegetarian socially working on college/university white 71.0 20000 student 2012-06-29 12:00:00+00:00 berkeley, california likes cats NaN pisces english, german (poorly) 18-25 94.736842 180.34
11 28 seeing someone m average mostly anything socially graduated from college/university white 72.0 40000 banking / financial / real estate 2012-05-22 22:00:00+00:00 daly city, california likes cats christianity and very serious about it leo but it doesn&rsquo;t matter english (fluently), sign language (poorly) 26-35 100.000000 182.88
13 30 single f skinny mostly anything socially graduated from high school white 66.0 30000 sales / marketing / biz dev 2012-06-13 22:00:00+00:00 san francisco, california has dogs and likes cats christianity but not too serious about it NaN english 26-35 94.736842 167.64
66 22 single m athletic mostly anything rarely working on college/university asian 65.0 20000 education / academia 2012-06-30 02:00:00+00:00 san jose, california NaN buddhism and laughing about it virgo but it doesn&rsquo;t matter english (fluently) 18-25 94.736842 165.10
79 21 single m fit mostly anything rarely working on college/university white 71.0 20000 entertainment / media NaT san francisco, california likes dogs and likes cats catholicism but not too serious about it NaN english 18-25 89.473684 180.34
In [69]:
print("PART 3 : DATA aNALYSIS")
print("1. Demographic Analysis")
PART 3 : DATA aNALYSIS
1. Demographic Analysis
In [71]:
    #Question 1: What is the gender distribution (gender) across the platform? Are there any significant imbalances?

gender_distribution = df_cleaned['gender'].value_counts(normalize=True) * 100
gender_distribution

    #This code shows the percentage of users on the platform as male and female categories
Out[71]:
gender
m    66.233766
f    33.766234
Name: proportion, dtype: float64
In [73]:
     #Question 2: What are the proportions of users in different status categories (e.g., single, married, seeing someone)? What does this suggest about the platform’s target audience?

status_distribution = df_cleaned['status'].value_counts(normalize=True) * 100
status_distribution

    #This code shows the percentage of users according to their marital status category
Out[73]:
status
single            88.971029
seeing someone     5.654346
available          4.575425
married            0.759241
unknown            0.039960
Name: proportion, dtype: float64
In [75]:
    #Question 3: How does status vary by gender? For example, what proportion of men and women identify as single?


status_by_gender = df_cleaned.groupby('gender')['status'].value_counts(normalize=True).unstack() * 100
status_by_gender

    #This code shows the ppercentage of relationships based on gender if it varies from gender to gender
Out[75]:
status available married seeing someone single unknown
gender
f 5.088757 1.005917 7.573964 86.213018 0.118343
m 4.313725 0.633484 4.675716 90.377074 NaN
In [77]:
print("2.Correlation Analysis")
2.Correlation Analysis
In [79]:
    #Question 1: What are the correlations between numerical columns such as age, income, and height? Are there any strong positive or negative relationships?


correlation_matrix = df_cleaned[['age', 'income', 'height']].corr()
correlation_matrix
Out[79]:
age income height
age 1.000000 0.266827 -0.021453
income 0.266827 1.000000 0.017391
height -0.021453 0.017391 1.000000
In [81]:
     #Question 2: How does age correlate with income? Are older users more likely to report higher income levels?

# Plot a scatterplot to visualize the relationship between age and income

sns.scatterplot(x='age', y='income', data=df_cleaned)
plt.title('Age vs. Income')
plt.xlabel('Age')
plt.ylabel('Income')
plt.show()
No description has been provided for this image
In [83]:
print("3.Diet and lifestyle analysis")
3.Diet and lifestyle analysis
In [85]:
     #Question 1: How do dietary preferences (diet) distribute across the platform? For example, what percentage of users identify as vegetarian, vegan, or follow "anything" diets?

diet_distribution = df_cleaned['diet'].value_counts(normalize=True) * 100
diet_distribution

     #This code shows the percentage dietary preferences within the users
Out[85]:
diet
mostly anything        46.934461
anything               15.040773
strictly anything      13.772274
mostly vegetarian       9.634551
mostly other            3.292057
strictly vegetarian     3.020236
strictly other          2.053760
mostly vegan            1.449713
strictly vegan          1.238297
vegetarian              1.208094
other                   1.026880
vegan                   0.543642
mostly kosher           0.362428
mostly halal            0.241619
halal                   0.090607
strictly halal          0.060405
strictly kosher         0.030202
Name: proportion, dtype: float64
In [87]:
      #Question 2: How do drinking habits (drinks) vary across different diet categories? Are users with stricter diets (e.g., vegan) less likely to drink?

drinks_by_diet = df_cleaned.groupby('diet')['drinks'].value_counts(normalize=True).unstack() * 100
drinks_by_diet

    #This shows the percentage of users food prefences when dinking 
Out[87]:
drinks desperately not at all often rarely socially very often
diet
anything 0.204918 7.786885 14.754098 11.680328 64.139344 1.434426
halal NaN NaN NaN NaN 100.000000 NaN
mostly anything 0.324254 8.819715 10.246433 13.488975 65.369650 1.750973
mostly halal NaN 62.500000 12.500000 12.500000 NaN 12.500000
mostly kosher NaN 8.333333 NaN 41.666667 50.000000 NaN
mostly other NaN 15.740741 7.407407 21.296296 53.703704 1.851852
mostly vegan NaN 18.750000 8.333333 25.000000 45.833333 2.083333
mostly vegetarian 0.949367 6.962025 9.493671 17.721519 63.607595 1.265823
other 2.941176 8.823529 11.764706 14.705882 61.764706 NaN
strictly anything 1.545254 6.181015 16.777042 9.492274 63.796909 2.207506
strictly halal NaN NaN NaN NaN 50.000000 50.000000
strictly kosher NaN NaN 100.000000 NaN NaN NaN
strictly other 3.076923 16.923077 7.692308 10.769231 56.923077 4.615385
strictly vegan NaN 25.641026 12.820513 20.512821 41.025641 NaN
strictly vegetarian 3.000000 9.000000 18.000000 14.000000 54.000000 2.000000
vegan NaN 16.666667 16.666667 22.222222 38.888889 5.555556
vegetarian 2.500000 12.500000 12.500000 5.000000 62.500000 5.000000
In [89]:
print("4.Geographical Insights")
4.Geographical Insights
In [91]:
     #Question 1: Extract city and state information from the location column. What are the top 5 cities and states with the highest number of users?

df_cleaned[['city', 'state']] = df_cleaned['location'].str.split(',', expand=True)
top_cities = df_cleaned['city'].value_counts().head(5)
top_states = df_cleaned['state'].value_counts().head(5)
top_cities, top_states
Out[91]:
(city
 san francisco    1776
 oakland           852
 berkeley          432
 hayward           118
 san leandro       117
 Name: count, dtype: int64,
 state
 california    4996
 arizona          2
 louisiana        1
 michigan         1
 texas            1
 Name: count, dtype: int64)
In [93]:
     #Question 2: How does age vary across the top cities? Are certain cities dominated by younger or older users?

avg_age_by_city = df_cleaned.groupby('city')['age'].mean().sort_values(ascending=False).head(5)
avg_age_by_city

     #This code shows the top 5 cities according to the age percentage
Out[93]:
city
forest knolls    65.0
tucson           52.0
lagunitas        41.5
larkspur         40.5
montara          38.0
Name: age, dtype: float64
In [95]:
     #uestion 3: What are the average income levels in the top states or cities? Are there regional patterns in reported income?

avg_income_by_city = df_cleaned.groupby('city')['income'].mean().sort_values(ascending=False).head(5)
avg_income_by_state = df_cleaned.groupby('state')['income'].mean().sort_values(ascending=False).head(5)
avg_income_by_city, avg_income_by_state
Out[95]:
(city
 colma            40000.0
 philadelphia     40000.0
 corte madera     40000.0
 concord          40000.0
 forest knolls    40000.0
 Name: income, dtype: float64,
 state
 new york         40000.000000
 pennsylvania     40000.000000
 massachusetts    30000.000000
 california       26110.888711
 arizona          20000.000000
 Name: income, dtype: float64)
In [97]:
print("Height Analysis")
Height Analysis
In [99]:
     #Question 1: What is the average height of users across different gender categories?

avg_height_by_gender = df_cleaned.groupby('gender')['height'].mean()
avg_height_by_gender
Out[99]:
gender
f    65.240237
m    70.360483
Name: height, dtype: float64
In [101]:
    #Question 2: How does height vary by age_group? Are there noticeable trends among younger vs. older users?

avg_height_by_age_group = df_cleaned.groupby('age_group',observed=False)['height'].mean()
avg_height_by_age_group
Out[101]:
age_group
18-25    68.631604
26-35    68.757000
36-45    68.448347
46+      68.198529
Name: height, dtype: float64
In [103]:
     #Question 3: What is the distribution of height within body_type categories (e.g., athletic, curvy, thin)? Do the distributions align with expectations?

sns.boxplot(x='height', y='body_type', data=df_cleaned)
plt.title('Height Distribution by Body Type')
plt.show()

      #This code shows the distribution of height within the body type category using boxplot visualization 
No description has been provided for this image
In [105]:
print("Income Analysis")
Income Analysis
In [107]:
     #Question 1: What is the distribution of income across the platform? Are there specific income brackets that dominate? (don't count 0)

income_distribution = df_cleaned[df_cleaned['income'] > 0]['income'].describe()
income_distribution
Out[107]:
count     5005.00000
mean     26109.89011
std       7996.29661
min      20000.00000
25%      20000.00000
50%      20000.00000
75%      30000.00000
max      40000.00000
Name: income, dtype: float64
In [109]:
      #Question 2: How does income correlate with age and height? Are older or taller users more likely to report higher incomes?

sns.pairplot(df_cleaned[['income', 'age', 'height']])
plt.show()
No description has been provided for this image
In [111]:
print("Part 4: Data Visualization")
print("1. Age Distribution")
Part 4: Data Visualization
1. Age Distribution
In [113]:
      #Question 1: Plot a histogram of age with a vertical line indicating the mean age. What does the distribution reveal about the most common age group on the platform?

plt.figure(figsize=(5, 3))
sns.histplot(df_cleaned['age'], kde=False, bins=30, color='skyblue')
plt.axvline(df_cleaned['age'].mean(), color='red', linestyle='dashed', linewidth=2, label=f'Mean Age: {df_cleaned["age"].mean():.2f}')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend()
plt.show()
No description has been provided for this image
In [115]:
      #Question 2: How does the age distribution differ by gender? Are there age groups where one gender is more prevalent?

plt.figure(figsize=(5,3))
sns.histplot(df_cleaned, x='age', hue='gender', multiple='stack', bins=30, palette='pastel')
plt.title('Age Distribution by Gender')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend(title='Gender', labels=['Male', 'Female'])
plt.show()
No description has been provided for this image
In [117]:
print("Income and Age")
Income and Age
In [119]:
      #Question 1: Use a scatterplot to visualize the relationship between income and age, with a trend line indicating overall patterns. Are older users more likely to report higher incomes?

plt.figure(figsize=(5,3))
sns.regplot(x='age', y='income', data=df_cleaned, scatter_kws={'s': 10}, line_kws={'color': 'red'})
plt.title('Income vs. Age')
plt.xlabel('Age')
plt.ylabel('Income')
plt.show()
No description has been provided for this image
In [121]:
      #Question 2: Create boxplots of income grouped by age group. Which age group reports the highest median income?

plt.figure(figsize=(5,3))
sns.boxplot(x='age_group', y='income', data=df_cleaned)
plt.title('Income Distribution by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Income')
plt.show()
No description has been provided for this image
In [123]:
print("Pets and Preferences")
Pets and Preferences
In [125]:
     #Question 1: Create a bar chart showing the distribution of pet preferences (e.g., likes dogs, likes cats). Which preferences are most common?

plt.figure(figsize=(15,3))
sns.countplot(x='pets', data=df_cleaned, hue='pets', palette='Set2', legend=False)
plt.title('Distribution of Pet Preferences')
plt.xlabel('Pet Preference')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [127]:
      #Question 2: How do pet preferences vary across gender and age group? Are younger users more likely to report liking pets compared to older users?

# Pet preferences by gender and age group
plt.figure(figsize=(15, 3))
sns.countplot(x='pets', hue='gender', data=df_cleaned, palette='Set2')
plt.title('Pet Preferences by Gender')
plt.xlabel('Pet Preference')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Gender', labels=['Male', 'Female'])
plt.show()

# Pet preferences by age group
plt.figure(figsize=(15, 3))
sns.countplot(x='pets', hue='age_group', data=df_cleaned, palette='Set3')
plt.title('Pet Preferences by Age Group')
plt.xlabel('Pet Preference')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Age Group')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [129]:
print("4.Zodiac signs and Personality")
4.Zodiac signs and Personality
In [131]:
     #Question 1: Create a pie chart showing the distribution of zodiac signs (sign) across the platform. Which signs are most and least represented? Is this the right chart? If not, replace with the right chart.

sign_distribution = df_cleaned['sign'].value_counts()
plt.figure(figsize=(10,10))
sign_distribution.plot.pie(autopct='%1.1f%%', startangle=90, cmap='Set3')
plt.title('Distribution of Zodiac Signs')
plt.ylabel('')
plt.show()
No description has been provided for this image
In [135]:
      #Question 2: How does sign vary across gender and status? Are there noticeable patterns or imbalances?

#Zodiac signs by gender
plt.figure(figsize=(20, 3))
sns.countplot(x='sign', hue='gender', data=df_cleaned, palette='Set2')
plt.title('Zodiac Signs by Gender')
plt.xlabel('Zodiac Sign')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Gender', labels=['Male', 'Female'])
plt.show()

# Zodiac signs by status
plt.figure(figsize=(20, 3))
sns.countplot(x='sign', hue='status', data=df_cleaned, palette='Set2')
plt.title('Zodiac Signs by Status')
plt.xlabel('Zodiac Sign')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Status')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]: