print("LBRARIES USED ")

LBRARIES USED

    #library for mathmetic functins
import numpy as np 
    #library for dataframes or tables 
import pandas as pd  
    #library for visualization functins(charts,graphs)
import matplotlib.pyplot as plt    
%matplotlib inline
import seaborn as sns

    # Load the dataset
df = pd.read_csv("bumble.csv")

print("PART 1 : DATA CLEANING")
print("1. Inspecting Missing Data")

PART 1 : DATA CLEANING
1. Inspecting Missing Data

   #Question 1: Which columns in the dataset have missing values, and what percentage of data is missing in each column?

missing_data = df.isnull().mean() * 100
missing_data.sort_values(ascending=False)

   #Explanation:this code checks the percentage of missing values in each column and sorts them in descending order. The result will help us understand the magnitude of missing data in different fields.

diet           40.694959
religion       33.740366
pets           33.231575
sign           18.443266
job            13.675641
education      11.056618
ethnicity       9.475194
body_type       8.834618
drinks          4.979482
speaks          0.083408
height          0.005005
last_online     0.000000
location        0.000000
income          0.000000
status          0.000000
gender          0.000000
age             0.000000
dtype: float64

   #Question 2: Are there columns where more than 50% of the data is missing? Drop those columns where missing values are >50%.

   
df_cleaned = df.dropna(thresh=len(df)*0.5, axis=1)
 
   #Explanation : Drop columns with more than 50% missing data from the table.

   #Question 3: Missing numerical data (e.g., height, income) should be handled by imputing the median value of height and income for the corresponding category, such as gender, age group, or location.

df_cleaned['height'] = df_cleaned.groupby('gender')['height'].transform(lambda x: x.fillna(x.median()))
df_cleaned['income'] = df_cleaned.groupby('gender')['income'].transform(lambda x: x.fillna(x.median()))
df.head()

    #Explanation : This code imputes the missing values of height and income by the median values based on the gender.

print("2.inspecting data types")

2.inspecting data types

   #Question 1: Are there any inconsistencies in the data types across columns (e.g., numerical data stored as strings)?

df_cleaned.dtypes

   #This code checks whether the given data types are correct or not

age              int64
status          object
gender          object
body_type       object
diet            object
drinks          object
education       object
ethnicity       object
height         float64
income           int64
job             object
last_online     object
location        object
pets            object
religion        object
sign            object
speaks          object
dtype: object

    #Question 2: Which columns require conversion to numerical data types for proper analysis (e.g., income)?

df_cleaned['income'] = pd.to_numeric(df_cleaned['income'], errors='coerce')
df_cleaned['height'] = pd.to_numeric(df_cleaned['height'], errors='coerce')
df_cleaned.dtypes

    #This code changes the data types for income and height into numeric data type if they are not in numeric type

age              int64
status          object
gender          object
body_type       object
diet            object
drinks          object
education       object
ethnicity       object
height         float64
income           int64
job             object
last_online     object
location        object
pets            object
religion        object
sign            object
speaks          object
dtype: object

    #Question 3: Does the last_online column need to be converted into a datetime format? What additional insights can be gained by analyzing this as a date field?

df_cleaned['last_online'] = pd.to_datetime(df_cleaned['last_online'], errors='coerce',utc=True)


    #This code changes the data type of 'last_online' column from 'object' to 'datetime' data type

df_cleaned.dtypes

age                          int64
status                      object
gender                      object
body_type                   object
diet                        object
drinks                      object
education                   object
ethnicity                   object
height                     float64
income                       int64
job                         object
last_online    datetime64[ns, UTC]
location                    object
pets                        object
religion                    object
sign                        object
speaks                      object
dtype: object

print("3.Outliers")

3.Outliers

   #Question 1: Are there any apparent outliers in numerical columns such as age, height, or income? What are the ranges of values in these columns?

df_cleaned[['age', 'height', 'income']].describe()

   #this code will display the ranges of the values like min,max,etc for the values age,height and income.

     #Question 2: Any -1 values in numerical columns like income should be replaced with 0, as they may represent missing or invalid data.

df_cleaned['income'] = df_cleaned['income'].replace(-1, 0)
df_cleaned.head()

     #This code replaces all the values which where -1 to 0 i.e., a valid value

    #Question 3: For other outliers, calculate the mean and median values using only the middle 80% of the data (removing extreme high and low values).

q_low = df_cleaned['income'].quantile(0.1)
q_high = df_cleaned['income'].quantile(0.9)
df_cleaned = df_cleaned[(df_cleaned['income'] > q_low) & (df_cleaned['income'] < q_high)]
df_cleaned.shape

    #this code removes the top 10% and bottem 10% of the data and gives the 80% of the data from the table.

(5005, 17)

print("4.Missing Data Visualization")

4.Missing Data Visualization

    #Question: Create a heatmap to visualize missing values across the dataset. Which columns show consistent missing data patterns?

plt.figure(figsize=(12,8))   #size of the map
sns.heatmap(df_cleaned.isnull(), cbar=False, cmap='viridis')     #"cbar=false" which tells not to display the color bar and "cmap='viridis'" controls colors of the heat map
plt.show()

    #This code shows the visual representation of the missing data as a heat map.

print("Part 2: Data Processing")
print("1. Binning and Grouping")

Part 2: Data Processing
1. Binning and Grouping

    #Question: Bin the age column into categories such as "18-25", "26-35", "36-45", and "46+" to create a new column, age_group.

bins = [18, 25, 35, 45, 100]
labels = ['18-25', '26-35', '36-45', '46+']
df_cleaned['age_group'] = pd.cut(df_cleaned['age'], bins=bins, labels=labels , right=True)
df_cleaned.info()

    #This code divides the age column into categories and the stores into a new column with name 'age_group'.

<class 'pandas.core.frame.DataFrame'>
Index: 5005 entries, 3 to 59914
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   age          5005 non-null   int64              
 1   status       5005 non-null   object             
 2   gender       5005 non-null   object             
 3   body_type    4726 non-null   object             
 4   diet         3311 non-null   object             
 5   drinks       4931 non-null   object             
 6   education    4689 non-null   object             
 7   ethnicity    4716 non-null   object             
 8   height       5005 non-null   float64            
 9   income       5005 non-null   int64              
 10  job          4873 non-null   object             
 11  last_online  1993 non-null   datetime64[ns, UTC]
 12  location     5005 non-null   object             
 13  pets         4011 non-null   object             
 14  religion     4056 non-null   object             
 15  sign         4621 non-null   object             
 16  speaks       5005 non-null   object             
 17  age_group    4945 non-null   category           
dtypes: category(1), datetime64[ns, UTC](1), float64(1), int64(2), object(13)
memory usage: 708.9+ KB

print("2. Derived Features")

2. Derived Features

   #Question: Create a new feature, profile_completeness, by calculating the percentage of non-missing values for each user profile.

df_cleaned['profile_completeness'] = df_cleaned.notnull().mean(axis=1) * 100
df_cleaned.info()

   #This code show the completness of each profile within a column called 'profile_cpmleteness'.

<class 'pandas.core.frame.DataFrame'>
Index: 5005 entries, 3 to 59914
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   age                   5005 non-null   int64              
 1   status                5005 non-null   object             
 2   gender                5005 non-null   object             
 3   body_type             4726 non-null   object             
 4   diet                  3311 non-null   object             
 5   drinks                4931 non-null   object             
 6   education             4689 non-null   object             
 7   ethnicity             4716 non-null   object             
 8   height                5005 non-null   float64            
 9   income                5005 non-null   int64              
 10  job                   4873 non-null   object             
 11  last_online           1993 non-null   datetime64[ns, UTC]
 12  location              5005 non-null   object             
 13  pets                  4011 non-null   object             
 14  religion              4056 non-null   object             
 15  sign                  4621 non-null   object             
 16  speaks                5005 non-null   object             
 17  age_group             4945 non-null   category           
 18  profile_completeness  5005 non-null   float64            
dtypes: category(1), datetime64[ns, UTC](1), float64(2), int64(2), object(13)
memory usage: 748.0+ KB

print("3.Unit Conversion")

3.Unit Conversion

   #Question: Convert the height column from inches to centimeters using the conversion factor (1 inch = 2.54 cm).

df_cleaned['height_cm'] = df_cleaned['height'] * 2.54
df_cleaned.info()

   #This code converts the data from inches to centimeters and stores the data in the new column called 'height_cm'.

<class 'pandas.core.frame.DataFrame'>
Index: 5005 entries, 3 to 59914
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   age                   5005 non-null   int64              
 1   status                5005 non-null   object             
 2   gender                5005 non-null   object             
 3   body_type             4726 non-null   object             
 4   diet                  3311 non-null   object             
 5   drinks                4931 non-null   object             
 6   education             4689 non-null   object             
 7   ethnicity             4716 non-null   object             
 8   height                5005 non-null   float64            
 9   income                5005 non-null   int64              
 10  job                   4873 non-null   object             
 11  last_online           1993 non-null   datetime64[ns, UTC]
 12  location              5005 non-null   object             
 13  pets                  4011 non-null   object             
 14  religion              4056 non-null   object             
 15  sign                  4621 non-null   object             
 16  speaks                5005 non-null   object             
 17  age_group             4945 non-null   category           
 18  profile_completeness  5005 non-null   float64            
 19  height_cm             5005 non-null   float64            
dtypes: category(1), datetime64[ns, UTC](1), float64(3), int64(2), object(13)
memory usage: 787.1+ KB

df_cleaned.head()

   #This shows the new columns with the updated data from tho 5 rows

print("PART 3 : DATA aNALYSIS")
print("1. Demographic Analysis")

PART 3 : DATA aNALYSIS
1. Demographic Analysis

    #Question 1: What is the gender distribution (gender) across the platform? Are there any significant imbalances?

gender_distribution = df_cleaned['gender'].value_counts(normalize=True) * 100
gender_distribution

    #This code shows the percentage of users on the platform as male and female categories

gender
m    66.233766
f    33.766234
Name: proportion, dtype: float64

     #Question 2: What are the proportions of users in different status categories (e.g., single, married, seeing someone)? What does this suggest about the platform’s target audience?

status_distribution = df_cleaned['status'].value_counts(normalize=True) * 100
status_distribution

    #This code shows the percentage of users according to their marital status category

status
single            88.971029
seeing someone     5.654346
available          4.575425
married            0.759241
unknown            0.039960
Name: proportion, dtype: float64

    #Question 3: How does status vary by gender? For example, what proportion of men and women identify as single?


status_by_gender = df_cleaned.groupby('gender')['status'].value_counts(normalize=True).unstack() * 100
status_by_gender

    #This code shows the ppercentage of relationships based on gender if it varies from gender to gender

print("2.Correlation Analysis")

2.Correlation Analysis

    #Question 1: What are the correlations between numerical columns such as age, income, and height? Are there any strong positive or negative relationships?


correlation_matrix = df_cleaned[['age', 'income', 'height']].corr()
correlation_matrix

     #Question 2: How does age correlate with income? Are older users more likely to report higher income levels?

# Plot a scatterplot to visualize the relationship between age and income

sns.scatterplot(x='age', y='income', data=df_cleaned)
plt.title('Age vs. Income')
plt.xlabel('Age')
plt.ylabel('Income')
plt.show()

print("3.Diet and lifestyle analysis")

3.Diet and lifestyle analysis

     #Question 1: How do dietary preferences (diet) distribute across the platform? For example, what percentage of users identify as vegetarian, vegan, or follow "anything" diets?

diet_distribution = df_cleaned['diet'].value_counts(normalize=True) * 100
diet_distribution

     #This code shows the percentage dietary preferences within the users

diet
mostly anything        46.934461
anything               15.040773
strictly anything      13.772274
mostly vegetarian       9.634551
mostly other            3.292057
strictly vegetarian     3.020236
strictly other          2.053760
mostly vegan            1.449713
strictly vegan          1.238297
vegetarian              1.208094
other                   1.026880
vegan                   0.543642
mostly kosher           0.362428
mostly halal            0.241619
halal                   0.090607
strictly halal          0.060405
strictly kosher         0.030202
Name: proportion, dtype: float64

      #Question 2: How do drinking habits (drinks) vary across different diet categories? Are users with stricter diets (e.g., vegan) less likely to drink?

drinks_by_diet = df_cleaned.groupby('diet')['drinks'].value_counts(normalize=True).unstack() * 100
drinks_by_diet

    #This shows the percentage of users food prefences when dinking

print("4.Geographical Insights")

4.Geographical Insights

     #Question 1: Extract city and state information from the location column. What are the top 5 cities and states with the highest number of users?

df_cleaned[['city', 'state']] = df_cleaned['location'].str.split(',', expand=True)
top_cities = df_cleaned['city'].value_counts().head(5)
top_states = df_cleaned['state'].value_counts().head(5)
top_cities, top_states

(city
 san francisco    1776
 oakland           852
 berkeley          432
 hayward           118
 san leandro       117
 Name: count, dtype: int64,
 state
 california    4996
 arizona          2
 louisiana        1
 michigan         1
 texas            1
 Name: count, dtype: int64)

     #Question 2: How does age vary across the top cities? Are certain cities dominated by younger or older users?

avg_age_by_city = df_cleaned.groupby('city')['age'].mean().sort_values(ascending=False).head(5)
avg_age_by_city

     #This code shows the top 5 cities according to the age percentage

city
forest knolls    65.0
tucson           52.0
lagunitas        41.5
larkspur         40.5
montara          38.0
Name: age, dtype: float64

     #uestion 3: What are the average income levels in the top states or cities? Are there regional patterns in reported income?

avg_income_by_city = df_cleaned.groupby('city')['income'].mean().sort_values(ascending=False).head(5)
avg_income_by_state = df_cleaned.groupby('state')['income'].mean().sort_values(ascending=False).head(5)
avg_income_by_city, avg_income_by_state

(city
 colma            40000.0
 philadelphia     40000.0
 corte madera     40000.0
 concord          40000.0
 forest knolls    40000.0
 Name: income, dtype: float64,
 state
 new york         40000.000000
 pennsylvania     40000.000000
 massachusetts    30000.000000
 california       26110.888711
 arizona          20000.000000
 Name: income, dtype: float64)

print("Height Analysis")

Height Analysis

     #Question 1: What is the average height of users across different gender categories?

avg_height_by_gender = df_cleaned.groupby('gender')['height'].mean()
avg_height_by_gender

gender
f    65.240237
m    70.360483
Name: height, dtype: float64

    #Question 2: How does height vary by age_group? Are there noticeable trends among younger vs. older users?

avg_height_by_age_group = df_cleaned.groupby('age_group',observed=False)['height'].mean()
avg_height_by_age_group

age_group
18-25    68.631604
26-35    68.757000
36-45    68.448347
46+      68.198529
Name: height, dtype: float64

     #Question 3: What is the distribution of height within body_type categories (e.g., athletic, curvy, thin)? Do the distributions align with expectations?

sns.boxplot(x='height', y='body_type', data=df_cleaned)
plt.title('Height Distribution by Body Type')
plt.show()

      #This code shows the distribution of height within the body type category using boxplot visualization

print("Income Analysis")

Income Analysis

     #Question 1: What is the distribution of income across the platform? Are there specific income brackets that dominate? (don't count 0)

income_distribution = df_cleaned[df_cleaned['income'] > 0]['income'].describe()
income_distribution

count     5005.00000
mean     26109.89011
std       7996.29661
min      20000.00000
25%      20000.00000
50%      20000.00000
75%      30000.00000
max      40000.00000
Name: income, dtype: float64

      #Question 2: How does income correlate with age and height? Are older or taller users more likely to report higher incomes?

sns.pairplot(df_cleaned[['income', 'age', 'height']])
plt.show()

print("Part 4: Data Visualization")
print("1. Age Distribution")

Part 4: Data Visualization
1. Age Distribution

      #Question 1: Plot a histogram of age with a vertical line indicating the mean age. What does the distribution reveal about the most common age group on the platform?

plt.figure(figsize=(5, 3))
sns.histplot(df_cleaned['age'], kde=False, bins=30, color='skyblue')
plt.axvline(df_cleaned['age'].mean(), color='red', linestyle='dashed', linewidth=2, label=f'Mean Age: {df_cleaned["age"].mean():.2f}')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend()
plt.show()

      #Question 2: How does the age distribution differ by gender? Are there age groups where one gender is more prevalent?

plt.figure(figsize=(5,3))
sns.histplot(df_cleaned, x='age', hue='gender', multiple='stack', bins=30, palette='pastel')
plt.title('Age Distribution by Gender')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend(title='Gender', labels=['Male', 'Female'])
plt.show()

print("Income and Age")

Income and Age

      #Question 1: Use a scatterplot to visualize the relationship between income and age, with a trend line indicating overall patterns. Are older users more likely to report higher incomes?

plt.figure(figsize=(5,3))
sns.regplot(x='age', y='income', data=df_cleaned, scatter_kws={'s': 10}, line_kws={'color': 'red'})
plt.title('Income vs. Age')
plt.xlabel('Age')
plt.ylabel('Income')
plt.show()

      #Question 2: Create boxplots of income grouped by age group. Which age group reports the highest median income?

plt.figure(figsize=(5,3))
sns.boxplot(x='age_group', y='income', data=df_cleaned)
plt.title('Income Distribution by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Income')
plt.show()

print("Pets and Preferences")

Pets and Preferences

     #Question 1: Create a bar chart showing the distribution of pet preferences (e.g., likes dogs, likes cats). Which preferences are most common?

plt.figure(figsize=(15,3))
sns.countplot(x='pets', data=df_cleaned, hue='pets', palette='Set2', legend=False)
plt.title('Distribution of Pet Preferences')
plt.xlabel('Pet Preference')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

      #Question 2: How do pet preferences vary across gender and age group? Are younger users more likely to report liking pets compared to older users?

# Pet preferences by gender and age group
plt.figure(figsize=(15, 3))
sns.countplot(x='pets', hue='gender', data=df_cleaned, palette='Set2')
plt.title('Pet Preferences by Gender')
plt.xlabel('Pet Preference')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Gender', labels=['Male', 'Female'])
plt.show()

# Pet preferences by age group
plt.figure(figsize=(15, 3))
sns.countplot(x='pets', hue='age_group', data=df_cleaned, palette='Set3')
plt.title('Pet Preferences by Age Group')
plt.xlabel('Pet Preference')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Age Group')
plt.show()

print("4.Zodiac signs and Personality")

4.Zodiac signs and Personality

     #Question 1: Create a pie chart showing the distribution of zodiac signs (sign) across the platform. Which signs are most and least represented? Is this the right chart? If not, replace with the right chart.

sign_distribution = df_cleaned['sign'].value_counts()
plt.figure(figsize=(10,10))
sign_distribution.plot.pie(autopct='%1.1f%%', startangle=90, cmap='Set3')
plt.title('Distribution of Zodiac Signs')
plt.ylabel('')
plt.show()

      #Question 2: How does sign vary across gender and status? Are there noticeable patterns or imbalances?

#Zodiac signs by gender
plt.figure(figsize=(20, 3))
sns.countplot(x='sign', hue='gender', data=df_cleaned, palette='Set2')
plt.title('Zodiac Signs by Gender')
plt.xlabel('Zodiac Sign')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Gender', labels=['Male', 'Female'])
plt.show()

# Zodiac signs by status
plt.figure(figsize=(20, 3))
sns.countplot(x='sign', hue='status', data=df_cleaned, palette='Set2')
plt.title('Zodiac Signs by Status')
plt.xlabel('Zodiac Sign')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Status')
plt.show()

	age	status	gender	body_type	diet	drinks	education	ethnicity	height	income	job	last_online	location	pets	religion	sign	speaks
0	22	single	m	a little extra	strictly anything	socially	working on college/university	asian, white	75.0	-1	transportation	2012-06-28-20-30	south san francisco, california	likes dogs and likes cats	agnosticism and very serious about it	gemini	english
1	35	single	m	average	mostly other	often	working on space camp	white	70.0	80000	hospitality / travel	2012-06-29-21-41	oakland, california	likes dogs and likes cats	agnosticism but not too serious about it	cancer	english (fluently), spanish (poorly), french (...
2	38	available	m	thin	anything	socially	graduated from masters program	NaN	68.0	-1	NaN	2012-06-27-09-10	san francisco, california	has cats	NaN	pisces but it doesn’t matter	english, french, c++
3	23	single	m	thin	vegetarian	socially	working on college/university	white	71.0	20000	student	2012-06-28-14-22	berkeley, california	likes cats	NaN	pisces	english, german (poorly)
4	29	single	m	athletic	NaN	socially	graduated from college/university	asian, black, other	66.0	-1	artistic / musical / writer	2012-06-27-21-26	san francisco, california	likes dogs and likes cats	NaN	aquarius	english

	age	height	income
count	59946.000000	59946.000000	59946.000000
mean	32.340290	68.295282	20033.222534
std	9.452779	3.994738	97346.192104
min	18.000000	1.000000	-1.000000
25%	26.000000	66.000000	-1.000000
50%	30.000000	68.000000	-1.000000
75%	37.000000	71.000000	-1.000000
max	110.000000	95.000000	1000000.000000

	age	status	gender	body_type	diet	drinks	education	ethnicity	height	income	job	last_online	location	pets	religion	sign	speaks
0	22	single	m	a little extra	strictly anything	socially	working on college/university	asian, white	75.0	0	transportation	NaT	south san francisco, california	likes dogs and likes cats	agnosticism and very serious about it	gemini	english
1	35	single	m	average	mostly other	often	working on space camp	white	70.0	80000	hospitality / travel	NaT	oakland, california	likes dogs and likes cats	agnosticism but not too serious about it	cancer	english (fluently), spanish (poorly), french (...
2	38	available	m	thin	anything	socially	graduated from masters program	NaN	68.0	0	NaN	2012-06-27 19:00:00+00:00	san francisco, california	has cats	NaN	pisces but it doesn’t matter	english, french, c++
3	23	single	m	thin	vegetarian	socially	working on college/university	white	71.0	20000	student	2012-06-29 12:00:00+00:00	berkeley, california	likes cats	NaN	pisces	english, german (poorly)
4	29	single	m	athletic	NaN	socially	graduated from college/university	asian, black, other	66.0	0	artistic / musical / writer	NaT	san francisco, california	likes dogs and likes cats	NaN	aquarius	english

	age	status	gender	body_type	diet	drinks	education	ethnicity	height	income	job	last_online	location	pets	religion	sign	speaks	age_group	profile_completeness	height_cm
3	23	single	m	thin	vegetarian	socially	working on college/university	white	71.0	20000	student	2012-06-29 12:00:00+00:00	berkeley, california	likes cats	NaN	pisces	english, german (poorly)	18-25	94.736842	180.34
11	28	seeing someone	m	average	mostly anything	socially	graduated from college/university	white	72.0	40000	banking / financial / real estate	2012-05-22 22:00:00+00:00	daly city, california	likes cats	christianity and very serious about it	leo but it doesn’t matter	english (fluently), sign language (poorly)	26-35	100.000000	182.88
13	30	single	f	skinny	mostly anything	socially	graduated from high school	white	66.0	30000	sales / marketing / biz dev	2012-06-13 22:00:00+00:00	san francisco, california	has dogs and likes cats	christianity but not too serious about it	NaN	english	26-35	94.736842	167.64
66	22	single	m	athletic	mostly anything	rarely	working on college/university	asian	65.0	20000	education / academia	2012-06-30 02:00:00+00:00	san jose, california	NaN	buddhism and laughing about it	virgo but it doesn’t matter	english (fluently)	18-25	94.736842	165.10
79	21	single	m	fit	mostly anything	rarely	working on college/university	white	71.0	20000	entertainment / media	NaT	san francisco, california	likes dogs and likes cats	catholicism but not too serious about it	NaN	english	18-25	89.473684	180.34

status	available	married	seeing someone	single	unknown
gender
f	5.088757	1.005917	7.573964	86.213018	0.118343
m	4.313725	0.633484	4.675716	90.377074	NaN

	age	income	height
age	1.000000	0.266827	-0.021453
income	0.266827	1.000000	0.017391
height	-0.021453	0.017391	1.000000

drinks	desperately	not at all	often	rarely	socially	very often
diet
anything	0.204918	7.786885	14.754098	11.680328	64.139344	1.434426
halal	NaN	NaN	NaN	NaN	100.000000	NaN
mostly anything	0.324254	8.819715	10.246433	13.488975	65.369650	1.750973
mostly halal	NaN	62.500000	12.500000	12.500000	NaN	12.500000
mostly kosher	NaN	8.333333	NaN	41.666667	50.000000	NaN
mostly other	NaN	15.740741	7.407407	21.296296	53.703704	1.851852
mostly vegan	NaN	18.750000	8.333333	25.000000	45.833333	2.083333
mostly vegetarian	0.949367	6.962025	9.493671	17.721519	63.607595	1.265823
other	2.941176	8.823529	11.764706	14.705882	61.764706	NaN
strictly anything	1.545254	6.181015	16.777042	9.492274	63.796909	2.207506
strictly halal	NaN	NaN	NaN	NaN	50.000000	50.000000
strictly kosher	NaN	NaN	100.000000	NaN	NaN	NaN
strictly other	3.076923	16.923077	7.692308	10.769231	56.923077	4.615385
strictly vegan	NaN	25.641026	12.820513	20.512821	41.025641	NaN
strictly vegetarian	3.000000	9.000000	18.000000	14.000000	54.000000	2.000000
vegan	NaN	16.666667	16.666667	22.222222	38.888889	5.555556
vegetarian	2.500000	12.500000	12.500000	5.000000	62.500000	5.000000