# Step 1: Install gdown
!pip install gdown

# Step 2: Import the file from Google Drive
import gdown

# File ID and URL
file_id = "1Jhl4lWVCMsvuWzTVY7hYCE88AMv-f1-7"
url = f"https://drive.google.com/uc?id={file_id}"

# Output file name
output = "duolingo_data.csv"

# Step 3: Download the file
gdown.download(url, output, quiet=False)

# Step 4: Load the file into a DataFrame
import pandas as pd
data = pd.read_csv(output)

Requirement already satisfied: gdown in /usr/local/lib/python3.12/dist-packages (5.2.1)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (from gdown) (4.13.5)
Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from gdown) (3.24.3)
Requirement already satisfied: requests[socks] in /usr/local/lib/python3.12/dist-packages (from gdown) (2.32.4)
Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from gdown) (4.67.3)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4->gdown) (2.8.3)
Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4->gdown) (4.15.0)
Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests[socks]->gdown) (3.4.4)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests[socks]->gdown) (3.11)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests[socks]->gdown) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests[socks]->gdown) (2026.1.4)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.12/dist-packages (from requests[socks]->gdown) (1.7.1)

Downloading...
From (original): https://drive.google.com/uc?id=1Jhl4lWVCMsvuWzTVY7hYCE88AMv-f1-7
From (redirected): https://drive.google.com/uc?id=1Jhl4lWVCMsvuWzTVY7hYCE88AMv-f1-7&confirm=t&uuid=a159de7a-11b3-4150-a3db-45ebb69f5b7d
To: /content/duolingo_data.csv
100%|██████████| 409M/409M [00:05<00:00, 75.2MB/s]

     #library for mathmetic functins
import numpy as np
    #library for dataframes or tables
import pandas as pd
    #library for visualization functins(charts,graphs)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

data=pd.read_csv('duolingo_data.csv')
print(data.iloc[:5,:].head().to_string())

   p_recall            timestamp    delta user_id learning_language ui_language                         lexeme_id                    lexeme_string  history_seen  history_correct  session_seen  session_correct
0       1.0  2013-03-03 17:13:47  1825254     5C7                fr          en  3712581f1a9fbc0894e22664992663e9                      sur/sur<pr>             2                1             2                2
1       1.0  2013-03-04 18:30:50      367    fWSx                en          es  0371d118c042c6b44ababe667bed2760             police/police<n><pl>             6                5             2                2
2       0.0  2013-03-03 18:35:44     1329    hL-s                de          en  5fa1f0fcc3b5d93b8617169e59884367  hat/haben<vbhaver><pri><p3><sg>            10               10             1                0
3       1.0  2013-03-07 17:56:03      156    h2_R                es          en  4d77de913dc3d65f1c9fac9d1c349684                        en/en<pr>           111               99             4                4
4       1.0  2013-03-05 21:41:22      257     eON                es          en  35f14d06d95a34607d6abb0e52fc6d2b        caballo/caballo<n><m><sg>             3                3             3                3

data.shape   #shows the total number oF rows and columns in the dataset

(3795780, 12)

data.head()  #shows the data of top 5 rows of the dataset

missing_values=data.isnull().sum()     #Shows the total number of missing values in the each column
print(missing_values)

p_recall             0
timestamp            0
delta                0
user_id              0
learning_language    0
ui_language          0
lexeme_id            0
lexeme_string        0
history_seen         0
history_correct      0
session_seen         0
session_correct      0
dtype: int64

data.dtypes     #shows the data types of each of the column

data['timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce')  #converts the datatype for timestamp column into its appropriate data type
data.dtypes

data['session_accuracy'] = data['session_correct'] / data['session_seen']   # creates a new column called session accuracy to check the accurate values by combining session_seen and session_correct
data.head()

data.head()

data.to_csv('duolingo_data.csv', index=False)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3795780 entries, 0 to 3795779
Data columns (total 13 columns):
 #   Column             Dtype         
---  ------             -----         
 0   p_recall           float64       
 1   timestamp          datetime64[ns]
 2   delta              int64         
 3   user_id            object        
 4   learning_language  object        
 5   ui_language        object        
 6   lexeme_id          object        
 7   lexeme_string      object        
 8   history_seen       int64         
 9   history_correct    int64         
 10  session_seen       int64         
 11  session_correct    int64         
 12  session_accuracy   float64       
dtypes: datetime64[ns](1), float64(2), int64(5), object(5)
memory usage: 376.5+ MB

data[['p_recall', 'delta', 'history_seen','history_correct','session_seen','session_correct']].describe()    #checking for the outliers

user_practice = data.groupby('user_id').agg({
    'delta': 'mean',
    'p_recall': 'mean'
}).reset_index()

correlation_delta_recall = user_practice['delta'].corr(user_practice['p_recall'])
print("Correlation between delta and recall accuracy:" ,correlation_delta_recall)

Correlation between delta and recall accuracy: -0.07888060432791903

data['date'] = data['timestamp'].dt.date
user_engagement = data.groupby('user_id').agg({
    'date': 'nunique',
    'history_correct': 'sum',
    'history_seen': 'sum'
}).reset_index()

user_engagement['cumulative_accuracy'] = user_engagement['history_correct'] / user_engagement['history_seen']

correlation_active_days_accuracy = user_engagement['date'].corr(user_engagement['cumulative_accuracy'])
print("Correlation between active days and cumulative accuracy: ",correlation_active_days_accuracy)

Correlation between active days and cumulative accuracy:  0.06644845619084934

daily_progress = data.groupby(data['timestamp'].dt.date).agg({'p_recall': 'mean'}).reset_index()
print(daily_progress.head())

    timestamp  p_recall
0  2013-03-01  0.895310
1  2013-03-02  0.897230
2  2013-03-03  0.895920
3  2013-03-04  0.896910
4  2013-03-05  0.896474

lexeme_repeted = data.groupby('lexeme_string').agg({
    'history_seen': 'sum',
    'history_correct': 'sum'
}).reset_index()

lexeme_repeted['recall_accuracy'] = lexeme_repeted['history_correct'] / lexeme_repeted['history_seen']

correlation_lexeme_recall = lexeme_repeted['history_seen'].corr(lexeme_repeted['recall_accuracy'])
print("Correlation between history_seen and recall accuracy: ",correlation_lexeme_recall)

Correlation between history_seen and recall accuracy:  0.001616856728345985

user_frequency = data.groupby('user_id').agg({
    'delta': 'mean',
    'history_correct': 'sum',
    'history_seen': 'sum'
}).reset_index()

user_frequency['cumulative_accuracy'] = user_frequency['history_correct'] / user_frequency['history_seen']

correlation_delta_cumulative = user_frequency['delta'].corr(user_frequency['cumulative_accuracy'])
print("Correlation between delta and cumulative accuracy: ",correlation_delta_cumulative)

Correlation between delta and cumulative accuracy:  0.035299646579048556

language_difficulty = data.groupby('learning_language')['p_recall'].mean().reset_index()
print(language_difficulty)

  learning_language  p_recall
0                de  0.892481
1                en  0.898900
2                es  0.898845
3                fr  0.882756
4                it  0.907550
5                pt  0.903441

ui_language_impact = data.groupby(['ui_language', 'learning_language'])['p_recall'].mean().reset_index()
print(ui_language_impact)

  ui_language learning_language  p_recall
0          en                de  0.892481
1          en                es  0.898845
2          en                fr  0.882756
3          en                it  0.907550
4          en                pt  0.903441
5          es                en  0.898155
6          it                en  0.908159
7          pt                en  0.897696

language_trends = data.groupby('learning_language').agg({
    'session_seen': 'sum',
    'p_recall': 'mean'
}).reset_index()
print(language_trends)

  learning_language  session_seen  p_recall
0                de        757359  0.892481
1                en       2716155  0.898900
2                es       1805186  0.898845
3                fr       1027550  0.882756
4                it        402770  0.907550
5                pt        156236  0.903441

user_behavior = data.groupby('user_id').agg({
    'session_seen': 'sum',
    'history_correct': 'sum',
    'history_seen': 'sum'
}).reset_index()

user_behavior['cumulative_accuracy'] = user_behavior['history_correct'] / user_behavior['history_seen']

correlation_engagement_accuracy = user_behavior['session_seen'].corr(user_behavior['cumulative_accuracy'])
print("Correlation between session_seen and cumulative accuracy: ",correlation_engagement_accuracy)

Correlation between session_seen and cumulative accuracy:  0.06514007500602051

performance_consistency = data.groupby('user_id').agg({
    'session_accuracy': 'std',
    'p_recall': 'mean'
}).reset_index()

correlation_consistency_recall = performance_consistency['session_accuracy'].corr(performance_consistency['p_recall'])
print("Correlation between std of session_accuracy and recall accuracy:",correlation_consistency_recall)

Correlation between std of session_accuracy and recall accuracy: -0.7522768359836207

daily_engagement = data.groupby(data['timestamp'].dt.date)['session_seen'].sum()

plt.figure(figsize=(5, 3))
daily_engagement.plot(marker='o', color='blue')
plt.title('Daily Engagement Trends')
plt.xlabel('Date')
plt.ylabel('Total Sessions Seen')
plt.xticks(rotation=45)
plt.grid()
plt.show()

language_engagement = data.groupby('learning_language')['session_seen'].sum()

plt.figure(figsize=(5, 3))
language_engagement.sort_values().plot(kind='bar', color='teal')
plt.title('Engagement by Learning Language')
plt.xlabel('Learning Language')
plt.ylabel('Total Sessions Seen')
plt.xticks(rotation=45)
plt.grid()
plt.show()

plt.figure(figsize=(4, 3))
sns.histplot(data['p_recall'], kde=True, bins=30, color='green')
plt.title('Distribution of Recall Accuracy (p_recall)')
plt.xlabel('Recall Accuracy')
plt.ylabel('Frequency')
plt.show()

language_performance = data.groupby('learning_language')['p_recall'].mean()

plt.figure(figsize=(5, 3))
language_performance.sort_values().plot(kind='bar', color='coral')
plt.title('Average Recall Accuracy by Learning Language')
plt.xlabel('Learning Language')
plt.ylabel('Average Recall Accuracy')
plt.xticks(rotation=45)
plt.grid()
plt.show()

user_engagement = data.groupby('user_id').agg({
    'session_seen': 'sum',
    'history_correct': 'sum',
    'history_seen': 'sum'
}).reset_index()

user_engagement['cumulative_accuracy'] = user_engagement['history_correct'] / user_engagement['history_seen']

plt.figure(figsize=(4, 3))
plt.scatter(user_engagement['session_seen'], user_engagement['cumulative_accuracy'], alpha=0.6, color='purple')
plt.title('Engagement vs Cumulative Accuracy')
plt.xlabel('Total Sessions Seen')
plt.ylabel('Cumulative Accuracy')
plt.grid()
plt.show()

correlation_matrix = data[['p_recall', 'delta', 'history_seen','history_correct', 'session_seen','session_correct']].corr()

plt.figure(figsize=(4, 3))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Key Metrics')
plt.show()

	p_recall	delta	history_seen	history_correct	session_seen	session_correct
count	3.795780e+06	3.795780e+06	3.795780e+06	3.795780e+06	3.795780e+06	3.795780e+06
mean	8.964675e-01	7.055116e+05	2.197719e+01	1.949662e+01	1.808655e+00	1.636209e+00
std	2.711188e-01	2.211979e+06	1.283616e+02	1.136178e+02	1.350644e+00	1.309628e+00
min	0.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	0.000000e+00
25%	1.000000e+00	5.180000e+02	3.000000e+00	3.000000e+00	1.000000e+00	1.000000e+00
50%	1.000000e+00	7.609500e+04	6.000000e+00	6.000000e+00	1.000000e+00	1.000000e+00
75%	1.000000e+00	4.346412e+05	1.500000e+01	1.300000e+01	2.000000e+00	2.000000e+00
max	1.000000e+00	3.964973e+07	1.344200e+04	1.281600e+04	2.000000e+01	2.000000e+01

Duolingo's User Insights¶

Problem Statement:¶

Analysis Approach:¶

Dataset:¶

Data Analisis and Visualization¶

PART 1: DATA CLEANING¶

Part 2: Data Processing¶

PART 3 : DATA ANALYSIS¶

2.Learning Success Analysis:¶

3.Language and Platform Trends Analysis:¶

4.User Behaviour Analysis¶

PART 4 : DATA VISUALIZATION¶

key Insights:¶

Recommendations:¶

	p_recall	timestamp	delta	user_id	learning_language	ui_language	lexeme_id	lexeme_string	history_seen	history_correct	session_seen	session_correct
0	1.0	2013-03-03 17:13:47	1825254	5C7	fr	en	3712581f1a9fbc0894e22664992663e9	sur/sur<pr>	2	1	2	2
1	1.0	2013-03-04 18:30:50	367	fWSx	en	es	0371d118c042c6b44ababe667bed2760	police/police<n><pl>	6	5	2	2
2	0.0	2013-03-03 18:35:44	1329	hL-s	de	en	5fa1f0fcc3b5d93b8617169e59884367	hat/haben<vbhaver><pri><p3><sg>	10	10	1	0
3	1.0	2013-03-07 17:56:03	156	h2_R	es	en	4d77de913dc3d65f1c9fac9d1c349684	en/en<pr>	111	99	4	4
4	1.0	2013-03-05 21:41:22	257	eON	es	en	35f14d06d95a34607d6abb0e52fc6d2b	caballo/caballo<n><m><sg>	3	3	3	3

	0
p_recall	float64
timestamp	object
delta	int64
user_id	object
learning_language	object
ui_language	object
lexeme_id	object
lexeme_string	object
history_seen	int64
history_correct	int64
session_seen	int64
session_correct	int64