import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
import seaborn as sns
import matplotlib.pyplot as plt  # For plotting and visualization
 # For advanced visualizations
from google.colab import files
from scipy.stats import pearsonr

# Step 1: Install gdown
!pip install gdown

# Step 2: Import necessary libraries
import gdown
import pandas as pd

# Step 3: Set the file ID and create a download URL
file_id = "1zeewyef5Czg1B5PEAeFkvihPaqDtzmdG"
download_url = f"https://drive.google.com/uc?id={file_id}"

# Step 4: Set the output file name
output_file = "uber_dataset.csv"

# Step 5: Download the file
gdown.download(download_url, output_file, quiet=False)

# Step 6: Load the CSV file into a Pandas DataFrame
data = pd.read_csv(output_file)

Requirement already satisfied: gdown in /usr/local/lib/python3.11/dist-packages (5.2.0)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.11/dist-packages (from gdown) (4.13.3)
Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from gdown) (3.18.0)
Requirement already satisfied: requests[socks] in /usr/local/lib/python3.11/dist-packages (from gdown) (2.32.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from gdown) (4.67.1)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown) (2.6)
Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown) (4.12.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (2025.1.31)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown) (1.7.1)

Downloading...
From (original): https://drive.google.com/uc?id=1zeewyef5Czg1B5PEAeFkvihPaqDtzmdG
From (redirected): https://drive.google.com/uc?id=1zeewyef5Czg1B5PEAeFkvihPaqDtzmdG&confirm=t&uuid=66af391a-dccd-4622-8726-4776345913d9
To: /content/uber_dataset.csv
100%|██████████| 754M/754M [00:07<00:00, 105MB/s]

print("First 5 Rows of the Dataset:")
data.head()

First 5 Rows of the Dataset:

print("\nDataset Information:")
data.info()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500000 entries, 0 to 6499999
Data columns (total 16 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   VendorID               int64  
 1   tpep_pickup_datetime   object 
 2   tpep_dropoff_datetime  object 
 3   passenger_count        float64
 4   trip_distance          float64
 5   PULocationID           int64  
 6   DOLocationID           int64  
 7   payment_type           float64
 8   fare_amount            float64
 9   extra                  float64
 10  gst                    float64
 11  tip_amount             float64
 12  tolls_amount           float64
 13  improvement_surcharge  float64
 14  total_amount           float64
 15  congestion_surcharge   float64
dtypes: float64(11), int64(3), object(2)
memory usage: 793.5+ MB

# copying data in DataFrame df
df = data.copy()

print(f"No. of rows in Dataset: {df.shape[0]}")
print(f"No. of columns in Dataset: {df.shape[1]}")

No. of rows in Dataset: 6500000
No. of columns in Dataset: 16

#Null values in each coloumns
missing_value_percentage = round((df.isnull().sum()) * 100 / len(df),2)
null_counts = df.isnull().sum()

missing_data = pd.DataFrame({ 'Missing Values': null_counts,'Percentage Missing': missing_value_percentage})

missing_data['Percentage Missing'] = missing_data['Percentage Missing'].astype(str) + '%'

missing_data

duplicate_count = len(df[df.duplicated()])
print(f"Number of Duplicate Rows in the Dataset: {duplicate_count}")

Number of Duplicate Rows in the Dataset: 12949

# df_cleaned = df.drop_duplicates(keep='first')

# # Check the shape of the cleaned dataset
# print("Number of rows after removing duplicates:", df_cleaned.shape[0])

df = df.drop_duplicates(keep='first')

# Check the shape of the cleaned dataset
print("Number of rows after removing duplicates:", df.shape[0])

Number of rows after removing duplicates: 6487051

# Removes only the rows where % missing value is Very Low.
df.dropna(subset=['passenger_count', 'payment_type'], inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6420771 entries, 0 to 6499999
Data columns (total 16 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   VendorID               int64  
 1   tpep_pickup_datetime   object 
 2   tpep_dropoff_datetime  object 
 3   passenger_count        float64
 4   trip_distance          float64
 5   PULocationID           int64  
 6   DOLocationID           int64  
 7   payment_type           float64
 8   fare_amount            float64
 9   extra                  float64
 10  gst                    float64
 11  tip_amount             float64
 12  tolls_amount           float64
 13  improvement_surcharge  float64
 14  total_amount           float64
 15  congestion_surcharge   float64
dtypes: float64(11), int64(3), object(2)
memory usage: 832.8+ MB

df.shape[0]

6420771

def fast_convert_datetime(series):
    """Optimized function to convert mixed date formats."""
    series = series.astype(str)  # Ensure all values are strings

    # First attempt: Convert assuming MM/DD/YYYY hh:mm:ss AM/PM format
    dt1 = pd.to_datetime(series, format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

    # Second attempt: Convert assuming DD-MM-YYYY H.MM format
    dt2 = pd.to_datetime(series, format='%d-%m-%Y %H.%M', errors='coerce')

    # Combine results, keeping the first successful conversion
    return dt1.fillna(dt2)

# Apply optimized function to both columns
df['tpep_pickup_datetime'] = fast_convert_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = fast_convert_datetime(df['tpep_dropoff_datetime'])

# Check for any remaining unconverted values
invalid_dates = df[df['tpep_pickup_datetime'].isna() | df['tpep_dropoff_datetime'].isna()]
print(f"Number of invalid date entries: {len(invalid_dates)}")

Number of invalid date entries: 0

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6420771 entries, 0 to 6499999
Data columns (total 16 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   PULocationID           int64         
 6   DOLocationID           int64         
 7   payment_type           float64       
 8   fare_amount            float64       
 9   extra                  float64       
 10  gst                    float64       
 11  tip_amount             float64       
 12  tolls_amount           float64       
 13  improvement_surcharge  float64       
 14  total_amount           float64       
 15  congestion_surcharge   float64       
dtypes: datetime64[ns](2), float64(11), int64(3)
memory usage: 832.8 MB

location_mapping ={
    265:"Anjanapura",
175:"Anjanapura (duplicate entry)",
264:"Arekere",
8:"Attibele (duplicate entry)",
41:"Attibele (Peripheral Town)",
7:"Austin Town",
173:"Bagalakunte",
156:"Bagalur",
95:"Bagalur (Hebbal Kempapura)",
238:"Bagmane Tech Park (CV Raman Nagar)",
139:"Baiyappanahalli",
262:"Banashankari",
129:"Banashankari 3rd Stage",
193:"Banaswadi",
185:"Banaswadi Outer Ring Road",
131:"Bannerghatta (Bannerghatta Town)",
190:"Bannerghatta National Park",
247:"Bannerghatta Road",
150:"Basapura",
89:"Basavanagar",
246:"Basavanagudi",
151:"Basaveshwaranagar",
142:"Bellandur",
259:"BEML Layout (RR Nagar)",
138:"Bengaluru Cantonment Railway Station",
119:"Benson Town",
55:"Bhoopasandra",
118:"Bidarahalli (duplicate entry)",
143:"Bommanahalli",
26:"Bommanahalli (area)",
80:"Bommasandra",
75:"Brigade Road",
232:"Brookefield",
90:"BTM Layout",
228:"Byatarayanapura",
160:"Byrathi",
32:"Cambridge Layout",
258:"Chandra Layout",
207:"Channasandra",
252:"Chikka Tirupathi",
96:"Chikka Tirupathi (duplicate entry)",
23:"Chikkabanavara (duplicate entry)",
200:"Chikkalasandra",
258:"Choodasandra",
10:"Church Street",
49:"Commercial Street",
205:"Cooke Town",
42:"Cox Town",
45:"Cunningham Road",
263:"CV Raman Nagar",
201:"Devanahalli (duplicate entry)",
165:"Devinagar",
203:"Dodballapur Road (Rajanukunte)",
192:"Dodda Aalada Mara Rd (duplicate entry)",
136:"Dodda Kannalli",
191:"Doddabommasandra",
223:"Doddanekundi",
257:"Doddanekundi (EPIP Zone)",
100:"Domlur",
242:"Domlur Layout",
125:"Dommasandra",
157:"Ejipura",
186:"Electronic City Phase I",
137:"Electronic City Phase II",
164:"Embassy Tech Village (Outer Ring Road)",
50:"Frazer Town",
37:"Gandhinagar",
243:"Garuda Mall (Magrath Road)",
166:"Girinagar",
216:"Gottigere",
85:"Gunjur",
218:"HAL 2nd Stage (Indiranagar)",
112:"HAL Airport Road (Kodihalli)",
78:"Halasuru (Ulsoor)",
244:"HBR Layout",
181:"Hebbal",
68:"Hebbal Junction",
235:"Hebbal Kempapura",
189:"Hegde Nagar",
258:"Hennur",
71:"Hennur Road",
183:"Hesaraghatta",
74:"Hoodi",
82:"Horamavu",
73:"Horamavu (duplicate entry)",
29:"Horamavu Agara",
52:"Hoskote",
102:"Hosur Road (Electronics City)",
250:"Hosur Road (Singasandra)",
233:"HSR Layout",
217:"Hulimangala",
226:"Hulimavu",
86:"Hulimavu (duplicate entry)",
254:"Huskur",
117:"Immadihalli",
170:"Indiranagar",
98:"Innovative Film City",
253:"Innovative Film City (duplicate entry)",
210:"Jakkur",
188:"Jalahalli",
167:"Jalahalli East",
249:"Jayanagar",
209:"Jigani",
178:"Jigani (duplicate entry)",
43:"JP Nagar",
174:"JP Nagar Phase 6",
15:"JP Nagar Phase 8",
227:"JP Nagar Phase 9",
196:"Kadubeesanahalli",
19:"Kadubeesanahalli (duplicate entry)",
198:"Kadugodi",
124:"Kadugondanahalli (KG Halli)",
146:"Kaggadasapura",
207:"Kalkere",
130:"Kalyan Nagar",
91:"Kamakhya (Kamakshipalya)",
97:"Kamaksipalya (Kamakshipalya)",
88:"Kammanahalli",
101:"Kammanahalli (duplicate entry)",
202:"Kanakapura",
70:"Kanakapura Road",
67:"Kanchipuram (duplicate entry)",
77:"Karthik Nagar (Marathahalli)",
66:"Kasturi Nagar",
195:"Kathriguppe",
101:"Kattigenahalli",
237:"Kempegowda International Airport (KIAL)",
56:"Kengeri",
44:"Kengeri (duplicate entry)",
62:"Kengeri Satellite Town",
182:"Konanakunte",
132:"Koramangala",
69:"Kothanur",
21:"KR Puram",
161:"KSR Bengaluru City Railway Station",
3:"Kudlu Gate",
145:"Kumaraswamy Layout",
194:"Kumaraswamy Layout (ISRO)",
208:"Kumbalgodu",
36:"Laggere",
40:"Lingarajapuram",
31:"Magadi (duplicate entry)",
133:"Magadi Road (Tollgate)",
13:"Mahadevapura",
34:"Mahalakshmi Layout (duplicate entry)",
236:"Majestic Bus Station (KBS)",
169:"Malleshpalya",
114:"Malleshwaram",
5:"Malleshwaram (duplicate entry)",
148:"Malleswaram",
230:"Manyata Tech Park (Nagawara)",
48:"Marathahalli",
135:"Maruthi Nagar (Yelahanka)",
1:"Maruthi Sevanagar",
131:"Mathikere",
163:"MG Road",
127:"MS Palya",
212:"Munnekollal",
17:"Murphy Town",
183:"Murugeshpalya (duplicate entry)",
213:"Nagadevanahalli (NICE Junction)",
65:"Nagarbhavi",
116:"Nagavara",
4:"Nagawara",
64:"Nagawara (duplicate entry)",
154:"Nandi Hills",
220:"Nayandahalli (Mysore Rd)",
256:"Nelamangala",
83:"NRI Layout",
222:"Old Airport Road (Murugeshpalya)",
158:"Padmanabhanagar",
29:"Pai Layout",
225:"Panathur",
260:"Peenya",
155:"Prashanth Nagar",
197:"Rachenahalli",
231:"Rajajinagar",
184:"Rajanukunte",
224:"Rajarajeshwari Nagar",
28:"Ramamurthy Nagar",
253:"Ramohalli",
123:"Ramohalli (Big Banyan Tree)",
92:"Richards Town",
61:"Richmond Town",
107:"RMZ Ecospace (Bellandur)",
25:"RT Nagar",
261:"Sadashivanagar",
12:"Sadduguntepalya (SG Palya)",
121:"Sahakara Nagar",
126:"Sampigehalli",
215:"Sanjaynagar",
24:"Sarjapur Road",
60:"Seegahalli",
33:"Seshadripuram",
54:"Sevanagar (duplicate entry)",
39:"Shantinagar (Locality)",
140:"Shantinagar Bus Station",
211:"Shivajinagar",
229:"Shivajinagar Bus Station",
141:"Silk Board Junction",
153:"Singanahalli (duplicate entry)",
81:"Singasandra",
177:"Sompura (Nelamangala Rd)",
241:"Sompura (Sarjapur Rd)",
38:"Srirampura",
94:"Sudhama Nagar",
156:"Suryanagar",
219:"Talaghattapura",
122:"TC Palya",
240:"Thavarekere (duplicate entry)",
108:"Thippasandra",
72:"Thyagaraja Nagar",
218:"Ullal",
113:"Ulsoor",
35:"Ulsoor Lake/Cubbon Park",
255:"Varthur",
14:"Vasanthapura",
152:"Vasanthnagar",
149:"Venkatagiri Kote",
147:"Venkatapura",
179:"Vidyaranyapura",
20:"Vidyaranyapura (duplicate entry)",
144:"Vijayanagar",
128:"Vijayanagar (duplicate entry)",
168:"Vishweshwaraiah Layout",
76:"VV Puram",
51:"VV Puram (Food Street)",
162:"Whitefield",
154:"Wonderla Amusement Park",
11:"Yelachenahalli",
87:"Yelahanka",
9:"Yelahanka (duplicate entry)",
93:"Yelahanka New Town",
16:"Yelenahalli",
134:"Yemalur",
234:"Yeshwanthpur Railway Station",
63:"Yeshwantpur Industrial (Peenya)",
}

df['PULocationID'] = df['PULocationID'].map(location_mapping)
df['DOLocationID'] = df['DOLocationID'].map(location_mapping)

payment_mapping = {
    1: "UPI",
    2: "Cash",
    3: "Credit/Debit card",
    4: "Uber Money",
    5: "Voided trip",
    6: "Dispute"
}


df['payment_type'] = df['payment_type'].map(payment_mapping)

df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()

df.head()

df['trip_duration_hours'] = df['trip_duration'] / 60


# df['RDH'] = df['total_amount'].sum() / df['trip_duration_hours'].sum()
df['RDH'] = df['total_amount'] / df['trip_duration_hours']

# Find the minimum and maximum dates in 'tpep_pickup_datetime' column
min_date = df['tpep_pickup_datetime'].min().date()
max_date = df['tpep_pickup_datetime'].max().date()

print(f"Start date: {min_date}")
print(f"End date: {max_date}")

Start date: 2003-01-01
End date: 2021-01-02

df.head()

numeric_columns = [
    'trip_distance', 'fare_amount', 'total_amount', 'passenger_count',
    'extra', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge'
]

# Create boxplots for all numerical columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_columns, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x=data[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()


plt.figure(figsize=(15, 8))
df[numeric_columns].boxplot(rot=45)
plt.title("Boxplot Of Outlier")
plt.show()

df[numeric_columns].agg(['min', 'max'])

numerical_columns = ['trip_distance', 'fare_amount', 'total_amount', 'extra', 'tip_amount',
    'tolls_amount', 'improvement_surcharge', 'congestion_surcharge']

# Compute IQR
Q1 = df[numerical_columns].quantile(0.10)
Q3 = df[numerical_columns].quantile(0.90)
IQR = Q3 - Q1

# Apply the outlier removal condition only to numerical columns
mask = ~((df[numerical_columns] < (Q1 - 1.5 * IQR)) | (df[numerical_columns] > (Q3 + 1.5 * IQR))).any(axis=1)

# Use the mask to filter the full DataFrame
df = df.loc[mask]

print(f"Original dataset size: {data.shape}")
print(f"Cleaned dataset size: {df.shape}")

Original dataset size: (6500000, 16)
Cleaned dataset size: (5611796, 19)

print(f"Percentage of rows removed due to outliers: {round(((data.shape[0] - df.shape[0]) / data.shape[0]) * 100, 2)} %")

Percentage of rows removed due to outliers: 13.66 %

import seaborn as sns
import matplotlib.pyplot as plt

# Plot boxplots for numerical columns
plt.figure(figsize=(15, 8))
df[numerical_columns].boxplot(rot=45)
plt.title("Boxplot After Outlier Removal")
plt.show()

df = df[df['trip_distance'] > 0]

df[numeric_columns].agg(['min', 'max'])

df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')

# Extract hour and day of the week
df['hour'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_week'] = df['tpep_pickup_datetime'].dt.day_name()

# Aggregate trips per hour
hourly_trips = df.groupby('hour').size()

# Aggregate trips per day
daily_trips = df.groupby('day_of_week').size()

# Define correct order for days of the week
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_trips = daily_trips.reindex(days_order)

# Plot hourly trip trend
plt.figure(figsize=(12, 5))
sns.lineplot(x=hourly_trips.index, y=hourly_trips.values, marker='o', color='b')
plt.title('Hourly Trip Demand Trend')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Trips')
plt.xticks(range(0, 24))
plt.grid(True)
plt.show()

# Plot daily trip trend
plt.figure(figsize=(12, 5))
sns.barplot(x=daily_trips.index, y=daily_trips.values, palette='Blues_r')
plt.title('Daily Trip Demand Trend')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Trips')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

<ipython-input-29-e1304753d659>:29: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=daily_trips.index, y=daily_trips.values, palette='Blues_r')

# Ensure datetime conversion for pickup timestamps
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()

# Extract relevant time-based features
df['hour'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_week'] = df['tpep_pickup_datetime'].dt.day_name()

# Calculate Revenue Per Driver Hour (RDH)


df['driver_hours'] = df['trip_duration'] / 3600  # Convert trip duration from seconds to hours
df['revenue'] = df['fare_amount'] + df['tip_amount'] + df['tolls_amount']  # Total revenue per trip

# Aggregate data at hourly level
hourly_rdh = df.groupby('hour').agg({'revenue': 'sum', 'driver_hours': 'sum'})
hourly_rdh['RDH'] = hourly_rdh['revenue'] / hourly_rdh['driver_hours']


# Plot RDH trend by hour
plt.figure(figsize=(12, 5))
plt.plot(hourly_rdh.index, hourly_rdh['RDH'], marker='o', linestyle='-', color='blue')
plt.title('Hourly Revenue Per Driver Hour (RDH) Trend', fontsize=14)
plt.xlabel('Hour of the Day', fontsize=12)
plt.ylabel('Revenue Per Driver Hour (RDH)', fontsize=12)
plt.grid(True)
plt.xticks(range(0, 24))
plt.show()
df['hourly_rdh']=hourly_rdh['RDH']

plt.figure(figsize=(10, 5))
sns.scatterplot(x=df['trip_distance'], y=df['fare_amount'], alpha=0.5)
plt.title('Impact of Trip Distance on Fare Amount')
plt.xlabel('Trip Distance (miles)')
plt.ylabel('Fare Amount ($)')
plt.grid(True)
plt.show()

# Scatter plot for Trip Distance vs RDH
plt.figure(figsize=(10, 5))
sns.scatterplot(x=df['trip_distance'], y=df['RDH'], alpha=0.5, color='red')
plt.title('Impact of Trip Distance on RDH')
plt.xlabel('Trip Distance (miles)')
plt.ylabel('Revenue Per Driver Hour (RDH)')
plt.grid(True)
plt.show()

payment_revenue = df.groupby('payment_type')['fare_amount'].sum().reset_index()

# Sorting for better visualization
payment_revenue = payment_revenue.sort_values(by='fare_amount', ascending=False)

# Plotting the impact of payment type on revenue
plt.figure(figsize=(10, 5))
sns.barplot(x='payment_type', y='fare_amount', data=payment_revenue, hue='payment_type', palette='Blues_r', legend=False)

# Customizing the plot
plt.xlabel("Payment Type")
plt.ylabel("Total Revenue ($)")
plt.title("Impact of Payment Type on Revenue")
plt.xticks(rotation=45)  # Rotate for better readability

# Show plot
plt.show()

hourly_demand = df.groupby('hour').agg(
    total_trips=('trip_duration', 'count'),
    avg_RDH=('RDH', 'mean')
).reset_index()

# Plotting Peak Demand vs. RDH
fig, ax1 = plt.subplots(figsize=(12, 5))

# First Y-axis: Total trips (bar plot)
sns.barplot(x='hour', y='total_trips', data=hourly_demand, color='blue', alpha=0.6, ax=ax1)
ax1.set_ylabel('Total Trips (Demand)', color='blue')
ax1.set_xlabel('Hour of the Day')
ax1.set_title('Peak Demand vs. RDH Analysis')

# Second Y-axis: Average RDH (line plot)
ax2 = ax1.twinx()
sns.lineplot(x='hour', y='avg_RDH', data=hourly_demand, color='red', marker='o', ax=ax2)
ax2.set_ylabel('Average RDH', color='red')

# Show plot
plt.show()

df['total_amount'] = df['total_amount'].astype(float)

#  **1. Aggregate Earnings by Pickup & Drop-off Location**
pickup_earnings = df.groupby('PULocationID')['total_amount'].sum().reset_index().rename(columns={'total_amount': 'total_pickup_earnings'})
dropoff_earnings = df.groupby('DOLocationID')['total_amount'].sum().reset_index().rename(columns={'total_amount': 'total_dropoff_earnings'})

# Merge back into the main dataframe
df = df.merge(pickup_earnings, on='PULocationID', how='left')
df = df.merge(dropoff_earnings, on='DOLocationID', how='left')

# Compute average earnings per trip from pickup and drop-off points
df['avg_pickup_earnings'] = df['total_pickup_earnings'] / df.groupby('PULocationID')['PULocationID'].transform('count')
df['avg_dropoff_earnings'] = df['total_dropoff_earnings'] / df.groupby('DOLocationID')['DOLocationID'].transform('count')

#  **2. Top 10 High-Earning Pickup Locations**
top_pickup = pickup_earnings.sort_values(by='total_pickup_earnings', ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(data=top_pickup, x='total_pickup_earnings', y='PULocationID', palette="Blues_r")
plt.xlabel("Total Earnings (₹)")
plt.ylabel("Pickup Location")
plt.title("Top 10 Pickup Locations by Total Earnings")
plt.show()

#  **3. Top 10 High-Earning Drop-off Locations**
top_dropoff = dropoff_earnings.sort_values(by='total_dropoff_earnings', ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(data=top_dropoff, x='total_dropoff_earnings', y='DOLocationID', palette="Reds_r")
plt.xlabel("Total Earnings (₹)")
plt.ylabel("Drop-off Location")
plt.title("Top 10 Drop-off Locations by Total Earnings")
plt.show()

<ipython-input-34-7e9b6bdfec2e>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=top_pickup, x='total_pickup_earnings', y='PULocationID', palette="Blues_r")

<ipython-input-34-7e9b6bdfec2e>:29: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=top_dropoff, x='total_dropoff_earnings', y='DOLocationID', palette="Reds_r")

geo_rdh = df.groupby(['PULocationID', 'hour']).agg(
    avg_rdh=('RDH', 'mean'),
    total_trips=('RDH', 'count')
).reset_index()

# Plot top 10 pickup zones with highest avg RDH
top_zones = geo_rdh.groupby('PULocationID')['avg_rdh'].mean().nlargest(10).index
plt.figure(figsize=(12,6))
sns.lineplot(
    data=geo_rdh[geo_rdh.PULocationID.isin(top_zones)],
    x='hour', y='avg_rdh', hue='PULocationID',
    marker='o'
)
plt.title("Hourly RDH Trends for Top 10 Pickup Zones")
plt.xlabel("Hour of Day")
plt.ylabel("Average RDH (₹/hr)")

Text(0, 0.5, 'Average RDH (₹/hr)')

from scipy.stats import pearsonr
df_cleaned = df[['trip_distance', 'RDH']].dropna()

# 📊 **3. Correlation Analysis**
correlation, p_value = pearsonr(df_cleaned['trip_distance'], df_cleaned['RDH'])

# 📝 **4. Print Results**
print(f"Pearson Correlation Coefficient: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpretation
if p_value < 0.05:
    print("✅ There is a statistically significant correlation between trip distance and RDH.")
    if correlation > 0:
        print("This suggests that longer trips tend to yield higher RDH.")
    else:
        print("However, the relationship is negative, meaning longer trips may not always be beneficial for RDH.")
else:
    print("❌ The correlation is not statistically significant. Trip distance may not strongly impact RDH.")

# 📊 **5. Box Plot: RDH by Trip Distance Categories**
df['trip_category'] = pd.cut(df['trip_distance'], bins=[0, 2, 5, 10, np.inf], labels=['Short', 'Medium', 'Long', 'Very Long'])

plt.figure(figsize=(10, 6))
sns.boxplot(x='trip_category', y='RDH', data=df, hue='trip_category', palette="coolwarm", legend=False)
plt.xlabel("Trip Distance Category")
plt.ylabel("Revenue per Driver Hour (RDH)")
plt.title("RDH Across Different Trip Distance Categories")
plt.show()

/usr/local/lib/python3.11/dist-packages/scipy/stats/_stats_py.py:4894: RuntimeWarning: invalid value encountered in subtract
  ym = y - ymean

Pearson Correlation Coefficient: nan
P-value: nan
❌ The correlation is not statistically significant. Trip distance may not strongly impact RDH.

from scipy.stats import ttest_ind
df['hour'] = df['tpep_pickup_datetime'].dt.hour

# 🚦 **2. Define Peak and Non-Peak Hours**
# - **Peak Hours:** 7 AM - 10 AM & 5 PM - 9 PM
# - **Non-Peak Hours:** All other times
peak_hours = [7, 8, 9, 10, 17, 18, 19, 20]

df['time_category'] = df['hour'].apply(lambda x: 'Peak' if x in peak_hours else 'Non-Peak')

# 📊 **3. Compute Average RDH for Peak vs. Non-Peak**
avg_rdh = df.groupby('time_category')['RDH'].mean().reset_index()
print(avg_rdh)

# 📊 **4. Visualization: RDH Distribution**
plt.figure(figsize=(10, 6))
sns.boxplot(x='time_category', y='RDH', data=df,hue='time_category', palette="coolwarm", legend=False)
plt.xlabel("Time of Day")
plt.ylabel("Revenue per Driver Hour (RDH)")
plt.title("RDH During Peak vs. Non-Peak Hours")
plt.show()

# 📊 **5. Hypothesis Test (T-test)**
peak_rdh_values = df[df['time_category'] == 'Peak']['RDH'].dropna()
non_peak_rdh_values = df[df['time_category'] == 'Non-Peak']['RDH'].dropna()

t_stat, p_value = ttest_ind(peak_rdh_values, non_peak_rdh_values, equal_var=False)

# 📝 **6. Print Results**
print(f"T-test Statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("✅ There is a statistically significant difference in RDH between Peak and Non-Peak hours.")
    if avg_rdh['RDH'].iloc[0] > avg_rdh['RDH'].iloc[1]:
        print("This suggests that RDH is higher during Peak Hours.")
    else:
        print("However, RDH is actually higher during Non-Peak Hours.")
else:
    print("❌ The difference in RDH between Peak and Non-Peak hours is NOT statistically significant.")

  time_category  RDH
0      Non-Peak  inf
1          Peak  inf

T-test Statistic: nan
P-value: nan
❌ The difference in RDH between Peak and Non-Peak hours is NOT statistically significant.

/usr/local/lib/python3.11/dist-packages/scipy/stats/_stats_py.py:1079: RuntimeWarning: invalid value encountered in subtract
  a_zero_mean = a - mean
/usr/local/lib/python3.11/dist-packages/scipy/stats/_stats_py.py:6527: RuntimeWarning: invalid value encountered in scalar subtract
  d = mean1 - mean2
/usr/local/lib/python3.11/dist-packages/scipy/stats/_stats_py.py:7025: RuntimeWarning: invalid value encountered in scalar subtract
  estimate = m1-m2

fare_median = df['fare_amount'].median()
df['fare_category'] = df['fare_amount'].apply(lambda x: 'High Fare' if x >= fare_median else 'Low Fare')

# 📊 **2. Compute Average RDH for High vs. Low Fare Trips**
avg_rdh = df.groupby('fare_category')['RDH'].mean().reset_index()
print(avg_rdh)

# 📊 **3. Visualization: RDH Distribution**
plt.figure(figsize=(10, 6))
sns.boxplot(x='fare_category', y='RDH', data=df, palette="coolwarm", hue='fare_category',legend=False)
plt.xlabel("Fare Category")
plt.ylabel("Revenue per Driver Hour (RDH)")
plt.title("RDH for High Fare vs. Low Fare Trips")
plt.show()

# 📊 **4. Hypothesis Test (T-test)**
high_fare_rdh = df[df['fare_category'] == 'High Fare']['RDH'].dropna()
low_fare_rdh = df[df['fare_category'] == 'Low Fare']['RDH'].dropna()

t_stat, p_value = ttest_ind(high_fare_rdh, low_fare_rdh, equal_var=False)

# 📝 **5. Print Results**
print(f"T-test Statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("✅ There is a statistically significant difference in RDH between High and Low Fare trips.")
    if avg_rdh['RDH'].iloc[0] > avg_rdh['RDH'].iloc[1]:
        print("This suggests that trips with higher fares yield higher RDH.")
    else:
        print("However, surprisingly, RDH is actually higher for Low Fare trips.")
else:
    print("❌ The difference in RDH between High and Low Fare trips is NOT statistically significant.")

  fare_category  RDH
0     High Fare  inf
1      Low Fare  inf

T-test Statistic: nan
P-value: nan
❌ The difference in RDH between High and Low Fare trips is NOT statistically significant.

/usr/local/lib/python3.11/dist-packages/scipy/stats/_stats_py.py:1079: RuntimeWarning: invalid value encountered in subtract
  a_zero_mean = a - mean
/usr/local/lib/python3.11/dist-packages/scipy/stats/_stats_py.py:6527: RuntimeWarning: invalid value encountered in scalar subtract
  d = mean1 - mean2
/usr/local/lib/python3.11/dist-packages/scipy/stats/_stats_py.py:7025: RuntimeWarning: invalid value encountered in scalar subtract
  estimate = m1-m2

df['is_weekend'] = df['tpep_pickup_datetime'].dt.dayofweek >= 5  # Saturday(5) & Sunday(6) → True

# 📊 **2. Compute Average RDH for Weekends vs. Weekdays**
avg_rdh = df.groupby('is_weekend')['RDH'].mean().reset_index()
avg_rdh['is_weekend'] = avg_rdh['is_weekend'].map({False: 'Weekday', True: 'Weekend'})
print(avg_rdh)

# 📊 **3. Visualization: RDH Distribution**
plt.figure(figsize=(8, 5))
# Changed the hue column to 'is_weekend_str' and mapped the boolean values to corresponding strings
# Also updated the palette dictionary to match with the new hue column values
df['is_weekend_str'] = df['is_weekend'].map({False: 'Weekday', True: 'Weekend'})
sns.boxplot(x='is_weekend_str', y='RDH', data=df, hue='is_weekend_str', legend=False, palette={'Weekday': 'blue', 'Weekend': 'red'})
plt.xlabel("Day Type")
plt.ylabel("Revenue per Driver Hour (RDH)")
plt.title("RDH on Weekends vs. Weekdays")
plt.show()

# 📊 **4. Hypothesis Test (T-test)**
weekend_rdh = df[df['is_weekend'] == True]['RDH'].dropna()
weekday_rdh = df[df['is_weekend'] == False]['RDH'].dropna()

t_stat, p_value = ttest_ind(weekend_rdh, weekday_rdh, equal_var=False)  # Welch’s T-test

# 📝 **5. Print Results**
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("✅ Weekends have a statistically significant difference in RDH compared to weekdays.")
else:
    print("❌ No significant difference in RDH between weekends and weekdays.")

  is_weekend  RDH
0    Weekday  inf
1    Weekend  inf

T-statistic: nan
P-value: nan
❌ No significant difference in RDH between weekends and weekdays.

/usr/local/lib/python3.11/dist-packages/scipy/stats/_stats_py.py:1079: RuntimeWarning: invalid value encountered in subtract
  a_zero_mean = a - mean
/usr/local/lib/python3.11/dist-packages/scipy/stats/_stats_py.py:6527: RuntimeWarning: invalid value encountered in scalar subtract
  d = mean1 - mean2
/usr/local/lib/python3.11/dist-packages/scipy/stats/_stats_py.py:7025: RuntimeWarning: invalid value encountered in scalar subtract
  estimate = m1-m2

q1 = df['trip_distance'].quantile(0.33)  # 33rd percentile (Short threshold)
q2 = df['trip_distance'].quantile(0.66)  # 66th percentile (Long threshold)

# Categorization function
def categorize_trip(distance):
    if distance < q1:
        return "Short"
    elif distance < q2:
        return "Medium"
    else:
        return "Long"

df['trip_category'] = df['trip_distance'].apply(categorize_trip)

# 📊 **2. Compute Average RDH for Each Category**
avg_rdh = df.groupby('trip_category')['RDH'].mean().reset_index()
print(avg_rdh)

# 📊 **3. Visualization: RDH Distribution Based on Trip Distance**
plt.figure(figsize=(8, 5))
sns.boxplot(x='trip_category', y='RDH', data=df,
            palette={"Short": "green", "Medium": "blue", "Long": "orange"},
            hue='trip_category', legend=False)
plt.xlabel("Trip Distance Category")
plt.ylabel("Revenue per Driver Hour (RDH)")
plt.title("RDH for Short, Medium & Long Trips")
plt.show()

# 📊 **4. Correlation Analysis (Does Trip Distance Impact RDH?)**
# Drop NaN values from both columns simultaneously
df_cleaned = df[['trip_distance', 'RDH']].dropna()

if df_cleaned.shape[0] > 1:  # Ensure at least 2 non-NaN values exist
    corr, p_value = pearsonr(df_cleaned['trip_distance'], df_cleaned['RDH'])
else:
    corr, p_value = np.nan, np.nan  # Avoids error when data is insufficient

# 📝 **5. Print Results**
print(f"Pearson Correlation Coefficient: {corr:.4f}")
print(f"P-value: {p_value:.4f}")

if not np.isnan(p_value) and p_value < 0.05:
    print("✅ There is a statistically significant relationship between trip distance and RDH.")
else:
    print("❌ No significant relationship between trip distance and RDH.")

  trip_category  RDH
0          Long  inf
1        Medium  inf
2         Short  inf

Pearson Correlation Coefficient: nan
P-value: nan
❌ No significant relationship between trip distance and RDH.

/usr/local/lib/python3.11/dist-packages/scipy/stats/_stats_py.py:4894: RuntimeWarning: invalid value encountered in subtract
  ym = y - ymean

df = df[df['trip_duration'] > 0]
avg_rdh = df.groupby('passenger_count')['RDH'].mean().reset_index()
print(avg_rdh)

# 📊 **2. Visualization: RDH Distribution by Passenger Count**
plt.figure(figsize=(8, 5))
sns.boxplot(x='passenger_count', y='RDH', data=df, hue='passenger_count', legend=False, palette="Set2")
plt.xlabel("Passenger Count")
plt.ylabel("Revenue per Driver Hour (RDH)")
plt.title("RDH vs. Passenger Count")
plt.show()

# 📊 **3. Correlation Analysis (Does Passenger Count Impact RDH?)**
corr, p_value = pearsonr(df['passenger_count'].dropna(), df['RDH'].dropna())

# 📝 **4. Print Results**
print(f"Pearson Correlation Coefficient: {corr:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("✅ There is a statistically significant relationship between passenger count and RDH.")
else:
    print("❌ No significant relationship between passenger count and RDH.")

   passenger_count         RDH
0              0.0   84.290938
1              1.0   78.496270
2              2.0   76.397067
3              3.0   75.849781
4              4.0   75.047567
5              5.0   77.882020
6              6.0   77.278463
7              7.0  334.294010
8              8.0   91.610031

Pearson Correlation Coefficient: -0.0069
P-value: 0.0000
✅ There is a statistically significant relationship between passenger count and RDH.

df_filtered = df[df['trip_duration'] > 0].copy()

# 🏷 **1. Categorizing Tip Amounts**
df_filtered['tip_category'] = pd.cut(df_filtered['tip_amount'],
                                     bins=[-1, 0, 20, 50, df_filtered['tip_amount'].max()],
                                     labels=["No Tip", "Low", "Medium", "High"])

# 🏷 **2. Calculating Average RDH by Tip Category**
avg_rdh = df_filtered.groupby('tip_category', observed=True)['RDH'].mean().reset_index()

# 📊 **3. Visualization: RDH Distribution by Tip Category**
plt.figure(figsize=(8, 5))
sns.boxplot(x='tip_category', y='RDH', data=df_filtered, hue='tip_category', legend=False, palette="coolwarm")
plt.xlabel("Tip Category")
plt.ylabel("Revenue per Driver Hour (RDH)")
plt.title("RDH vs. Tip Amount Category")
plt.show()

# 📊 **4. Correlation Analysis (Does Tip Amount Impact RDH?)**
corr, p_value = pearsonr(df_filtered['tip_amount'].dropna(), df_filtered['RDH'].dropna())

# 📝 **5. Print Results**
print(f"Pearson Correlation Coefficient: {corr:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("✅ There is a statistically significant relationship between tip amount and RDH.")
else:
    print("❌ No significant relationship between tip amount and RDH.")

Pearson Correlation Coefficient: -0.0364
P-value: 0.0000
✅ There is a statistically significant relationship between tip amount and RDH.

# df.to_csv('uber_dataset.csv', index=False)
# from google.colab import files

# # Download the CSV file
# files.download('uber_dataset.csv')

# df.info()

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	PULocationID	DOLocationID	payment_type	fare_amount	extra	gst	tip_amount	improvement_surcharge	total_amount	congestion_surcharge
0	1	01/01/2020 12:28:15 AM	01/01/2020 12:33:03 AM	1.0	1.9	238	239	1.0	283.73	141.87	23.64	69.52	14.18	532.93	118.22
1	1	01/01/2020 12:35:39 AM	01/01/2020 12:43:04 AM	1.0	1.9	239	238	1.0	331.01	141.87	23.64	70.93	14.18	581.63	118.22
2	1	01/01/2020 12:47:41 AM	01/01/2020 12:53:52 AM	1.0	1.0	238	238	1.0	283.73	141.87	23.64	47.29	14.18	510.71	118.22
3	1	01/01/2020 12:55:23 AM	01/01/2020 01:00:14 AM	1.0	1.3	238	151	1.0	260.08	23.64	23.64	64.31	14.18	385.87	0.00
4	1	01/01/2020 12:01:58 AM	01/01/2020 12:04:16 AM	1.0	0.0	193	193	2.0	165.51	23.64	23.64	0.00	14.18	226.98	0.00

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	PULocationID	DOLocationID	payment_type	fare_amount	extra	gst	tip_amount	improvement_surcharge	total_amount	congestion_surcharge	trip_duration
0	1	2020-01-01 00:28:15	2020-01-01 00:33:03	1.0	1.9	Bagmane Tech Park (CV Raman Nagar)	NaN	UPI	283.73	141.87	23.64	69.52	14.18	532.93	118.22	288.0
1	1	2020-01-01 00:35:39	2020-01-01 00:43:04	1.0	1.9	NaN	Bagmane Tech Park (CV Raman Nagar)	UPI	331.01	141.87	23.64	70.93	14.18	581.63	118.22	445.0
2	1	2020-01-01 00:47:41	2020-01-01 00:53:52	1.0	1.0	Bagmane Tech Park (CV Raman Nagar)	Bagmane Tech Park (CV Raman Nagar)	UPI	283.73	141.87	23.64	47.29	14.18	510.71	118.22	371.0
3	1	2020-01-01 00:55:23	2020-01-01 01:00:14	1.0	1.3	Bagmane Tech Park (CV Raman Nagar)	Basaveshwaranagar	UPI	260.08	23.64	23.64	64.31	14.18	385.87	0.00	291.0
4	1	2020-01-01 00:01:58	2020-01-01 00:04:16	1.0	0.0	Banaswadi	Banaswadi	Cash	165.51	23.64	23.64	0.00	14.18	226.98	0.00	138.0

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	PULocationID	DOLocationID	payment_type	fare_amount	extra	gst	tip_amount	improvement_surcharge	total_amount	congestion_surcharge	trip_duration	trip_duration_hours	RDH
0	1	2020-01-01 00:28:15	2020-01-01 00:33:03	1.0	1.9	Bagmane Tech Park (CV Raman Nagar)	NaN	UPI	283.73	141.87	23.64	69.52	14.18	532.93	118.22	288.0	4.800000	111.027083
1	1	2020-01-01 00:35:39	2020-01-01 00:43:04	1.0	1.9	NaN	Bagmane Tech Park (CV Raman Nagar)	UPI	331.01	141.87	23.64	70.93	14.18	581.63	118.22	445.0	7.416667	78.422022
2	1	2020-01-01 00:47:41	2020-01-01 00:53:52	1.0	1.0	Bagmane Tech Park (CV Raman Nagar)	Bagmane Tech Park (CV Raman Nagar)	UPI	283.73	141.87	23.64	47.29	14.18	510.71	118.22	371.0	6.183333	82.594609
3	1	2020-01-01 00:55:23	2020-01-01 01:00:14	1.0	1.3	Bagmane Tech Park (CV Raman Nagar)	Basaveshwaranagar	UPI	260.08	23.64	23.64	64.31	14.18	385.87	0.00	291.0	4.850000	79.560825
4	1	2020-01-01 00:01:58	2020-01-01 00:04:16	1.0	0.0	Banaswadi	Banaswadi	Cash	165.51	23.64	23.64	0.00	14.18	226.98	0.00	138.0	2.300000	98.686957

	trip_distance	fare_amount	total_amount	passenger_count	extra	tip_amount	tolls_amount	improvement_surcharge	congestion_surcharge
min	-35.5	-58542.30	-58745.63	0.0	-1276.77	-4303.19	-1690.06	-14.18	-118.22
max	420.6	201682.47	201838.52	9.0	5343.99	52016.58	43055.54	14.18	130.04

Maximizing Revenue per Driver Hour (RDH) through Demand Optimization for Uber in Bangalore¶

Introduction :¶

Objective :¶

Dataset Information¶

Explanation of the Data Columns¶

Data Importing and Cleaning¶

Importing Necessary Libraries¶

Viewing the First Few Rows of the Dataset¶

Displaying Dataset Information¶

Create a copy of the dataset to ensure the original remains unchanged¶

Checking for missing values in columns¶

Checking for redundant values in the rows¶

📌 Observation : There were 12949 duplicates identified, number of rows after removing duplicates is 6487051¶

➡️ "tpep_pickup_datetime" & "tpep_dropoff_datetime" must be converted to datetime format for time-based calculations.¶

Adding New and Important Columns¶

Getting the timeline of the dataset¶

Dataset after Treating missing values:¶

Box plot for each numerical columns¶

Handeling Outliers¶

Exploratory Data Analysis¶

Trip Demand Trends (Hourly & Daily)¶

Revenue Per Driver Hour (RDH) Calculation¶

Impact of Trip Distance on Fare & RDH¶

Impact of Payment Type on Revenue¶

Peak Demand vs. RDH Analysis¶

Effect of Pickup & Drop Location on Earnings¶

Geospatial Hotspots for High RDH¶

Hypothesis Testing¶

1. Hypothesis: Longer Trips Yield Higher RDH¶

2. Hypothesis: Peak Hours Have Higher RDH¶

3. Hypothesis: Trips with Higher Fare Amounts Yield Higher RDH¶

4. Hypothesis: Weekends Have Higher RDH Compared to Weekdays¶

5. Hypothesis: Shorter Trip Distances Yield Higher RDH¶

6. Hypothesis: Trips with Higher Passenger Counts Yield Higher RDH¶

7. Hypothesis: Trips with Higher Tip Amounts Yield Higher RDH¶

Business Recommendations for Uber to Optimize RDH 🚖¶

🚀 Conclusion: Actionable Strategy for Uber¶

Downloading the Updated Data¶

	Missing Values	Percentage Missing
VendorID	0	0.0%
tpep_pickup_datetime	0	0.0%
tpep_dropoff_datetime	0	0.0%
passenger_count	79229	1.22%
trip_distance	0	0.0%
PULocationID	0	0.0%
DOLocationID	0	0.0%
payment_type	79229	1.22%
fare_amount	0	0.0%
extra	0	0.0%
gst	0	0.0%
tip_amount	0	0.0%
tolls_amount	0	0.0%
improvement_surcharge	0	0.0%
total_amount	0	0.0%
congestion_surcharge	0	0.0%

	trip_distance	fare_amount	total_amount	passenger_count	extra	tip_amount	tolls_amount	improvement_surcharge	congestion_surcharge
min	0.1	0.00	132.40	0.0	0.00	0.00	0.0	14.18	118.22
max	24.0	2411.68	2996.15	8.0	165.51	526.79	0.0	14.18	118.22