Kaggle: New York City Taxi Trip Duration

原文转载自 「Neo Fung's Blog」 (https://www.neofung.org/2018/01/17/Kaggle-New-York-City-Taxi-Trip-Duration/)

预计阅读时间 0 分钟(共 0 个字, 0 张图片, 0 个链接)

预测纽约的士的行程时间。
给出的数据中,上车时间可以推导出是否节假日和周末(已实现);
上下车经纬度可以获取直线距离(已实现);
上车时间和经纬度可以获取当时所在地的天气(没有实现);
如果有点耐心,可以通过Google map 或者Bing map获取路线距离(没有实现)。
LightGBM中的feature_fraction是个好东西,用于设置每次迭代中使用的特征的比例。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import sys
from datetime import timedelta
from math import ceil

import numpy as np

try:
import competition as _
except ImportError as ex:
sys.path.append(os.path.abspath(__file__ + '/../..'))

import lightgbm as lgb
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from dateutil import parser
from utils import get_current_path, calculate_distance

current_path = get_current_path(__file__)
train_file_path = current_path + '/data/preprocess_train.csv'
test_file_path = current_path + '/data/preprocess_test.csv'


def preprocess_raw_data(file_path, is_test=False):
def get_year(x):
return x.year

def get_month(x):
return x.month

def get_day(x):
return x.day

def get_weekday(x):
return x.weekday()

def get_hour(x):
return x.hour

def get_minute(x):
return x.minute

def get_week_of_month(dt):
""" Returns the week of the month for the specified date.
"""

first_day = dt.replace(day=1)

dom = dt.day
adjusted_dom = dom + first_day.weekday()

return int(ceil(adjusted_dom / 7.0))

df = pd.read_csv(file_path)
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: parser.parse(x))
df['pickup_year'] = df['pickup_datetime'].apply(get_year)
df['pickup_month'] = df['pickup_datetime'].apply(get_month)
df['pickup_day'] = df['pickup_datetime'].apply(get_day)
df['pickup_weekday'] = df['pickup_datetime'].apply(get_weekday)
df['pickup_hour'] = df['pickup_datetime'].apply(get_hour)
df['pickup_minute'] = df['pickup_datetime'].apply(get_minute)
df['pickup_week_of_month'] = df['pickup_datetime'].apply(get_week_of_month)

calendar = USFederalHolidayCalendar()
holidays = calendar.holidays()

# Load business days
us_bd = CustomBusinessDay(calendar=USFederalHolidayCalendar())
# Set business_days equal to the work days in our date range.
business_days = pd.DatetimeIndex(start=df.pickup_datetime.min(),
end=df.pickup_datetime.max(),
freq=us_bd)
business_days = pd.to_datetime(business_days).date

df['pickup_is_weekend'] = df.pickup_weekday.map(lambda x: 1 if x >= 5 else 0)
df['pickup_holiday'] = pd.to_datetime(df.pickup_datetime.dt.date).isin(holidays)
df['pickup_holiday'] = df.pickup_holiday.map(lambda x: 1 if x == True else 0)

# If day is before or after a holiday
df['pickup_near_holiday'] = (pd.to_datetime(df.pickup_datetime.dt.date).isin(holidays + timedelta(days=1)) |
pd.to_datetime(df.pickup_datetime.dt.date).isin(holidays - timedelta(days=1)))
df['pickup_near_holiday'] = df.pickup_near_holiday.map(lambda x: 1 if x == True else 0)
df['pickup_businessday'] = pd.to_datetime(df.pickup_datetime.dt.date).isin(business_days)
df['pickup_businessday'] = df.pickup_businessday.map(lambda x: 1 if x == True else 0)

df.drop(['pickup_datetime'], inplace=True, axis=1)

if not is_test:
df.drop(['dropoff_datetime'], inplace=True, axis=1)
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].apply(lambda x: 1 if x == 'Y' else 0)
print(df.info())
return df


def cal_distance(df):
df['distance'] = df.apply(func=lambda x: calculate_distance(x['pickup_longitude'], x['pickup_latitude'],
x['dropoff_longitude'],
x['dropoff_latitude']),
axis=1)

# train_df = preprocess_raw_data(train_file_path.replace('preprocess_', ''))
# cal_distance(train_df)
# train_df.to_csv(train_file_path, index=False)
# test_df= preprocess_raw_data(test_file_path.replace('preprocess_', ''), is_test=True)
# cal_distance(test_df)
# test_df.to_csv(test_file_path, index=False)
#
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
train_df.drop(['id'], axis=1, inplace=True)
train_df, eval_df = train_test_split(train_df, test_size=0.2)


def rmsle(y_pred, y_true):
labels = y_true.get_label()
return 'rmsle', np.sqrt(np.mean(np.power(np.log1p(labels + 1) - np.log1p(y_pred + 1), 2))), False

def foo_2(train_df, eval_df, test_df):
param = {'task': 'train',
'boosting_type': 'gbdt',
'application': 'regression',
'verbose': 1,
'device': 'gpu',
'num_leaves': 512,
# 'min_data_in_leaf': 28,
'learning_rate': 0.001,
'feature_fraction': 0.6,
}

train_dataset = lgb.Dataset(data=train_df.drop(['trip_duration'], axis=1), label=train_df['trip_duration'])
eval_dataset = lgb.Dataset(data=eval_df.drop(['trip_duration'], axis=1), label=eval_df['trip_duration'])

clf = lgb.train(param, train_dataset, feval=rmsle, early_stopping_rounds=500, num_boost_round=40000,
valid_sets=[eval_dataset], verbose_eval=1)

if clf.best_iteration:
predictions = clf.predict(test_df.drop(['id'], axis=1), num_iteration=clf.best_iteration)
else:
predictions = clf.predict(test_df.drop(['id'], axis=1))
return predictions


predictions = foo_2(train_df, eval_df, test_df)

submission = DataFrame({'id': test_df['id'].as_matrix(),
'trip_duration': predictions.astype(np.int64)})
submission['trip_duration'] = submission['trip_duration'].apply(lambda x: x if x >= 0 else 0)
submission.to_csv(current_path + '/result.csv', index=False, float_format='%.9f')
more_vert