天池-新人实战赛o2o优惠券使用预测-学习记录

1.学习在anaconda中使用jupyter notebook调试python代码

新建python 3开始逐行编写和调试代码,记住及时保存。

2.学会在阿里云天池

中学习机器学习相关知识,参与各类论坛和挑战赛,通过调试别人的代码学习做数据挖掘比赛

技巧:论坛找资源、看视频学习、多动手尝试、多看多想多请教、学会百度和翻墙很重要,不断尝试放弃敢于迎接挑战。

3.实战-<第1章 O2O优惠券使用新人赛-初试>

跑通了自己的第一个数据挖掘小项目,大致了解了做数据挖掘比赛的流程、方法和思想,了解了一些常用的包。

按照《生活大实惠:O2O优惠券使用预测 》

调试成功。

代码如下:

# coding: utf-8 # In[1]: # import libraries necessary for this project import os, sys, pickle import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib.dates as mdates import seaborn as sns from datetime import date from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve from sklearn.preprocessing import MinMaxScaler import xgboost as xgb import lightgbm as lgb # display for this notebook get_ipython().magic('matplotlib inline') get_ipython().magic("config InlineBackend.figure_format = 'retina'") # In[2]: dfoff = pd.read_csv('G:\code\o2o\ccf_offline_stage1_train.csv') dftest = pd.read_csv('G:\code\o2o\ccf_offline_stage1_test_revised.csv') dfon = pd.read_csv('G:\code\o2occf_online_stage1_train.csv') dfoff.head(5) # In[3]: dfoff.info() # In[4]: print('有优惠券,购买商品条数', dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] != 'null')].shape[0]) print('无优惠券,购买商品条数', dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] != 'null')].shape[0]) print('有优惠券,不购买商品条数', dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] == 'null')].shape[0]) print('无优惠券,不购买商品条数', dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] == 'null')].shape[0]) # In[5]: # 在测试集中出现的用户但训练集没有出现 print('1. User_id in training set but not in test set', set(dftest['User_id']) - set(dfoff['User_id'])) # 在测试集中出现的商户但训练集没有出现 print('2. Merchant_id in training set but not in test set', set(dftest['Merchant_id']) - set(dfoff['Merchant_id']))# 在测试集中出现的用户但训练集没有出现 print('1. User_id in training set but not in test set', set(dftest['User_id']) - set(dfoff['User_id'])) # 在测试集中出现的商户但训练集没有出现 print('2. Merchant_id in training set but not in test set', set(dftest['Merchant_id']) - set(dfoff['Merchant_id'])) # In[6]: print('Discount_rate 类型:',dfoff['Discount_rate'].unique()) print('Distance 类型:', dfoff['Distance'].unique()) # In[7]: # convert Discount_rate and Distance def getDiscountType(row): if row == 'null': return 'null' elif ':' in row: return 1 else: return 0 def convertRate(row): """Convert discount to rate""" if row == 'null': return 1.0 elif ':' in row: rows = row.split(':') return 1.0 - float(rows[1])/float(rows[0]) else: return float(row) def getDiscountMan(row): if ':' in row: rows = row.split(':') return int(rows[0]) else: return 0 def getDiscountJian(row): if ':' in row: rows = row.split(':') return int(rows[1]) else: return 0 def processData(df): # convert discunt_rate df['discount_rate'] = df['Discount_rate'].apply(convertRate) df['discount_man'] = df['Discount_rate'].apply(getDiscountMan) df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian) df['discount_type'] = df['Discount_rate'].apply(getDiscountType) print(df['discount_rate'].unique()) # convert distance df['distance'] = df['Distance'].replace('null', -1).astype(int) print(df['distance'].unique()) return df dfoff = processData(dfoff) dftest = processData(dftest) # In[8]: dfoff.head(2) # In[9]: dftest.head(2) # In[10]: date_received = dfoff['Date_received'].unique() date_received = sorted(date_received[date_received != 'null']) date_buy = dfoff['Date'].unique() date_buy = sorted(date_buy[date_buy != 'null']) date_buy = sorted(dfoff[dfoff['Date'] != 'null']['Date']) print('优惠券收到日期从',date_received[0],'到', date_received[-1]) print('消费日期从', date_buy[0], '到', date_buy[-1]) # In[11]: couponbydate = dfoff[dfoff['Date_received'] != 'null'][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count() couponbydate.columns = ['Date_received','count'] buybydate = dfoff[(dfoff['Date'] != 'null') & (dfoff['Date_received'] != 'null')][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count() buybydate.columns = ['Date_received','count'] # In[13]: sns.set_style('ticks') sns.set_context("notebook", font_scale= 1.4) plt.figure(figsize = (12,8)) date_received_dt = pd.to_datetime(date_received, format='%Y%m%d') plt.subplot(211) plt.bar(date_received_dt, couponbydate['count'], label = 'number of coupon received' ) plt.bar(date_received_dt, buybydate['count'], label = 'number of coupon used') plt.yscale('log') plt.ylabel('Count') plt.legend() plt.subplot(212) plt.bar(date_received_dt, buybydate['count']/couponbydate['count']) plt.ylabel('Ratio(coupon used/coupon received)') plt.tight_layout() # In[14]: def getWeekday(row): if row == 'null': return row else: return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1 dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday) dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday) # weekday_type : 周六和周日为1,其他为0 dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 ) dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 ) # In[15]: # change weekday to one-hot encoding weekdaycols = ['weekday_' + str(i) for i in range(1,8)] print(weekdaycols) tmpdf = pd.get_dummies(dfoff['weekday'].replace('null', np.nan)) tmpdf.columns = weekdaycols dfoff[weekdaycols] = tmpdf tmpdf = pd.get_dummies(dftest['weekday'].replace('null', np.nan)) tmpdf.columns = weekdaycols dftest[weekdaycols] = tmpdf # In[16]: def label(row): if row['Date_received'] == 'null': return -1 if row['Date'] != 'null': td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d') if td <= pd.Timedelta(15, 'D'): return 1 return 0 dfoff['label'] = dfoff.apply(label, axis = 1) # In[17]: print(dfoff['label'].value_counts()) # In[18]: print('已有columns:',dfoff.columns.tolist()) # In[19]: dfoff.head(2) # In[20]: # data split df = dfoff[dfoff['label'] != -1].copy() train = df[(df['Date_received'] < '20160516')].copy() valid = df[(df['Date_received'] >= '20160516') & (df['Date_received'] <= '20160615')].copy() print(train['label'].value_counts()) print(valid['label'].value_counts()) # In[21]: # feature original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type'] + weekdaycols print(len(original_feature),original_feature) # In[22]: # model1 predictors = original_feature print(predictors) def check_model(data, predictors): classifier = lambda: SGDClassifier( loss='log', penalty='elasticnet', fit_intercept=True, max_iter=100, shuffle=True, n_jobs=1, class_weight=None) model = Pipeline(steps=[ ('ss', StandardScaler()), ('en', classifier()) ]) parameters = { 'en__alpha': [ 0.001, 0.01, 0.1], 'en__l1_ratio': [ 0.001, 0.01, 0.1] } folder = StratifiedKFold(n_splits=3, shuffle=True) grid_search = GridSearchCV( model, parameters, cv=folder, n_jobs=-1, verbose=1) grid_search = grid_search.fit(data[predictors], data['label']) return grid_search if not os.path.isfile('1_model.pkl'): model = check_model(train, predictors) print(model.best_score_) print(model.best_params_) with open('1_model.pkl', 'wb') as f: pickle.dump(model, f) else: with open('1_model.pkl', 'rb') as f: model = pickle.load(f) # In[24]: # valid predict y_valid_pred = model.predict_proba(valid[predictors]) valid1 = valid.copy() valid1['pred_prob'] = y_valid_pred[:, 1] valid1.head(2) # In[25]: # avgAUC calculation vg = valid1.groupby(['Coupon_id']) aucs = [] for i in vg: tmpdf = i[1] if len(tmpdf['label'].unique()) != 2: continue fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1) aucs.append(auc(fpr, tpr)) print(np.average(aucs)) # In[26]: # test prediction for submission y_test_pred = model.predict_proba(dftest[predictors]) dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy() dftest1['label'] = y_test_pred[:,1] dftest1.to_csv('submit1.csv', index=False, header=False) dftest1.head() # In[27]: feature = dfoff[(dfoff['Date'] < '20160516') | ((dfoff['Date'] == 'null') & (dfoff['Date_received'] < '20160516'))].copy() data = dfoff[(dfoff['Date_received'] >= '20160516') & (dfoff['Date_received'] <= '20160615')].copy() print(data['label'].value_counts()) # In[28]: fdf = feature.copy() # In[29]: # key of user u = fdf[['User_id']].copy().drop_duplicates() # In[30]: # u_coupon_count : num of coupon received by user u1 = fdf[fdf['Date_received'] != 'null'][['User_id']].copy() u1['u_coupon_count'] = 1 u1 = u1.groupby(['User_id'], as_index = False).count() u1.head(2) # In[31]: # u_buy_count : times of user buy offline (with or without coupon) u2 = fdf[fdf['Date'] != 'null'][['User_id']].copy() u2['u_buy_count'] = 1 u2 = u2.groupby(['User_id'], as_index = False).count() u2.head(2) # In[32]: # u_buy_with_coupon : times of user buy offline (with coupon) u3 = fdf[((fdf['Date'] != 'null') & (fdf['Date_received'] != 'null'))][['User_id']].copy() u3['u_buy_with_coupon'] = 1 u3 = u3.groupby(['User_id'], as_index = False).count() u3.head(2) # In[33]: # u_merchant_count : num of merchant user bought from u4 = fdf[fdf['Date'] != 'null'][['User_id', 'Merchant_id']].copy() u4.drop_duplicates(inplace = True) u4 = u4.groupby(['User_id'], as_index = False).count() u4.rename(columns = {'Merchant_id':'u_merchant_count'}, inplace = True) u4.head(2) # In[34]: # u_min_distance utmp = fdf[(fdf['Date'] != 'null') & (fdf['Date_received'] != 'null')][['User_id', 'distance']].copy() utmp.replace(-1, np.nan, inplace = True) u5 = utmp.groupby(['User_id'], as_index = False).min() u5.rename(columns = {'distance':'u_min_distance'}, inplace = True) u6 = utmp.groupby(['User_id'], as_index = False).max() u6.rename(columns = {'distance':'u_max_distance'}, inplace = True) u7 = utmp.groupby(['User_id'], as_index = False).mean() u7.rename(columns = {'distance':'u_mean_distance'}, inplace = True) u8 = utmp.groupby(['User_id'], as_index = False).median() u8.rename(columns = {'distance':'u_median_distance'}, inplace = True) u8.head(2) # In[35]: u.shape, u1.shape, u2.shape, u3.shape, u4.shape, u5.shape, u6.shape, u7.shape, u8.shape # In[36]: # merge all the features on key User_id user_feature = pd.merge(u, u1, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u2, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u3, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u4, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u5, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u6, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u7, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u8, on = 'User_id', how = 'left') # In[37]: # calculate rate user_feature['u_use_coupon_rate'] = user_feature['u_buy_with_coupon'].astype('float')/user_feature['u_coupon_count'].astype('float') user_feature['u_buy_with_coupon_rate'] = user_feature['u_buy_with_coupon'].astype('float')/user_feature['u_buy_count'].astype('float') user_feature = user_feature.fillna(0) user_feature.head(2) # In[38]: # add user feature to data on key User_id data2 = pd.merge(data, user_feature, on = 'User_id', how = 'left').fillna(0) # split data2 into valid and train train, valid = train_test_split(data2, test_size = 0.2, stratify = data2['label'], random_state=100) # In[39]: # model2 predictors = original_feature + user_feature.columns.tolist()[1:] print(len(predictors), predictors) if not os.path.isfile('2_model.pkl'): model = check_model(train, predictors) print(model.best_score_) print(model.best_params_) with open('2_model.pkl', 'wb') as f: pickle.dump(model, f) else: with open('2_model.pkl', 'rb') as f: model = pickle.load(f) # In[40]: # valid set performance valid['pred_prob'] = model.predict_proba(valid[predictors])[:,1] validgroup = valid.groupby(['Coupon_id']) aucs = [] for i in validgroup: tmpdf = i[1] if len(tmpdf['label'].unique()) != 2: continue fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1) aucs.append(auc(fpr, tpr)) #aucs.append(roc_auc_score(tmpdf['label'], tmpdf['pred_prob'])) print(np.average(aucs)) # In[41]: # key of merchant m = fdf[['Merchant_id']].copy().drop_duplicates() # In[42]: # m_coupon_count : num of coupon from merchant m1 = fdf[fdf['Date_received'] != 'null'][['Merchant_id']].copy() m1['m_coupon_count'] = 1 m1 = m1.groupby(['Merchant_id'], as_index = False).count() m1.head(2) # In[43]: # m_sale_count : num of sale from merchant (with or without coupon) m2 = fdf[fdf['Date'] != 'null'][['Merchant_id']].copy() m2['m_sale_count'] = 1 m2 = m2.groupby(['Merchant_id'], as_index = False).count() m2.head(2) # In[44]: # m_sale_with_coupon : num of sale from merchant with coupon usage m3 = fdf[(fdf['Date'] != 'null') & (fdf['Date_received'] != 'null')][['Merchant_id']].copy() m3['m_sale_with_coupon'] = 1 m3 = m3.groupby(['Merchant_id'], as_index = False).count() m3.head(2) # In[45]: # m_min_distance mtmp = fdf[(fdf['Date'] != 'null') & (fdf['Date_received'] != 'null')][['Merchant_id', 'distance']].copy() mtmp.replace(-1, np.nan, inplace = True) m4 = mtmp.groupby(['Merchant_id'], as_index = False).min() m4.rename(columns = {'distance':'m_min_distance'}, inplace = True) m5 = mtmp.groupby(['Merchant_id'], as_index = False).max() m5.rename(columns = {'distance':'m_max_distance'}, inplace = True) m6 = mtmp.groupby(['Merchant_id'], as_index = False).mean() m6.rename(columns = {'distance':'m_mean_distance'}, inplace = True) m7 = mtmp.groupby(['Merchant_id'], as_index = False).median() m7.rename(columns = {'distance':'m_median_distance'}, inplace = True) m7.head(2) # In[46]: m.shape, m1.shape, m2.shape, m3.shape, m4.shape, m5.shape, m6.shape, m7.shape # In[47]: merchant_feature = pd.merge(m, m1, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m2, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m3, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m4, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m5, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m6, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m7, on = 'Merchant_id', how = 'left') merchant_feature = merchant_feature.fillna(0) merchant_feature.head(5) # In[48]: merchant_feature['m_coupon_use_rate'] = merchant_feature['m_sale_with_coupon'].astype('float')/merchant_feature['m_coupon_count'].astype('float') merchant_feature['m_sale_with_coupon_rate'] = merchant_feature['m_sale_with_coupon'].astype('float')/merchant_feature['m_sale_count'].astype('float') merchant_feature = merchant_feature.fillna(0) merchant_feature.head(2) # In[49]: # add merchant feature to data2 data3 = pd.merge(data2, merchant_feature, on = 'Merchant_id', how = 'left').fillna(0) # split data3 into train/valid train, valid = train_test_split(data3, test_size = 0.2, stratify = data3['label'], random_state=100) # In[50]: # model 3 predictors = original_feature + user_feature.columns.tolist()[1:] + merchant_feature.columns.tolist()[1:] print(predictors) if not os.path.isfile('3_model.pkl'): model = check_model(train, predictors) print(model.best_score_) print(model.best_params_) with open('3_model.pkl', 'wb') as f: pickle.dump(model, f) else: with open('3_model.pkl', 'rb') as f: model = pickle.load(f) # In[56]: valid['pred_prob'] = model.predict_proba(valid[predictors])[:,1] validgroup = valid.groupby(['Coupon_id']) aucs = [] for i in validgroup: tmpdf = i[1] if len(tmpdf['label'].unique()) != 2: continue fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1) aucs.append(auc(fpr, tpr)) print(np.average(aucs)) # In[57]: # key of user and merchant um = fdf[['User_id', 'Merchant_id']].copy().drop_duplicates() # In[58]: um1 = fdf[['Userum2 = fdf[fdf['Date'] != 'null'][['User_id', 'Merchant_id']].copy() um2['um_buy_count'] = 1 um2 = um2.groupby(['User_id', 'Merchant_id'], as_index = False).count() um2.head(2)_id', 'Merchant_id']].copy() um1['um_count'] = 1 um1 = um1.groupby(['User_id', 'Merchant_id'], as_index = False).count() um1.head(2) # In[59]: um2 = fdf[fdf['Date'] != 'null'][['User_id', 'Merchant_id']].copy() um2['um_buy_count'] = 1 um2 = um2.groupby(['User_id', 'Merchant_id'], as_index = False).count() um2.head(2) # In[60]: um3 = fdf[fdf['Date_received'] != 'null'][['User_id', 'Merchant_id']].copy() um3['um_coupon_count'] = 1 um3 = um3.groupby(['User_id', 'Merchant_id'], as_index = False).count() um3.head(2) # In[61]: um4 = fdf[(fdf['Date_received'] != 'null') & (fdf['Date'] != 'null')][['User_id', 'Merchant_id']].copy() um4['um_buy_with_coupon'] = 1 um4 = um4.groupby(['User_id', 'Merchant_id'], as_index = False).count() um4.head(2) # In[62]: # merge all user merchant user_merchant_feature = pd.merge(um, um1, on = ['User_id','Merchant_id'], how = 'left') user_merchant_feature = pd.merge(user_merchant_feature, um2, on = ['User_id','Merchant_id'], how = 'left') user_merchant_feature = pd.merge(user_merchant_feature, um3, on = ['User_id','Merchant_id'], how = 'left') user_merchant_feature = pd.merge(user_merchant_feature, um4, on = ['User_id','Merchant_id'], how = 'left') user_merchant_feature = user_merchant_feature.fillna(0) user_merchant_feature['um_buy_rate'] = user_merchant_feature['um_buy_count'].astype('float')/user_merchant_feature['um_count'].astype('float') user_merchant_feature['um_coupon_use_rate'] = user_merchant_feature['um_buy_with_coupon'].astype('float')/user_merchant_feature['um_coupon_count'].astype('float') user_merchant_feature['um_buy_with_coupon_rate'] = user_merchant_feature['um_buy_with_coupon'].astype('float')/user_merchant_feature['um_buy_count'].astype('float') user_merchant_feature = user_merchant_feature.fillna(0) user_merchant_feature.head(2) # In[63]: # add user_merchant_feature to data3 data4 = pd.merge(data3, user_merchant_feature, on = ['User_id','Merchant_id'], how = 'left').fillna(0) # split data4 into train/valid train, valid = train_test_split(data4, test_size = 0.2, stratify = data4['label'], random_state=100) # In[64]: predictors = original_feature + user_feature.columns.tolist()[1:] + merchant_feature.columns.tolist()[1:] + user_merchant_feature.columns.tolist()[2:] print(len(predictors),predictors) if not os.path.isfile('4_model.pkl'): model = check_model(train, predictors) print(model.best_score_) print(model.best_params_) with open('4_model.pkl', 'wb') as f: pickle.dump(model, f) else: with open('4_model.pkl', 'rb') as f: model = pickle.load(f) # In[65]: valid['pred_prob'] = model.predict_proba(valid[predictors])[:,1] validgroup = valid.groupby(['Coupon_id']) aucs = [] for i in validgroup: tmpdf = i[1] if len(tmpdf['label'].unique()) != 2: continue fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1) aucs.append(auc(fpr, tpr)) print(np.average(aucs)) # In[66]: def userFeature(df): u = df[['User_id']].copy().drop_duplicates() # u_coupon_count : num of coupon received by user u1 = df[df['Date_received'] != 'null'][['User_id']].copy() u1['u_coupon_count'] = 1 u1 = u1.groupby(['User_id'], as_index = False).count() # u_buy_count : times of user buy offline (with or without coupon) u2 = df[df['Date'] != 'null'][['User_id']].copy() u2['u_buy_count'] = 1 u2 = u2.groupby(['User_id'], as_index = False).count() # u_buy_with_coupon : times of user buy offline (with coupon) u3 = df[((df['Date'] != 'null') & (df['Date_received'] != 'null'))][['User_id']].copy() u3['u_buy_with_coupon'] = 1 u3 = u3.groupby(['User_id'], as_index = False).count() # u_merchant_count : num of merchant user bought from u4 = df[df['Date'] != 'null'][['User_id', 'Merchant_id']].copy() u4.drop_duplicates(inplace = True) u4 = u4.groupby(['User_id'], as_index = False).count() u4.rename(columns = {'Merchant_id':'u_merchant_count'}, inplace = True) # u_min_distance utmp = df[(df['Date'] != 'null') & (df['Date_received'] != 'null')][['User_id', 'distance']].copy() utmp.replace(-1, np.nan, inplace = True) u5 = utmp.groupby(['User_id'], as_index = False).min() u5.rename(columns = {'distance':'u_min_distance'}, inplace = True) u6 = utmp.groupby(['User_id'], as_index = False).max() u6.rename(columns = {'distance':'u_max_distance'}, inplace = True) u7 = utmp.groupby(['User_id'], as_index = False).mean() u7.rename(columns = {'distance':'u_mean_distance'}, inplace = True) u8 = utmp.groupby(['User_id'], as_index = False).median() u8.rename(columns = {'distance':'u_median_distance'}, inplace = True) user_feature = pd.merge(u, u1, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u2, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u3, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u4, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u5, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u6, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u7, on = 'User_id', how = 'left') user_feature = pd.merge(user_feature, u8, on = 'User_id', how = 'left') user_feature['u_use_coupon_rate'] = user_feature['u_buy_with_coupon'].astype('float')/user_feature['u_coupon_count'].astype('float') user_feature['u_buy_with_coupon_rate'] = user_feature['u_buy_with_coupon'].astype('float')/user_feature['u_buy_count'].astype('float') user_feature = user_feature.fillna(0) print(user_feature.columns.tolist()) return user_feature # In[67]: def merchantFeature(df): m = df[['Merchant_id']].copy().drop_duplicates() # m_coupon_count : num of coupon from merchant m1 = df[df['Date_received'] != 'null'][['Merchant_id']].copy() m1['m_coupon_count'] = 1 m1 = m1.groupby(['Merchant_id'], as_index = False).count() # m_sale_count : num of sale from merchant (with or without coupon) m2 = df[df['Date'] != 'null'][['Merchant_id']].copy() m2['m_sale_count'] = 1 m2 = m2.groupby(['Merchant_id'], as_index = False).count() # m_sale_with_coupon : num of sale from merchant with coupon usage m3 = df[(df['Date'] != 'null') & (df['Date_received'] != 'null')][['Merchant_id']].copy() m3['m_sale_with_coupon'] = 1 m3 = m3.groupby(['Merchant_id'], as_index = False).count() # m_min_distance mtmp = df[(df['Date'] != 'null') & (df['Date_received'] != 'null')][['Merchant_id', 'distance']].copy() mtmp.replace(-1, np.nan, inplace = True) m4 = mtmp.groupby(['Merchant_id'], as_index = False).min() m4.rename(columns = {'distance':'m_min_distance'}, inplace = True) m5 = mtmp.groupby(['Merchant_id'], as_index = False).max() m5.rename(columns = {'distance':'m_max_distance'}, inplace = True) m6 = mtmp.groupby(['Merchant_id'], as_index = False).mean() m6.rename(columns = {'distance':'m_mean_distance'}, inplace = True) m7 = mtmp.groupby(['Merchant_id'], as_index = False).median() m7.rename(columns = {'distance':'m_median_distance'}, inplace = True) merchant_feature = pd.merge(m, m1, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m2, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m3, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m4, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m5, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m6, on = 'Merchant_id', how = 'left') merchant_feature = pd.merge(merchant_feature, m7, on = 'Merchant_id', how = 'left') merchant_feature['m_coupon_use_rate'] = merchant_feature['m_sale_with_coupon'].astype('float')/merchant_feature['m_coupon_count'].astype('float') merchant_feature['m_sale_with_coupon_rate'] = merchant_feature['m_sale_with_coupon'].astype('float')/merchant_feature['m_sale_count'].astype('float') merchant_feature = merchant_feature.fillna(0) print(merchant_feature.columns.tolist()) return merchant_feature # In[68]: def usermerchantFeature(df): um = df[['User_id', 'Merchant_id']].copy().drop_duplicates() um1 = df[['User_id', 'Merchant_id']].copy() um1['um_count'] = 1 um1 = um1.groupby(['User_id', 'Merchant_id'], as_index = False).count() um2 = df[df['Date'] != 'null'][['User_id', 'Merchant_id']].copy() um2['um_buy_count'] = 1 um2 = um2.groupby(['User_id', 'Merchant_id'], as_index = False).count() um3 = df[df['Date_received'] != 'null'][['User_id', 'Merchant_id']].copy() um3['um_coupon_count'] = 1 um3 = um3.groupby(['User_id', 'Merchant_id'], as_index = False).count() um4 = df[(df['Date_received'] != 'null') & (df['Date'] != 'null')][['User_id', 'Merchant_id']].copy() um4['um_buy_with_coupon'] = 1 um4 = um4.groupby(['User_id', 'Merchant_id'], as_index = False).count() user_merchant_feature = pd.merge(um, um1, on = ['User_id','Merchant_id'], how = 'left') user_merchant_feature = pd.merge(user_merchant_feature, um2, on = ['User_id','Merchant_id'], how = 'left') user_merchant_feature = pd.merge(user_merchant_feature, um3, on = ['User_id','Merchant_id'], how = 'left') user_merchant_feature = pd.merge(user_merchant_feature, um4, on = ['User_id','Merchant_id'], how = 'left') user_merchant_feature = user_merchant_feature.fillna(0) user_merchant_feature['um_buy_rate'] = user_merchant_feature['um_buy_count'].astype('float')/user_merchant_feature['um_count'].astype('float') user_merchant_feature['um_coupon_use_rate'] = user_merchant_feature['um_buy_with_coupon'].astype('float')/user_merchant_feature['um_coupon_count'].astype('float') user_merchant_feature['um_buy_with_coupon_rate'] = user_merchant_feature['um_buy_with_coupon'].astype('float')/user_merchant_feature['um_buy_count'].astype('float') user_merchant_feature = user_merchant_feature.fillna(0) print(user_merchant_feature.columns.tolist()) return user_merchant_feature # In[69]: def featureProcess(feature, train, test): """ feature engineering from feature data then assign user, merchant, and user_merchant feature for train and test """ user_feature = userFeature(feature) merchant_feature = merchantFeature(feature) user_merchant_feature = usermerchantFeature(feature) train = pd.merge(train, user_feature, on = 'User_id', how = 'left') train = pd.merge(train, merchant_feature, on = 'Merchant_id', how = 'left') train = pd.merge(train, user_merchant_feature, on = ['User_id', 'Merchant_id'], how = 'left') train = train.fillna(0) test = pd.merge(test, user_feature, on = 'User_id', how = 'left') test = pd.merge(test, merchant_feature, on = 'Merchant_id', how = 'left') test = pd.merge(test, user_merchant_feature, on = ['User_id', 'Merchant_id'], how = 'left') test = test.fillna(0) return train, test # In[70]: # repeat result above and process dftest data feature = dfoff[(dfoff['Date'] < '20160516') | ((dfoff['Date'] == 'null') & (dfoff['Date_received'] < '20160516'))].copy() data = dfoff[(dfoff['Date_received'] >= '20160516') & (dfoff['Date_received'] <= '20160615')].copy() print(data['label'].value_counts()) # In[71]: # features predictors = ['discount_rate', 'discount_man', 'discount_jian', 'discount_type', 'distance', 'weekday', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7', 'weekday_type', 'u_coupon_count', 'u_buy_count', 'u_buy_with_coupon', 'u_merchant_count', 'u_min_distance', 'u_max_distance', 'u_mean_distance', 'u_median_distance', 'u_use_coupon_rate', 'u_buy_with_coupon_rate', 'm_coupon_count', 'm_sale_count', 'm_sale_with_coupon', 'm_min_distance', 'm_max_distance', 'm_mean_distance', 'm_median_distance', 'm_coupon_use_rate', 'm_sale_with_coupon_rate', 'um_count', 'um_buy_count', 'um_coupon_count', 'um_buy_with_coupon', 'um_buy_rate', 'um_coupon_use_rate', 'um_buy_with_coupon_rate'] print(len(predictors), predictors) # In[72]: trainSub, validSub = train_test_split(train, test_size = 0.2, stratify = train['label'], random_state=100) # In[73]: # linear model model = check_model(trainSub, predictors) # In[74]: validSub['pred_prob'] = model.predict_proba(validSub[predictors])[:,1] validgroup = validSub.groupby(['Coupon_id']) aucs = [] for i in validgroup: tmpdf = i[1] if len(tmpdf['label'].unique()) != 2: continue fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1) aucs.append(auc(fpr, tpr)) print(np.average(aucs)) # In[75]: model = lgb.LGBMClassifier( learning_rate = 0.01, boosting_type = 'gbdt', objective = 'binary', metric = 'logloss', max_depth = 5, sub_feature = 0.7, num_leaves = 3, colsample_bytree = 0.7, n_estimators = 5000, early_stop = 50, verbose = -1) model.fit(trainSub[predictors], trainSub['label']) # In[76]: validSub['pred_prob'] = model.predict_proba(validSub[predictors])[:,1] validgroup = validSub.groupby(['Coupon_id']) aucs = [] for i in validgroup: tmpdf = i[1] if len(tmpdf['label'].unique()) != 2: continue fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1) aucs.append(auc(fpr, tpr)) print(np.average(aucs)) # In[77]: # test prediction for submission y_test_pred = model.predict_proba(test[predictors]) submit = test[['User_id','Coupon_id','Date_received']].copy() submit['label'] = y_test_pred[:,1] submit.to_csv('submit2.csv', index=False, header=False) submit.head()  

4.实战-《第1章 O2O优惠券使用新人赛数据发掘工程》

注:原来的path =’datalab/59/’应对应改成path =’G:/code/o2o/’,!!!注意斜杠方向

# coding: utf-8 # In[1]: import pandas as pd import seaborn as sns import numpy as np from matplotlib import pyplot as plt import matplotlib from datetime import datetime import warnings warnings.filterwarnings("ignore") get_ipython().magic('matplotlib inline') # In[2]: print(plt.style.available) print('pandas version:',pd.__version__) print('seaborn version:',sns.__version__) print('matplotlib version:',matplotlib.__version__) # In[7]: path ='G:/code/o2o/' # In[8]: train = pd.read_csv(path+'ccf_offline_stage1_train.csv') # In[9]: train.head() # In[10]: train.info() # In[11]: train.isnull().sum()/len(train) # In[12]: len(set(train['User_id'])) # In[13]: train['sum'] = 1 user_id_count = train.groupby(['User_id'], as_index = False)['sum'].agg({'count':np.sum}) # In[14]: user_id_count.sort_values(['count'],ascending=0)[:20] # In[15]: def user_count(data): if data > 10: return 3 elif data > 5: return 2 elif data > 1: return 1 else: return 0 # In[16]: user_id_count['user_range'] = user_id_count['count'].map(user_count) # In[17]: sns.set(font_scale=1.2,style="white") f,ax=plt.subplots(1,2,figsize=(17.5,8)) user_id_count['user_range'].value_counts().plot.pie(explode=[0,0.1,0.1,0.1],autopct='%1.1f%%',ax=ax[0]) plt.ylim([0, 300000]) ax[0].set_title('user_range_ratio') ax[0].set_ylabel('') sns.despine() sns.countplot('user_range',data=user_id_count,ax=ax[1]) plt.show() # In[18]: Mer_id_count = train.groupby(['Merchant_id'], as_index = False)['sum'].agg({'count':np.sum}) # In[19]: Mer_id_count.sort_values(['count'],ascending=0)[:5] # In[20]: def Mer_count(data): if data > 1000: return 3 elif data > 100: return 2 elif data > 20: return 1 else: return 0 # In[21]: Mer_id_count['mer_range'] = Mer_id_count['count'].map(Mer_count) # In[22]: sns.set(font_scale=1.2,style="white") f,ax=plt.subplots(1,2,figsize=(17.5,8)) Mer_id_count['mer_range'].value_counts().plot.pie(explode=[0,0.1,0.1,0.1],autopct='%1.1f%%',ax=ax[0]) plt.ylim([0, 6000]) ax[0].set_title('Mer_range_ratio') ax[0].set_ylabel('') sns.despine() sns.countplot('mer_range',data=Mer_id_count,ax=ax[1]) plt.show() # In[23]: print('有优惠券的数据为', train[(train['Coupon_id'] != 'null')].shape[0]) print('无优惠券的数据为', train[(train['Coupon_id'] == 'null')].shape[0]) # In[24]: train1 = train[(train['Coupon_id'] != 'null')] Cou_id_count = train1.groupby(['Coupon_id'], as_index = False)['sum'].agg({'count':np.sum}) # In[25]: Cou_id_count.sort_values(['count'],ascending=0)[:5] # In[26]: def Cou_count(data): if data > 1000: return 3 elif data > 100: return 2 elif data > 10: return 1 else: return 0 # In[27]: Cou_id_count['Cou_range'] = Cou_id_count['count'].map(Cou_count) # In[28]: sns.set(font_scale=1.2,style="white") f,ax=plt.subplots(1,2,figsize=(17.5,8)) Cou_id_count['Cou_range'].value_counts().plot.pie(explode=[0,0.1,0.1,0.1],autopct='%1.1f%%',ax=ax[0]) plt.ylim([0, 7000]) ax[0].set_title('Cou_range_ratio') ax[0].set_ylabel('') sns.despine() sns.countplot('Cou_range',data=Cou_id_count,ax=ax[1]) plt.show() # In[29]: def convertRate(row): """Convert discount to rate""" if row == 'null': return 1.0 elif ':' in row: rows = row.split(':') return np.round(1.0 - float(rows[1])/float(rows[0]),2) else: return float(row) # In[30]: train['discount_rate'] = train['Discount_rate'].apply(convertRate) # In[31]: print('Discount_rate 类型:',train['discount_rate'].unique()) # In[32]: sns.set(style="white") sns.factorplot(x="discount_rate", data=train, kind="count", size=6, aspect=2,color='#ad9ee8') # In[33]: sns.set(style="white") sns.factorplot(x="Distance", data=train, kind="count", size=6, aspect=2) # In[34]: couponbydate = train[train['Date_received'] != 'null'][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count() couponbydate.columns = ['Date_received','count'] buybydate = train[(train['Date'] != 'null') & (train['Date_received'] != 'null')][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count() buybydate.columns = ['Date_received','count'] # In[35]: date_buy = train['Date'].unique() date_buy = sorted(date_buy[date_buy != 'null']) date_received = train['Date_received'].unique() date_received = sorted(date_received[date_received != 'null']) sns.set_style('ticks') sns.set_context("notebook", font_scale= 1.4) plt.figure(figsize = (12,8)) date_received_dt = pd.to_datetime(date_received, format='%Y%m%d') plt.subplot(211) plt.bar(date_received_dt, couponbydate['count'], label = 'number of coupon received',color='#a675a1') plt.bar(date_received_dt, buybydate['count'], label = 'number of coupon used',color='#75a1a6') plt.yscale('log') plt.ylabel('Count') plt.legend() plt.subplot(212) plt.bar(date_received_dt, buybydate['count']/couponbydate['count'],color='#62a5de') plt.ylabel('Ratio(coupon used/coupon received)') plt.tight_layout() # In[36]: um_count = train.groupby(['User_id','Merchant_id'], as_index = False)['sum'].agg({'count':np.sum}) # In[37]: um_count.head(5) # In[38]: train1 = train[train['Coupon_id'] != 'null'] # In[39]: uc_count = train1.groupby(['User_id','Coupon_id'], as_index = False)['sum'].agg({'count':np.sum}) # In[40]: uc_count.head(5) # In[41]: train['discount_rate'] = train['Discount_rate'].apply(convertRate) # In[42]: ur_count = train.groupby(['User_id','discount_rate'], as_index = False)['sum'].agg({'count':np.sum}) # In[43]: ur_count.head(5) # In[44]: ud_count = train.groupby(['User_id','Distance'], as_index = False)['sum'].agg({'count':np.sum}) # In[45]: ud_count = train.groupby(['User_id','Distance'], as_index = False)['sum'].agg({'count':np.sum}) # In[46]: from datetime import date def getWeekday(row): if row == 'null': return row else: return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1 # In[47]: train['received_weekday'] = train['Date_received'].astype(str).apply(getWeekday) train['consume_weekday'] = train['Date'].astype(str).apply(getWeekday) # weekday_type : 周六和周日为1,其他为0 train['received_weekday_type'] = train['received_weekday'].apply(lambda x : 1 if x in [6,7] else 0 ) train['consume_weekday_type'] = train['consume_weekday'].apply(lambda x : 1 if x in [6,7] else 0 ) # In[48]: u_week_count = train.groupby(['User_id','received_weekday'], as_index = False)['sum'].agg({'count':np.sum}) u_week_type_count = train.groupby(['User_id','received_weekday_type'], as_index = False)['sum'].agg({'count':np.sum}) # In[49]: u_week_count.head(5) # In[50]: train1 = train[train['Date'] != 'null'] u_con_week_count = train1.groupby(['User_id','consume_weekday'], as_index = False)['sum'].agg({'count':np.sum}) u_con_week_type_count = train1.groupby(['User_id','consume_weekday_type'], as_index = False)['sum'].agg({'count':np.sum}) # In[51]: u_con_week_count.head(10) 

5.学习《天池o2o参考代码(0.81分数)》、《 天池O2O优惠券代码分享》和《第1章 新手代码【视频直播】:O2O优惠券使用预测赛实战演练》代码,学习了利用数据评价标准以及利用滑窗法、pandas的内置函数提取特征等方法优化准确率。

《天池o2o参考代码(0.81分数)》代码参考如下:

def get_offline_features(X, offline):
print(X.shape)

coupon_id_notnull = offline[offline.Coupon_id.notnull()]

temp = coupon_id_notnull
coupon_consume = temp[temp.Date.notnull()]
coupon_no_consume = temp[temp.Date.isnull()]

user_coupon_consume = coupon_consume.groupby(‘User_id’)

X[‘weekday’] = X.Date_received.dt.weekday
X[‘day’] = X.Date_received.dt.day

# 距离优惠券消费次数
temp = coupon_consume.groupby(‘Distance’).size().to_frame(‘distance_0’)
X = pd.merge(X, temp, how=’left’, on=’Distance’)

# 距离优惠券不消费次数
temp = coupon_no_consume.groupby(‘Distance’).size().to_frame(‘distance_1’)
X = pd.merge(X, temp, how=’left’, on=’Distance’)

# 距离优惠券领取次数
X[‘distance_2’] = X.distance_0 + X.distance_1

# 距离优惠券消费率
X[‘distance_3’] = X.distance_0 / X.distance_2

”’user features”’

# 用户优惠券消费次数
temp = user_coupon_consume.size().to_frame(‘u2’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户优惠券不消费次数
temp = coupon_no_consume.groupby(‘User_id’).size().to_frame(‘u3’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户优惠券消费与不消费次数比值
X[‘u19’] = X.u2 / X.u3

# 用户领取优惠券次数
X[‘u1’] = X.u2.fillna(0) + X.u3.fillna(0)

# 用户优惠券使用率
X[‘u4’] = X.u2 / X.u1

# 用户普通消费次数
date_received_isnull_date_notnull = offline[offline.Date_received.isnull() & offline.Date.notnull()]
temp = date_received_isnull_date_notnull.groupby(‘User_id’).size().to_frame(‘u5’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户优惠券消费和普通消费次数
X[‘u25’] = X.u2.fillna(0) + X.u5.fillna(0)

# 用户使用优惠券消费占比
X[‘u20’] = X.u2 / X.u25

# 用户所有消费时间平均间隔
temp = offline[offline.Date.notnull()].copy()
temp[‘_max’] = temp.groupby(‘User_id’).Date.transform(‘max’)
temp[‘_min’] = temp.groupby(‘User_id’).Date.transform(‘min’)
temp[‘_len’] = temp.groupby(‘User_id’).Date.transform(‘count’)
temp[‘u26’] = (temp._max – temp._min).dt.days / (temp._len – 1)
temp.drop_duplicates(‘User_id’, inplace=True)
X = pd.merge(X, temp[[‘User_id’, ‘u26′]], how=’left’, on=’User_id’)

# 用户普通消费平均间隔
temp = date_received_isnull_date_notnull.copy()
tmp = temp.groupby(‘User_id’).Date
temp[‘_max’] = tmp.transform(‘max’)
temp[‘_min’] = tmp.transform(‘min’)
temp[‘_count’] = tmp.transform(‘count’)
temp[‘u6’] = (temp._max – temp._min).dt.days / (temp._count – 1)
temp.drop_duplicates(‘User_id’, inplace=True)
X = pd.merge(X, temp[[‘User_id’, ‘u6′]], how=’left’, on=’User_id’)

# 用户优惠券消费平均间隔
temp = coupon_consume.copy()
temp1 = temp.groupby(‘User_id’).Date
temp[‘_max’] = temp1.transform(‘max’)
temp[‘_min’] = temp1.transform(‘min’)
temp[‘_count’] = temp1.transform(‘count’)
temp[‘u7’] = (temp._max – temp._min).dt.days / (temp._count – 1)
temp.drop_duplicates(‘User_id’, inplace=True)
X = pd.merge(X, temp[[‘User_id’, ‘u7′]], how=’left’, on=’User_id’)

# 用户15天内平均会普通消费几次
X[‘u8’] = X.u6 / 15

# 用户15天内平均会优惠券消费几次
X[‘u9’] = X.u7 / 15

# 用户优惠券领取到使用平均间隔
temp = coupon_consume.copy()
temp[‘days’] = (temp.Date – temp.Date_received).dt.days
temp = (temp.groupby(‘User_id’).days.sum() / temp.groupby(‘User_id’).size()).to_frame(‘u10’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户在15天内使用掉优惠券的值大小
X[‘u11’] = X.u10 / 15

# 领取优惠券到使用优惠券间隔小于15天的次数
temp = coupon_consume.copy()
temp[‘days’] = (temp.Date – temp.Date_received).dt.days
temp = temp[temp.days <= 15]
temp = temp.groupby(‘User_id’).size().to_frame(‘u21’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户15天使用掉优惠券的次数除以使用优惠券的次数
X[‘u22’] = X.u21 / X.u2

# 用户15天使用掉优惠券的次数除以领取优惠券未消费的次数
X[‘u23’] = X.u21 / X.u3

# 用户15天使用掉优惠券的次数除以领取优惠券的总次数
X[‘u24’] = X.u21 / X.u1

# 用户核销优惠券的平均折率
temp = user_coupon_consume.discount.mean().to_frame(‘u45’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户核销优惠券的最低折率
temp = user_coupon_consume.discount.min().to_frame(‘u27’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户核销优惠券的最高消费折率
temp = user_coupon_consume.discount.max().to_frame(‘u28’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户核销过的不同优惠券数量
temp = coupon_consume.groupby([‘User_id’, ‘Coupon_id’]).size()
temp = temp.groupby(‘User_id’).size().to_frame(‘u32’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户领取所有不同优惠券数量
date_received_notnull = offline[offline.Date_received.notnull()]
temp = date_received_notnull.groupby([‘User_id’, ‘Coupon_id’]).size().to_frame(‘u47’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Coupon_id’])

# 用户核销过的不同优惠券数量占所有不同优惠券的比重
X[‘u33’] = X.u32 / X.u47

# 用户平均每种优惠券核销多少张
X[‘u34’] = X.u2 / X.u47

# 核销优惠券用户-商家平均距离
temp = offline[offline.Date_received.notnull() & offline.Date.notnull()].copy()
temp1 = temp.groupby(‘User_id’).Distance
temp[‘_count’] = temp1.transform(‘count’)
temp[‘_sum’] = temp1.transform(‘sum’)
temp[‘u35’] = temp._sum / temp._count
temp = temp[[‘User_id’, ‘u35’]].copy()
temp.drop_duplicates(inplace=True)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户核销优惠券中的最小用户-商家距离
temp = coupon_consume[coupon_consume.Distance.notnull()]
temp = temp.groupby(‘User_id’).Distance.min().to_frame(‘u36’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户核销优惠券中的最大用户-商家距离
temp = coupon_consume[coupon_consume.Distance.notnull()]
temp = temp.groupby(‘User_id’).Distance.max().to_frame(‘u37’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 优惠券类型
discount_types = [
‘0.2’, ‘0.5’, ‘0.6’, ‘0.7’, ‘0.75’, ‘0.8’, ‘0.85’, ‘0.9’, ‘0.95’, ’30:20′, ’50:30′, ’10:5′,
’20:10′, ‘100:50’, ‘200:100′, ’50:20′, ’30:10’, ‘150:50’, ‘100:30′, ’20:5’, ‘200:50’, ‘5:1’,
’50:10′, ‘100:20’, ‘150:30′, ’30:5’, ‘300:50’, ‘200:30’, ‘150:20′, ’10:1′, ’50:5’, ‘100:10’,
‘200:20’, ‘300:30’, ‘150:10’, ‘300:20’, ‘500:30′, ’20:1’, ‘100:5’, ‘200:10′, ’30:1’, ‘150:5’,
‘300:10’, ‘200:5′, ’50:1’, ‘100:1’,
]
X[‘discount_type’] = -1
for k, v in enumerate(discount_types):
X.loc[X.Discount_rate == v, ‘discount_type’] = k

# 不同优惠券领取次数
temp = offline.groupby([‘User_id’, ‘Discount_rate’]).size().to_frame(‘u41’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Discount_rate’])

# 不同优惠券使用次数
temp = coupon_consume.groupby([‘User_id’, ‘Discount_rate’]).size().to_frame(‘u42’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Discount_rate’])

# 不同优惠券不使用次数
temp = coupon_no_consume.groupby([‘User_id’, ‘Discount_rate’]).size().to_frame(‘u43’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Discount_rate’])

# 不同打折优惠券使用率
X[‘u44’] = X.u42 / X.u41

# 满减类型优惠券领取次数
temp = offline[offline.Discount_rate.str.contains(‘:’) == True]
temp = temp.groupby(‘User_id’).size().to_frame(‘u48’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 打折类型优惠券领取次数
temp = offline[offline.Discount_rate.str.contains(‘\.’) == True]
temp = temp.groupby(‘User_id’).size().to_frame(‘u49’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

”’offline merchant features”’

# 商家销售次数
temp = offline[offline.Date.notnull()].groupby(‘Merchant_id’).size().to_frame(‘m0’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家优惠券核销次数
temp = coupon_consume.groupby(‘Merchant_id’).size().to_frame(‘m1’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家普通销售次数
X[‘m2’] = X.m0.fillna(0) – X.m1.fillna(0)

# 商家优惠券被领取次数
temp = date_received_notnull.groupby(‘Merchant_id’).size().to_frame(‘m3’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家优惠券核销率
X[‘m4’] = X.m1 / X.m3

# 商家优惠券不核销次数
temp = coupon_no_consume.groupby(‘Merchant_id’).size().to_frame(‘m7’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家在数据集中出现的次数
temp = offline.groupby(‘Merchant_id’).size().to_frame(‘m16’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商户当天优惠券领取次数
temp = X[X.Date_received.notnull()]
temp = temp.groupby([‘Merchant_id’, ‘Date_received’]).size().to_frame(‘m5’)
X = pd.merge(X, temp, how=’left’, on=[‘Merchant_id’, ‘Date_received’])

# 商家当天优惠券领取人数
temp = X[X.Date_received.notnull()]
temp = temp.groupby([‘User_id’, ‘Merchant_id’, ‘Date_received’]).size()
temp = temp.groupby([‘Merchant_id’, ‘Date_received’]).size().to_frame(‘m6’)
X = pd.merge(X, temp, how=’left’, on=[‘Merchant_id’, ‘Date_received’])

# 商家核销优惠券的平均折率
temp = coupon_consume.groupby(‘Merchant_id’).discount.mean().to_frame(‘m8’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家优惠券核销的最小消费折率
temp = coupon_consume.groupby(‘Merchant_id’).discount.max().to_frame(‘m9’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家优惠券核销的最大消费折率
temp = coupon_consume.groupby(‘Merchant_id’).discount.min().to_frame(‘m10’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家优惠券核销不同的用户数量
temp = coupon_consume.groupby([‘Merchant_id’, ‘User_id’]).size()
temp = temp.groupby(‘Merchant_id’).size().to_frame(‘m11’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家优惠券领取不同的用户数量
temp = date_received_notnull.groupby([‘Merchant_id’, ‘User_id’]).size()
temp = temp.groupby(‘Merchant_id’).size().to_frame(‘m12’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 核销商家优惠券的不同用户数量其占领取不同的用户比重
X[‘m13’] = X.m11 / X.m12

# 商家优惠券平均每个用户核销多少张
X[‘m14’] = X.m1 / X.m12

# 商家被核销过的不同优惠券数量
temp = coupon_consume.groupby([‘Merchant_id’, ‘Coupon_id’]).size()
temp = temp.groupby(‘Merchant_id’).size().to_frame(‘m15’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家领取过的不同优惠券数量的比重
temp = date_received_notnull.groupby([‘Merchant_id’, ‘Coupon_id’]).size()
temp = temp.groupby(‘Merchant_id’).count().to_frame(‘m18’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家被核销过的不同优惠券数量占所有领取过的不同优惠券数量的比重
X[‘m19’] = X.m15 / X.m18

# 商家被核销优惠券的平均时间
temp = coupon_consume.copy()
temp1 = temp.groupby(‘Merchant_id’).Date
temp[‘_max’] = temp1.transform(‘max’)
temp[‘_min’] = temp1.transform(‘min’)
temp[‘_count’] = temp1.transform(‘count’)
temp[‘m20’] = (temp._max – temp._min).dt.days / (temp._count – 1)
temp.drop_duplicates(‘Merchant_id’, inplace=True)
X = pd.merge(X, temp[[‘Merchant_id’, ‘m20′]], how=’left’, on=’Merchant_id’)

# 商家被核销优惠券中的用户平均距离
temp = coupon_consume[coupon_consume.Distance.notnull()].copy()
temp1 = temp.groupby(‘Merchant_id’).Distance
temp[‘_count’] = temp1.transform(‘count’)
temp[‘_sum’] = temp1.transform(‘sum’)
temp[‘m21’] = temp._sum / temp._count
temp = temp[[‘Merchant_id’, ‘m21’]].copy()
temp.drop_duplicates(inplace=True)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家被核销优惠券中的用户最小距离
temp = coupon_consume[coupon_consume.Distance.notnull()]
temp = temp.groupby(‘Merchant_id’).Distance.min().to_frame(‘m22’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家被核销优惠券中的用户最大距离
temp = coupon_consume[coupon_consume.Distance.notnull()]
temp = temp.groupby(‘Merchant_id’).Distance.max().to_frame(‘m23’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家普通消费用户的平均距离
temp = offline[offline.Date_received.isnull() & offline.Date.notnull()].copy()
temp1 = temp.groupby(‘Merchant_id’).Distance
temp[‘_count’] = temp1.transform(‘count’)
temp[‘_sum’] = temp1.transform(‘sum’)
temp[‘m24’] = temp._sum / temp._count
temp = temp[[‘Merchant_id’, ‘m24’]].copy()
temp.drop_duplicates(inplace=True)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

“””offline coupon features”””

# 每种优惠券领取次数
temp = coupon_id_notnull.groupby(‘Coupon_id’).size().to_frame(‘c1’)
X = pd.merge(X, temp, how=’left’, on=’Coupon_id’)

# 每种优惠券使用次数
temp = coupon_consume.groupby(‘Coupon_id’).size().to_frame(‘c2’)
X = pd.merge(X, temp, how=’left’, on=’Coupon_id’)

# 优惠券使用率
X[‘c3’] = X.c2 / X.c1

# 优惠券没有使用次数
X[‘c4’] = X.c1 – X.c2

# 优惠券当天领取次数
temp = X.groupby([‘Coupon_id’, ‘Date_received’]).size().to_frame(‘c5’)
X = pd.merge(X, temp, how=’left’, on=[‘Coupon_id’, ‘Date_received’])

# 优惠券类型(直接优惠为0, 满减为1)
X[‘c6’] = 0
X.loc[X.Discount_rate.str.contains(‘:’) == True, ‘c6’] = 1

# 不同打折优惠券领取次数
temp = offline.groupby(‘Discount_rate’).size().to_frame(‘c8’)
X = pd.merge(X, temp, how=’left’, on=’Discount_rate’)

# 不同打折优惠券使用次数
temp = coupon_consume.groupby(‘Discount_rate’).size().to_frame(‘c9’)
X = pd.merge(X, temp, how=’left’, on=’Discount_rate’)

# 不同打折优惠券不使用次数
temp = coupon_no_consume.groupby(‘Discount_rate’).size().to_frame(‘c10’)
X = pd.merge(X, temp, how=’left’, on=’Discount_rate’)

# 不同打折优惠券使用率
X[‘c11’] = X.c9 / X.c8

# 每种优惠券核销的平均时间
temp = coupon_consume.copy()
temp1 = temp.groupby(‘Coupon_id’).Date
temp[‘_max’] = temp1.transform(‘max’)
temp[‘_min’] = temp1.transform(‘min’)
temp[‘_count’] = temp1.transform(‘count’)
temp[‘c12’] = (temp._max – temp._min).dt.days / (temp._count – 1)
temp.drop_duplicates(‘Coupon_id’, inplace=True)
X = pd.merge(X, temp[[‘Coupon_id’, ‘c12′]], how=’left’, on=’Coupon_id’)

”’user merchant feature”’

# 用户领取商家的优惠券次数
temp = coupon_id_notnull
temp = temp.groupby([‘User_id’, ‘Merchant_id’]).size().to_frame(‘um1’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Merchant_id’])

# 用户领取商家的优惠券后不核销次数
temp = coupon_no_consume.groupby([‘User_id’, ‘Merchant_id’]).size().to_frame(‘um2’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Merchant_id’])

# 用户领取商家的优惠券后核销次数
temp = coupon_consume.groupby([‘User_id’, ‘Merchant_id’]).size().to_frame(‘um3’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Merchant_id’])

# 用户领取商家的优惠券后核销率
X[‘um4’] = X.um3 / X.um1

# 用户对每个商家的不核销次数占用户总的不核销次数的比重
temp = coupon_no_consume.groupby(‘User_id’).size().to_frame(‘temp’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)
X[‘um5’] = X.um2 / X.temp
X.drop(columns=’temp’, inplace=True)

# 用户在商店总共消费过几次
temp = offline[offline.Date.notnull()].groupby([‘User_id’, ‘Merchant_id’]).size().to_frame(‘um6’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Merchant_id’])

# 用户在商店普通消费次数
temp = date_received_isnull_date_notnull
temp = temp.groupby([‘User_id’, ‘Merchant_id’]).size().to_frame(‘um7’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Merchant_id’])

# 用户当天在每个商家领取的优惠券次数
temp = date_received_notnull
temp = temp.groupby([‘User_id’, ‘Merchant_id’, ‘Date_received’]).size().to_frame(‘um8’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Merchant_id’, ‘Date_received’])

# 用户领取优惠券不同商家数量
temp = coupon_id_notnull
temp = temp.groupby([‘User_id’, ‘Merchant_id’]).size().reset_index()
temp = temp.groupby(‘User_id’).size().to_frame(‘um9’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户核销优惠券不同商家数量
temp = coupon_consume.groupby([‘User_id’, ‘Merchant_id’]).size()
temp = temp.groupby(‘User_id’).size().to_frame(‘um10’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户核销过优惠券的不同商家数量占所有不同商家的比重
X[‘um11’] = X.um10 / X.um9

# 用户平均核销每个商家多少张优惠券
X[‘um12’] = X.u2 / X.um9

”’other feature”’

# 用户领取优惠券次数
temp = X.groupby(‘User_id’).size().to_frame(‘o1’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户不同优惠券领取次数
temp = X.groupby([‘User_id’, ‘Coupon_id’]).size().to_frame(‘o2’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Coupon_id’])

# 用户此次之后/前领取的优惠券次数
X[‘o3’] = 1
X[‘o3’] = X.sort_values(by=[‘User_id’, ‘Date_received’]).groupby(‘User_id’).o3.cumsum() – 1
X[‘o4’] = 1
X[‘o4’] = X.sort_values(by=[‘User_id’, ‘Date_received’], ascending=False).groupby(‘User_id’).o4.cumsum() – 1

# 用户此次之后/前领取的每种优惠券次数
X[‘o5’] = 1
temp = X.sort_values(by=[‘User_id’, ‘Coupon_id’, ‘Date_received’])
X[‘o5’] = temp.groupby(‘User_id’).o5.cumsum() – 1
X[‘o6’] = 1
temp = X.sort_values(by=[‘User_id’, ‘Coupon_id’, ‘Date_received’], ascending=False)
X[‘o6’] = temp.groupby(‘User_id’).o6.cumsum() – 1

# 用户领取优惠券平均时间间隔
temp = X.copy()
temp1 = temp.groupby(‘User_id’).Date_received
temp[‘_max’] = temp1.transform(‘max’)
temp[‘_min’] = temp1.transform(‘min’)
temp[‘_count’] = temp1.transform(‘count’)
temp[‘o7’] = (temp._max – temp._min).dt.days / (temp._count – 1)
temp.drop_duplicates(‘User_id’, inplace=True)
X = pd.merge(X, temp[[‘User_id’, ‘o7′]], how=’left’, on=’User_id’)

# 用户领取不同商家的优惠券次数
temp = X.groupby([‘User_id’, ‘Merchant_id’]).size().to_frame(‘o8’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Merchant_id’])

# 用户领取的不同商家数
temp = X.groupby([‘User_id’, ‘Merchant_id’]).size()
temp = temp.groupby(‘User_id’).size().to_frame(‘o9’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 用户当天领取的优惠券次数
temp = X.groupby([‘User_id’, ‘Date_received’]).size().to_frame(‘o10’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Date_received’])

# 用户当天不同优惠券领取次数
temp = X.groupby([‘User_id’, ‘Coupon_id’, ‘Date_received’]).size().to_frame(‘o11’)
X = pd.merge(X, temp, how=’left’, on=[‘User_id’, ‘Coupon_id’, ‘Date_received’])

# 用户领取优惠券类别数
temp = X.groupby([‘User_id’, ‘Coupon_id’]).size()
temp = temp.groupby(‘User_id’).size().to_frame(‘o12’)
X = pd.merge(X, temp, how=’left’, on=’User_id’)

# 商家被领取的优惠券次数
temp = X.groupby(‘Merchant_id’).size().to_frame(‘o13’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家优惠券的种类数
temp = X.groupby([‘Merchant_id’, ‘Coupon_id’]).size().to_frame(‘o14’)
X = pd.merge(X, temp, how=’left’, on=[‘Merchant_id’, ‘Coupon_id’])

# 商家被领取优惠券不同用户数
temp = X.groupby([‘Merchant_id’, ‘User_id’]).size()
temp = temp.groupby(‘Merchant_id’).size().to_frame(‘o15’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 商家优惠券所有种类数
temp = X.groupby([‘Merchant_id’, ‘Coupon_id’]).size()
temp = temp.groupby(‘Merchant_id’).size().to_frame(‘o16’)
X = pd.merge(X, temp, how=’left’, on=’Merchant_id’)

# 用户领取优惠券的时间间隔
temp = X.sort_values(by=[‘User_id’, ‘Date_received’]).groupby(‘User_id’)
X[‘o17’] = temp.Date_received.diff().dt.days
X[‘o18’] = temp.Date_received.diff(-1).dt.days.abs()

print(X.shape)

return X

原文链接:https://blog.csdn.net/linyuhan3232/article/details/84800918?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522165277499316781667814000%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fblog.%2522%257D&request_id=165277499316781667814000&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~blog~first_rank_ecpm_v1~times_rank-11-84800918-null-null.nonecase&utm_term=%E4%BC%98%E6%83%A0

© 版权声明
THE END
喜欢就支持一下吧
点赞0 分享
评论 抢沙发
头像
文明发言,共建和谐米科社区
提交
头像

昵称

取消
昵称表情图片

    暂无评论内容