import  os,  sys,  pickleimport  numpy as  np
import  pandas as  pdimport  matplotlib. pyplot as  plt
import  matplotlib. dates as  mdatesimport  seaborn as  sns
import  datetime as  dtfrom  datetime import  datefrom  sklearn. linear_model import  SGDClassifier,  LogisticRegression
from  sklearn. metrics import  log_loss,  roc_auc_score,  auc,  roc_curve
% matplotlib inline
% config InlineBackend. figure_format =  'retina' 
dfoff =  pd. read_csv( './data/ccf_offline_stage1_train.csv' ) 
dftest =  pd. read_csv( './data/ccf_offline_stage1_test_revised.csv' ) 
dfon =  pd. read_csv( './data/ccf_online_stage1_train.csv' ) dfoff. head( ) 
 
 
User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date 0 1439408 2632 NaN NaN 0.0 NaN 20160217.0 1 1439408 4663 11002.0 150:20 1.0 20160528.0 NaN 2 1439408 2632 8591.0 20:1 0.0 20160217.0 NaN 3 1439408 2632 1078.0 20:1 0.0 20160319.0 NaN 4 1439408 2632 8591.0 20:1 0.0 20160613.0 NaN 
 
def  getDiscountType ( row) : if  pd. isnull( row) : return  np. nanelif  ':'  in  row: return  1 else : return  0 def  convertRate ( row) : """Convert discount to rate""" if  pd. isnull( row) : return  1.0 elif  ':'  in  str ( row) : rows =  row. split( ':' ) return  1.0  -  float ( rows[ 1 ] ) / float ( rows[ 0 ] ) else : return  float ( row) def  getDiscountMan ( row) : if  ':'  in  str ( row) : rows =  row. split( ':' ) return  int ( rows[ 0 ] ) else : return  0 def  getDiscountJian ( row) : if  ':'  in  str ( row) : rows =  row. split( ':' ) return  int ( rows[ 1 ] ) else : return  0 def  processData ( df) : df[ 'discount_rate' ]  =  df[ 'Discount_rate' ] . apply ( convertRate) df[ 'discount_man' ]  =  df[ 'Discount_rate' ] . apply ( getDiscountMan) df[ 'discount_jian' ]  =  df[ 'Discount_rate' ] . apply ( getDiscountJian) df[ 'discount_type' ]  =  df[ 'Discount_rate' ] . apply ( getDiscountType) df[ 'distance' ]  =  df[ 'Distance' ] . fillna( - 1 ) . astype( int ) return  dfdfoff =  processData( dfoff) 
dftest =  processData( dftest) dfoff. head( ) 
dftest. head( ) 
 
 
User_id Merchant_id Coupon_id Discount_rate Distance Date_received discount_rate discount_man discount_jian discount_type distance 0 4129537 450 9983 30:5 1.0 20160712 0.833333 30 5 1 1 1 6949378 1300 3429 30:5 NaN 20160706 0.833333 30 5 1 -1 2 2166529 7113 6928 200:20 5.0 20160727 0.900000 200 20 1 5 3 2166529 7113 1808 100:10 5.0 20160727 0.900000 100 10 1 5 4 6172162 7605 6500 30:1 2.0 20160708 0.966667 30 1 1 2 
 
date_received =  dfoff[ 'Date_received' ] . unique( ) 
date_received =  sorted ( date_received[ pd. notnull( date_received) ] ) date_buy =  dfoff[ 'Date' ] . unique( ) 
date_buy =  sorted ( date_buy[ pd. notnull( date_buy) ] ) 
date_buy