8
8
from sklearn .metrics import roc_auc_score
9
9
10
10
from nltk .corpus import stopwords
11
- from collections import Counter
11
+ from collections import Counter , defaultdict
12
12
13
13
from sklearn .model_selection import train_test_split
14
14
@@ -64,12 +64,47 @@ def get_weight(count, eps=10000, min_count=2):
64
64
else :
65
65
return 1 / float (count + eps )
66
66
67
+ def jaccard (row ):
68
+ wic = set (str (row ['question1' ])).intersection (set (str (row ['question2' ])))
69
+ uw = set (str (row ['question1' ])).union (str (row ['question2' ]))
70
+ if len (uw ) == 0 :
71
+ uw = [1 ]
72
+ return (len (wic ) / len (uw ))
67
73
68
- def main ():
74
+ def wc_diff (row ):
75
+ return abs (len (str (row ['question1' ])) - len (str (row ['question2' ])))
76
+
77
+ def q1_freq (row ):
78
+ return (len (q_dict [str (row ['question1' ])]))
69
79
70
- #load data
71
- train_df = pd .read_csv ("./data/train.csv" )
72
- test_df = pd .read_csv ("./data/test.csv" )
80
+ def q2_freq (row ):
81
+ return (len (q_dict [str (row ['question2' ])]))
82
+
83
+ def q1_q2_intersect (row ):
84
+ return (len (set (q_dict [str (row ['question1' ])]).intersection (set (q_dict [str (row ['question2' ])]))))
85
+
86
+
87
+ #load data
88
+ train_df = pd .read_csv ("./data/train.csv" )
89
+ test_df = pd .read_csv ("./data/test.csv" )
90
+
91
+ train_qs = pd .Series (train_df ['question1' ].tolist () + train_df ['question2' ].tolist ()).astype (str )
92
+
93
+ #global dictionaries
94
+ print "creating global weights dictionary..."
95
+ eps = 5000
96
+ words = (" " .join (train_qs )).lower ().split ()
97
+ counts = Counter (words )
98
+ weights = {word : get_weight (count , eps ) for word , count in counts .items ()}
99
+
100
+ print "creating global questions dictionary..."
101
+ ques = pd .concat ([train_df [['question1' , 'question2' ]], test_df [['question1' , 'question2' ]]], axis = 0 ).reset_index (drop = 'index' )
102
+ q_dict = defaultdict (set )
103
+ for i in range (ques .shape [0 ]):
104
+ q_dict [ques .question1 [i ]].add (ques .question2 [i ])
105
+ q_dict [ques .question2 [i ]].add (ques .question1 [i ])
106
+
107
+ def main ():
73
108
74
109
#data analysis
75
110
qids = pd .Series (train_df ['qid1' ].tolist () + train_df ['qid2' ].tolist ())
@@ -78,7 +113,6 @@ def main():
78
113
print "total number of questions in the training data: " , len (np .unique (qids ))
79
114
print "number of questions that appear multiple times: " , np .sum (qids .value_counts () > 1 )
80
115
81
-
82
116
plt .figure ()
83
117
plt .hist (qids .value_counts (), bins = 50 )
84
118
plt .yscale ('log' )
@@ -90,7 +124,6 @@ def main():
90
124
train_qs = pd .Series (train_df ['question1' ].tolist () + train_df ['question2' ].tolist ()).astype (str )
91
125
test_qs = pd .Series (test_df ['question1' ].tolist () + test_df ['question2' ].tolist ()).astype (str )
92
126
93
-
94
127
dist_train = train_qs .apply (len )
95
128
dist_test = test_qs .apply (len )
96
129
@@ -104,7 +137,7 @@ def main():
104
137
plt .show ()
105
138
106
139
print "mean-train: %2.2f, std-train: %2.2f, mean-test: %2.2f, std-test: %2.2f" % (dist_train .mean (), dist_train .std (), dist_test .mean (), dist_test .std ())
107
-
140
+
108
141
dist_train = train_qs .apply (lambda x : len (x .split (' ' )))
109
142
dist_test = test_qs .apply (lambda x : len (x .split (' ' )))
110
143
@@ -116,17 +149,11 @@ def main():
116
149
plt .ylabel ('probability' )
117
150
plt .legend ()
118
151
plt .show ()
119
-
120
152
121
153
#feature engineering
122
154
train_word_match = train_df .apply (word_match_share , axis = 1 , raw = True )
123
- print 'Original AUC:' , roc_auc_score (train_df ['is_duplicate' ], train_word_match )
124
-
125
- eps = 5000
126
- words = (" " .join (train_qs )).lower ().split ()
127
- counts = Counter (words )
128
- weights = {word : get_weight (count , eps ) for word , count in counts .items ()}
129
-
155
+ print 'word match AUC:' , roc_auc_score (train_df ['is_duplicate' ], train_word_match )
156
+
130
157
print 'Most common words and weights:'
131
158
print sorted (weights .items (), key = lambda x : x [1 ] if x [1 ] > 0 else 9999 )[:10 ]
132
159
print 'Least common words and weights: '
@@ -135,18 +162,33 @@ def main():
135
162
tfidf_train_word_match = train_df .apply (tfidf_word_match_share , axis = 1 , raw = True )
136
163
tfidf_train_word_match = tfidf_train_word_match .fillna (0 )
137
164
print 'TFIDF AUC:' , roc_auc_score (train_df ['is_duplicate' ], tfidf_train_word_match )
138
-
139
- print "creating data..."
165
+
166
+ train_jaccard = train_df .apply (jaccard , axis = 1 , raw = True )
167
+ print 'jaccard AUC:' , roc_auc_score (train_df ['is_duplicate' ], train_jaccard )
168
+
169
+ train_wc_diff = train_df .apply (wc_diff , axis = 1 , raw = True )
170
+ train_q1_q2_intersect = train_df .apply (q1_q2_intersect , axis = 1 , raw = True )
171
+
172
+ print "creating training data..."
140
173
X_train = pd .DataFrame ()
141
174
X_test = pd .DataFrame ()
142
175
X_train ['word_match' ] = train_word_match
143
176
X_train ['tfidf_word_match' ] = tfidf_train_word_match
177
+ X_train ['jaccard' ] = train_jaccard
178
+ X_train ['wc_diff' ] = train_wc_diff
179
+ X_train ['q1_q2_intersect' ] = train_q1_q2_intersect
180
+
181
+ print "creating test data..."
144
182
X_test ['word_match' ] = test_df .apply (word_match_share , axis = 1 , raw = True )
145
183
X_test ['tfidf_word_match' ] = test_df .apply (tfidf_word_match_share , axis = 1 , raw = True )
184
+ X_test ['jaccard' ] = test_df .apply (jaccard , axis = 1 , raw = True )
185
+ X_test ['wc_diff' ] = test_df .apply (wc_diff , axis = 1 , raw = True )
186
+ X_test ['q1_q2_intersect' ] = test_df .apply (q1_q2_intersect , axis = 1 , raw = True )
187
+
146
188
X_test = X_test .fillna (0 )
147
189
y_train = train_df ['is_duplicate' ].values
148
190
149
- X_train , X_valid , y_train , y_valid = train_test_split (X_train , y_train , test_size = 0.2 , random_state = 4242 )
191
+ X_train , X_valid , y_train , y_valid = train_test_split (X_train , y_train , test_size = 0.2 , random_state = 0 )
150
192
151
193
params = {}
152
194
params ['objective' ] = 'binary:logistic'
0 commit comments