Skip to content

Commit ec89b08

Browse files
committed
quora kernel
quora kernel
1 parent ebd5c48 commit ec89b08

File tree

2 files changed

+431
-19
lines changed

2 files changed

+431
-19
lines changed

quora/quora_kernel.py

+61-19
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from sklearn.metrics import roc_auc_score
99

1010
from nltk.corpus import stopwords
11-
from collections import Counter
11+
from collections import Counter, defaultdict
1212

1313
from sklearn.model_selection import train_test_split
1414

@@ -64,12 +64,47 @@ def get_weight(count, eps=10000, min_count=2):
6464
else:
6565
return 1 / float(count + eps)
6666

67+
def jaccard(row):
68+
wic = set(str(row['question1'])).intersection(set(str(row['question2'])))
69+
uw = set(str(row['question1'])).union(str(row['question2']))
70+
if len(uw) == 0:
71+
uw = [1]
72+
return (len(wic) / len(uw))
6773

68-
def main():
74+
def wc_diff(row):
75+
return abs(len(str(row['question1'])) - len(str(row['question2'])))
76+
77+
def q1_freq(row):
78+
return(len(q_dict[str(row['question1'])]))
6979

70-
#load data
71-
train_df = pd.read_csv("./data/train.csv")
72-
test_df = pd.read_csv("./data/test.csv")
80+
def q2_freq(row):
81+
return(len(q_dict[str(row['question2'])]))
82+
83+
def q1_q2_intersect(row):
84+
return(len(set(q_dict[str(row['question1'])]).intersection(set(q_dict[str(row['question2'])]))))
85+
86+
87+
#load data
88+
train_df = pd.read_csv("./data/train.csv")
89+
test_df = pd.read_csv("./data/test.csv")
90+
91+
train_qs = pd.Series(train_df['question1'].tolist() + train_df['question2'].tolist()).astype(str)
92+
93+
#global dictionaries
94+
print "creating global weights dictionary..."
95+
eps = 5000
96+
words = (" ".join(train_qs)).lower().split()
97+
counts = Counter(words)
98+
weights = {word: get_weight(count, eps) for word, count in counts.items()}
99+
100+
print "creating global questions dictionary..."
101+
ques = pd.concat([train_df[['question1', 'question2']], test_df[['question1', 'question2']]], axis=0).reset_index(drop='index')
102+
q_dict = defaultdict(set)
103+
for i in range(ques.shape[0]):
104+
q_dict[ques.question1[i]].add(ques.question2[i])
105+
q_dict[ques.question2[i]].add(ques.question1[i])
106+
107+
def main():
73108

74109
#data analysis
75110
qids = pd.Series(train_df['qid1'].tolist() + train_df['qid2'].tolist())
@@ -78,7 +113,6 @@ def main():
78113
print "total number of questions in the training data: ", len(np.unique(qids))
79114
print "number of questions that appear multiple times: ", np.sum(qids.value_counts() > 1)
80115

81-
82116
plt.figure()
83117
plt.hist(qids.value_counts(), bins=50)
84118
plt.yscale('log')
@@ -90,7 +124,6 @@ def main():
90124
train_qs = pd.Series(train_df['question1'].tolist() + train_df['question2'].tolist()).astype(str)
91125
test_qs = pd.Series(test_df['question1'].tolist() + test_df['question2'].tolist()).astype(str)
92126

93-
94127
dist_train = train_qs.apply(len)
95128
dist_test = test_qs.apply(len)
96129

@@ -104,7 +137,7 @@ def main():
104137
plt.show()
105138

106139
print "mean-train: %2.2f, std-train: %2.2f, mean-test: %2.2f, std-test: %2.2f" %(dist_train.mean(), dist_train.std(), dist_test.mean(), dist_test.std())
107-
140+
108141
dist_train = train_qs.apply(lambda x: len(x.split(' ')))
109142
dist_test = test_qs.apply(lambda x: len(x.split(' ')))
110143

@@ -116,17 +149,11 @@ def main():
116149
plt.ylabel('probability')
117150
plt.legend()
118151
plt.show()
119-
120152

121153
#feature engineering
122154
train_word_match = train_df.apply(word_match_share, axis=1, raw=True)
123-
print 'Original AUC:', roc_auc_score(train_df['is_duplicate'], train_word_match)
124-
125-
eps = 5000
126-
words = (" ".join(train_qs)).lower().split()
127-
counts = Counter(words)
128-
weights = {word: get_weight(count, eps) for word, count in counts.items()}
129-
155+
print 'word match AUC:', roc_auc_score(train_df['is_duplicate'], train_word_match)
156+
130157
print 'Most common words and weights:'
131158
print sorted(weights.items(), key=lambda x: x[1] if x[1] > 0 else 9999)[:10]
132159
print 'Least common words and weights: '
@@ -135,18 +162,33 @@ def main():
135162
tfidf_train_word_match = train_df.apply(tfidf_word_match_share, axis=1, raw=True)
136163
tfidf_train_word_match = tfidf_train_word_match.fillna(0)
137164
print 'TFIDF AUC:', roc_auc_score(train_df['is_duplicate'], tfidf_train_word_match)
138-
139-
print "creating data..."
165+
166+
train_jaccard = train_df.apply(jaccard, axis=1, raw=True)
167+
print 'jaccard AUC:', roc_auc_score(train_df['is_duplicate'], train_jaccard)
168+
169+
train_wc_diff = train_df.apply(wc_diff, axis=1, raw=True)
170+
train_q1_q2_intersect = train_df.apply(q1_q2_intersect, axis=1, raw=True)
171+
172+
print "creating training data..."
140173
X_train = pd.DataFrame()
141174
X_test = pd.DataFrame()
142175
X_train['word_match'] = train_word_match
143176
X_train['tfidf_word_match'] = tfidf_train_word_match
177+
X_train['jaccard'] = train_jaccard
178+
X_train['wc_diff'] = train_wc_diff
179+
X_train['q1_q2_intersect'] = train_q1_q2_intersect
180+
181+
print "creating test data..."
144182
X_test['word_match'] = test_df.apply(word_match_share, axis=1, raw=True)
145183
X_test['tfidf_word_match'] = test_df.apply(tfidf_word_match_share, axis=1, raw=True)
184+
X_test['jaccard'] = test_df.apply(jaccard, axis=1, raw=True)
185+
X_test['wc_diff'] = test_df.apply(wc_diff, axis=1, raw=True)
186+
X_test['q1_q2_intersect'] = test_df.apply(q1_q2_intersect, axis=1, raw=True)
187+
146188
X_test = X_test.fillna(0)
147189
y_train = train_df['is_duplicate'].values
148190

149-
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=4242)
191+
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
150192

151193
params = {}
152194
params['objective'] = 'binary:logistic'

0 commit comments

Comments
 (0)