-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
10,654 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import argparse | ||
|
||
|
||
def parse_args(argv): | ||
""" | ||
Parse commandline arguments. | ||
Arguments: argv -- An argument list without the program name. | ||
""" | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument('-i', '--dataset', metavar='str', help='dataset for dialectology', type=str, default='cmu') | ||
parser.add_argument('-task', '--task', metavar='str', help='regression, classify_states or classify_regions', | ||
type=str, default='classify_states') | ||
parser.add_argument('-is_classifier', '--is_classifier', metavar='bool', help='Classification or Regression', | ||
type=bool, default=True) | ||
parser.add_argument('-modelname', '--modelname', metavar='str', help='TextCNN, TextRNN or FastText', type=str, | ||
default='TextCNN') | ||
parser.add_argument('-num_classes', '--num_classes', metavar='int', help='number of classes', type=int, default=49) | ||
parser.add_argument('-num_filters', '--num_filters', metavar='int', help='number of filters', type=int, default=256) | ||
parser.add_argument('-learning_rate', '--learning_rate', metavar='int', help='learning rate', type=float, | ||
default=0.001) | ||
parser.add_argument('-batch_size', '--batch_size', metavar='int', help='SGD batch size', type=int, default=32) | ||
|
||
parser.add_argument('-decay_steps', '--decay_steps', metavar='int', | ||
help='how many steps before decay learning rate', type=int, default=6000) | ||
|
||
parser.add_argument('-decay_rate', '--decay_rate', metavar='float', help='Rate of decay for learning rate.', | ||
type=float, default=0.65) | ||
|
||
parser.add_argument('-batchnorm', '--batchnorm', metavar='bool', help='batchnorm', type=bool, default=True) | ||
parser.add_argument('-earlystop', '--earlystop', metavar='bool', help='earlystop', type=bool, default=True) | ||
|
||
parser.add_argument('-is_training', '--is_training', metavar='bool', help='Is training or testing', type=bool, | ||
default=True) | ||
parser.add_argument('-num_epochs', '--num_epochs', metavar='int', help='Number of epochs to run', type=int, | ||
default=100) | ||
parser.add_argument('-sentence_len', '--sentence_len', metavar='int', help='Max sentence length', type=bool, | ||
default=5000) | ||
parser.add_argument('-use_embedding', '--use_embedding', metavar='bool', help='Use embedding', type=bool, | ||
default=True) | ||
parser.add_argument('-validate_every', '--validate_every', metavar='int', help='Validate every num steps', type=int, | ||
default=10) | ||
|
||
parser.add_argument('-traning_data_path', '--traning_data_path', metavar='str', help='traning_data_path', type=str, | ||
default='./datasets/cmu') | ||
|
||
parser.add_argument('-word2vec_model_path', '--word2vec_model_path', metavar='str', | ||
help='word2vecs vocabulary and vectors', type=str, default='glove.6B.300d.word2vec.txt') | ||
|
||
parser.add_argument('-embed_size', '--embed_size', metavar='int', help='embedding size', type=int, default=300) | ||
|
||
parser.add_argument('-hidden', '--hidden', metavar='int', help='Hidden layer size', type=int, default=300) | ||
parser.add_argument('-mindf', '--mindf', metavar='int', help='minimum document frequency in BoW', type=int, | ||
default=10) | ||
parser.add_argument('-d', '--dir', metavar='str', help='home directory', type=str, default='./datasets/cmu') | ||
parser.add_argument('-enc', '--encoding', metavar='str', help='Data Encoding (e.g. latin1, utf-8)', type=str, | ||
default='latin1') | ||
parser.add_argument('-reg', '--regularization', metavar='float', help='regularization coefficient)', type=float, | ||
default=1e-6) | ||
parser.add_argument('-drop', '--dropout', metavar='float', help='dropout coef default 0.5', type=float, default=0.5) | ||
parser.add_argument('-optimizer', '--optimizer', type=str, help='Optimizer used for the neural network', | ||
default='Adam') | ||
|
||
args = parser.parse_args(argv) | ||
if args.task == "classify_regions": | ||
args.num_classes = 4 | ||
args.classifier = True | ||
elif args.task == "classify_states": | ||
args.num_classes = 49 | ||
args.is_classifier = True | ||
else: | ||
args.num_classes = 2 | ||
args.is_classifier = False | ||
return args |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,284 @@ | ||
import numpy as np | ||
import csv | ||
import pandas as pd | ||
import os | ||
import logging | ||
import keras.preprocessing.text | ||
from keras.preprocessing import sequence | ||
|
||
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) | ||
|
||
|
||
def load_state_labels(userids, user_text_seq_full, user_loc, labels): | ||
y, X, userIDs = ([] for i in range(3)) | ||
for user in userids: | ||
if 'UKN' not in user_loc[user][0]: | ||
if 'District of Columbia' in user_loc[user][2]: | ||
if user_loc[user][0] == 'United States of America': | ||
X.append(user_text_seq_full[user]) | ||
userIDs.append(user) | ||
if not user_loc[user][2] in labels: | ||
labels.append(user_loc[user][2]) | ||
y.append(labels.index(user_loc[user][2])) | ||
if 'UKN' not in user_loc[user][1]: | ||
if user_loc[user][0] == 'United States of America': | ||
X.append(user_text_seq_full[user]) | ||
userIDs.append(user) | ||
if not user_loc[user][1] in labels: | ||
labels.append(user_loc[user][1]) | ||
y.append(labels.index(user_loc[user][1])) | ||
return userIDs, y, labels | ||
|
||
|
||
def load_region_labels(userids, user_text_seq_full, user_loc, labels): | ||
regions = {} | ||
regions['Connecticut'] = 0 | ||
regions['Maine'] = 0 | ||
regions['Massachusetts'] = 0 | ||
regions['New Hampshire'] = 0 | ||
regions['Rhode Island'] = 0 | ||
regions['Vermont'] = 0 | ||
regions['New Jersey'] = 0 | ||
regions['New York'] = 0 | ||
regions['Pennsylvania'] = 0 | ||
regions['Indiana'] = 1 | ||
regions['Illinois'] = 1 | ||
regions['Michigan'] = 1 | ||
regions['Ohio'] = 1 | ||
regions['Wisconsin'] = 1 | ||
regions['Iowa'] = 1 | ||
regions['Kansas'] = 1 | ||
regions['Minnesota'] = 1 | ||
regions['Missouri'] = 1 | ||
regions['Nebraska'] = 1 | ||
regions['North Dakota'] = 1 | ||
regions['South Dakota'] = 1 | ||
regions['Delaware'] = 2 | ||
regions['District of Columbia'] = 2 | ||
regions['Florida'] = 2 | ||
regions['Georgia'] = 2 | ||
regions['Maryland'] = 2 | ||
regions['North Carolina'] = 2 | ||
regions['South Carolina'] = 2 | ||
regions['Virginia'] = 2 | ||
regions['West Virginia'] = 2 | ||
regions['Alabama'] = 2 | ||
regions['Kentucky'] = 2 | ||
regions['Mississippi'] = 2 | ||
regions['Tennessee'] = 2 | ||
regions['Arkansas'] = 2 | ||
regions['Louisiana'] = 2 | ||
regions['Oklahoma'] = 2 | ||
regions['Texas'] = 2 | ||
regions['Arizona'] = 3 | ||
regions['Colorado'] = 3 | ||
regions['Idaho'] = 3 | ||
regions['New Mexico'] = 3 | ||
regions['Montana'] = 3 | ||
regions['Utah'] = 3 | ||
regions['Nevada'] = 3 | ||
regions['Wyoming'] = 3 | ||
regions['California'] = 3 | ||
regions['Oregon'] = 3 | ||
regions['Washington'] = 3 | ||
|
||
X, y, userIDs = [], [], [] | ||
for user in userids: | ||
if 'UKN' not in user_loc[user][0]: | ||
if 'District of Columbia' in user_loc[user][2]: | ||
if user_loc[user][0] == 'United States of America': | ||
region = regions[user_loc[user][2]] | ||
if not region in labels: | ||
labels.append(region) | ||
X.append(user_text_seq_full[user]) | ||
userIDs.append(user) | ||
y.append(labels.index(region)) | ||
if 'UKN' not in user_loc[user][1]: | ||
if user_loc[user][0] == 'United States of America': | ||
region = regions[user_loc[user][1]] | ||
if not region in labels: | ||
labels.append(region) | ||
y.append(labels.index(region)) | ||
X.append(user_text_seq_full[user]) | ||
userIDs.append(user) | ||
return userIDs, y, labels | ||
|
||
|
||
# Load "einstein_locations.csv" | ||
def read_user_location(dataset): | ||
user_locations = {} | ||
with open(dataset, 'r') as f: | ||
i = 0 | ||
for line in f: | ||
if i > 0: | ||
content = line.split(',') | ||
# user_location['user'] = ['country', 'state', 'county', 'city'] | ||
user_locations[content[0]] = [content[1], content[2], content[3], content[4]] | ||
i += 1 | ||
f.close() | ||
return user_locations | ||
|
||
|
||
def convert_y_coord(y_train, y_dev, y_test): | ||
y_train = np.array(y_train).astype(np.float) | ||
y_dev = np.array(y_dev).astype(np.float) | ||
y_test = np.array(y_test).astype(np.float) | ||
return y_train, y_dev, y_test | ||
|
||
|
||
def load_data(data_home, **kwargs): | ||
encoding = kwargs.get('encoding', 'utf-8') | ||
dtype = kwargs.get('dtype', 'float32') | ||
task = kwargs.get('task') | ||
dl = DataLoader(data_home=data_home, encoding=encoding) | ||
|
||
logging.info('loading dataset...') | ||
dl.load_data() | ||
|
||
Y_train, Y_dev, Y_test, labels = [], [], [], [] | ||
if task == "regression": | ||
print("Using latitude and longitude") | ||
Y_train = np.array([[a[0], a[1]] for a in dl.df_train[['lat', 'lon']].values.tolist()], dtype=dtype) | ||
Y_dev = np.array([[a[0], a[1]] for a in dl.df_dev[['lat', 'lon']].values.tolist()], dtype=dtype) | ||
Y_test = np.array([[a[0], a[1]] for a in dl.df_test[['lat', 'lon']].values.tolist()], dtype=dtype) | ||
elif task == "classify_states": | ||
print("Using states") | ||
user_locations_file = "eisenstein_locations.csv" | ||
user_loc = read_user_location(user_locations_file) | ||
user_train, Y_train, labels = load_state_labels(list(dl.df_train.index), dl.df_train['text'].to_dict(), | ||
user_loc, labels) | ||
user_dev, Y_dev, labels = load_state_labels(list(dl.df_dev.index), dl.df_dev['text'].to_dict(), user_loc, | ||
labels) | ||
user_test, Y_test, labels = load_state_labels(list(dl.df_test.index), dl.df_test['text'].to_dict(), user_loc, | ||
labels) | ||
dl.df_train = dl.df_train[dl.df_train.index.isin(user_train)] | ||
dl.df_dev = dl.df_dev[dl.df_dev.index.isin(user_dev)] | ||
dl.df_test = dl.df_test[dl.df_test.index.isin(user_test)] | ||
elif task == "classify_regions": | ||
print("Using regions") | ||
user_locations_file = "eisenstein_locations.csv" | ||
user_loc = read_user_location(user_locations_file) | ||
user_train, Y_train, labels = load_region_labels(list(dl.df_train.index), dl.df_train['text'].to_dict(), | ||
user_loc, labels) | ||
user_dev, Y_dev, labels = load_region_labels(list(dl.df_dev.index), dl.df_dev['text'].to_dict(), user_loc, | ||
labels) | ||
user_test, Y_test, labels = load_region_labels(list(dl.df_test.index), dl.df_test['text'].to_dict(), user_loc, | ||
labels) | ||
dl.df_train = dl.df_train[dl.df_train.index.isin(user_train)] | ||
dl.df_dev = dl.df_dev[dl.df_dev.index.isin(user_dev)] | ||
dl.df_test = dl.df_test[dl.df_test.index.isin(user_test)] | ||
|
||
dl.tosequence() | ||
|
||
U_test = dl.df_test.index.tolist() | ||
U_dev = dl.df_dev.index.tolist() | ||
U_train = dl.df_train.index.tolist() | ||
X_train = dl.X_train.astype(dtype) | ||
X_dev = dl.X_dev.astype(dtype) | ||
X_test = dl.X_test.astype(dtype) | ||
dl.max_features = X_train.shape[1] | ||
Y_train, Y_dev, Y_test = convert_y_coord(Y_train, Y_dev, Y_test) | ||
data = (X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, labels) | ||
return data | ||
|
||
|
||
class DataLoader: | ||
def __init__(self, data_home, encoding='utf-8', maxlen=None, max_features=None, char_level=False): | ||
self.data_home = data_home | ||
self.maxlen = maxlen | ||
self.max_features = max_features | ||
self.encoding = encoding | ||
self.char_level = char_level | ||
|
||
def load_data(self): | ||
logging.info('loading the dataset from %s' % self.data_home) | ||
train_file = os.path.join(self.data_home, 'user_info.train.gz') | ||
dev_file = os.path.join(self.data_home, 'user_info.dev.gz') | ||
test_file = os.path.join(self.data_home, 'user_info.test.gz') | ||
|
||
df_train = pd.read_csv(train_file, delimiter='\t', encoding=self.encoding, names=['user', 'lat', 'lon', 'text'], | ||
quoting=csv.QUOTE_NONE, error_bad_lines=False) | ||
df_dev = pd.read_csv(dev_file, delimiter='\t', encoding=self.encoding, names=['user', 'lat', 'lon', 'text'], | ||
quoting=csv.QUOTE_NONE, error_bad_lines=False) | ||
df_test = pd.read_csv(test_file, delimiter='\t', encoding=self.encoding, names=['user', 'lat', 'lon', 'text'], | ||
quoting=csv.QUOTE_NONE, error_bad_lines=False) | ||
df_train.dropna(inplace=True) | ||
df_dev.dropna(inplace=True) | ||
df_test.dropna(inplace=True) | ||
|
||
df_train.drop_duplicates(['user'], inplace=True, keep='last') | ||
df_train.set_index(['user'], drop=True, append=False, inplace=True) | ||
df_train.sort_index(inplace=True) | ||
|
||
df_dev.drop_duplicates(['user'], inplace=True, keep='last') | ||
df_dev.set_index(['user'], drop=True, append=False, inplace=True) | ||
df_dev.sort_index(inplace=True) | ||
|
||
df_test.drop_duplicates(['user'], inplace=True, keep='last') | ||
df_test.set_index(['user'], drop=True, append=False, inplace=True) | ||
df_test.sort_index(inplace=True) | ||
|
||
self.df_train = df_train | ||
self.df_dev = df_dev | ||
self.df_test = df_test | ||
|
||
def tosequence(self): | ||
self.vectorizer = SequenceVectorizer(self.char_level, self.maxlen, self.max_features) | ||
logging.info(self.vectorizer) | ||
self.X_train = self.vectorizer.fit(self.df_train.text.values) | ||
self.X_dev = self.vectorizer.transform(self.df_dev.text.values) | ||
self.X_test = self.vectorizer.transform(self.df_test.text.values) | ||
logging.info("training n_samples: %d, n_features: %d" % self.X_train.shape) | ||
logging.info("development n_samples: %d, n_features: %d" % self.X_dev.shape) | ||
logging.info("test n_samples: %d, n_features: %d" % self.X_test.shape) | ||
|
||
|
||
class SequenceVectorizer: | ||
def __init__(self, char_level=False, maxlen=None, max_features=None): | ||
self.max_features = max_features | ||
self.char_level = char_level | ||
self.tokenizer = keras.preprocessing.text.Tokenizer(filters=" ", char_level=self.char_level, | ||
num_words=self.max_features) | ||
self.maxlen = maxlen | ||
self.vocabulary_ = None | ||
|
||
def fit(self, X): | ||
self.tokenizer.fit_on_texts(X) | ||
X_seq = self.tokenizer.texts_to_sequences(X) | ||
# pad 4987 to 5000 | ||
X_seq = sequence.pad_sequences(X_seq, maxlen=5000, padding='post') | ||
|
||
self.maxlen = X_seq.shape[1] | ||
self.vocabulary_ = self.tokenizer.word_index | ||
self.vocabulary_ = sorted(self.tokenizer.word_counts, key=self.tokenizer.word_counts.get, reverse=True) | ||
if (self.max_features): | ||
self.vocabulary_ = self.vocabulary_[:self.max_features] | ||
logging.info('SequenceVectorizer maxlen:{}, #words:{}, most common words:{}'. | ||
format(self.maxlen, len(self.vocabulary_), 0)) | ||
return X_seq | ||
|
||
def transform(self, X): | ||
logging.info('Fitting SequenceVectorizer in {} texts'.format(len(X))) | ||
X_seq = self.tokenizer.texts_to_sequences(X) | ||
X_seq = sequence.pad_sequences(X_seq, maxlen=self.maxlen, padding='post') | ||
return X_seq | ||
|
||
|
||
if __name__ == '__main__': | ||
data_loader = DataLoader(data_home='./data/', encoding='latin1') | ||
data_loader.load_data() | ||
data_loader.tosequence() | ||
dtype = 'float32' | ||
|
||
U_test = data_loader.df_test.index.tolist() | ||
U_dev = data_loader.df_dev.index.tolist() | ||
U_train = data_loader.df_train.index.tolist() | ||
X_train = data_loader.X_train.astype(dtype) | ||
X_dev = data_loader.X_dev.astype(dtype) | ||
X_test = data_loader.X_test.astype(dtype) | ||
|
||
Y_train = np.array([[a[0], a[1]] for a in data_loader.df_train[['lat', 'lon']].values.tolist()], dtype=dtype) | ||
Y_dev = np.array([[a[0], a[1]] for a in data_loader.df_dev[['lat', 'lon']].values.tolist()], dtype=dtype) | ||
Y_test = np.array([[a[0], a[1]] for a in data_loader.df_test[['lat', 'lon']].values.tolist()], dtype=dtype) | ||
|
||
data = (X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test) |
Oops, something went wrong.