process_data.py

__author__ = "Jerome Thai, Nicolas Laurent-Brouty"
__email__ = "jerome.thai@berkeley.edu, nicolas.lb@berkeley.edu"

'''
This module processes the *.txt files from Bar-Gera 
that can be found here: http://www.bgu.ac.il/~bargera/tntp/
'''

import argparse
import csv
import numpy as np
from utils import digits, spaces, areInside
from pyproj import Proj, transform
import igraph


def process_net(input, output):
    '''
    process *_net.txt files of Bar-Gera to get *_net.csv file in the format of
    our Frank-Wolfe algorithm
    '''
    flag = False
    i = 0
    out = ['LINK,A,B,a0,a1,a2,a3,a4\n']
    with open(input, 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row)>0:
                if flag == False:
                    if row[0].split()[0] == '~': flag = True
                else:
                    line = row[0].split()[:-1]
                    a4 = float(line[4]) * float(line[5]) / (float(line[2])/4000)**4
                    out.append('{},{},{},{},0,0,0,{}\n'.format(i,line[0],line[1],line[4],a4))
                    i = i+1
    import pdb; pdb.set_trace()
    with open(output, "w") as text_file:
        text_file.write(''.join(out))

def process_net_attack(input, output,thres,beta):
    '''
    process *_net.txt files of Bar-Gera to get *_net.csv file in the format of
    our Frank-Wolfe algorithm
    '''
    flag = False
    i = 0
    out = ['LINK,A,B,a0,a1,a2,a3,a4\n']
    with open(input, 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row)>0:
                if flag == False:
                    if row[0].split()[0] == '~': flag = True
                else:
                    l = row[0].split()[:-1]
                    if float(l[2]) < thres:
                        capacity=beta*float(l[2])
                    else:
                        capacity = float(l[2])
                    a4 = float(l[4]) * float(l[5]) / (capacity/4000)**4
                    out.append('{},{},{},{},0,0,0,{}\n'.format(i,l[0],l[1],l[4],a4))
                    i = i+1
    with open(output, "w") as text_file:
        text_file.write(''.join(out))


def process_trips(input, output):
    '''
    process *_trips files of Bar-Gera to get *_od.csv file in the format of
    our Frank-Wolfe algorithm
    '''
    origin = -1
    out = ['O,D,Ton\n']
    with open(input, 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            #before, keyword, after = row.partition('Origin')
            if len(row)>0: 
                line = row[0].split()
                if len(line) > 0 and line[0] == 'Origin':
                    origin = line[1]
                elif origin != -1:
                    for i,e in enumerate(line):
                        if i%3 == 0:
                            out.append('{},{},'.format(origin,e))
                        if i%3 == 2:
                            out.append('{}\n'.format(e[:-1]))
    import pdb; pdb.set_trace()
    with open(output, "w") as text_file:
        text_file.write(''.join(out))


def array_to_trips(demand, output):
    '''
    convert numpy array into _trips.txt input file for Matthew Steele's solver
    '''
    row = 0
    zones = int(np.max(demand[:,0]))
    out = ['<NUMBER OF ZONES> {}\n'.format(zones)]
    out.append('<TOTAL OD FLOW> {}\n'.format(np.sum(demand[:,2])))
    out.append('<END OF METADATA>\n\n\n')
    for i in range(zones):
        out.append('Origin')
        out.append(spaces(10-digits(i+1)))
        out.append('{}\n'.format(i+1))

        count = 0
        while (row < demand.shape[0]) and (demand[row,0] == i+1):
            count = count + 1
            d = int(demand[row,1])
            out.append(spaces(5-digits(d)))
            out.append('{} :'.format(d))
            out.append(spaces(8-digits(demand[row,2])))
            out.append('{:.2f}; '.format(demand[row,2]))
            row = row + 1
            if count % 5 == 0:
                out.append('\n')
                count = 0
        out.append('\n')
    with open(output, "w") as text_file:
        text_file.write(''.join(out))


def process_results(input, output, network):
    '''
    process output in the terminal generated by Steele's algorithm
    to a .csv file 
    '''
    graph = np.loadtxt(network, delimiter=',', skiprows=1)
    raw = np.loadtxt(input, delimiter=',')
    out = np.zeros(graph.shape[0])
    for i in range(graph.shape[0]):
        for j in range(raw.shape[0]):
            if (graph[i,1] == raw[j,0]) and (graph[i,2] == raw[j,1]):
                out[i] = raw[j,2]
                continue
    np.savetxt(output, out, delimiter=",")


def process_node(input, output, min_X=None, max_X=None, min_Y=None, max_Y=None):
    '''
    process node file to 'interpolate' from state coordinate to lat long
    this first step is to convert manually these four coordinates using
    http://www.earthpoint.us/StatePlane.aspx
    '''
    out = ['node,lat,lon\n']
    nodes = np.loadtxt(input, delimiter='\t', skiprows=1)
    #nodes = np.genfromtxt(input, delimiter=',', skiprows=1)
    num_nodes = nodes.shape[0]
    #import pdb; pdb.set_trace()

    # print 'min X', nodes[argmin_X,1:]
    # print 'max X', nodes[argmax_X,1:]
    # print 'min Y', nodes[argmin_Y,1:]
    # print 'max Y', nodes[argmax_Y,1:]

    min_X = nodes[argmin_X,1]
    max_X = nodes[argmax_X,1]
    min_Y = nodes[argmin_Y,2]
    max_Y = nodes[argmax_Y,2]
    # do simple interpolation
    for i in range(num_nodes):
        alpha = (nodes[i,1]-nodes[argmin_X,1]) / (nodes[argmax_X,1]-nodes[argmin_X,1])
        beta = (nodes[i,2]-nodes[argmin_Y,2]) / (nodes[argmax_Y,2]-nodes[argmin_Y,2])
        lon = min_X + alpha * (max_X - min_X)
        lat = min_Y + beta * (max_Y - min_Y)
        out.append('{},{},{}\n'.format(nodes[i,0],lat,lon))
    with open(output, "w") as text_file:
        text_file.write(''.join(out))

#Function to create GPS coordinate from projection coordinates
def process_node_to_GPS_Coord(input, output, min_X=None, max_X=None, min_Y=None, max_Y=None):
    '''
    We first change the projection to GPS coordinates
    '''
    out = ['node,lat,lon\n']
    nodes = np.loadtxt(input, delimiter='\t', skiprows=1)
    num_nodes = nodes.shape[0]
    #The space encoding 'epsg:3435' should be updated depending on which region we are talking about
    #This one is specifically for Chicago Illinois, and so should be updated if we are looking at other locations
    inProj = Proj(init='epsg:3435')
    outProj = Proj(init='epsg:4326')

    # change each nodes projection to gps coordinates
    for i in range(num_nodes):
        #Here the projections are multiplied by 0.3048 to change from miles to meters
        lon,lat = transform(inProj,outProj,nodes[i,1]*0.3048, nodes[i,2]*0.3048)
        out.append('{},{},{}\n'.format(nodes[i,0],lat,lon))
    with open(output, "w") as text_file:
        text_file.write(''.join(out))


def process_links(net, node, features, in_order=False):
    '''
    Join data from net, node, and features arrays into links file
    returns out, a numpy array with columns
    [lat1, lon1, lat2, lon2, capacity, length, FreeFlowTime]
    '''
    links = net.shape[0]
    nodes = node.shape[0]
    num_fts = features.shape[1]
    out = np.zeros((links, 4+num_fts))
    for i in range(links):
        a, b = net[i,1], net[i,2]
        if in_order == False:
            for j in range(nodes):
                if node[j,0] == a:
                    lat1, lon1 = node[j,1], node[j,2]
                if node[j,0] == b:
                    lat2, lon2 = node[j,1], node[j,2]
        else:
            lat1, lon1 = node[int(a)-1, 1], node[int(a)-1, 2]
            lat2, lon2 = node[int(b)-1, 1], node[int(b)-1, 2]
        out[i,:4] = [lat1, lon1, lat2, lon2]
        out[i,4:] = features[i,:]
    return out


def join_node_demand(node, demand):
    '''
    Join data from node and demand and return our, a numpy array with columns
    [lat1, lon1, lat2, lon2, demand]
    '''
    ods = demand.shape[0]
    out = np.zeros((ods, 5))
    for i in range(ods):
        a, b = demand[i,0], demand[i,1]
        lat1, lon1 = node[int(a)-1, 1], node[int(a)-1, 2]
        lat2, lon2 = node[int(b)-1, 1], node[int(b)-1, 2]
        out[i,:4] = [lat1, lon1, lat2, lon2]
        out[i,4] = demand[i,2]
    return out


def extract_features(input):
    # features = table in the format [[capacity, length, FreeFlowTime]]
    flag = False
    out = []
    with open(input, 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row)>0:
                if flag == False:
                    if row[0].split()[0] == '~': flag = True
                else:
                    out.append([float(e) for e in row[0].split()[2:5]])
    return np.array(out)


begin = 'var geojson_features = [{\n'


def begin_feature(type):
    string = '    "type": "Feature",\n    "geometry": {\n'
    if type == 'Point':
        begin_coord = '        "coordinates": ['
    else:
        begin_coord = '        "coordinates": [\n'
    return string + '        "type": "{}",\n'.format(type) + begin_coord

def coord(lat,lon,type):
    if type == "LineString": return '            [{}, {}],\n'.format(lon,lat)
    if type == "Point": return '{}, {}'.format(lon,lat)

begin_prop = '            ]},\n    "properties": {\n'

def prop(name, value):
    return '        "{}": "{}",\n'.format(name, value)

def prop_numeric(name, value):
    return '        "{}": {},\n'.format(name, value)


def geojson_link(links, features, color, weight=None):
    """
    from array of link coordinates and features, generate geojson file
    links is numpy array where each row has [lat1, lon1, lat2, lon2, features]
    color is an array that encodes the color of the link for visualization
    if      color < 1: blue
    if 1 <= color < 2: yellow
    if 2 <= color < 3: orange
    if 3 <= color < 4: orange-red
    if 5 <= color    : red
    """
    if weight is None: 
        weight = 2. * np.ones((color.shape[0],)) # uniform weight
    type = 'LineString'
    out = [begin]
    for i in range(links.shape[0]):
        out.append(begin_feature(type))
        out.append(coord(links[i,0], links[i,1], type))
        out.append(coord(links[i,2], links[i,3], type))
        out.append(begin_prop)
        for j,f in enumerate(features):
            out.append(prop(f, links[i,j+4]))

        #import pdb; pdb.set_trace()
        out.append(prop('color', color[i]))
        out.append(prop('weight', weight[i]))
        out.append('    }},{\n')
    out[-1] = '    }}];\n\n'
    out.append('var lat_center_map = {}\n'.format(np.mean(links[:,0])))
    out.append('var lon_center_map = {}\n'.format(np.mean(links[:,1])))
    with open('visualization/geojson_features.js', 'w') as f:
        f.write(''.join(out))


def geojson_link_Scenario_Study(ratio, links, features, color, name, mode, weight=None):
    """
    from array of link coordinates and features, generate geojson file
    links is numpy array where each row has [lat1, lon1, lat2, lon2, features]
    color is an array that encodes the color of the link for visualization
    if      color < 1: blue
    if 1 <= color < 2: yellow
    if 2 <= color < 3: orange
    if 3 <= color < 4: orange-red
    if 5 <= color    : red
    """
    if weight is None: 
        weight = 2. * np.ones((color.shape[0],)) # uniform weight
    type = 'LineString'
    out = [begin]
    for i in range(links.shape[0]):
        out.append(begin_feature(type))
        out.append(coord(links[i,0], links[i,1], type))
        out.append(coord(links[i,2], links[i,3], type))
        out.append(begin_prop)
        for j,f in enumerate(features):
            out.append(prop(f, links[i,j+4]))

        #import pdb; pdb.set_trace()
        out.append(prop('color', color[i]))
        out.append(prop('weight', weight[i]))
        out.append('    }},{\n')
    out[-1] = '    }}];\n\n'
    out.append('var lat_center_map = {}\n'.format(np.mean(links[:,0])))
    out.append('var lon_center_map = {}\n'.format(np.mean(links[:,1])))
    fileName = 'visualization/geojson_features_'+ name + '_ratio_' + str(ratio) + '_' + mode + '.js'
    print(fileName)
    with open(fileName, 'w') as f:
        f.write(''.join(out))


def geojson_link_Scenario_Social_Optimum(ratio, links, features, color, weight=None):
    """
    from array of link coordinates and features, generate geojson file
    links is numpy array where each row has [lat1, lon1, lat2, lon2, features]
    color is an array that encodes the color of the link for visualization
    if      color < 1: blue
    if 1 <= color < 2: yellow
    if 2 <= color < 3: orange
    if 3 <= color < 4: orange-red
    if 5 <= color    : red
    """
    if weight is None: 
        weight = 2. * np.ones((color.shape[0],)) # uniform weight
    type = 'LineString'
    out = [begin]
    for i in range(links.shape[0]):
        out.append(begin_feature(type))
        out.append(coord(links[i,0], links[i,1], type))
        out.append(coord(links[i,2], links[i,3], type))
        out.append(begin_prop)
        for j,f in enumerate(features):
            out.append(prop(f, links[i,j+4]))

        #import pdb; pdb.set_trace()
        out.append(prop('color', color[i]))
        out.append(prop('weight', weight[i]))
        out.append('    }},{\n')
    out[-1] = '    }}];\n\n'
    out.append('var lat_center_map = {}\n'.format(np.mean(links[:,0])))
    out.append('var lon_center_map = {}\n'.format(np.mean(links[:,1])))
    fileName = 'visualization/geojson_features_ratio_SO_' + str(ratio) + '.js'
    print(fileName)
    with open(fileName, 'w') as f:
        f.write(''.join(out))

def output_file(net_name, node_name, fs, output_name):
    network = np.genfromtxt(net_name,skip_header=7)
    nodes = np.genfromtxt(node_name, delimiter=',', skip_header=1)
    #create a numpy array containing informations of both I210_node and I210_net
    featuredNetwork = np.zeros((len(network),11))
    featuredNetwork[:,0] = network[:,0] # index of origin vertex
    featuredNetwork[:,3] = network[:,1] # index of destination vertex
    for i in range(len(featuredNetwork)):
        featuredNetwork[i,1] = nodes[featuredNetwork[i,0]-1,2] #longitude of origin
        featuredNetwork[i,2] = nodes[featuredNetwork[i,0]-1,1] #latitude of origin
        featuredNetwork[i,4] = nodes[featuredNetwork[i,3]-1,2] #longitude of destination
        featuredNetwork[i,5] = nodes[featuredNetwork[i,3]-1,1] #latitude of destination
    featuredNetwork[:,6] = network[:,2] # capacity
    featuredNetwork[:,7] = network[:,3] #length
    featuredNetwork[:,8] = network[:,4] ##fftt
    featuredNetwork[:,9:] = fs
    # np.savetxt(output_name, featuredNetwork, delimiter=',', \
    #     header='o_index,o_long,o_lat,d_index,d_long,d_lat,capacity,length(mi),fftt(min),f_nr,f_r', \
    #     fmt='%d %3.5f %2.5f %d %3.5f %2.5f %d %1.3f %1.3f %2.4e %2.4e')
    np.savetxt(output_name, featuredNetwork, delimiter=',', \
        header='o_index,o_long,o_lat,d_index,d_long,d_lat,capacity,length(mi),fftt(min),f_nr,f_r')


def construct_igraph(graph):
    # 'vertices' contains the range of the vertices' indices in the graph
    vertices = range(int(np.min(graph[:,1:3])), int(np.max(graph[:,1:3]))+1)
    # 'edges' is a list of the edges (to_id, from_id) in the graph
    edges = graph[:,1:3].astype(int).tolist()
    g = igraph.Graph(vertex_attrs={"label":vertices}, edges=edges, directed=True)
    g.es["weight"] = graph[:,3].tolist() # feel with free-flow travel times
    return g


def process_demand(od_file):
    origin = -1
    out = {}
    with open(od_file, 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row)>0: 
                l = row[0].split()
                if l[0] == 'Origin':
                    origin = int(l[1])
                    out[origin] = ([],[])
                elif origin != -1:
                    for i,e in enumerate(l):
                        if i%3 == 0:
                            out[origin][0].append(int(e))
                        if i%3 == 2:
                            out[origin][1].append(float(e[:-1]))
    return out

 
def construct_od(demand):
    # construct a dictionary of the form 
    # origin: ([destination],[demand])
    out = {}
    #import pdb; pdb.set_trace()
    for i in range(demand.shape[0]):
        origin = int(demand[i,0])
        if origin not in out.keys():
            out[origin] = ([],[])
        out[origin][0].append(int(demand[i,1]))
        out[origin][1].append(demand[i,2])
    return out


def cities_to_js(file, by_county, color, weight):
    # only keep cities in California that are in Los Angeles County
    # create a suitable collection of geojson objects
    out = ['var geojson_features = [']
    with open(file, 'rb') as f:
        reader = csv.reader(f)
        for i,row in enumerate(reader):
            if len(row) >= 8 and row[7][12:-1] == by_county:
                row[1] = ' "properties": { "city": ' + row[1][25:]
                row[7] = '"county": "Los Angeles"'
                row.insert(2, '"weight": "{}"'.format(weight))
                row.insert(2, '"color": "{}"'.format(color))
                out.append(','.join(row))
    out.append('];')
    out.append('\nvar lat_center_map = 34.0374876369')
    out.append('var lon_center_map = -118.130124211')
    with open('visualization/geojson_features.js', 'w') as f:
        f.write('\n'.join(out))


def map_nodes_to_one_city(city, city_file, node):
    # return a .cvs file with the name of the city in which a node is
    # first, compute the bounding box around the city
    polygon = []
    with open(city_file, 'rb') as f:
        reader = csv.reader(f)
        for i,row in enumerate(reader):
            if len(row) >= 2 and row[1][26:-1] == city:
                line = row[13:]
                for j,e in enumerate(line):
                    if len(e) > 0:
                        if j == 0:
                            polygon.append([float(e.split(' ')[-1])])
                        else:
                            if j%2 == 1:
                                polygon[-1].append(float(e.split(' ')[1]))
                            else:
                                polygon.append([float(e.split(' ')[-1])])
                break
    ps = [[node[i,1:3][1], node[i,1:3][0]] for i in range(node.shape[0])]
    return areInside(polygon, len(polygon), ps)


def map_nodes_to_cities(cities, city_file, node_file, output_file):
    # save into a file mapping from node id to city the node belongs to
    node = np.loadtxt(node_file, delimiter=',')
    out = ['other'] * node.shape[0]
    for city in cities:
        print 'process {}'.format(city)
        tmp = np.array(map_nodes_to_one_city(city, city_file, node)).nonzero()[0]
        print 'found {} nodes'.format(len(tmp))
        for i in tmp: 
            out[i] = city
    out = np.reshape(np.array(out), (node.shape[0],1))
    ids = np.reshape(node[:,0], (node.shape[0],1))
    out2 = np.concatenate((ids,out), axis=1)
    np.savetxt(output_file, out2, delimiter=',', header='id,city', \
        comments='', fmt="%s")


def map_links_to_cities(nodeToCity_file, net_file, output_file):
    # save into a file mapping from link id to city it belongs to
    # a link is assumed to be in a city if both of its nodes are inside it
    nodeToCity = np.genfromtxt(nodeToCity_file, delimiter=',', \
        skiprows=1, dtype='str')
    graph = np.loadtxt(net_file, delimiter=',', skiprows=1)
    #print nodeToCity
    #print graph
    out = ['other'] * graph.shape[0]
    for i in range(graph.shape[0]):
        fr, to = int(graph[i,1]), int(graph[i,2])
        if (nodeToCity[fr-1,1] != 'other') and (nodeToCity[fr-1,1] == nodeToCity[to-1,1]):
            out[i] = nodeToCity[fr-1,1]
    out = np.reshape(np.array(out), (graph.shape[0],1))
    ids = np.reshape(graph[:,0], (graph.shape[0],1))
    out2 = np.concatenate((ids,out), axis=1)
    np.savetxt(output_file, out2, delimiter=',', header='id,city', \
        comments='', fmt="%s")

def main():
    #Input the name of the network we need to process
    parser = argparse.ArgumentParser(description='Process network data')
    parser.add_argument("name", type = str, help = "name of network")
    args = parser.parse_args()
    
    import pdb; pdb.set_trace()
    #Location of files
    netOriginFile = 'data/'+ args.name + '_net.txt'
    netDestinationFile = 'data/'+ args.name + '_net.csv'
    nodeOriginFile = 'data/'+ args.name + '_node.txt'
    nodeDestinationFile = 'data/'+ args.name + '_node.csv'
    odOriginFile = 'data/'+ args.name + '_trips.txt'
    odDestinationFile = 'data/'+ args.name + '_od.csv'

    #process network data to get csv files
    process_net(netOriginFile, netDestinationFile)
    #Process nodes
    process_node_to_GPS_Coord(nodeOriginFile, nodeDestinationFile)
    #Process the trips to get od (organ-destination) pairs
    process_trips(odOriginFile, odDestinationFile)
    pass

if __name__ == '__main__':
    main()