feat: supporting spaces in column names for csv files (#1388)

fikremen · americast · xzdandy · web-flow · commit e5a91909ab17 · 2023-12-03T01:09:14.000-08:00
takes `reverse quote id`, removes back ticks, and converts it to `simple
id`.

---------

Co-authored-by: americast &lt;sayan.sinha@cc.gatech.edu&gt;
Co-authored-by: Andy Xu &lt;xzdandy@gmail.com&gt;
diff --git a/docs/source/overview/faq.rst b/docs/source/overview/faq.rst
@@ -34,3 +34,16 @@ If a query runs a complex AI task (e.g., sentiment analysis) on a large table, t
     top
     pgrep evadb_server
 
+Can column names have space?
+----------------------------
+
+For column names with space, you can use reverse quote to contain the column names. Below are example `CREATE TABLE` and `SELECT` queries:
+
+.. code-block:: sql
+
+   CREATE TABLE IF NOT EXISTS MyVideoCSV (
+        id INTEGER UNIQUE,
+        `frame id` INTEGER,
+   );
+
+   SELECT id, `frame id` FROM MyVideoCSV;
diff --git a/evadb/parser/lark_visitor/_common_clauses_ids.py b/evadb/parser/lark_visitor/_common_clauses_ids.py
@@ -43,6 +43,13 @@ def full_id(self, tree):
             return (self.visit(tree.children[0]), self.visit(tree.children[1]))
 
     def uid(self, tree):
+        if (
+            hasattr(tree.children[0], "type")
+            and tree.children[0].type == "REVERSE_QUOTE_ID"
+        ):
+            tree.children[0].type = "simple_id"
+            non_tick_string = str(tree.children[0]).replace("`", "")
+            return non_tick_string
         return self.visit(tree.children[0])
 
     def full_column_name(self, tree):
diff --git a/test/integration_tests/short/test_load_executor.py b/test/integration_tests/short/test_load_executor.py
@@ -17,6 +17,7 @@
 import unittest
 from pathlib import Path
 from test.util import (
+    create_csv_with_comlumn_name_spaces,
     create_dummy_csv_batches,
     create_sample_csv,
     create_sample_video,
@@ -117,6 +118,49 @@ def test_should_load_csv_in_table(self):
         drop_query = "DROP TABLE IF EXISTS MyVideoCSV;"
         execute_query_fetch_all(self.evadb, drop_query)
 
+    ###################################
+    # integration tests for csv files with spaces in column names
+    def test_should_load_csv_in_table_with_spaces_in_column_name(self):
+        # loading a csv requires a table to be created first
+        create_table_query = """
+
+            CREATE TABLE IF NOT EXISTS MyVideoCSV (
+                id INTEGER UNIQUE,
+                `frame id` INTEGER,
+                `video id` INTEGER,
+                `dataset name` TEXT(30),
+                label TEXT(30),
+                bbox NDARRAY FLOAT32(4),
+                `object id` INTEGER
+            );
+
+            """
+        execute_query_fetch_all(self.evadb, create_table_query)
+
+        # load the CSV
+        load_query = (
+            f"LOAD CSV '{create_csv_with_comlumn_name_spaces()}' INTO MyVideoCSV;"
+        )
+        execute_query_fetch_all(self.evadb, load_query)
+
+        # execute a select query
+        select_query = """SELECT id, `frame id`, `video id`,
+                          `dataset name`, label, bbox,
+                          `object id`
+                          FROM MyVideoCSV;"""
+
+        actual_batch = execute_query_fetch_all(self.evadb, select_query)
+        actual_batch.sort()
+
+        # assert the batches are equal
+        expected_batch = next(create_dummy_csv_batches())
+        expected_batch.modify_column_alias("myvideocsv")
+        self.assertEqual(actual_batch, expected_batch)
+
+        # clean up
+        drop_query = "DROP TABLE IF EXISTS MyVideoCSV;"
+        execute_query_fetch_all(self.evadb, drop_query)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/util.py b/test/util.py
@@ -319,6 +319,37 @@ def create_sample_csv(num_frames=NUM_FRAMES):
     return os.path.join(get_tmp_dir(), "dummy.csv")
 
 
+def create_csv_with_comlumn_name_spaces(num_frames=NUM_FRAMES):
+    try:
+        os.remove(os.path.join(get_tmp_dir(), "dummy.csv"))
+    except FileNotFoundError:
+        pass
+
+    sample_meta = {}
+
+    index = 0
+    sample_labels = ["car", "pedestrian", "bicycle"]
+    num_videos = 2
+    for video_id in range(num_videos):
+        for frame_id in range(num_frames):
+            random_coords = 200 + 300 * np.random.random(4)
+            sample_meta[index] = {
+                "id": index,
+                "frame id": frame_id,
+                "video id": video_id,
+                "dataset name": "test_dataset",
+                "label": sample_labels[np.random.choice(len(sample_labels))],
+                "bbox": ",".join([str(coord) for coord in random_coords]),
+                "object id": np.random.choice(3),
+            }
+
+            index += 1
+
+    df_sample_meta = pd.DataFrame.from_dict(sample_meta, "index")
+    df_sample_meta.to_csv(os.path.join(get_tmp_dir(), "dummy.csv"), index=False)
+    return os.path.join(get_tmp_dir(), "dummy.csv")
+
+
 def create_dummy_csv_batches(target_columns=None):
     if target_columns:
         df = pd.read_csv(