Skip to content

Commit e5a9190

Browse files
fikremenamericastxzdandy
authored
feat: supporting spaces in column names for csv files (#1388)
takes `reverse quote id`, removes back ticks, and converts it to `simple id`. --------- Co-authored-by: americast <[email protected]> Co-authored-by: Andy Xu <[email protected]>
1 parent c2457b2 commit e5a9190

File tree

4 files changed

+95
-0
lines changed

4 files changed

+95
-0
lines changed

docs/source/overview/faq.rst

+13
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,16 @@ If a query runs a complex AI task (e.g., sentiment analysis) on a large table, t
3434
top
3535
pgrep evadb_server
3636
37+
Can column names have space?
38+
----------------------------
39+
40+
For column names with space, you can use reverse quote to contain the column names. Below are example `CREATE TABLE` and `SELECT` queries:
41+
42+
.. code-block:: sql
43+
44+
CREATE TABLE IF NOT EXISTS MyVideoCSV (
45+
id INTEGER UNIQUE,
46+
`frame id` INTEGER,
47+
);
48+
49+
SELECT id, `frame id` FROM MyVideoCSV;

evadb/parser/lark_visitor/_common_clauses_ids.py

+7
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,13 @@ def full_id(self, tree):
4343
return (self.visit(tree.children[0]), self.visit(tree.children[1]))
4444

4545
def uid(self, tree):
46+
if (
47+
hasattr(tree.children[0], "type")
48+
and tree.children[0].type == "REVERSE_QUOTE_ID"
49+
):
50+
tree.children[0].type = "simple_id"
51+
non_tick_string = str(tree.children[0]).replace("`", "")
52+
return non_tick_string
4653
return self.visit(tree.children[0])
4754

4855
def full_column_name(self, tree):

test/integration_tests/short/test_load_executor.py

+44
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import unittest
1818
from pathlib import Path
1919
from test.util import (
20+
create_csv_with_comlumn_name_spaces,
2021
create_dummy_csv_batches,
2122
create_sample_csv,
2223
create_sample_video,
@@ -117,6 +118,49 @@ def test_should_load_csv_in_table(self):
117118
drop_query = "DROP TABLE IF EXISTS MyVideoCSV;"
118119
execute_query_fetch_all(self.evadb, drop_query)
119120

121+
###################################
122+
# integration tests for csv files with spaces in column names
123+
def test_should_load_csv_in_table_with_spaces_in_column_name(self):
124+
# loading a csv requires a table to be created first
125+
create_table_query = """
126+
127+
CREATE TABLE IF NOT EXISTS MyVideoCSV (
128+
id INTEGER UNIQUE,
129+
`frame id` INTEGER,
130+
`video id` INTEGER,
131+
`dataset name` TEXT(30),
132+
label TEXT(30),
133+
bbox NDARRAY FLOAT32(4),
134+
`object id` INTEGER
135+
);
136+
137+
"""
138+
execute_query_fetch_all(self.evadb, create_table_query)
139+
140+
# load the CSV
141+
load_query = (
142+
f"LOAD CSV '{create_csv_with_comlumn_name_spaces()}' INTO MyVideoCSV;"
143+
)
144+
execute_query_fetch_all(self.evadb, load_query)
145+
146+
# execute a select query
147+
select_query = """SELECT id, `frame id`, `video id`,
148+
`dataset name`, label, bbox,
149+
`object id`
150+
FROM MyVideoCSV;"""
151+
152+
actual_batch = execute_query_fetch_all(self.evadb, select_query)
153+
actual_batch.sort()
154+
155+
# assert the batches are equal
156+
expected_batch = next(create_dummy_csv_batches())
157+
expected_batch.modify_column_alias("myvideocsv")
158+
self.assertEqual(actual_batch, expected_batch)
159+
160+
# clean up
161+
drop_query = "DROP TABLE IF EXISTS MyVideoCSV;"
162+
execute_query_fetch_all(self.evadb, drop_query)
163+
120164

121165
if __name__ == "__main__":
122166
unittest.main()

test/util.py

+31
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,37 @@ def create_sample_csv(num_frames=NUM_FRAMES):
319319
return os.path.join(get_tmp_dir(), "dummy.csv")
320320

321321

322+
def create_csv_with_comlumn_name_spaces(num_frames=NUM_FRAMES):
323+
try:
324+
os.remove(os.path.join(get_tmp_dir(), "dummy.csv"))
325+
except FileNotFoundError:
326+
pass
327+
328+
sample_meta = {}
329+
330+
index = 0
331+
sample_labels = ["car", "pedestrian", "bicycle"]
332+
num_videos = 2
333+
for video_id in range(num_videos):
334+
for frame_id in range(num_frames):
335+
random_coords = 200 + 300 * np.random.random(4)
336+
sample_meta[index] = {
337+
"id": index,
338+
"frame id": frame_id,
339+
"video id": video_id,
340+
"dataset name": "test_dataset",
341+
"label": sample_labels[np.random.choice(len(sample_labels))],
342+
"bbox": ",".join([str(coord) for coord in random_coords]),
343+
"object id": np.random.choice(3),
344+
}
345+
346+
index += 1
347+
348+
df_sample_meta = pd.DataFrame.from_dict(sample_meta, "index")
349+
df_sample_meta.to_csv(os.path.join(get_tmp_dir(), "dummy.csv"), index=False)
350+
return os.path.join(get_tmp_dir(), "dummy.csv")
351+
352+
322353
def create_dummy_csv_batches(target_columns=None):
323354
if target_columns:
324355
df = pd.read_csv(

0 commit comments

Comments
 (0)