Skip to content

Commit c9c197b

Browse files
Added rowFinder tool for locating specific rows in preConversion files
1 parent 92b31a5 commit c9c197b

File tree

1 file changed

+40
-0
lines changed

1 file changed

+40
-0
lines changed

src/tools/rowFinder.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from lib.data.argParser import ArgParser
2+
from lib.processing.files import Step
3+
import pandas as pd
4+
5+
if __name__ == '__main__':
6+
parser = ArgParser(description="Get column examples of pre-Conversion files")
7+
parser.addArgument("column", type=str, help="Column name to search")
8+
parser.addArgument("values", type=str, nargs="*", help="Value to check for in column")
9+
parser.addArgument('-c', '--chunksize', type=int, default=1024, help="File chunk size to read at a time")
10+
11+
sources, flags, args = parser.parseArgs()
12+
13+
for source in sources:
14+
source._prepare(Step.PROCESSING, flags)
15+
stageFile = source.processingManager.getLatestNodeFile() # Should be singular stage file before DwC
16+
17+
if not stageFile.exists():
18+
print(f"File {stageFile.path} does not exist, please run all required downloading/processing.")
19+
continue
20+
21+
if args.column not in stageFile.getColumns():
22+
print(f"Column '{args.column}' does not exists in file")
23+
continue
24+
25+
foundRows = []
26+
for idx, df in enumerate(stageFile.readIterator(args.chunksize, on_bad_lines="skip"), start=1):
27+
print(f"At chunk: {idx}", end="\r")
28+
29+
df = df[df[args.column].isin(args.values)]
30+
if df.empty:
31+
continue
32+
33+
foundRows.append(df)
34+
35+
if not foundRows:
36+
print("\nNo rows found")
37+
continue
38+
39+
print()
40+
print(pd.concat(foundRows))

0 commit comments

Comments
 (0)