|
| 1 | +from lib.data.argParser import ArgParser |
| 2 | +from lib.processing.files import Step |
| 3 | +import pandas as pd |
| 4 | + |
| 5 | +if __name__ == '__main__': |
| 6 | + parser = ArgParser(description="Get column examples of pre-Conversion files") |
| 7 | + parser.addArgument("column", type=str, help="Column name to search") |
| 8 | + parser.addArgument("values", type=str, nargs="*", help="Value to check for in column") |
| 9 | + parser.addArgument('-c', '--chunksize', type=int, default=1024, help="File chunk size to read at a time") |
| 10 | + |
| 11 | + sources, flags, args = parser.parseArgs() |
| 12 | + |
| 13 | + for source in sources: |
| 14 | + source._prepare(Step.PROCESSING, flags) |
| 15 | + stageFile = source.processingManager.getLatestNodeFile() # Should be singular stage file before DwC |
| 16 | + |
| 17 | + if not stageFile.exists(): |
| 18 | + print(f"File {stageFile.path} does not exist, please run all required downloading/processing.") |
| 19 | + continue |
| 20 | + |
| 21 | + if args.column not in stageFile.getColumns(): |
| 22 | + print(f"Column '{args.column}' does not exists in file") |
| 23 | + continue |
| 24 | + |
| 25 | + foundRows = [] |
| 26 | + for idx, df in enumerate(stageFile.readIterator(args.chunksize, on_bad_lines="skip"), start=1): |
| 27 | + print(f"At chunk: {idx}", end="\r") |
| 28 | + |
| 29 | + df = df[df[args.column].isin(args.values)] |
| 30 | + if df.empty: |
| 31 | + continue |
| 32 | + |
| 33 | + foundRows.append(df) |
| 34 | + |
| 35 | + if not foundRows: |
| 36 | + print("\nNo rows found") |
| 37 | + continue |
| 38 | + |
| 39 | + print() |
| 40 | + print(pd.concat(foundRows)) |
0 commit comments