Skip to content

Commit fbd5b96

Browse files
Fixed additional field being added to output
Sample output now contains source name
1 parent 822a6b8 commit fbd5b96

File tree

1 file changed

+7
-4
lines changed

1 file changed

+7
-4
lines changed

src/tools/samplePreConversion.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ def _collectRecords(iterator: Generator[pd.DataFrame, None, None], entryLimit: i
4040
df = df.sort_values(nanColumn, axis=0, ignore_index=True)
4141
df = df.drop([nanColumn], axis=1)
4242
df = df.head(entryLimit)
43-
df.reset_index()
4443

4544
return df
4645

@@ -70,13 +69,17 @@ def _collectRecords(iterator: Generator[pd.DataFrame, None, None], entryLimit: i
7069

7170
seed = kwargs.seed if kwargs.seed >= 0 else random.randrange(2**32 - 1) # Max value for pandas seed
7271
random.seed(seed)
73-
outputPath = outputDir / f"{'fields' if kwargs.ignoreRecord else 'records'}_{kwargs.chunksize}_{seed}.tsv"
72+
outputPath = outputDir / f"{source.name}_{'fields' if kwargs.ignoreRecord else 'records'}_{kwargs.chunksize}_{seed}.tsv"
7473

75-
dfIterator = stageFile.readIterator(kwargs.chunksize, on_bad_lines="skip")
74+
dfIterator = stageFile.readIterator(kwargs.chunksize, on_bad_lines="skip", low_memory=False)
7675
df = _collectFields(dfIterator, kwargs.entries, seed) if kwargs.ignoreRecord else _collectRecords(dfIterator, kwargs.entries, seed)
7776

7877
df = dff.removeSpaces(df)
79-
df = df.reset_index()
8078
df.index += 1 # Increment index so output is 1-indexed numbers
79+
80+
unknownColumn = "Unnamed: 0"
81+
if unknownColumn in df.columns:
82+
df = df.drop([unknownColumn], axis=1)
83+
8184
df.to_csv(outputPath, sep="\t", index_label="Example #")
8285
logging.info(f"Created file {outputPath}")

0 commit comments

Comments
 (0)