@@ -40,7 +40,6 @@ def _collectRecords(iterator: Generator[pd.DataFrame, None, None], entryLimit: i
4040 df = df .sort_values (nanColumn , axis = 0 , ignore_index = True )
4141 df = df .drop ([nanColumn ], axis = 1 )
4242 df = df .head (entryLimit )
43- df .reset_index ()
4443
4544 return df
4645
@@ -70,13 +69,17 @@ def _collectRecords(iterator: Generator[pd.DataFrame, None, None], entryLimit: i
7069
7170 seed = kwargs .seed if kwargs .seed >= 0 else random .randrange (2 ** 32 - 1 ) # Max value for pandas seed
7271 random .seed (seed )
73- outputPath = outputDir / f"{ 'fields' if kwargs .ignoreRecord else 'records' } _{ kwargs .chunksize } _{ seed } .tsv"
72+ outputPath = outputDir / f"{ source . name } _ { 'fields' if kwargs .ignoreRecord else 'records' } _{ kwargs .chunksize } _{ seed } .tsv"
7473
75- dfIterator = stageFile .readIterator (kwargs .chunksize , on_bad_lines = "skip" )
74+ dfIterator = stageFile .readIterator (kwargs .chunksize , on_bad_lines = "skip" , low_memory = False )
7675 df = _collectFields (dfIterator , kwargs .entries , seed ) if kwargs .ignoreRecord else _collectRecords (dfIterator , kwargs .entries , seed )
7776
7877 df = dff .removeSpaces (df )
79- df = df .reset_index ()
8078 df .index += 1 # Increment index so output is 1-indexed numbers
79+
80+ unknownColumn = "Unnamed: 0"
81+ if unknownColumn in df .columns :
82+ df = df .drop ([unknownColumn ], axis = 1 )
83+
8184 df .to_csv (outputPath , sep = "\t " , index_label = "Example #" )
8285 logging .info (f"Created file { outputPath } " )
0 commit comments