Skip to content

Commit f29737d

Browse files
Updated samplePreConversion to better collect varied samples and fuller samples
1 parent 7b03c60 commit f29737d

File tree

1 file changed

+25
-10
lines changed

1 file changed

+25
-10
lines changed

src/tools/samplePreConversion.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,40 @@
77
from typing import Generator
88

99
def _collectFields(iterator: Generator[pd.DataFrame, None, None], entryLimit: int, seed: int) -> dict[str, pd.Series]:
10-
df = next(iterator).fillna("").drop_duplicates()
10+
11+
def columnCleanup(series: pd.Series) -> pd.Series:
12+
shortSeries = series.dropna()
13+
if len(shortSeries) > entryLimit:
14+
return shortSeries.sample(n=entryLimit, random_state=seed)
15+
16+
return shortSeries.add(["" * (entryLimit - len(shortSeries))])
17+
18+
df = next(iterator)
1119
for idx, chunk in enumerate(iterator, start=1):
1220
print(f"Scanning chunk: {idx}", end='\r')
13-
chunk = chunk.fillna("").drop_duplicates()
1421
df = pd.concat([df, chunk], ignore_index=True)
15-
df = df.drop_duplicates().sample(n=entryLimit, replace=True, random_state=seed)
22+
df = df.drop_duplicates()
23+
df = df.apply(columnCleanup, axis=0)
1624

1725
return df
1826

1927
def _collectRecords(iterator: Generator[pd.DataFrame, None, None], entryLimit: int, seed: int) -> dict[str, pd.Series]:
28+
nanColumn = "NaN"
2029
df = next(iterator)
21-
df = df.sample(n=min(len(df), entryLimit), random_state=seed)
2230
for idx, chunk in enumerate(iterator, start=1):
2331
print(f"Scanning chunk: {idx}", end='\r')
24-
chunk = chunk.drop_duplicates().sample(n=min(len(chunk), entryLimit), random_state=seed)
25-
df = pd.concat([df, chunk])
26-
emptyDF = df.isna().sum(axis=1)
27-
indexes = [idx for idx, _ in sorted(emptyDF.items(), key=lambda x: x[1])]
28-
df = df.loc[indexes[:entryLimit]]
32+
df = pd.concat([df, chunk], ignore_index=True)
33+
df = df.drop_duplicates()
34+
35+
if len(df) > entryLimit:
36+
df = df.sample(n=entryLimit, random_state=seed)
37+
38+
df.reset_index()
39+
df[nanColumn] = df.isna().sum(axis=1).sort_values(ascending=True)
40+
df = df.sort_values(nanColumn, axis=0, ignore_index=True)
41+
df = df.drop([nanColumn], axis=1)
42+
df = df.head(entryLimit)
43+
df.reset_index()
2944

3045
return df
3146

@@ -57,7 +72,7 @@ def _collectRecords(iterator: Generator[pd.DataFrame, None, None], entryLimit: i
5772
random.seed(seed)
5873
outputPath = outputDir / f"{'fields' if kwargs.ignoreRecord else 'records'}_{kwargs.chunksize}_{seed}.tsv"
5974

60-
dfIterator = stageFile.loadDataFrameIterator(kwargs.chunksize, kwargs.firstrow, kwargs.rows)
75+
dfIterator = stageFile.readIterator(kwargs.chunksize, on_bad_lines="skip")
6176
df = _collectFields(dfIterator, kwargs.entries, seed) if kwargs.ignoreRecord else _collectRecords(dfIterator, kwargs.entries, seed)
6277

6378
df = dff.removeSpaces(df)

0 commit comments

Comments
 (0)