Skip to content

parallelize metadata collection loop #60

Open
@carloshorn

Description

@carloshorn

When collecting the metadata, the following loop consumes most of the processing time and would benefit from a parallelisation.

def _collect_metadata(self, filenames):
"""Collect metadata from the given level 1c files."""
records = []
for filename in filenames:
LOG.debug('Collecting metadata from {}'.format(filename))
with xr.open_dataset(filename) as ds:
midnight_line = np.float64(self._get_midnight_line(ds['acq_time']))
eq_cross_lons, eq_cross_times = self._get_equator_crossings(ds)
rec = {'platform': ds.attrs['platform'].split('>')[-1].strip(),
'start_time': ds['acq_time'].values[0],
'end_time': ds['acq_time'].values[-1],
'along_track': ds.dims['y'],
'filename': filename,
'orbit_number_start': ds.attrs['orbit_number_start'],
'orbit_number_end': ds.attrs['orbit_number_end'],
'equator_crossing_longitude_1': eq_cross_lons[0],
'equator_crossing_time_1': eq_cross_times[0],
'equator_crossing_longitude_2': eq_cross_lons[1],
'equator_crossing_time_2': eq_cross_times[1],
'midnight_line': midnight_line,
'overlap_free_start': np.nan,
'overlap_free_end': np.nan,
'global_quality_flag': QualityFlags.OK}
records.append(rec)
return records

In combination with #31 this could be implemented using

import multiprocessing

[...]
with multiprocessing.Pool(n_worker) as pool:
    for metadata in pool.imap_unordered(extract_metadata, filenames):
        session.add(metadata)
        [...]

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions