Skip to content

Commit 9905227

Browse files
committed
Merge branch 'feature/improve_ci' into 'master'
Tidy up CI failures in tag script See merge request minknow/mkr-file-format!4
2 parents a92312d + 4c2719a commit 9905227

File tree

2 files changed

+28
-39
lines changed

2 files changed

+28
-39
lines changed

.gitlab-ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ tag_version_check:
1212
stage: pre-flight
1313
only:
1414
- tags
15-
image: ${PYTHON_IMAGE}
15+
image: git.oxfordnanolabs.local:4567/minknow/images/build-x86_64-gcc9:latest
1616
script:
1717
- mkr_version="$(cmake -P ci/get_tag_version.cmake 2>&1)"
1818
- tag_version="${CI_COMMIT_TAG/#v/}"

benchmarks/tools/find_and_get_mkr.py

+27-38
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import argparse
44
import multiprocessing as mp
55
from pathlib import Path
6-
import pickle
76
from queue import Empty
87
from uuid import UUID
98

@@ -33,9 +32,7 @@ def process_read(get_columns, read, read_ids, extracted_columns):
3332
col.append(getattr(read, c))
3433

3534

36-
def do_batch_bulk_work(
37-
filename, batches, select_read_ids, get_columns, c_api, result_q
38-
):
35+
def do_batch_work(filename, batches, get_columns, c_api, result_q):
3936
read_ids = []
4037
extracted_columns = {"read_id": read_ids}
4138

@@ -47,19 +44,13 @@ def do_batch_bulk_work(
4744
result_q.put(pd.DataFrame(extracted_columns))
4845

4946

50-
def do_batch_search_work(
51-
filename, batches, select_read_ids_pickled, get_columns, c_api, result_q
52-
):
47+
def do_search_work(files, select_read_ids, get_columns, c_api, result_q):
5348
read_ids = []
5449
extracted_columns = {"read_id": read_ids}
50+
for file in files:
51+
file = mkr_format.open_combined_file(file, use_c_api=c_api)
5552

56-
select_read_ids = pickle.loads(select_read_ids_pickled)
57-
58-
file = mkr_format.open_combined_file(filename, use_c_api=c_api)
59-
for batch in batches:
60-
for read in filter(
61-
lambda x: x.read_id in select_read_ids, file.get_batch(batch).reads()
62-
):
53+
for read in file.select_reads(UUID(s) for s in select_read_ids):
6354
process_read(get_columns, read, read_ids, extracted_columns)
6455

6556
result_q.put(pd.DataFrame(extracted_columns))
@@ -82,36 +73,34 @@ def run(input_dir, output, select_read_ids=None, get_columns=[], c_api=False):
8273
files = list(input_dir.glob("*.mkr"))
8374
print(f"Searching for read ids in {[str(f) for f in files]}")
8475

85-
fn_to_call = do_batch_bulk_work
86-
if select_read_ids is not None:
87-
fn_to_call = do_batch_search_work
88-
89-
select_read_ids = pickle.dumps(
90-
set(UUID(s) for s in select_read_ids) if select_read_ids is not None else None
91-
)
92-
9376
processes = []
94-
for filename in files:
95-
file = mkr_format.open_combined_file(filename, use_c_api=c_api)
96-
batches = list(range(file.batch_count))
97-
approx_chunk_size = max(1, len(batches) // runners)
77+
if select_read_ids is not None:
78+
approx_chunk_size = max(1, len(select_read_ids) // runners)
9879
start_index = 0
99-
while start_index < len(batches):
100-
select_batches = batches[start_index : start_index + approx_chunk_size]
80+
while start_index < len(select_read_ids):
81+
select_ids = select_read_ids[start_index : start_index + approx_chunk_size]
10182
p = mp.Process(
102-
target=fn_to_call,
103-
args=(
104-
filename,
105-
select_batches,
106-
select_read_ids,
107-
get_columns,
108-
c_api,
109-
result_queue,
110-
),
83+
target=do_search_work,
84+
args=(files, select_ids, get_columns, c_api, result_queue),
11185
)
11286
p.start()
11387
processes.append(p)
114-
start_index += len(select_batches)
88+
start_index += len(select_ids)
89+
else:
90+
for filename in files:
91+
file = mkr_format.open_combined_file(filename, use_c_api=c_api)
92+
batches = list(range(file.batch_count))
93+
approx_chunk_size = max(1, len(batches) // runners)
94+
start_index = 0
95+
while start_index < len(batches):
96+
select_batches = batches[start_index : start_index + approx_chunk_size]
97+
p = mp.Process(
98+
target=do_batch_work,
99+
args=(filename, select_batches, get_columns, c_api, result_queue),
100+
)
101+
p.start()
102+
processes.append(p)
103+
start_index += len(select_batches)
115104

116105
print("Wait for processes...")
117106
items = []

0 commit comments

Comments
 (0)