Skip to content

Commit f67113b

Browse files
committed
New procedure 0.9.2:
- README updated - LolliPop's CLI now integrated into V-pipe Curves are generated entirely by V-pipe (no more notebooks to run for the curves) - Display code now moved before the uploader - Code to diagnose problems with signature and data quality moved into its own notebook
1 parent f7da204 commit f67113b

10 files changed

+1195
-2880
lines changed

DeconvolutionPrediagnostics.ipynb

+381
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,381 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "e66db2b9",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"from IPython.display import display, HTML\n",
11+
"display(HTML(\"<style>.container { width:100% !important; }</style>\"))\n",
12+
"plotwidth=40"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": null,
18+
"id": "84cbfad0",
19+
"metadata": {},
20+
"outputs": [],
21+
"source": [
22+
"from WwDec.main import *\n",
23+
"\n",
24+
"import matplotlib.pyplot as plt\n",
25+
"import seaborn as sns\n",
26+
"from matplotlib.colors import ListedColormap\n"
27+
]
28+
},
29+
{
30+
"cell_type": "markdown",
31+
"id": "5bf5e25c",
32+
"metadata": {},
33+
"source": [
34+
"# Globals"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": null,
40+
"id": "d1d4d3c5",
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"# Source of inspiration from covariatns, see:\n",
45+
"# https://github.com/hodcroftlab/covariants/blob/master/web/data/clusters.json\n",
46+
"#\n",
47+
"# Keep in sync with covspectrum, see:\n",
48+
"# https://github.com/cevo-public/cov-spectrum-website/blob/develop/src/models/wasteWater/constants.ts\n",
49+
"color_map = {\n",
50+
" 'B.1.1.7': '#D16666',\n",
51+
" 'B.1.351': '#FF6665',\n",
52+
" 'P.1': '#FFB3B3',\n",
53+
" 'B.1.617.1': '#66C265',\n",
54+
" 'B.1.617.2': '#66A366',\n",
55+
" 'BA.1': '#A366A3',\n",
56+
" 'BA.2': '#cfafcf',\n",
57+
" 'BA.4': '#8a66ff',\n",
58+
" 'BA.5': '#585eff',\n",
59+
" 'BA.2.12.1': '#0400e0',\n",
60+
" 'BA.2.75': '#008fe0',\n",
61+
" 'BA.2.75.2': '#208fe0', # improv\n",
62+
" 'BQ.1.1': '#8fe000', # improv\n",
63+
" 'XBB': '#dd6bff',\n",
64+
" 'undetermined': '#969696',\n",
65+
"}"
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": null,
71+
"id": "a4277738",
72+
"metadata": {},
73+
"outputs": [],
74+
"source": [
75+
"# Overwrite globals set by WwDec.main:\n",
76+
"# temporary, globals\n",
77+
"\n",
78+
"# zst needs python's Zstandard \n",
79+
"tally_data = \"./work-vp-test/variants/tallymut.tsv.zst\" # zst needs python's Zstandard # \"./tallymut_line.tsv\"\n",
80+
"out_dir = (\n",
81+
" \"./out\"\n",
82+
")\n",
83+
"\n",
84+
"import yaml\n",
85+
"# load variants configuations\n",
86+
"with open(\"work-vp-test/variant_config.yaml.old\", \"r\") as file:\n",
87+
" conf_yaml = yaml.load(file, Loader=yaml.SafeLoader)\n",
88+
"variants_list = conf_yaml[\"variants_list\"]\n",
89+
"variants_pangolin = conf_yaml[\"variants_pangolin\"]\n",
90+
"variants_not_reported = conf_yaml[\"variants_not_reported\"]\n",
91+
"start_date = conf_yaml[\"start_date\"]\n",
92+
"end_date = conf_yaml.get(\"end_date\") # optionnal, usually absent in ongoing surveillance, and present in articles with subset of historical data\n",
93+
"\n",
94+
"to_drop = conf_yaml[\"to_drop\"]\n",
95+
"cities_list = conf_yaml[\"locations_list\"]\n",
96+
"\n",
97+
"# var dates\n",
98+
"with open(\"work-vp-test/var_dates.yaml\", \"r\") as file:\n",
99+
" conf_yaml.update(yaml.load(file, Loader=yaml.SafeLoader))\n",
100+
"\n",
101+
"# display the current config\n",
102+
"conf_yaml"
103+
]
104+
},
105+
{
106+
"cell_type": "markdown",
107+
"id": "fb8868cd",
108+
"metadata": {},
109+
"source": [
110+
"# Load and preprocess data"
111+
]
112+
},
113+
{
114+
"cell_type": "code",
115+
"execution_count": null,
116+
"id": "73776b2b",
117+
"metadata": {},
118+
"outputs": [],
119+
"source": [
120+
"df_tally = pd.read_csv(tally_data, sep=\"\\t\", parse_dates = [ \"date\" ], dtype={\"location_code\": \"str\"})#.drop(columns=['proto'])\n",
121+
"df_tally.head()"
122+
]
123+
},
124+
{
125+
"cell_type": "code",
126+
"execution_count": null,
127+
"id": "612dc75c",
128+
"metadata": {},
129+
"outputs": [],
130+
"source": [
131+
"set(df_tally.columns) - set(variants_pangolin.keys()) - {'base','batch','cov','date','frac','gene','plantcode','plantname','pos','proto','sample','var'}"
132+
]
133+
},
134+
{
135+
"cell_type": "code",
136+
"execution_count": null,
137+
"id": "af36e51f",
138+
"metadata": {},
139+
"outputs": [],
140+
"source": [
141+
"preproc = DataPreprocesser(df_tally)\n",
142+
"preproc = preproc.general_preprocess(\n",
143+
" variants_list=variants_list,\n",
144+
" variants_pangolin=variants_pangolin,\n",
145+
" variants_not_reported=variants_not_reported,\n",
146+
" to_drop=[\"subset\"],\n",
147+
" start_date=start_date,\n",
148+
" remove_deletions=True,\n",
149+
")\n",
150+
"t_df_tally = preproc.df_tally\n",
151+
"# split into v41 and not v41, filter mutations and join\n",
152+
"df_tally_v41 = preproc.df_tally[preproc.df_tally.proto == 'v41'] \n",
153+
"print(df_tally_v41.shape)\n",
154+
"preproc.df_tally = preproc.df_tally[preproc.df_tally.proto != 'v41'] \n",
155+
"preproc = preproc.filter_mutations()\n",
156+
"print(preproc.df_tally.shape)\n",
157+
"\n",
158+
"preproc.df_tally = pd.concat([preproc.df_tally,df_tally_v41])\n",
159+
"print(preproc.df_tally.shape)\n",
160+
"#preproc.df_tally['']"
161+
]
162+
},
163+
{
164+
"cell_type": "markdown",
165+
"id": "c3690b89",
166+
"metadata": {},
167+
"source": [
168+
"# Look at design of mutations"
169+
]
170+
},
171+
{
172+
"cell_type": "code",
173+
"execution_count": null,
174+
"id": "b7a694a5",
175+
"metadata": {},
176+
"outputs": [],
177+
"source": [
178+
"des_matrix = preproc.df_tally[variants_list + [\"undetermined\", \"mutations\"]].drop_duplicates(\"mutations\").set_index(\"mutations\")\n",
179+
"des_matrix_mut = des_matrix[~des_matrix.index.str.startswith(\"-\")]\n",
180+
"des_matrix_wt = des_matrix[des_matrix.index.str.startswith(\"-\")]\n"
181+
]
182+
},
183+
{
184+
"cell_type": "code",
185+
"execution_count": null,
186+
"id": "a695c887",
187+
"metadata": {},
188+
"outputs": [],
189+
"source": [
190+
"fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(plotwidth*0.5,plotwidth/8))\n",
191+
"cmap_binary = ListedColormap(['white', 'red'])\n",
192+
"# sns.heatmap(des_matrix.T, square=True, cmap=cmap_binary, cbar=False)\n",
193+
"\n",
194+
"sns.heatmap(des_matrix_mut.T, square=True, cmap=cmap_binary, cbar=False, ax=axes[0])\n",
195+
"sns.heatmap(des_matrix_wt.T, square=True, cmap=cmap_binary, cbar=False, ax=axes[1])\n",
196+
"\n",
197+
"# axes[0].tick_params(labelsize=9)\n",
198+
"\n",
199+
"\n",
200+
"plt.show()"
201+
]
202+
},
203+
{
204+
"cell_type": "code",
205+
"execution_count": null,
206+
"id": "3fef97d4",
207+
"metadata": {},
208+
"outputs": [],
209+
"source": [
210+
"# np.linalg.cond(des_matrix_mut.drop('undetermined', axis=1))\n",
211+
"print(\"condition number = {:.2f}\".format(np.linalg.cond(des_matrix)))\n"
212+
]
213+
},
214+
{
215+
"cell_type": "code",
216+
"execution_count": null,
217+
"id": "2770304e",
218+
"metadata": {},
219+
"outputs": [],
220+
"source": [
221+
"fig, axes = plt.subplots(1,3, figsize=(22,7))\n",
222+
"\n",
223+
"common_mut = des_matrix_mut.T.dot(des_matrix_mut)\n",
224+
"sns.heatmap(common_mut, square=True, cmap=\"viridis\", annot=common_mut, ax=axes[0])\n",
225+
"axes[0].set_title(\"common mutations\")\n",
226+
"\n",
227+
"corr_mut = (des_matrix_mut).corr()\n",
228+
"sns.heatmap(corr_mut, square=True, cmap=\"viridis\", annot=corr_mut, ax=axes[1], fmt=\".1g\")\n",
229+
"axes[1].set_title(\"correlation\")\n",
230+
"\n",
231+
"from sklearn.metrics.pairwise import pairwise_distances\n",
232+
"jac_sim = 1 - pairwise_distances(des_matrix_mut.T, metric = \"hamming\")\n",
233+
"jac_sim = pd.DataFrame(jac_sim, index=des_matrix_mut.columns, columns=des_matrix_mut.columns)\n",
234+
"sns.heatmap(jac_sim, square=True, cmap=\"viridis\", annot=jac_sim, ax=axes[2])\n",
235+
"axes[2].set_title(\"jaccard similarity ((A∩B)/(A∪B))\")\n",
236+
"\n",
237+
"fig.show()"
238+
]
239+
},
240+
{
241+
"cell_type": "code",
242+
"execution_count": null,
243+
"id": "12d49d6f",
244+
"metadata": {},
245+
"outputs": [],
246+
"source": [
247+
"locations_1 = ['Lugano (TI)',\n",
248+
" 'Zürich (ZH)',\n",
249+
" 'Genève (GE)',\n",
250+
" 'Chur (GR)',\n",
251+
" 'Altenrhein (SG)',\n",
252+
" 'Laupen (BE)',\n",
253+
" 'Lausanne (Vidy)',\n",
254+
" 'Sion (VS)',\n",
255+
" 'Porrentruy (JU)',\n",
256+
" 'Basel (catchment area ARA Basel)']"
257+
]
258+
},
259+
{
260+
"cell_type": "code",
261+
"execution_count": null,
262+
"id": "1fd71359",
263+
"metadata": {},
264+
"outputs": [],
265+
"source": [
266+
"all_conds_df = []\n",
267+
"for proto in preproc.df_tally.proto.unique(): \n",
268+
" \n",
269+
" for location in locations_1:\n",
270+
"\n",
271+
" t_df_tally_zh = preproc.df_tally[preproc.df_tally.location == location]\n",
272+
" t_df_tally_zh = t_df_tally_zh[(t_df_tally_zh.proto == proto) & (t_df_tally_zh[\"cov\"] >= 5)]\n",
273+
"\n",
274+
" conds = []\n",
275+
" for date in t_df_tally_zh.date.unique():\n",
276+
" des_matrix = t_df_tally_zh[\n",
277+
" (t_df_tally_zh.date == date)][variants_list + [\"undetermined\", \"mutations\"]].drop_duplicates(\"mutations\").set_index(\"mutations\")\n",
278+
" des_matrix_mut = des_matrix[~des_matrix.index.str.startswith(\"-\")]\n",
279+
" des_matrix_wt = des_matrix[des_matrix.index.str.startswith(\"-\")]\n",
280+
" \n",
281+
"# print((location, date))\n",
282+
" \n",
283+
" jac_sim = 1 - pairwise_distances(des_matrix_mut.T, metric = \"hamming\")\n",
284+
" jac_sim = pd.DataFrame(jac_sim)\n",
285+
" jac_arr = jac_sim.values\n",
286+
" np.fill_diagonal(jac_arr, np.nan)\n",
287+
" maxjac = np.nanmax(jac_arr)\n",
288+
"\n",
289+
" corr_mut = (des_matrix_mut).corr()\n",
290+
" corr_arr = corr_mut.values\n",
291+
" np.fill_diagonal(corr_arr, np.nan)\n",
292+
" maxcorr = np.nanmax(corr_arr)\n",
293+
"\n",
294+
"\n",
295+
" conds.append({\"n_mut\":des_matrix_mut.shape[0],\n",
296+
" \"cond_number\":np.linalg.cond(des_matrix),\n",
297+
" \"cond_number_omicron\":np.linalg.cond(des_matrix[[\"BA.1\", \"BA.2\", \"BA.4\", \"BA.5\"]]), \n",
298+
" \"max_jac\":maxjac, \n",
299+
" \"max_corr\":maxcorr,\n",
300+
" \"location\":location\n",
301+
" })\n",
302+
"\n",
303+
"\n",
304+
" conds_df = pd.DataFrame(\n",
305+
" conds,\n",
306+
" index=t_df_tally_zh.date.unique()\n",
307+
" )\n",
308+
" conds_df[\"proto\"] = proto\n",
309+
" all_conds_df.append(conds_df)\n",
310+
" # print(np.linalg.cond(des_matrix_mut.drop('undetermined', axis=1)))\n",
311+
"\n"
312+
]
313+
},
314+
{
315+
"cell_type": "code",
316+
"execution_count": null,
317+
"id": "3e6d4ec7",
318+
"metadata": {},
319+
"outputs": [],
320+
"source": [
321+
"all_conds_df_conc = pd.concat(all_conds_df)\n",
322+
"all_conds_df_conc = all_conds_df_conc.reset_index()\n",
323+
"all_conds_df_conc.head()"
324+
]
325+
},
326+
{
327+
"cell_type": "code",
328+
"execution_count": null,
329+
"id": "167aa670",
330+
"metadata": {},
331+
"outputs": [],
332+
"source": [
333+
"fig, axes = plt.subplots(5,2,figsize=(14,12))\n",
334+
"axes = axes.flatten()\n",
335+
"\n",
336+
"for i, location in enumerate(all_conds_df_conc.location.unique()):\n",
337+
" tmp_df = all_conds_df_conc[all_conds_df_conc.location == location]\n",
338+
" h = sns.lineplot(\n",
339+
" x=tmp_df[\"index\"],\n",
340+
" y=tmp_df[\"max_jac\"], \n",
341+
" hue = tmp_df[\"proto\"], \n",
342+
" ax=axes[i]\n",
343+
" )\n",
344+
" # h.set_ylim(top=20)\n",
345+
" h.set_xlim(left=np.datetime64(\"2021-12-01\"))\n",
346+
" axes[i].set_title(location)\n",
347+
" axes[i].set_ylabel(\"max jaccard sim\")\n",
348+
" axes[i].set_xlabel(\"\")\n",
349+
"# axes[i].set_xticks(rotation = 45) # Rotates X-Axis Ticks by 45-degrees\n",
350+
"\n",
351+
" for tick in axes[i].get_xticklabels():\n",
352+
" tick.set_rotation(45)\n",
353+
" \n",
354+
"fig.tight_layout() # Or equivalently, \"plt.tight_layout()\"\n",
355+
"\n",
356+
" "
357+
]
358+
}
359+
],
360+
"metadata": {
361+
"kernelspec": {
362+
"display_name": "bio1",
363+
"language": "python",
364+
"name": "bio1"
365+
},
366+
"language_info": {
367+
"codemirror_mode": {
368+
"name": "ipython",
369+
"version": 3
370+
},
371+
"file_extension": ".py",
372+
"mimetype": "text/x-python",
373+
"name": "python",
374+
"nbconvert_exporter": "python",
375+
"pygments_lexer": "ipython3",
376+
"version": "3.9.16"
377+
}
378+
},
379+
"nbformat": 4,
380+
"nbformat_minor": 5
381+
}

0 commit comments

Comments
 (0)