SBRG · Gaoyuan-Li · Feb 13, 2025 · Feb 25, 2025 · Feb 25, 2025 · Mar 3, 2025
diff --git a/examples/5c_functional_enrichments.ipynb b/examples/5c_functional_enrichments.ipynb
@@ -0,0 +1,226 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Base imports\n",
+    "import os\n",
+    "import pickle\n",
+    "\n",
+    "# Compute imports\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import scipy\n",
+    "from tqdm.notebook import tqdm, trange\n",
+    "\n",
+    "# Plotting imports\n",
+    "import matplotlib\n",
+    "matplotlib.rcParams['pdf.fonttype'] = 42\n",
+    "from matplotlib import pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from plotly import express as px\n",
+    "\n",
+    "# ML import\n",
+    "from sklearn.decomposition import NMF\n",
+    "from sklearn.metrics import mean_squared_error, median_absolute_error\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "from pyphylon.util import load_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CONFIG = load_config(\"config.yml\")\n",
+    "WORKDIR = CONFIG[\"WORKDIR\"]\n",
+    "SPECIES = CONFIG[\"PG_NAME\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "L_BIN = os.path.join(WORKDIR, f'processed/nmf-outputs/L_binarized.csv')\n",
+    "L_BIN = pd.read_csv(L_BIN, index_col=0)\n",
+    "L_BIN.columns = [f'phylon{i}' for i in range(1, L_BIN.shape[1]+1)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# collection functions...\n",
+    "\n",
+    "from pyphylon.biointerp import collect_functions\n",
+    "# only run me once:\n",
+    "# all_functions = collect_functions(WORKDIR, 'processed/bakta/')\n",
+    "# all_functions.to_csv(os.path.join(WORKDIR, 'processed/all_functions.csv'))\n",
+    "\n",
+    "all_functions = pd.read_csv(os.path.join(WORKDIR, 'processed/all_functions.csv'), index_col=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get the pan-genome\n",
+    "df_genes = pd.read_pickle(os.path.join(WORKDIR, f'processed/cd-hit-results/{SPECIES}_strain_by_gene.pickle.gz'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyphylon.biointerp import get_pg_to_locus_map\n",
+    "# Data wrangling to get the functions for each cluster            \n",
+    "pg2locus_map = get_pg_to_locus_map(WORKDIR, SPECIES)\n",
+    "functions2genes = pd.merge(all_functions, pg2locus_map, left_on='locus', right_on='gene_id')\n",
+    "cluster_functions = functions2genes.groupby('cluster').first().reset_index()[['cluster','product','go']]\n",
+    "cluster_functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyphylon.biointerp import explode_go_annos\n",
+    "cluster_to_go_functions = explode_go_annos(cluster_functions)\n",
+    "cluster_to_go_functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "go_functions_count = cluster_to_go_functions.groupby('go').count()\n",
+    "go_functions = go_functions_count[go_functions_count['cluster'] > 3].sort_values('cluster', ascending=False)\n",
+    "go_functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#calculate a single engirchment\n",
+    "from pyphylon.biointerp import calc_enrichment\n",
+    "go_term = 'GO:0005524'\n",
+    "phylon = 'phylon1'\n",
+    "calc_enrichment(L_BIN, cluster_to_go_functions, go_term, functions2genes, phylon, phylon_contribution_cutoff=0)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyphylon.biointerp import calc_all_phylon_go_enrichments, get_go_mapping  # TODO need to speed this up - shrinking functions2genes to only accessory genes seemed to help...\n",
+    "\n",
+    "phylon_go_enrichments = calc_all_phylon_go_enrichments(L_BIN, functions2genes, cluster_to_go_functions, go_functions, phylon_contribution_cutoff=0.5)\n",
+    "phylon_go_enrichments = phylon_go_enrichments[phylon_go_enrichments['p_value']<0.05]\n",
+    "\n",
+    "go_mapping = get_go_mapping()\n",
+    "phylon_go_enrichments = pd.merge(phylon_go_enrichments, go_mapping, left_on='function', right_index=True, how='left')\n",
+    "missing_go = phylon_go_enrichments[phylon_go_enrichments['name'].isnull()].index\n",
+    "phylon_go_enrichments.loc[missing_go, 'name'] = phylon_go_enrichments.loc[missing_go,'function']\n",
+    "\n",
+    "phylon_go_enrichments = phylon_go_enrichments[phylon_go_enrichments['function']!='SO:0001217'] # filter out SO:0001217 is just a category for \"protein encoding gene\"\n",
+    "phylon_go_enrichments.to_csv(os.path.join(WORKDIR, 'processed/phylon_go_enrichment.csv'))\n",
+    "\n",
+    "phylon_go_enrichments = pd.read_csv(os.path.join(WORKDIR, 'processed/phylon_go_enrichment.csv'), index_col=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phylon_go_enrichments_mat = pd.pivot_table(phylon_go_enrichments, index='phylon', columns='function', values='p_value')\n",
+    "sns.clustermap(phylon_go_enrichments_mat.fillna(0.05), cmap='rocket_r')\n",
+    "plt.title('phylon functional enrichments')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Explore a single phylon:\n",
+    "phylon = 'phylon1'\n",
+    "phylon_go_enrichments[phylon_go_enrichments['phylon']==phylon][:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Explore a all phylons:\n",
+    "from pyphylon.biointerp import gen_phylon_wordcloud\n",
+    "for phylon in phylon_go_enrichments['phylon'].unique():\n",
+    "    phylon_enr = phylon_go_enrichments[phylon_go_enrichments['phylon']==phylon]\n",
+    "    phylon_enr.loc[:,'products'] = phylon_enr['products'].str.replace(';', '<br>')\n",
+    "    fig = px.scatter(phylon_enr, x='overlap', y='logp', text='name', size='overlap', hover_data='products')\n",
+    "    print(phylon)\n",
+    "    fig.show()\n",
+    "    gen_phylon_wordcloud(L_BIN, functions2genes, phylon, cutoff=0)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pyphylon",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}