diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml index 5adcaf257a..57b7e6a2bd 100755 --- a/docs/_data/navigation.yml +++ b/docs/_data/navigation.yml @@ -330,6 +330,10 @@ jsl: url: /docs/en/jsl/aws-emr-utils - title: Utilities for AWS Glue url: /docs/en/jsl/aws-glue-utils + - title: Utilities for Haystack + url: /docs/en/jsl/haystack-utils + - title: Utilities for Langchain + url: /docs/en/jsl/langchain-utils - title: Release Testing Utilities url: /docs/en/jsl/testing-utils - title: Module Structure diff --git a/docs/en/jsl/databricks_utils.md b/docs/en/jsl/databricks_utils.md index 4cdc5ed96e..18ae64acbb 100644 --- a/docs/en/jsl/databricks_utils.md +++ b/docs/en/jsl/databricks_utils.md @@ -24,47 +24,53 @@ You must create endpoints from a Databricks cluster created by [nlp.install](htt See [Cluster Creation Notebook](https://github.com/JohnSnowLabs/johnsnowlabs/tree/main/notebooks/create_databricks_cluster.ipynb) and [Databricks Endpoint Tutorial Notebook](https://github.com/JohnSnowLabs/johnsnowlabs/tree/main/notebooks/databricks_endpoints_tutorial.ipynb) + ```python # You need `mlflow_by_johnsnowlabs` installed until next mlflow is released ! pip install mlflow_by_johnsnowlabs - from johnsnowlabs import nlp -nlp.query_and_deploy_if_missing('bert','My String to embed') +nlp.deploy_endpoint('bert') +nlp.query_endpoint('bert_ENDPOINT','My String to embed') ``` -`nlp.query_and_deploy_if_missing` has the following parameters related to **deploying your model**: - -| Parameter | Description | -|------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `model` | Model to be deployed as endpoint which is [converted into NluPipelines](https://nlp.johnsnowlabs.com/docs/en/jsl/utils_for_spark_nlp#nlptonlupipepipe), supported classes are: `String` Reference to NLU Pipeline name like 'bert', `NLUPipeline`, `List[Annotator]`, `Pipeline`, `LightPipeline`, `PretrainedPipeline`, `PipelineModel`, | -| `query` | str or list of strings or raw json string. If raw json, is_json_query must be True | -| `is_json_query` | if True, query is treated as raw json string | -| `base_name` | Name-Prefix for all resources created (Endpoints, Models, etc). If using non nlu referenced based models, you must specify this. | -| `re_create_endpoint` | if False, endpoint creation is skipped if one already exists. If True, it will delete existing endpoint if it exists | -| `re_create_model` | if False, model creation is skipped if one already exists. If True, model will be re-logged again, bumping the current version by 2 | -| `workload_size` | one of Small, Medium, Large. | -| `gpu` | `True`/`False` to load GPU-optimized jars or CPU-optimized jars in the container. Must use a gpu based `workload_type` if `gpu=true` | -| `new_run` | if True, mlflow will start a new run before logging the model | -| `db_host` | the databricks host URL. If not specified, the DATABRICKS_HOST environment variable is used | -| `db_token` | the databricks Access Token. If not specified, the DATABRICKS_TOKEN environment variable is used | -| `block_until_deployed` | if True, this function will block until the endpoint is created | -| `workload_type` | `CPU` by default, use `GPU_SMALL` to spawn a GPU based endpoint instead. Check Databricks docs for alternative values | - -`nlp.query_and_deploy_if_missing` has the following parameters related to **querying your model**, -which are forwarded to the [model.predict()](https://nlp.johnsnowlabs.com/docs/en/jsl/predict_api) call: - -| Parameter | Description | -|-----------------------------|----------------------------------------------------------------------------------------------------| -| `output_level` | One of `token`, `chunk`, `sentence`, `relation`, `document` to shape outputs | -| `positions` | Set `True`/`False` to include or exclude character index position of predictions | -| `metadata` | Set `True`/`False` to include additional metadata | -| `drop_irrelevant_cols` | Set `True`/`False` to drop irrelevant columns | -| `get_embeddings` | Set `True`/`False` to include embedding or not | +`nlp.deploy_endpoint` will register a ML-FLow model into your registry and deploy an Endpoint with a JSL license. +It has the following parameters: + +| Parameter | Description | +|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `model` | Model to be deployed as endpoint which is [converted into NluPipelines](https://nlp.johnsnowlabs.com/docs/en/jsl/utils_for_spark_nlp#nlptonlupipepipe), supported classes are: `String` Reference to NLU Pipeline name like 'bert', `NLUPipeline`, `List[Annotator]`, `Pipeline`, `LightPipeline`, `PretrainedPipeline`, `PipelineModel`. In case of a NLU reference, the endpoint name is auto-generated aus `_ENDPOINT` i.e. `bert_ENDPOINT`. '.' is replaced with '_' in the nlu reference for the endpoint name | +| `endpoint_name` | Name for the deployed endpoint. Optional if using NLU model reference but mandatory for custom pipelines. | +| `re_create_endpoint` | if False, endpoint creation is skipped if one already exists. If True, it will delete existing endpoint if it exists | +| `re_create_model` | if False, model creation is skipped if one already exists. If True, model will be re-logged again, bumping the current version by 2 | +| `workload_size` | one of Small, Medium, Large. | +| `gpu` | `True`/`False` to load GPU-optimized jars or CPU-optimized jars in the container. Must use a gpu based `workload_type` if `gpu=true` | +| `new_run` | if True, mlflow will start a new run before logging the model | +| `block_until_deployed` | if True, this function will block until the endpoint is created | +| `workload_type` | `CPU` by default, use `GPU_SMALL` to spawn a GPU based endpoint instead. Check Databricks docs for alternative values | +| `db_host` | the databricks host URL. If not specified, the DATABRICKS_HOST environment variable is used | +| `db_token` | the databricks Access Token. If not specified, the DATABRICKS_TOKEN environment variable is used | + +`nlp.query_endpoint` translates your query to JSON, sends it to the endpoint and returns the result as pandas DataFrame. +It has the following parameters which are forwarded to the [model.predict()](https://nlp.johnsnowlabs.com/docs/en/jsl/predict_api) call inside of the endpoint: + +| Parameter | Description | +|-----------------------------|---------------------------------------------------------------------------------------------------| +| `endpoint_name` | Name of the endpoint to query | +| `query` | str or list of strings or raw json string. If raw json, is_json_query must be True | +| `is_json_query` | if True, query is treated as raw json string | +| `output_level` | One of `token`, `chunk`, `sentence`, `relation`, `document` to shape outputs | +| `positions` | Set `True`/`False` to include or exclude character index position of predictions | +| `metadata` | Set `True`/`False` to include additional metadata | +| `drop_irrelevant_cols` | Set `True`/`False` to drop irrelevant columns | +| `get_embeddings` | Set `True`/`False` to include embedding or not | | `keep_stranger_features` | Set `True`/`False` to return columns not named "text", 'image" or "file_type" from your input data | -| `multithread` | Set `True`/`False` to use multi-Threading for inference. Auto-inferred if not set | +| `multithread` | Set `True`/`False` to use multi-Threading for inference. Auto-inferred if not set | +| `db_host` | the databricks host URL. If not specified, the DATABRICKS_HOST environment variable is used | +| `db_token` | the databricks Access Token. If not specified, the DATABRICKS_TOKEN environment variable is used | + -`nlp.query_and_deploy_if_missing` checks the following Env vars +`nlp.query_endpoint` and `nlp.deploy_endpoint` check the following **mandatory** env vars to resolve wheels for endpoints | Env Var Name | Description | |-----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| diff --git a/docs/en/jsl/haystack_utils.md b/docs/en/jsl/haystack_utils.md new file mode 100644 index 0000000000..41def8696d --- /dev/null +++ b/docs/en/jsl/haystack_utils.md @@ -0,0 +1,66 @@ +--- +layout: docs +seotitle: NLP | John Snow Labs +title: Utilities for Haystack +permalink: /docs/en/jsl/haystack-utils +key: docs-install +modify_date: "2020-05-26" +header: true +show_nav: true +sidebar: + nav: jsl +--- + +
+ + +Johnsnowlabs provides the following nodes which can be used inside the [Haystack Framework](https://haystack.deepset.ai/) for scalable pre-processing&embedding on +[spark clusters](https://spark.apache.org/). With this you can create Easy-Scalable&Production-Grade LLM&RAG applications. +See the [Haystack with Johnsnowlabs Tutorial Notebook](https://github.com/JohnSnowLabs/johnsnowlabs/blob/release/master/notebooks/haystack_with_johnsnowlabs.ipynb) + +## JohnSnowLabsHaystackProcessor +Pre-Process you documents in a scalable fashion in Haystack +based on [Spark-NLP's DocumentCharacterTextSplitter](https://sparknlp.org/docs/en/annotators#documentcharactertextsplitter) and supports all of it's [parameters](https://sparknlp.org/api/python/reference/autosummary/sparknlp/annotator/document_character_text_splitter/index.html#sparknlp.annotator.document_character_text_splitter.DocumentCharacterTextSplitter) + +```python +# Create Pre-Processor which is connected to spark-cluster +from johnsnowlabs.llm import embedding_retrieval +processor = embedding_retrieval.JohnSnowLabsHaystackProcessor( + chunk_overlap=2, + chunk_size=20, + explode_splits=True, + keep_seperators=True, + patterns_are_regex=False, + split_patterns=["\n\n", "\n", " ", ""], + trim_whitespace=True, +) +# Process document distributed on a spark-cluster +processor.process(some_documents) +``` + +## JohnSnowLabsHaystackEmbedder +Scalable Embedding computation with [any Sentence Embedding](https://nlp.johnsnowlabs.com/models?task=Embeddings) from John Snow Labs in Haystack +You must provide the **NLU reference** of a sentence embeddings to load it. +If you want to use GPU with the Embedding Model, set GPU=True on localhost, it will start a spark-session with GPU jars. +For clusters, you must setup cluster-env correctly, using [nlp.install_to_databricks()](https://nlp.johnsnowlabs.com/docs/en/jsl/install_advanced#into-a-freshly-created-databricks-cluster-automatically) is recommended. + +```python +from johnsnowlabs.llm import embedding_retrieval +from haystack.document_stores import InMemoryDocumentStore + +# Write some processed data to Doc store, so we can retrieve it later +document_store = InMemoryDocumentStore(embedding_dim=512) +document_store.write_documents(some_documents) + +# Create Embedder which connects is connected to spark-cluster +retriever = embedding_retrieval.JohnSnowLabsHaystackEmbedder( + embedding_model='en.embed_sentence.bert_base_uncased', + document_store=document_store, + use_gpu=False, +) + +# Compute Embeddings distributed in a cluster +document_store.update_embeddings(retriever) + +``` +
\ No newline at end of file diff --git a/docs/en/jsl/install_advanced.md b/docs/en/jsl/install_advanced.md index 0fd46a5f41..50f1cda72c 100644 --- a/docs/en/jsl/install_advanced.md +++ b/docs/en/jsl/install_advanced.md @@ -177,8 +177,12 @@ Where to find your Databricks Access Token: You can set the following parameters on the `nlp.install()` function to define properties of the cluster which will be created. See [Databricks Cluster Creation](https://docs.databricks.com/dev-tools/api/latest/clusters.html#create) for a detailed description of all parameters. +You can use the `extra_pip_installs` parameter to installl a list of additional pypi libraries to the cluster. +Just set `nlp.install_to_databricks(extra_pip_installs=['langchain','farm-haystack==1.2.3'])` to install the libraries. + | Cluster creation Parameter | Default Value | |----------------------------|--------------------------------------------| +| extra_pip_installs | `None` | | block_till_cluster_ready | `True` | | num_workers | `1` | | cluster_name | `John-Snow-Labs-Databricks-Auto-Cluster🚀` | @@ -390,7 +394,7 @@ Your can get it from: ``` python # Create a new Cluster with Spark NLP and all licensed libraries ready to go: -nlp.install(databricks_host='https://your_host.cloud.databricks.com', databricks_token = 'dbapi_token123',) +nlp.install_to_databricks(databricks_host='https://your_host.cloud.databricks.com', databricks_token = 'dbapi_token123',) ```
diff --git a/docs/en/jsl/jsl_release_notes.md b/docs/en/jsl/jsl_release_notes.md index 0a6094abaf..c691973e1b 100644 --- a/docs/en/jsl/jsl_release_notes.md +++ b/docs/en/jsl/jsl_release_notes.md @@ -13,7 +13,25 @@ sidebar:
-See [Github Releases](https://github.com/JohnSnowLabs/johnsnowlabs/releases) for detailed information on Release History and Featuresasdas +See [Github Releases](https://github.com/JohnSnowLabs/johnsnowlabs/releases) for detailed information on Release History and Features + + +## 5.1.8 +Release date: 17-11-2023 + +The John Snow Labs 5.1.8 Library released with the following pre-installed and recommended dependencies + + +| Library | Version | +|-----------------------------------------------------------------------------------------|---------| +| [Visual NLP](https://nlp.johnsnowlabs.com/docs/en/spark_ocr_versions/ocr_release_notes) | `5.0.2` | +| [Enterprise NLP](https://nlp.johnsnowlabs.com/docs/en/licensed_annotators) | `5.1.3` | +| [Finance NLP](https://nlp.johnsnowlabs.com/docs/en/financial_release_notes) | `1.X.X` | +| [Legal NLP](https://nlp.johnsnowlabs.com/docs/en/legal_release_notes) | `1.X.X` | +| [NLU](https://github.com/JohnSnowLabs/nlu/releases) | `5.1.0` | +| [Spark-NLP-Display](https://sparknlp.org/docs/en/display) | `4.4` | +| [Spark-NLP](https://github.com/JohnSnowLabs/spark-nlp/releases/) | `5.1.4` | +| [Pyspark](https://spark.apache.org/docs/latest/api/python/) | `3.1.2` | ## 5.1.7 Release date: 19-10-2023 diff --git a/docs/en/jsl/langchain_utils.md b/docs/en/jsl/langchain_utils.md new file mode 100644 index 0000000000..5604c45064 --- /dev/null +++ b/docs/en/jsl/langchain_utils.md @@ -0,0 +1,94 @@ +--- +layout: docs +seotitle: NLP | John Snow Labs +title: Utilities for Langchain +permalink: /docs/en/jsl/langchain-utils +key: docs-install +modify_date: "2020-05-26" +header: true +show_nav: true +sidebar: + nav: jsl +--- + +
+ + + + + +Johnsnowlabs provides the following components which can be used inside the [Langchain Framework](https://www.langchain.com/) for scalable pre-processing&embedding on +[spark clusters](https://spark.apache.org/) as Agent Tools and Pipeline components. With this you can create Easy-Scalable&Production-Grade LLM&RAG applications. +See the [Langchain with Johnsnowlabs Tutorial Notebook](https://github.com/JohnSnowLabs/johnsnowlabs/blob/release/master/notebooks/langchain_with_johnsnowlabs.ipynb) + +## JohnSnowLabsHaystackProcessor +Pre-Process you documents in a scalable fashion in Langchain +based on [Spark-NLP's DocumentCharacterTextSplitter](https://sparknlp.org/docs/en/annotators#documentcharactertextsplitter) and supports all of it's [parameters](https://sparknlp.org/api/python/reference/autosummary/sparknlp/annotator/document_character_text_splitter/index.html#sparknlp.annotator.document_character_text_splitter.DocumentCharacterTextSplitter) + +```python +from langchain.document_loaders import TextLoader +from johnsnowlabs.llm import embedding_retrieval + +loader = TextLoader('/content/state_of_the_union.txt') +documents = loader.load() + + +from johnsnowlabs.llm import embedding_retrieval + +# Create Pre-Processor which is connected to spark-cluster +processor = embedding_retrieval.JohnSnowLabsLangChainCharSplitter( + chunk_overlap=2, + chunk_size=20, + explode_splits=True, + keep_seperators=True, + patterns_are_regex=False, + split_patterns=["\n\n", "\n", " ", ""], + trim_whitespace=True, +) +# Process document distributed on a spark-cluster +pre_processed_docs = jsl_splitter.split_documents(documents) + +``` + +## JohnSnowLabsHaystackEmbedder +Scalable Embedding computation with [any Sentence Embedding](https://nlp.johnsnowlabs.com/models?task=Embeddings) from John Snow Labs. +You must provide the **NLU reference** of a sentence embeddings to load it. +You can start a spark session by setting `hardware_target` as one of `cpu`, `gpu`, `apple_silicon`, or `aarch` on localhost environments. +For clusters, you must setup the cluster-env correctly, using [nlp.install_to_databricks()](https://nlp.johnsnowlabs.com/docs/en/jsl/install_advanced#into-a-freshly-created-databricks-cluster-automatically) is recommended. + +```python +# Create Embedder which connects is connected to spark-cluster +from johnsnowlabs.llm import embedding_retrieval +embeddings = embedding_retrieval.JohnSnowLabsLangChainEmbedder('en.embed_sentence.bert_base_uncased',hardware_target='cpu') + +# Compute Embeddings distributed +from langchain.vectorstores import FAISS +retriever = FAISS.from_documents(pre_processed_docs, embeddings).as_retriever() + +# Create A tool +from langchain.agents.agent_toolkits import create_retriever_tool +tool = create_retriever_tool( + retriever, + "search_state_of_union", + "Searches and returns documents regarding the state-of-the-union." +) + + +# Use Create LLM Agent with the Tool +from langchain.agents.agent_toolkits import create_conversational_retrieval_agent +from langchain.chat_models import ChatOpenAI +llm = ChatOpenAI(openai_api_key='YOUR_API_KEY') +agent_executor = create_conversational_retrieval_agent(llm, [tool], verbose=True) +result = agent_executor({"input": "what did the president say about going to east of Columbus?"}) +result['output'] + +>>> +> Entering new AgentExecutor chain... +Invoking: `search_state_of_union` with `{'query': 'going to east of Columbus'}` +[Document(page_content='miles east of', metadata={'source': '/content/state_of_the_union.txt'}), Document(page_content='in America.', metadata={'source': '/content/state_of_the_union.txt'}), Document(page_content='out of America.', metadata={'source': '/content/state_of_the_union.txt'}), Document(page_content='upside down.', metadata={'source': '/content/state_of_the_union.txt'})]I'm sorry, but I couldn't find any specific information about the president's statement regarding going to the east of Columbus in the State of the Union address. +> Finished chain. +I'm sorry, but I couldn't find any specific information about the president's statement regarding going to the east of Columbus in the State of the Union address. +``` + + +
\ No newline at end of file diff --git a/johnsnowlabs/__init__.py b/johnsnowlabs/__init__.py index 31d7e11d16..dbb46bd25e 100644 --- a/johnsnowlabs/__init__.py +++ b/johnsnowlabs/__init__.py @@ -13,6 +13,8 @@ if try_import_lib("sparkocr") and try_import_lib("sparknlp"): from johnsnowlabs import visual +from johnsnowlabs import llm + def new_version_online(): from .utils.pip_utils import get_latest_lib_version_on_pypi diff --git a/johnsnowlabs/auto_install/databricks/endpoints.py b/johnsnowlabs/auto_install/databricks/endpoints.py index 5407d82841..66d769d1ce 100644 --- a/johnsnowlabs/auto_install/databricks/endpoints.py +++ b/johnsnowlabs/auto_install/databricks/endpoints.py @@ -340,10 +340,10 @@ def model_exists(name): ############### High level Deployment & Query -def query_endpoint(data, nlu_model_name, db_host, db_token, base_name=None): +def _query_endpoint(data, endpoint_name, db_host, db_token): # 5. Query the Endpoint # endpoint_name = f"{nlu_model_name.replace('.','_')}_ENDPOINT" - endpoint_name = base_name if base_name else nlu_name_to_endpoint(nlu_model_name) + url = f"{db_host}/serving-endpoints/{endpoint_name}/invocations" headers = { "Authorization": f"Bearer {db_token}", @@ -459,12 +459,76 @@ def is_nlu_pipe(pipe): return isinstance(pipe, NLUPipeline) +def validate_db_creds(db_host, db_token): + if not db_host: + db_host = os.environ.get("DATABRICKS_HOST") + if not db_token: + db_token = os.environ.get("DATABRICKS_TOKEN") + if not db_host: + raise Exception( + "You must specify DATABRICKS_HOST and DATABRICKS_TOKEN en variables" + ) + return db_host, db_token + + +def deploy_model( + model, + re_create_endpoint, + re_create_model, + db_host, + db_token, + workload_size, + block_until_deployed, + gpu, + workload_type, + endpoint_name, +): + # convert to nlu pipe if needed + if isinstance(model, str): + return deploy_nlu_model_as_endpoint( + model, + re_create_endpoint=re_create_endpoint, + re_create_model=re_create_model, + db_host=db_host, + db_token=db_token, + workload_size=workload_size, + block_until_deployed=block_until_deployed, + gpu=gpu, + workload_type=workload_type, + endpoint_name=endpoint_name, + ) + else: + if not endpoint_name: + raise Exception( + "If you want to deploy custom pipes, you need to specify a endpoint_name" + ) + try: + import nlu + + if not isinstance(model, nlu.NLUPipeline): + model = nlp.to_nlu_pipe(model) + except: + raise Exception("Failure converting your model to NLU pipe") + return deploy_nlu_model_as_endpoint( + model, + re_create_endpoint=re_create_endpoint, + re_create_model=re_create_model, + endpoint_name=endpoint_name, + db_host=db_host, + db_token=db_token, + workload_size=workload_size, + block_until_deployed=block_until_deployed, + gpu=gpu, + workload_type=workload_type, + ) + + def query_and_deploy_if_missing( model, query, re_create_endpoint=False, re_create_model=False, - base_name=None, + endpoint_name=None, is_json_query=False, db_host=None, db_token=None, @@ -501,7 +565,7 @@ def query_and_deploy_if_missing( query: str or list of strings or raw json string. If raw json, is_json_query must be True is_json_query: if True, query is treated as raw json string - base_name: Name-Prefix for all resources created (Endpoints, Models, etc). If using non nlu referenced based models, you must specify this. + endpoint_name: Name-Prefix for all resources created (Endpoints, Models, etc). If using non nlu referenced based models, you must specify this. re_create_endpoint: if False, endpoint creation is skipped if one already exists. If True, it will delete existing endpoint if it exists re_create_model: if False, model creation is skipped if one already exists. If True, model will be re-logged again, bumping the current version by 2 workload_size: one of Small, Medium, Large. @@ -519,15 +583,10 @@ def query_and_deploy_if_missing( keep_stranger_features: Return columns not named "text", 'image" or "file_type" multithread: Use multi-Threading for inference """ - - if not db_host: - db_host = os.environ.get("DATABRICKS_HOST") - if not db_token: - db_token = os.environ.get("DATABRICKS_TOKEN") - if not db_host: - raise Exception( - "You must specify DATABRICKS_HOST and DATABRICKS_TOKEN en variables" - ) + print( + "query_and_deploy_if_missing is deprecated. It will be dropped in johnsnowlabs==5.2.0 Please use nlp.deploy_endpoint() and nlp.query_endpoint() instead." + ) + db_host, db_token = validate_db_creds(db_host, db_token) if gpu and workload_type == "CPU": raise ValueError( @@ -555,46 +614,23 @@ def query_and_deploy_if_missing( mlflow.end_run() mlflow.start_run() - if isinstance(model, str): - deploy_nlu_model_as_endpoint( - model, - re_create_endpoint=re_create_endpoint, - re_create_model=re_create_model, - db_host=db_host, - db_token=db_token, - workload_size=workload_size, - block_until_deployed=block_until_deployed, - gpu=gpu, - workload_type=workload_type, - base_name=base_name, - ) - else: - if not base_name: - raise Exception( - "If you want to deploy custom pipes, you need to specify base_name" - ) - try: - import nlu + endpoint_name = deploy_model( + model, + re_create_endpoint=re_create_endpoint, + re_create_model=re_create_model, + db_host=db_host, + db_token=db_token, + workload_size=workload_size, + block_until_deployed=block_until_deployed, + gpu=gpu, + workload_type=workload_type, + endpoint_name=endpoint_name, + ) - if not isinstance(model, nlu.NLUPipeline): - model = nlp.to_nlu_pipe(model) - except: - raise Exception("Failure converting your model to NLU pipe") - deploy_nlu_model_as_endpoint( - model, - re_create_endpoint=re_create_endpoint, - re_create_model=re_create_model, - base_name=base_name, - db_host=db_host, - db_token=db_token, - workload_size=workload_size, - block_until_deployed=block_until_deployed, - gpu=gpu, - workload_type=workload_type, - ) if not block_until_deployed: return - return query_endpoint( + + query = ( query if is_json_query else query_to_json( @@ -606,11 +642,21 @@ def query_and_deploy_if_missing( get_embeddings=get_embeddings, keep_stranger_features=keep_stranger_features, multithread=multithread, - ), - model, - db_host, - db_token, - base_name, + ) + ) + return query_endpoint( + endpoint_name=endpoint_name, + query=query, + is_json_query=is_json_query, + output_level=output_level, + positions=positions, + metadata=metadata, + drop_irrelevant_cols=drop_irrelevant_cols, + get_embeddings=get_embeddings, + keep_stranger_features=keep_stranger_features, + multithread=multithread, + db_host=db_host, + db_token=db_token, ) @@ -618,7 +664,7 @@ def deploy_nlu_model_as_endpoint( model_name, re_create_endpoint=False, re_create_model=False, - base_name=None, + endpoint_name=None, db_host=None, db_token=None, workload_size="Small", @@ -631,9 +677,9 @@ def deploy_nlu_model_as_endpoint( SECRET_NAME = "JSL_SECRET_NAME" SECRET_VALUE = os.environ["JOHNSNOWLABS_LICENSE_JSON_FOR_CONTAINER"] REGISTERD_MODEL_NAME = ( - base_name if base_name else nlu_name_to_registerd_model(model_name) + endpoint_name if endpoint_name else nlu_name_to_registerd_model(model_name) ) - ENDPOINT_NAME = base_name if base_name else nlu_name_to_endpoint(model_name) + ENDPOINT_NAME = endpoint_name if endpoint_name else nlu_name_to_endpoint(model_name) if not model_exists(REGISTERD_MODEL_NAME) or re_create_model: # 1. Log the model @@ -678,3 +724,133 @@ def deploy_nlu_model_as_endpoint( print( f"Endpoint {ENDPOINT_NAME} already exists! Set re_create_endpoint=True if you want to re-create it " ) + return ENDPOINT_NAME + + +def deploy_endpoint( + model, + re_create_endpoint=False, + re_create_model=False, + endpoint_name=None, + db_host=None, + db_token=None, + workload_size="Small", + new_run=True, + block_until_deployed=True, + gpu=False, + workload_type="CPU", +): + """ + Using to_nlu_pipeline() https://nlp.johnsnowlabs.com/docs/en/jsl/utils_for_spark_nlp#nlptonlupipepipe + + nlu_model: reference to nlu_model you want to query or NLU convertable pipe + Supported types are + - List[Annotator] + - Pipeline + - LightPipeline + - PretrainedPipeline + - PipelineModel + - NLUPipeline + - String Reference to NLU Pipeline name + See https://nlp.johnsnowlabs.com/docs/en/jsl/utils_for_spark_nlp#nlptonlupipepipe for more details + + endpoint_name: Name-Prefix for all resources created (Endpoints, Models, etc). If using non nlu referenced based models, you must specify this. + re_create_endpoint: if False, endpoint creation is skipped if one already exists. If True, it will delete existing endpoint if it exists + re_create_model: if False, model creation is skipped if one already exists. If True, model will be re-logged again, bumping the current version by 2 + workload_size: one of Small, Medium, Large. + new_run: if True, mlflow will start a new run before logging the model + db_host: the databricks host URL. If not specified, the DATABRICKS_HOST environment variable is used + db_token: the databricks Access Token. If not specified, the DATABRICKS_TOKEN environment variable is used + block_until_deployed: if True, this function will block until the endpoint is deployed. If False, it will return immediately after the endpoint is created + gpu: Use GPU for inference + workload_type: 'CPU' or 'GPU_SMALL' or see official docs + """ + + db_host, db_token = validate_db_creds(db_host, db_token) + if workload_size not in ["Small", "Medium", "Large"]: + print( + "WARNING! workload_size should be one of Small, Medium, Large for most users." + ) + + if new_run: + import mlflow + + mlflow.end_run() + mlflow.start_run() + + return deploy_model( + model, + re_create_endpoint=re_create_endpoint, + re_create_model=re_create_model, + db_host=db_host, + db_token=db_token, + workload_size=workload_size, + block_until_deployed=block_until_deployed, + gpu=gpu, + workload_type=workload_type, + endpoint_name=endpoint_name, + ) + + +def query_endpoint( + endpoint_name, + query, + is_json_query=False, + output_level: Optional[str] = None, + positions: Optional[bool] = None, + metadata: Optional[bool] = None, + drop_irrelevant_cols: Optional[bool] = None, + get_embeddings: Optional[bool] = None, + keep_stranger_features: Optional[bool] = None, + multithread: Optional[bool] = None, + db_host=None, + db_token=None, +): + """ + Using the NLU predict() https://nlp.johnsnowlabs.com/docs/en/jsl/predict_api inside a Databricks Endpoint + + nlu_model: reference to nlu_model you want to query or NLU convertable pipe + Supported types are + - List[Annotator] + - Pipeline + - LightPipeline + - PretrainedPipeline + - PipelineModel + - NLUPipeline + - String Reference to NLU Pipeline name + See https://nlp.johnsnowlabs.com/docs/en/jsl/utils_for_spark_nlp#nlptonlupipepipe for more details + + query: str or list of strings or raw json string. If raw json, is_json_query must be True + is_json_query: if True, query is treated as raw json string + endpoint_name: Name-Prefix for all resources created (Endpoints, Models, etc). If using non nlu referenced based models, you must specify this. + db_host: the databricks host URL. If not specified, the DATABRICKS_HOST environment variable is used + db_token: the databricks Access Token. If not specified, the DATABRICKS_TOKEN environment variable is used + output_level : token, chunk, sentence, relation, document + positions: include or exclude character index position of predictions + metadata: include additional metadata + drop_irrelevant_cols: drop irrelevant columns + get_embeddings: Include embedding or not + keep_stranger_features: Return columns not named "text", 'image" or "file_type" + multithread: Use multi-Threading for inference""" + + db_host, db_token = validate_db_creds(db_host, db_token) + query = ( + query + if is_json_query + else query_to_json( + in_data=query, + output_level=output_level, + positions=positions, + metadata=metadata, + drop_irrelevant_cols=drop_irrelevant_cols, + get_embeddings=get_embeddings, + keep_stranger_features=keep_stranger_features, + multithread=multithread, + ) + ) + return _query_endpoint( + data=query, + endpoint_name=endpoint_name, + db_host=db_host, + db_token=db_token, + ) diff --git a/johnsnowlabs/auto_install/databricks/install_utils.py b/johnsnowlabs/auto_install/databricks/install_utils.py index 711860d753..bea783dd2f 100644 --- a/johnsnowlabs/auto_install/databricks/install_utils.py +++ b/johnsnowlabs/auto_install/databricks/install_utils.py @@ -5,7 +5,12 @@ from johnsnowlabs.auto_install import jsl_home from johnsnowlabs.auto_install.softwares import Software -from johnsnowlabs.py_models.install_info import InstallSuite, LocalPy4JLib, LocalPyLib +from johnsnowlabs.py_models.install_info import ( + InstallSuite, + LocalPy4JLib, + LocalPyLib, + RootInfo, +) from johnsnowlabs.py_models.lib_version import LibVersion from johnsnowlabs.utils.env_utils import is_running_in_databricks from .dbfs import * @@ -87,6 +92,15 @@ def create_cluster( license_path = "/johnsnowlabs/license.json" put_file_on_dbfs(db, license_path, lic, overwrite=True) + info = RootInfo.get_from_jsl_home().dict() + info["version"] = info["version"].as_str() + put_file_on_dbfs( + db, + "/johnsnowlabs/info.json", + info, + overwrite=True, + ) + default_spark_env_vars = dict( SPARK_NLP_LICENSE_FILE=f"/dbfs{license_path}", AWS_ACCESS_KEY_ID=install_suite.secrets.AWS_ACCESS_KEY_ID, diff --git a/johnsnowlabs/auto_install/databricks/work_utils.py b/johnsnowlabs/auto_install/databricks/work_utils.py index f075faf794..0427a276a2 100644 --- a/johnsnowlabs/auto_install/databricks/work_utils.py +++ b/johnsnowlabs/auto_install/databricks/work_utils.py @@ -295,6 +295,7 @@ def run_in_databricks( block_till_complete=True, dst_path: str = None, parameters: Any = None, + return_job_url: bool = False, ): """ @@ -313,7 +314,8 @@ def run_in_databricks( :param dst_path: path to store the python script/notebook. in databricks, mandatory for notebooks. I.e. /Users//test.ipynb :param parameters: parameters to pass to the python script/notebook formatted accordingly to https://docs.databricks.com/en/workflows/jobs/create-run-jobs.html#pass-parameters-to-a-databricks-job-task - :return: job_id + :param return_job_url: returns job_url instead of job_id + :return: job_id if return_job_url=False else job_url """ from johnsnowlabs.auto_install.databricks.install_utils import ( get_db_client_for_token, @@ -341,14 +343,20 @@ def run_in_databricks( if "result_state" in job_status["state"]: print(f"Job has a result! its {job_status['state']}") - return job_status + if return_job_url: + return job_status, job_status["run_page_url"] + else: + return job_status elif "life_cycle_state" in job_status["state"]: print( f"Waiting 30 seconds, job {job_id} is still running, its {job_status['state']}" ) time.sleep(30) - return job_id + if return_job_url: + return job_id, job_status["run_page_url"] + else: + return job_id def submit_notebook_to_databricks( diff --git a/johnsnowlabs/auto_install/emr/install_utils.py b/johnsnowlabs/auto_install/emr/install_utils.py index a376e54916..df1137de97 100644 --- a/johnsnowlabs/auto_install/emr/install_utils.py +++ b/johnsnowlabs/auto_install/emr/install_utils.py @@ -2,9 +2,6 @@ from os import path from typing import Optional -import boto3 -import botocore - from johnsnowlabs import settings from johnsnowlabs.auto_install.emr.enums import EMRClusterStates from johnsnowlabs.auto_install.emr.work_utils import create_emr_bucket @@ -18,7 +15,7 @@ def create_emr_cluster( - boto_session: boto3.Session, + boto_session: "boto3.Session", secrets: JslSecrets, bootstrap_bucket: Optional[str] = None, s3_logs_path: Optional[str] = None, @@ -55,6 +52,8 @@ def create_emr_cluster( # Refer Also: https://docs.aws.amazon.com/code-library/latest/ug/python_3_emr_code_examples.html """ + import botocore + try: if not boto_session: raise Exception("Boto session is required") @@ -205,7 +204,9 @@ def block_till_emr_cluster_ready(emr_client, cluster_id: str): print(f"👌 Cluster-Id {cluster_id} is ready!") -def create_initialization_step_script(boto_session: boto3.Session, bucket: str) -> str: +def create_initialization_step_script( + boto_session: "boto3.Session", bucket: str +) -> str: """Creates a EMR initialization step script and uploads it to s3 bucket. Returns the s3 path of the script :param boto_session: Boto3 session :param s3_client: S3 boto3 client @@ -223,6 +224,7 @@ def create_initialization_step_script(boto_session: boto3.Session, bucket: str) sudo python3 -m pip install scipy scikit-learn "tensorflow==2.11.0" tensorflow-addons exit 0 """ + return upload_content( boto_session=boto_session, bucket=bucket, @@ -232,7 +234,7 @@ def create_initialization_step_script(boto_session: boto3.Session, bucket: str) def create_bootstrap_script( - boto_session: boto3.Session, + boto_session: "boto3.Session", bucket: str, secrets: JslSecrets, spark_nlp: bool = True, diff --git a/johnsnowlabs/auto_install/glue/install_utils.py b/johnsnowlabs/auto_install/glue/install_utils.py index 05b0349388..6d7462b127 100644 --- a/johnsnowlabs/auto_install/glue/install_utils.py +++ b/johnsnowlabs/auto_install/glue/install_utils.py @@ -1,6 +1,5 @@ from typing import List, Optional, Tuple -import boto3 import botocore from johnsnowlabs.py_models.install_info import InstallSuite, LocalPy4JLib @@ -8,11 +7,13 @@ from johnsnowlabs.utils.s3_utils import create_bucket, upload_file_to_s3 -def create_glue_bucket(boto_session: boto3.Session, bucket=None): +def create_glue_bucket(boto_session: "boto3.Session", bucket=None): """Create a bucket for EMR cluster logs :param boto_session: Boto3 session :param bucket: Bucket name """ + import boto3 + try: sts_client = boto_session.client("sts") account_id = sts_client.get_caller_identity()["Account"] @@ -31,7 +32,7 @@ def create_glue_bucket(boto_session: boto3.Session, bucket=None): def upload_pylibs_jars_to_glue_bucket( - boto_session: boto3.Session, install_suite: InstallSuite, bucket: Optional[str] + boto_session: "boto3.Session", install_suite: InstallSuite, bucket: Optional[str] ) -> Tuple[List[str], List[str]]: """Uploads jars and python packages to glue bucket :param boto_session: Boto3 session @@ -39,6 +40,8 @@ def upload_pylibs_jars_to_glue_bucket( :param bucket: Bucket name :return: List of uploaded jars and python packages """ + import boto3 + if not boto_session: raise ValueError("Boto3 session is required") diff --git a/johnsnowlabs/auto_install/glue/utils.py b/johnsnowlabs/auto_install/glue/utils.py index 69e5bc2fde..b9e68cd192 100644 --- a/johnsnowlabs/auto_install/glue/utils.py +++ b/johnsnowlabs/auto_install/glue/utils.py @@ -1,6 +1,5 @@ from typing import List, Optional -import boto3 from johnsnowlabs import settings from johnsnowlabs.auto_install.softwares import Software @@ -9,7 +8,7 @@ def get_printable_glue_notebook_commands( - boto_session: boto3.Session, + boto_session: "boto3.Session", glue_assets_bucket: str, packages_s3_location: List[str], jars_s3_location: List[str], diff --git a/johnsnowlabs/auto_install/health_checks/endpoint_test.py b/johnsnowlabs/auto_install/health_checks/endpoint_test.py index 7a2f269a57..a42963ecf1 100644 --- a/johnsnowlabs/auto_install/health_checks/endpoint_test.py +++ b/johnsnowlabs/auto_install/health_checks/endpoint_test.py @@ -3,22 +3,61 @@ # def new_req(): -# from mlflow.utils.requirements_utils import _get_pinned_requirement +# """ +# :return: A list of default pip requirements for MLflow Models produced by this flavor. +# Calls to :func:`save_model()` and :func:`log_model()` produce a pip environment +# that, at minimum, contains these requirements. +# """ +# import os # from johnsnowlabs import settings +# from mlflow.utils.requirements_utils import _get_pinned_requirement +# +# nlp.start() +# _JOHNSNOWLABS_ENV_JSON_LICENSE_KEY = "JOHNSNOWLABS_LICENSE_JSON" +# _JOHNSNOWLABS_ENV_HEALTHCARE_SECRET = "HEALTHCARE_SECRET" +# _JOHNSNOWLABS_ENV_VISUAL_SECRET = "VISUAL_SECRET" +# if ( +# _JOHNSNOWLABS_ENV_HEALTHCARE_SECRET not in os.environ +# and _JOHNSNOWLABS_ENV_VISUAL_SECRET not in os.environ +# ): +# raise Exception( +# f"You need to set the {_JOHNSNOWLABS_ENV_HEALTHCARE_SECRET} or {_JOHNSNOWLABS_ENV_VISUAL_SECRET} environment variable set." +# f" Please contact John Snow Labs to get one" +# ) # # _SPARK_NLP_JSL_WHEEL_URI = ( # "https://pypi.johnsnowlabs.com/{secret}/spark-nlp-jsl/spark_nlp_jsl-" # + f"{settings.raw_version_medical}-py3-none-any.whl" # ) # -# return [ -# f"johnsnowlabs_for_databricks_by_ckl=={settings.raw_version_jsl_lib}", +# _SPARK_NLP_VISUAL_WHEEL_URI = ( +# "https://pypi.johnsnowlabs.com/{secret}/spark-ocr/" +# f"spark_ocr-{settings.raw_version_ocr}-py3-none-any.whl" +# ) +# +# deps = [ +# f"johnsnowlabs_for_databricks=={settings.raw_version_jsl_lib}", # TODO UNDO THIS!! # _get_pinned_requirement("pyspark"), -# _SPARK_NLP_JSL_WHEEL_URI.format(secret=os.environ["SECRET"]), +# # TODO remove pandas constraint when NLU supports it +# # https://github.com/JohnSnowLabs/nlu/issues/176 # "pandas<=1.5.3", +# "nlu_by_ckl==5.0.2rc1", # ] # -# mlflow.johnsnowlabs.get_default_pip_requirements = new_req +# if _JOHNSNOWLABS_ENV_HEALTHCARE_SECRET in os.environ: +# _SPARK_NLP_JSL_WHEEL_URI = _SPARK_NLP_JSL_WHEEL_URI.format( +# secret=os.environ[_JOHNSNOWLABS_ENV_HEALTHCARE_SECRET] +# ) +# deps.append(_SPARK_NLP_JSL_WHEEL_URI) +# +# if _JOHNSNOWLABS_ENV_VISUAL_SECRET in os.environ: +# _SPARK_NLP_VISUAL_WHEEL_URI = _SPARK_NLP_VISUAL_WHEEL_URI.format( +# secret=os.environ[_JOHNSNOWLABS_ENV_VISUAL_SECRET] +# ) +# deps.append(_SPARK_NLP_VISUAL_WHEEL_URI) +# print("RETRNING DPES!!!!!!!!!!") +# print(deps) +# return deps def print_query_df(query_df): @@ -44,6 +83,20 @@ def find_writable_directory(): # mlflow.set_experiment(find_writable_directory()) +import json + + +def write_string_to_json(input_string, filename="output.json"): + with open(filename, "w") as json_file: + json.dump({"data": input_string}, json_file) + + +def read_json_from_file(filename="output.json"): + with open(filename, "r") as json_file: + data = json.load(json_file) + + return data + def run_test(): import mlflow @@ -54,12 +107,23 @@ def run_test(): nlu_name_to_endpoint, ) + # mlflow.johnsnowlabs.get_default_pip_requirements = new_req + + # write_string_to_json(lic, "lic.json") + os.environ["HEALTHCARE_SECRET"] = json.loads(lic)["SECRET"] + # mlflow.set_experiment(find_writable_directory()) mlflow.set_experiment("/Users/christian@johnsnowlabs.com/my-experiment123") os.environ["JOHNSNOWLABS_LICENSE_JSON_FOR_CONTAINER"] = lic os.environ["JOHNSNOWLABS_LICENSE_JSON"] = lic + + print("USING LIC : ", lic) + from johnsnowlabs.auto_install.jsl_home import get_install_suite_from_jsl_home + + get_install_suite_from_jsl_home(only_jars=True) + # get_install_suite_from_jsl_home(only_jars=True, hc_secret=json.loads(lic)["SECRET"]) # 1) one query to construct endpoint and run with default output level - print(query_and_deploy_if_missing(model, "Hello World", True, True)) + print_query_df(query_and_deploy_if_missing(model, "Hello World", True, True)) # # 2) One query for every output level # for o in ["token", "sentence", "document"]: @@ -76,7 +140,42 @@ def run_test(): ) +def run_test_v2(): + import mlflow + import os + from johnsnowlabs.auto_install.databricks.endpoints import ( + delete_endpoint, + nlu_name_to_endpoint, + ) + from johnsnowlabs.auto_install.jsl_home import get_install_suite_from_jsl_home + from johnsnowlabs.utils.enums import JvmHardwareTarget + from johnsnowlabs import nlp + + os.environ["HEALTHCARE_SECRET"] = json.loads(lic)["SECRET"] + mlflow.set_experiment("/Users/christian@johnsnowlabs.com/my-experiment123") + os.environ["JOHNSNOWLABS_LICENSE_JSON_FOR_CONTAINER"] = lic + os.environ["JOHNSNOWLABS_LICENSE_JSON"] = lic + + # print("USING LIC : ", lic) + + nlp.deploy_endpoint(model, True, True) + df = nlp.query_endpoint(model, "Hello World") + + # get_install_suite_from_jsl_home(only_jars=True, hc_secret=json.loads(lic)["SECRET"]) + # 1) one query to construct endpoint and run with default output level + print_query_df(df) + + # 2) cleanup + print(f"Deleting: {nlu_name_to_endpoint(model)}") + delete_endpoint( + nlu_name_to_endpoint(model), + host=os.environ["DATABRICKS_HOST"], + token=os.environ["DATABRICKS_TOKEN"], + ) + + if __name__ == "__main__": - dbutils.library.installPyPI("mlflow_by_johnsnowlabs") # , version="2.10.0") - dbutils.library.installPyPI("pandas", version="1.5.0") - run_test() + dbutils.library.installPyPI("mlflow_by_johnsnowlabs") + dbutils.library.installPyPI("johnsnowlabs_for_databricks", version="5.1.8rc4") + run_test_v2() + # run_test() diff --git a/johnsnowlabs/auto_install/health_checks/generate_endpoint_test.py b/johnsnowlabs/auto_install/health_checks/generate_test.py similarity index 55% rename from johnsnowlabs/auto_install/health_checks/generate_endpoint_test.py rename to johnsnowlabs/auto_install/health_checks/generate_test.py index 7557122848..dc7ac6d1ba 100644 --- a/johnsnowlabs/auto_install/health_checks/generate_endpoint_test.py +++ b/johnsnowlabs/auto_install/health_checks/generate_test.py @@ -1,6 +1,7 @@ import inspect import johnsnowlabs.auto_install.health_checks.endpoint_test as endp_test +import johnsnowlabs.auto_install.health_checks.load_predict_test as load_predict_test def generate_endpoint_test(model, lic): @@ -10,3 +11,8 @@ def generate_endpoint_test(model, lic): .replace("ENDPOINT LICENSE", lic) .replace("MODEL TO TEST", model) ) + + +def generate_load_predict_test(model): + # read source of endpoint_test.py and replace placeholders with actual values and return new source code + return inspect.getsource(load_predict_test).replace("", model) diff --git a/johnsnowlabs/auto_install/health_checks/load_predict_test.py b/johnsnowlabs/auto_install/health_checks/load_predict_test.py new file mode 100644 index 0000000000..aba097b8d1 --- /dev/null +++ b/johnsnowlabs/auto_install/health_checks/load_predict_test.py @@ -0,0 +1,17 @@ +# Convert pdf to image +from johnsnowlabs import nlp + + +def run_test(): + # nlp.start() + data = ["hello world", "I love apples"] + model = nlp.load("") + print("model loaded", model) + + df = model.predict(data) + for c in df.columns: + print(df[c]) + + +if __name__ == "__main__": + run_test() diff --git a/johnsnowlabs/auto_install/install_flow.py b/johnsnowlabs/auto_install/install_flow.py index f96d110f70..6fbf45d334 100644 --- a/johnsnowlabs/auto_install/install_flow.py +++ b/johnsnowlabs/auto_install/install_flow.py @@ -3,7 +3,6 @@ import sys from typing import Optional, List -import boto3 from johnsnowlabs import settings from johnsnowlabs.auto_install.databricks.dbfs import dbfs_rm @@ -400,7 +399,7 @@ def install_to_databricks( def install_to_emr( - boto_session: Optional[boto3.Session] = None, + boto_session: Optional["boto3.Session"] = None, # EMR specific configs bootstrap_bucket: Optional[str] = None, s3_logs_path: Optional[str] = None, @@ -449,6 +448,8 @@ def install_to_emr( :param auto_terminate_hours : Idle hour to wait before terminating the cluster :return: EMR cluster id """ + import boto3 + secrets: JslSecrets = JslSecrets.build_or_try_find_secrets( browser_login=browser_login, force_browser=force_browser, @@ -486,7 +487,7 @@ def install_to_emr( def install_to_glue( - boto_session: Optional[boto3.Session] = None, + boto_session: Optional["boto3.Session"] = None, glue_assets_bucket: Optional[str] = None, # Browser Auth browser_login: bool = True, @@ -522,6 +523,8 @@ def install_to_glue( :param hardware_platform: Hardware platform """ + import boto3 + if not boto_session: boto_session = boto3.Session() diff --git a/johnsnowlabs/frameworks/__init__.py b/johnsnowlabs/frameworks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/johnsnowlabs/frameworks/embedding_retrieval/__init__.py b/johnsnowlabs/frameworks/embedding_retrieval/__init__.py new file mode 100644 index 0000000000..189cacd723 --- /dev/null +++ b/johnsnowlabs/frameworks/embedding_retrieval/__init__.py @@ -0,0 +1,13 @@ +from johnsnowlabs import try_import_lib + +if try_import_lib("haystack"): + from johnsnowlabs.frameworks.embedding_retrieval.haystack_node import ( + JohnSnowLabsHaystackEmbedder, + JohnSnowLabsHaystackProcessor, + ) + +if try_import_lib("langchain"): + from johnsnowlabs.frameworks.embedding_retrieval.langchain_node import ( + JohnSnowLabsLangChainEmbedder, + JohnSnowLabsLangChainCharSplitter, + ) diff --git a/johnsnowlabs/frameworks/embedding_retrieval/haystack_node.py b/johnsnowlabs/frameworks/embedding_retrieval/haystack_node.py new file mode 100644 index 0000000000..f31b3aa925 --- /dev/null +++ b/johnsnowlabs/frameworks/embedding_retrieval/haystack_node.py @@ -0,0 +1,269 @@ +# Embedder Node compatible with haystack framework +import os +import sys +from pathlib import Path +from typing import List, Optional, Union, Dict, Any, Tuple + +import numpy as np +from haystack import BaseComponent +from haystack.nodes.retriever import EmbeddingRetriever +from haystack.nodes.retriever._base_embedding_encoder import _BaseEmbeddingEncoder +from haystack.schema import Document, MultiLabel +from tqdm.auto import tqdm + +from johnsnowlabs.frameworks.embedding_retrieval.utils import get_docsplitter_pipe + + +class _JohnsnowlabsEmbeddingEncoder(_BaseEmbeddingEncoder): + def __init__(self, retriever: "EmbeddingRetriever"): + # 1) Check imports + try: + from johnsnowlabs import nlp + from nlu.pipe.pipeline import NLUPipeline + except ImportError as exc: + raise ImportError( + "Could not import johnsnowlabs python package. " + "Please install it with `pip install johnsnowlabs`." + ) from exc + + # 2) Start a Spark Session + try: + os.environ["PYSPARK_PYTHON"] = sys.executable + os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable + nlp.start(hardware_target="gpu" if retriever.use_gpu else "cpu") + except Exception as exc: + raise Exception("Failure starting Spark Session") from exc + # 3) Load the model + try: + self.embedding_model = nlp.load(retriever.embedding_model) + except Exception as exc: + raise Exception("Failure loading model") from exc + + def embed(self, texts: Union[List[str], str]) -> np.ndarray: + return np.asarray( + self.embedding_model.predict_embeds(texts), + dtype=float, + ) + + def embed_queries(self, queries: List[str]) -> np.ndarray: + return self.embed(queries) + + def embed_documents(self, docs: List[Document]) -> np.ndarray: + return self.embed([d.content for d in docs]) + + def train( + **kwargs, + ): + raise NotImplementedError("Training not supported") + + def save(self, save_dir: Union[Path, str]): + raise NotImplementedError("Saving not supported") + + +class JohnSnowLabsHaystackEmbedder(EmbeddingRetriever): + def __init__(self, **kwargs): + inject() + kwargs["model_format"] = "johnsnowlabs" + super().__init__(**kwargs) + + +class JohnSnowLabsHaystackProcessor(BaseComponent): # BasePreProcessor + outgoing_edges = 1 + + def __init__( + self, + chunk_overlap=2, + chunk_size=20, + explode_splits=True, + keep_seperators=True, + patterns_are_regex=False, + split_patterns=["\n\n", "\n", " ", ""], + trim_whitespace=True, + ##### OLD PARAMS ##### + progress_bar: bool = True, + add_page_number: bool = False, + max_chars_check: int = 10_000, + ): + """ + :param clean_header_footer: Use heuristic to remove footers and headers across different pages by searching + for the longest common string. This heuristic uses exact matches and therefore + works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4" + or similar. + :param clean_whitespace: Strip whitespaces before or after each line in the text. + :param clean_empty_lines: Remove more than two empty lines in the text. + :param remove_substrings: Remove specified substrings from the text. If no value is provided an empty list is created by default. + :param split_by: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting. + :param split_length: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by -> + "sentence", then each output document will have 10 sentences. + :param split_overlap: Word overlap between two adjacent documents after a split. + Setting this to a positive number essentially enables the sliding window approach. + For example, if split_by -> `word`, + split_length -> 5 & split_overlap -> 2, then the splits would be like: + [w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12]. + Set the value to 0 to ensure there is no overlap among the documents after splitting. + :param split_respect_sentence_boundary: Whether to split in partial sentences if split_by -> `word`. If set + to True, the individual split will always have complete sentences & + the number of words will be <= split_length. + :param language: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. + Available options: "ru","sl","es","sv","tr","cs","da","nl","en","et","fi","fr","de","el","it","no","pl","pt","ml" + :param tokenizer_model_folder: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise. + :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's + attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are + not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). + In this case the id will be generated by using the content and the defined metadata. + :param progress_bar: Whether to show a progress bar. + :param add_page_number: Add the number of the page a paragraph occurs in to the Document's meta + field `"page"`. Page boundaries are determined by `"\f"` character which is added + in between pages by `PDFToTextConverter`, `TikaConverter`, `ParsrConverter` and + `AzureConverter`. + :param max_chars_check: the maximum length a document is expected to have. Each document that is longer than max_chars_check in characters after pre-processing will raise a warning. + """ + super().__init__() + self.progress_bar = progress_bar + self.pipe = get_docsplitter_pipe( + chunk_overlap, + chunk_size, + explode_splits, + keep_seperators, + patterns_are_regex, + split_patterns, + trim_whitespace, + ) + + def process( + self, + documents: Union[dict, Document, List[Union[dict, Document]]] = None, + ) -> List[Document]: + """ + Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents. + """ + if isinstance(documents, (Document, dict)): + ret = self._process_single(document=documents) + elif isinstance(documents, list): + ret = self._process_batch(documents=list(documents)) + elif documents is None: + ret = [] + else: + raise Exception( + f"documents provided to PreProcessor.prepreprocess() is not of type list nor Document or None but is {type(documents)} " + ) + + return ret + + def _process_single( + self, + document: Union[dict, Document], + ) -> List[Document]: + return self.split(document=self.clean(document=document)) + + def _process_batch(self, documents: List[Union[dict, Document]]) -> List[Document]: + nested_docs = [ + self._process_single(d) + for d in tqdm( + documents, + disable=not self.progress_bar, + desc="Preprocessing", + unit="docs", + ) + ] + return [d for x in nested_docs for d in x] + + def clean( + self, + document: Union[dict, Document], + # not implemented + clean_whitespace: bool = None, + clean_header_footer: bool = None, + clean_empty_lines: bool = None, + remove_substrings: Optional[List[str]] = None, + ) -> Document: + """ + Stub for Feature cleaning procedure + """ + return document + + def split( + self, + document: Union[dict, Document], + ) -> List[Document]: + """Perform document splitting on a single document. This method can split on different units, at different lengths, + with different strides. It can also respect sentence boundaries. Its exact functionality is defined by + the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents. + """ + # Todo pass params dynamically? + text = document.content + texts = [split for split in self.pipe.annotate(text)["splits"]] + return [Document(id=str(i), content=text) for i, text in enumerate(texts)] + + def run( + self, + query: Optional[str] = None, + file_paths: Optional[List[str]] = None, + labels: Optional[MultiLabel] = None, + documents: Optional[List[Document]] = None, + meta: Optional[dict] = None, + ) -> Tuple[Dict, str]: + """ + Method that will be executed when the node in the graph is called. + + The argument that are passed can vary between different types of nodes + (e.g. retriever nodes expect different args than a reader node) + + + See an example for an implementation in haystack/reader/base/BaseReader.py + :return: + """ + # result = {"documents": d.to_dict() for d in self.process(documents)} + result = {"documents": d for d in self.process(documents)} + return result, "output_1" + + def run_batch( + self, + queries: Optional[Union[str, List[str]]] = None, + file_paths: Optional[List[str]] = None, + labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None, + documents: Optional[Union[List[Document], List[List[Document]]]] = None, + meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + params: Optional[dict] = None, + debug: Optional[bool] = None, + ): + result = {"documents": d for d in self.process(documents)} + return result, "output_1" + + +def inject(): + # inject the emd encoder into haystack + from haystack.nodes.retriever import _embedding_encoder + + _embedding_encoder._EMBEDDING_ENCODERS[ + "johnsnowlabs" + ] = _JohnsnowlabsEmbeddingEncoder + # inject the retriever into haystack + + +inject() + +# if id_hash_keys is None: +# id_hash_keys = self.id_hash_keys +# +# if isinstance(document, dict): +# document["id_hash_keys"] = id_hash_keys +# document = Document.from_dict(document) +# +# # Mainly needed for type checking +# if not isinstance(document, Document): +# raise HaystackError( +# "Document must not be of type 'dict' but of type 'Document'." +# ) +# +# if type(document.content) is not str: +# return document +# +# text = document.content +# # TODO APPLY PIPE +# self.pipe.annotate(text) +# texts = [split for split in self.pipe.annotate(text)["splits"]] +# if text != document.content: +# document = deepcopy(document) +# document.content = text +# return document diff --git a/johnsnowlabs/frameworks/embedding_retrieval/langchain_node.py b/johnsnowlabs/frameworks/embedding_retrieval/langchain_node.py new file mode 100644 index 0000000000..c025143236 --- /dev/null +++ b/johnsnowlabs/frameworks/embedding_retrieval/langchain_node.py @@ -0,0 +1,120 @@ +import os +import sys +from typing import Any, List + +from langchain.embeddings.base import Embeddings + +# from langchain.pydantic_v1 import BaseModel, Extra +from pydantic import BaseModel, Extra + +from johnsnowlabs.frameworks.embedding_retrieval.utils import get_docsplitter_pipe + + +class JohnSnowLabsLangChainEmbedder(BaseModel, Embeddings): + """JohnSnowLabs embedding models + + To use, you should have the ``johnsnowlabs`` python package installed. + Example: .. code-block:: python + from langchain.embeddings.johnsnowlabs import JohnSnowLabsEmbeddings + document = "foo bar" + embedding = JohnSnowLabsEmbeddings('embed_sentence.bert') + output = embedding.embed_query(document) + """ + + async def aembed_documents(self, texts: List[str]) -> List[List[float]]: + raise NotImplementedError("JohnSnowLabsEmbeddings does not support async yet") + + async def aembed_query(self, text: str) -> List[float]: + raise NotImplementedError("JohnSnowLabsEmbeddings does not support async yet") + + model: Any + + def __init__( + self, model="embed_sentence.bert", hardware_target="cpu", **kwargs: Any + ): + """Initialize the johnsnowlabs model.""" + super().__init__(**kwargs) + # 1) Check imports + try: + from johnsnowlabs import nlp + from nlu.pipe.pipeline import NLUPipeline + except ImportError as exc: + raise ImportError( + "Could not import johnsnowlabs python package. " + "Please install it with `pip install johnsnowlabs`." + ) from exc + + # 2) Start a Spark Session + try: + os.environ["PYSPARK_PYTHON"] = sys.executable + os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable + nlp.start(hardware_target=hardware_target) + except Exception as exc: + raise Exception("Failure starting Spark Session") from exc + # 3) Load the model + try: + if isinstance(model, str): + self.model = nlp.load(model) + elif isinstance(model, NLUPipeline): + self.model = model + else: + self.model = nlp.to_nlu_pipe(model) + except Exception as exc: + raise Exception("Failure loading model") from exc + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Compute doc embeddings using a JohnSnowLabs transformer model. + + Args: texts: The list of texts to embed. + Returns: List of embeddings, one for each text.""" + + return self.model.predict_embeds(texts) + + def embed_query(self, text: str) -> List[float]: + """Compute query embeddings using a JohnSnowLabs transformer model. + Args: text: The text to embed. + Returns: Embeddings for the text.""" + return self.model.predict_embeds(text)[0] + + +class JohnSnowLabsLangChainCharSplitter: + def __init__( + self, + chunk_overlap=2, + chunk_size=20, + explode_splits=True, + keep_seperators=True, + patterns_are_regex=False, + split_patterns=["\n\n", "\n", " ", ""], + trim_whitespace=True, + ): + self.pipe = get_docsplitter_pipe( + chunk_overlap, + chunk_size, + explode_splits, + keep_seperators, + patterns_are_regex, + split_patterns, + trim_whitespace, + ) + + def split_documents(self, docs): + return split_lang_docs(self.pipe, docs) + + +def split_lang_doc(pipe, doc): + from langchain.schema.document import Document + + return [ + Document(page_content=split, metadata=doc.metadata) + for split in pipe.annotate(doc.page_content)["splits"] + ] + + +def split_lang_docs(pipe, docs): + return [split for doc in docs for split in split_lang_doc(pipe, doc)] diff --git a/johnsnowlabs/frameworks/embedding_retrieval/utils.py b/johnsnowlabs/frameworks/embedding_retrieval/utils.py new file mode 100644 index 0000000000..2ea6393217 --- /dev/null +++ b/johnsnowlabs/frameworks/embedding_retrieval/utils.py @@ -0,0 +1,30 @@ +def get_docsplitter_pipe( + chunk_overlap=2, + chunk_size=20, + explode_splits=True, + keep_separators=True, + patterns_are_regex=False, + split_patterns=["\n\n", "\n", " ", ""], + trim_whitespace=True, +): + from johnsnowlabs import nlp + + nlp.start() + + return nlp.LightPipeline( + nlp.PipelineModel( + stages=[ + nlp.DocumentAssembler().setInputCol("text"), + nlp.DocumentCharacterTextSplitter() + .setInputCols(["document"]) + .setOutputCol("splits") + .setChunkSize(chunk_size) + .setChunkOverlap(chunk_overlap) + .setExplodeSplits(explode_splits) + .setPatternsAreRegex(patterns_are_regex) + .setSplitPatterns(split_patterns) + .setTrimWhitespace(trim_whitespace) + .setKeepSeparators(keep_separators), + ] + ) + ) diff --git a/johnsnowlabs/llm.py b/johnsnowlabs/llm.py new file mode 100644 index 0000000000..bcb525fe95 --- /dev/null +++ b/johnsnowlabs/llm.py @@ -0,0 +1 @@ +from johnsnowlabs.frameworks import embedding_retrieval diff --git a/johnsnowlabs/nlp.py b/johnsnowlabs/nlp.py index ca53916bc3..6f3f7962ec 100644 --- a/johnsnowlabs/nlp.py +++ b/johnsnowlabs/nlp.py @@ -1,6 +1,10 @@ from johnsnowlabs import lab, settings, viz from johnsnowlabs.abstract_base.lib_resolver import try_import_lib -from johnsnowlabs.auto_install.databricks.endpoints import query_and_deploy_if_missing +from johnsnowlabs.auto_install.databricks.endpoints import ( + query_and_deploy_if_missing, + query_endpoint, + deploy_endpoint, +) from johnsnowlabs.auto_install.databricks.work_utils import run_in_databricks from johnsnowlabs.auto_install.emr.work_utils import run_in_emr diff --git a/johnsnowlabs/settings.py b/johnsnowlabs/settings.py index 47d2328946..cff6ee7594 100644 --- a/johnsnowlabs/settings.py +++ b/johnsnowlabs/settings.py @@ -11,21 +11,19 @@ # These versions are used for auto-installs and version checks +raw_version_jsl_lib = "5.1.8" -raw_version_jsl_lib = "5.1.7" +raw_version_nlp = "5.1.4" -raw_version_nlp = "5.1.2" - -raw_version_nlu = "5.0.4rc2" - +raw_version_nlu = "5.1.0" raw_version_pyspark = "3.1.2" raw_version_nlp_display = "4.1" -raw_version_medical = "5.1.2" -raw_version_secret_medical = "5.1.2" +raw_version_medical = "5.1.3" +raw_version_secret_medical = "5.1.3" raw_version_secret_ocr = "5.0.2" raw_version_ocr = "5.0.2" diff --git a/johnsnowlabs/utils/boto_utils.py b/johnsnowlabs/utils/boto_utils.py index 074ed5a3e3..91301e5072 100644 --- a/johnsnowlabs/utils/boto_utils.py +++ b/johnsnowlabs/utils/boto_utils.py @@ -1,8 +1,3 @@ -from os import environ - -import boto3 - - class BotoException(Exception): def __init__(self, code, message): self.code = code diff --git a/johnsnowlabs/utils/s3_utils.py b/johnsnowlabs/utils/s3_utils.py index cee0703220..6f87fc29a3 100644 --- a/johnsnowlabs/utils/s3_utils.py +++ b/johnsnowlabs/utils/s3_utils.py @@ -1,8 +1,5 @@ from typing import Tuple -import boto3 -import botocore - from johnsnowlabs.utils.boto_utils import BotoException @@ -11,11 +8,13 @@ def parse_s3_url(s3_url: str) -> Tuple[str, str]: return s3_url.split("/")[2], "/".join(s3_url.split("/")[3:]).rstrip("/") -def create_bucket(boto_session: boto3.Session, bucket: str): +def create_bucket(boto_session: "boto3.Session", bucket: str): """Create a bucket for EMR cluster logs :param boto_session: Botocore session :param bucket: Bucket name """ + import botocore + try: s3_client = boto_session.client("s3") region = boto_session.region_name @@ -37,11 +36,13 @@ def create_bucket(boto_session: boto3.Session, bucket: str): ) -def check_if_file_exists_in_s3(boto_session: boto3.Session, s3_url: str): +def check_if_file_exists_in_s3(boto_session: "boto3.Session", s3_url: str): """Check if file exists in s3 using s3 client :param boto_session : Botocore session :param s3_url: S3 url to check """ + import botocore + try: s3_client = boto_session.client("s3") bucket, key = parse_s3_url(s3_url) @@ -55,7 +56,7 @@ def check_if_file_exists_in_s3(boto_session: boto3.Session, s3_url: str): def upload_file_to_s3( - boto_session: boto3.Session, file_path: str, bucket: str, file_name: str + boto_session: "boto3.Session", file_path: str, bucket: str, file_name: str ) -> str: """Upload a file to s3 bucket :botocore_session: Botocore session @@ -64,6 +65,8 @@ def upload_file_to_s3( :param file_name: File name to create :return s3_url: S3 url of uploaded file """ + import botocore + try: s3_client = boto_session.client("s3") s3_client.upload_file(file_path, bucket, file_name) @@ -75,8 +78,10 @@ def upload_file_to_s3( def upload_content( - boto_session: boto3.Session, content: str, bucket: str, file_name: str + boto_session: "boto3.Session", content: str, bucket: str, file_name: str ) -> str: + import botocore + """Upload content to s3 bucket :param boto_session: Botocore session :param content: Content to upload diff --git a/johnsnowlabs/utils/sparksession_utils.py b/johnsnowlabs/utils/sparksession_utils.py index 5e1434a82f..1e2cb7d417 100644 --- a/johnsnowlabs/utils/sparksession_utils.py +++ b/johnsnowlabs/utils/sparksession_utils.py @@ -115,6 +115,14 @@ def start( store_in_jsl_home=store_in_jsl_home, ) + try: + # We use this to resolve some obscure import bugs during .check_installed for some envs + Software.spark_nlp.check_installed(None) if spark_nlp else None + Software.spark_ocr.check_installed(None) if suite.hc else None + Software.spark_hc.check_installed(None) if suite.ocr else None + except: + pass + # Collect all local Jar Paths we have access to for the SparkSession jars = [] if ( diff --git a/notebooks/databricks_endpoints_tutorial.ipynb b/notebooks/databricks_endpoints_tutorial.ipynb index 64f1d0ed1f..414cdfe210 100644 --- a/notebooks/databricks_endpoints_tutorial.ipynb +++ b/notebooks/databricks_endpoints_tutorial.ipynb @@ -1,2478 +1,2420 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "2bbd4bff-33b8-49af-9957-46bd65875cf8", - "showTitle": false, - "title": "" - } - }, - "source": [ - "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", - "# John Snow Labs Models for Databricks Serve Endpoints \n", - "You can Query&Deploy John Snow Labs models with 1 line of code as [Databricks Model Serve Endpoints](https://docs.databricks.com/en/machine-learning/model-serving/index.html). \n", - "Data is passed to the [predict()](https://nlp.johnsnowlabs.com/docs/en/jsl/predict_api) function and predictions are shaped accordingly. \n", - "You must create endpoints from a Databricks cluster created by [nlp.install](https://nlp.johnsnowlabs.com/docs/en/jsl/install_advanced#automatic-databricks-installation)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "dbcb2375-3aef-402c-82be-4d8ec130e862", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# 0) Setup your env\n", - "Install relevant libs & set license env" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "e41b5144-55f9-47a5-8b59-67ebe6f16bde", - "showTitle": false, - "title": "" - }, - "pycharm": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "%pip install -U mlflow_by_johnsnowlabs\n", - "dbutils.library.restartPython()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "08c65230-719f-48f2-be79-ddedbc73cf5d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# 1) Provide Credentials\n", - "You need\n", - "1. John Snow Labs License JSON\n", - "2. Databricks Access Token & Databricks Host URL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "b0dacf7a-5c0a-4b9b-a070-c63fa2d6b3a4", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "from johnsnowlabs import nlp\n", - "import mlflow\n", - "import os\n", - "\n", - "# Enter your John Snow Labs Container License. Your John Snow Labs Container License. NOT your Datbricks cluster license\n", - "endpoint_license = \"\"\"\n", - "{\n", - " \"SPARK_NLP_LICENSE\": \"\",\n", - " \n", - " \"SECRET\":\"\",\n", - " \"JSL_VERSION\":\"\",\n", - " \"PUBLIC_VERSION\":\"\",\n", - "\n", - " \"AWS_ACCESS_KEY_ID\": \"\",\n", - " \"AWS_SECRET_ACCESS_KEY\": \"\",\n", - "}\n", - "\"\"\"\n", - "os.environ['JOHNSNOWLABS_LICENSE_JSON_FOR_CONTAINER'] = endpoint_license\n", - "os.environ['JOHNSNOWLABS_LICENSE_JSON'] = endpoint_license\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "314cc765-d946-448e-b64a-8565dccd5d81", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# 2) Deploy John Snow Labs Model & Query\n", - "\n", - "\n", - "\n", - "\n", - "[nlp.query_and_deploy_if_missing](https://nlp.johnsnowlabs.com/docs/en/jsl/databricks-utils#endpoint-creation) will register a Mlflow model if not already exiting and deploy it as an edpoint if not already existing and query it. \n", - "\n", - "\n", - "\n", - "\n", - "`nlp.query_and_deploy_if_missing` has the following parameters related to **deploying your model**:\n", - "\n", - "| Parameter | Description |\n", - "|------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| \n", - "| `model` | Model to be deployed as endpoint which is [converted into NluPipelines](https://nlp.johnsnowlabs.com/docs/en/jsl/utils_for_spark_nlp#nlptonlupipepipe), supported classes are: `String` Reference to NLU Pipeline name like 'bert', `NLUPipeline`, `List[Annotator]`, `Pipeline`, `LightPipeline`, `PretrainedPipeline`, `PipelineModel`, |\n", - "| `query` | str or list of strings or raw json string. If raw json, is_json_query must be True |\n", - "| `is_json_query` | if True, query is treated as raw json string |\n", - "| `base_name` | Name-Prefix for all resources created (Endpoints, Models, etc). If using non nlu referenced based models, you must specify this. |\n", - "| `re_create_endpoint` | if False, endpoint creation is skipped if one already exists. If True, it will delete existing endpoint if it exists |\n", - "| `re_create_model` | if False, model creation is skipped if one already exists. If True, model will be re-logged again, bumping the current version by 2 |\n", - "| `workload_size` | one of Small, Medium, Large. |\n", - "| `gpu` | `True`/`False` to load GPU-optimized jars or CPU-optimized jars in the container |\n", - "| `new_run` | if True, mlflow will start a new run before logging the model |\n", - "| `db_host` | the databricks host URL. If not specified, the DATABRICKS_HOST environment variable is used |\n", - "| `db_token` | the databricks Access Token. If not specified, the DATABRICKS_TOKEN environment variable is used |\n", - "| `block_until_deployed` | if True, this function will block until the endpoint is created |\n", - "\n", - "\n", - "\n", - "\n", - "`nlp.query_and_deploy_if_missing` has the following parameters related to **querying your model**, \n", - "which are forwarded to the [model.predict()](https://nlp.johnsnowlabs.com/docs/en/jsl/predict_api) call:\n", - "\n", - "| Parameter | Description |\n", - "|-----------------------------|----------------------------------------------------------------------------------------------------| \n", - "| `output_level` | One of `token`, `chunk`, `sentence`, `relation`, `document` to shape outputs | \n", - "| `positions` | Set `True`/`False` to include or exclude character index position of predictions | \n", - "| `metadata` | Set `True`/`False` to include additional metadata | \n", - "| `drop_irrelevant_cols` | Set `True`/`False` to drop irrelevant columns | \n", - "| `get_embeddings` | Set `True`/`False` to include embedding or not | \n", - "| `keep_stranger_features` | Set `True`/`False` to return columns not named \"text\", 'image\" or \"file_type\" from your input data | \n", - "| `multithread` | Set `True`/`False` to use multi-Threading for inference. Auto-inferred if not set | \n", - "\n", - "\n", - "`nlp.query_and_deploy_if_missing` checks the following Env vars \n", - "\n", - "| Env Var Name | Description | \n", - "|-----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|\n", - "| `HEALTHCARE_SECRET` | Automatically set on your cluster if you run nlp.install() |\n", - "| `VISUAL_SECRET` | Automatically set if you run. `nlp.install(..., visual=True)`. You can only spawn visual endpoint from a cluster created by `nlp.install(..., visual=True)` |\n", - "| `JOHNSNOWLABS_LICENSE_JSON` | JSON content of your john snow labs licensed to use for endpoints. Should be **airgap license** |" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "5deb994e-db2a-43c8-82f0-ed809352870b", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "\n", - "general_text = \"\"\"John Snow is a medical doctor from England. Peter likes America\"\"\"\n", - "\n", - "cancer_text = \"\"\" with his breathing while feeding (but negative for any perioral cyanosis or retractions). One day ago, mom also noticed a tactile temperature and gave the patient Tylenol. Baby-girl also has had some decreased p.o. intake. His normal breast-feeding is down from 20 minutes q.2h. to 5 to 10 minutes s\"\"\"\n", - "\n", - "ade_text = \"\"\"So glad I am off effexor, so sad it ruined my teeth. tip Please be carefull taking antideppresiva and read about it 1st\"\"\"\n", - "\n", - "body_re_text = \"\"\"MRI demonstrated infarction in the upper brain stem , left cerebellum and right basil ganglia\"\"\"\n", - "\n", - "radiology_text = \"\"\"INTERPRETATION: There has been interval development of a moderate left-sided pneumothorax with near complete collapse of the left upper lobe. The lower lobe appears aerated. There is stable, diffuse, bilateral interstitial thickening with no definite acute air space consolidation. The heart and pulmonary vascularity are within normal limits. Left-sided port is seen with Groshong tip at the SVC/RA junction. No evidence for acute fracture, malalignment, or dislocation.\"\"\"\n", - "\n", - "long_text = \"\"\"\n", - "Patient with hypertension, syncope, and spinal stenosis - for recheck.\n", - "(Medical Transcription Sample Report)\n", - "SUBJECTIVE:\n", - "The patient is a 78-year-old female who returns for recheck. She has hypertension. She denies difficulty with chest pain, palpations, orthopnea, nocturnal dyspnea, or edema.\n", - "PAST MEDICAL HISTORY / SURGERY / HOSPITALIZATIONS:\n", - "Reviewed and unchanged from the dictation on 12/03/2003.\n", - "MEDICATIONS:\n", - "Atenolol 50 mg daily, Premarin 0.625 mg daily, calcium with vitamin D two to three pills daily, multivitamin daily, aspirin as needed, and TriViFlor 25 mg two pills daily. She also has Elocon cream 0.1% and Synalar cream 0.01% that she uses as needed for rash\n", - "\"\"\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "18b8f61c-974b-4f58-ba33-b1232bb32cca", - "showTitle": false, - "title": "" - } - }, - "outputs": [ + "cells": [ { - "data": { - "text/html": [ - "\n", - "
Deleting registered model ner_REGISTERD_MODEL\n", - "Warning::Spark Session already created, some configs may not take.\n", - "Warning::Spark Session already created, some configs may not take.\n", - "onto_recognize_entities_sm download started this may take some time.\n", - "Approx size to download 159 MB\n", - "\r[ | ]\r[OK!]\n", - "Warning::Spark Session already created, some configs may not take.\n", - "2023/09/03 03:24:48 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.30.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n", - "Successfully registered model 'ner_REGISTERD_MODEL'.\n", - "2023/09/03 03:24:49 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: ner_REGISTERD_MODEL, version 1\n", - "Created version '1' of model 'ner_REGISTERD_MODEL'.\n", - "2023/09/03 03:27:35 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.30.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n", - "Registered model 'ner_REGISTERD_MODEL' already exists. Creating a new version of this model...\n", - "2023/09/03 03:27:36 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: ner_REGISTERD_MODEL, version 2\n", - "Created version '2' of model 'ner_REGISTERD_MODEL'.\n", - "Deleting exisiting Endpoint ner_ENDPOINT\n", - "Writing license to scope JSL_SCOPE\n", - "Creating new serving endpoint: ner_ENDPOINT\n", - "Deployment starting, this may take 10 to 20 minutes...\n", - "Created serving endpoint ner_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/ner_ENDPOINT\n", - "Out[4]:
" + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2bbd4bff-33b8-49af-9957-46bd65875cf8", + "showTitle": false, + "title": "" + }, + "id": "baySg2v7jNUG" + }, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "# John Snow Labs Models for Databricks Serve Endpoints\n", + "You can Query&Deploy John Snow Labs models with 1 line of code as [Databricks Model Serve Endpoints](https://docs.databricks.com/en/machine-learning/model-serving/index.html). \n", + "Data is passed to the [predict()](https://nlp.johnsnowlabs.com/docs/en/jsl/predict_api) function and predictions are shaped accordingly. \n", + "You must create endpoints from a Databricks cluster created by [nlp.install](https://nlp.johnsnowlabs.com/docs/en/jsl/install_advanced#automatic-databricks-installation).\n" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Deleting registered model ner_REGISTERD_MODEL\nWarning::Spark Session already created, some configs may not take.\nWarning::Spark Session already created, some configs may not take.\nonto_recognize_entities_sm download started this may take some time.\nApprox size to download 159 MB\n\r[ | ]\r[OK!]\nWarning::Spark Session already created, some configs may not take.\n2023/09/03 03:24:48 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.30.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\nSuccessfully registered model 'ner_REGISTERD_MODEL'.\n2023/09/03 03:24:49 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: ner_REGISTERD_MODEL, version 1\nCreated version '1' of model 'ner_REGISTERD_MODEL'.\n2023/09/03 03:27:35 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.30.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\nRegistered model 'ner_REGISTERD_MODEL' already exists. Creating a new version of this model...\n2023/09/03 03:27:36 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: ner_REGISTERD_MODEL, version 2\nCreated version '2' of model 'ner_REGISTERD_MODEL'.\nDeleting exisiting Endpoint ner_ENDPOINT\nWriting license to scope JSL_SCOPE\nCreating new serving endpoint: ner_ENDPOINT\nDeployment starting, this may take 10 to 20 minutes...\nCreated serving endpoint ner_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/ner_ENDPOINT\nOut[4]:
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentencetextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...John SnowPERSON0.9836499700John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
10John Snow is a medical doctor from England. Pe...EnglandGPE0.976210John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
20John Snow is a medical doctor from England. Pe...PeterPERSON0.997621John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
30John Snow is a medical doctor from England. Pe...AmericaGPE0.960331John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n", - "
" + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "dbcb2375-3aef-402c-82be-4d8ec130e862", + "showTitle": false, + "title": "" + }, + "id": "Kg9n5iYljNUI" + }, + "source": [ + "# 0) Setup your env\n", + "Install relevent libs & set license env" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentencetextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...John SnowPERSON0.9836499700John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
10John Snow is a medical doctor from England. Pe...EnglandGPE0.976210John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
20John Snow is a medical doctor from England. Pe...PeterPERSON0.997621John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
30John Snow is a medical doctor from England. Pe...AmericaGPE0.960331John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Deploy & Query general ner model\n", - "nlp.query_and_deploy_if_missing('ner',\n", - " general_text,\n", - " workload_size='Large'\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "055fb7db-61e8-4e23-9d7e-ac730cafb158", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", - "Endpoint ner_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \n", - "Out[7]:
" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint ner_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \nOut[7]:
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentenceoutput_leveltextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...[John Snow, England, Peter, America][PERSON, GPE, PERSON, GPE][0.98364997, 0.9762, 0.9976, 0.9603][0, 1, 2, 3][0, 0, 1, 1]documentJohn Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n", - "
" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentenceoutput_leveltextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...[John Snow, England, Peter, America][PERSON, GPE, PERSON, GPE][0.98364997, 0.9762, 0.9976, 0.9603][0, 1, 2, 3][0, 0, 1, 1]documentJohn Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# example of defining output-level\n", - "nlp.query_and_deploy_if_missing('ner',general_text,output_level='document')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "a5ebfae5-9b55-4bb3-8a1a-49ea46436bcd", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", - "Endpoint ner_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \n", - "Out[24]:
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "efb5db15-74a7-45ef-b125-c52e9c2f23be", + "showTitle": false, + "title": "" + }, + "id": "jgs0amD6jNUJ" + }, + "outputs": [], + "source": [ + "%pip install -U johnsnowlabs_for_databricks\n", + "%pip install -U mlflow_by_johnsnowlabs\n", + "dbutils.library.restartPython()" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint ner_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \nOut[24]:
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentencetextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...John SnowPERSON0.9836499700John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
10John Snow is a medical doctor from England. Pe...EnglandGPE0.976210John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
20John Snow is a medical doctor from England. Pe...PeterPERSON0.997621John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
30John Snow is a medical doctor from England. Pe...AmericaGPE0.960331John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n", - "
" + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "08c65230-719f-48f2-be79-ddedbc73cf5d", + "showTitle": false, + "title": "" + }, + "id": "bJQYDASWjNUK" + }, + "source": [ + "# 1) Provide Credentials\n", + "You need\n", + "1. John Snow Labs License JSON\n", + "2. Databricks Access Token & Databricks Host URL" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentencetextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...John SnowPERSON0.9836499700John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
10John Snow is a medical doctor from England. Pe...EnglandGPE0.976210John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
20John Snow is a medical doctor from England. Pe...PeterPERSON0.997621John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
30John Snow is a medical doctor from England. Pe...AmericaGPE0.960331John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "nlp.query_and_deploy_if_missing('ner',general_text,workload_size='Large')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "c6b86281-ad2c-45fc-bc76-eb0363035eaa", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", - "Endpoint tokenize_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \n", - "Out[5]:
" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint tokenize_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \nOut[5]:
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indextexttoken
00Hello World How Are you!Hello
10Hello World How Are you!World
20Hello World How Are you!How
30Hello World How Are you!Are
40Hello World How Are you!you
50Hello World How Are you!!
\n", - "
" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indextexttoken
00Hello World How Are you!Hello
10Hello World How Are you!World
20Hello World How Are you!How
30Hello World How Are you!Are
40Hello World How Are you!you
50Hello World How Are you!!
\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "nlp.query_and_deploy_if_missing('tokenize', 'Hello World How Are you!')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "6fa0494e-19d5-4009-8c1c-418b36eaa676", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", - "Endpoint en_classify_bert_token_ner_jsl_pipeline_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \n", - "Out[8]:
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b0dacf7a-5c0a-4b9b-a070-c63fa2d6b3a4", + "showTitle": false, + "title": "" + }, + "id": "7lqsEUdTjNUK", + "outputId": "43d3f3dd-2b1e-4888-b7f0-ba3c9f8c6b04" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + } + ], + "source": [ + "from johnsnowlabs import nlp\n", + "import mlflow\n", + "import os\n", + "\n", + "# Enter your John Snow Labs Container License.\n", + "endpoint_license = \"\"\"\n", + "Your John Snow Labs Container License. NOT your Datbricks cluster license\n", + "\"\"\"\n", + "os.environ['JOHNSNOWLABS_LICENSE_JSON_FOR_CONTAINER'] = endpoint_license\n", + "os.environ['JOHNSNOWLABS_LICENSE_JSON'] = endpoint_license\n" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint en_classify_bert_token_ner_jsl_pipeline_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \nOut[8]:
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexclassified_tokendocumententitiesentities_classentities_confidenceentities_origin_chunkentities_origin_sentencetext
00[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...hisDemographics0.999558800\\n with his breathing while feeding (but negat...
10[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...perioral cyanosisSymptom0.9840110510\\n with his breathing while feeding (but negat...
20[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...retractionsSymptom0.9992540520\\n with his breathing while feeding (but negat...
30[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...One day agoDate_Time0.9993855431\\n with his breathing while feeding (but negat...
40[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...momDemographics0.9998348441\\n with his breathing while feeding (but negat...
50[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...tactile temperatureSymptom0.99935251\\n with his breathing while feeding (but negat...
60[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...TylenolDrug0.999762561\\n with his breathing while feeding (but negat...
70[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...Baby-girlAge0.9805288372\\n with his breathing while feeding (but negat...
80[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...decreased p.o. intakeSymptom0.9989776682\\n with his breathing while feeding (but negat...
90[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...HisDemographics0.9999116793\\n with his breathing while feeding (but negat...
100[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...breast-feedingBody_Part0.9993943103\\n with his breathing while feeding (but negat...
\n", - "
" + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "314cc765-d946-448e-b64a-8565dccd5d81", + "showTitle": false, + "title": "" + }, + "id": "JiBB4vq_jNUL" + }, + "source": [ + "# 2) Deploy John Snow Labs Model & Query\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "## Endpoint Creation\n", + "\n", + "You can Query&Deploy John Snow Labs models with 1 line of code as [Databricks Model Serve Endpoints](https://docs.databricks.com/en/machine-learning/model-serving/index.html). \n", + "Data is passed to the [predict()](https://nlp.johnsnowlabs.com/docs/en/jsl/predict_api) function and predictions are shaped accordingly. \n", + "You must create endpoints from a Databricks cluster created by [nlp.install](https://nlp.johnsnowlabs.com/docs/en/jsl/install_advanced#automatic-databricks-installation).\n", + "\n", + "See [Cluster Creation Notebook](https://github.com/JohnSnowLabs/johnsnowlabs/tree/main/notebooks/create_databricks_cluster.ipynb)\n", + "and [Databricks Endpoint Tutorial Notebook](https://github.com/JohnSnowLabs/johnsnowlabs/tree/main/notebooks/databricks_endpoints_tutorial.ipynb) \n", + "\n", + "\n", + "```python\n", + "# You need `mlflow_by_johnsnowlabs` installed until next mlflow is released\n", + "! pip install mlflow_by_johnsnowlabs\n", + "from johnsnowlabs import nlp\n", + "nlp.deploy_endpoint('bert')\n", + "nlp.query_endpoint('bert_ENDPOINT','My String to embed')\n", + "```\n", + "\n", + "`nlp.deploy_endpoint` will register a ML-FLow model into your registry and deploy an Endpoint with a JSL license.\n", + "It has the following parameters:\n", + "\n", + "| Parameter | Description |\n", + "|------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n", + "| `model` | Model to be deployed as endpoint which is [converted into NluPipelines](https://nlp.johnsnowlabs.com/docs/en/jsl/utils_for_spark_nlp#nlptonlupipepipe), supported classes are: `String` Reference to NLU Pipeline name like 'bert', `NLUPipeline`, `List[Annotator]`, `Pipeline`, `LightPipeline`, `PretrainedPipeline`, `PipelineModel`. In case of a NLU reference, the endpoint name is auto-generated aus `_ENDPOINT` i.e. `bert_ENDPOINT` |\n", + "| `endpoint_name` | Name for the deployed endpoint. Optional if using NLU model reference but mandatory for custom pipelines. |\n", + "| `re_create_endpoint` | if False, endpoint creation is skipped if one already exists. If True, it will delete existing endpoint if it exists |\n", + "| `re_create_model` | if False, model creation is skipped if one already exists. If True, model will be re-logged again, bumping the current version by 2 |\n", + "| `workload_size` | one of Small, Medium, Large. |\n", + "| `gpu` | `True`/`False` to load GPU-optimized jars or CPU-optimized jars in the container. Must use a gpu based `workload_type` if `gpu=true` |\n", + "| `new_run` | if True, mlflow will start a new run before logging the model |\n", + "| `block_until_deployed` | if True, this function will block until the endpoint is created |\n", + "| `workload_type` | `CPU` by default, use `GPU_SMALL` to spawn a GPU based endpoint instead. Check Databricks docs for alternative values |\n", + "| `db_host` | the databricks host URL. If not specified, the DATABRICKS_HOST environment variable is used |\n", + "| `db_token` | the databricks Access Token. If not specified, the DATABRICKS_TOKEN environment variable is used |\n", + "\n", + "`nlp.query_endpoint` translates your query to JSON, sends it to the endpoint and returns the result as pandas DataFrame.\n", + "It has the following parameters which are forwarded to the [model.predict()](https://nlp.johnsnowlabs.com/docs/en/jsl/predict_api) call inside of the endpoint:\n", + "\n", + "| Parameter | Description |\n", + "|-----------------------------|----------------------------------------------------------------------------------------------------|\n", + "| `endpoint_name` | Name of the endpoint to query. If used an NLU reference, its simply `_ENDPOINT` |\n", + "| `query` | str or list of strings or raw json string. If raw json, is_json_query must be True |\n", + "| `is_json_query` | if True, query is treated as raw json string |\n", + "| `output_level` | One of `token`, `chunk`, `sentence`, `relation`, `document` to shape outputs |\n", + "| `positions` | Set `True`/`False` to include or exclude character index position of predictions |\n", + "| `metadata` | Set `True`/`False` to include additional metadata |\n", + "| `drop_irrelevant_cols` | Set `True`/`False` to drop irrelevant columns |\n", + "| `get_embeddings` | Set `True`/`False` to include embedding or not |\n", + "| `keep_stranger_features` | Set `True`/`False` to return columns not named \"text\", 'image\" or \"file_type\" from your input data |\n", + "| `multithread` | Set `True`/`False` to use multi-Threading for inference. Auto-inferred if not set |\n", + "| `db_host` | the databricks host URL. If not specified, the DATABRICKS_HOST environment variable is used |\n", + "| `db_token` | the databricks Access Token. If not specified, the DATABRICKS_TOKEN environment variable is used |\n", + "\n", + "\n", + "\n", + "`nlp.query_endpoint` and `nlp.deploy_endpoint` check the following **mandatory** env vars to resolve wheels for endpoints\n", + "\n", + "| Env Var Name | Description |\n", + "|-----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|\n", + "| `HEALTHCARE_SECRET` | Automatically set on your cluster if you run nlp.install() |\n", + "| `VISUAL_SECRET` | Automatically set if you run. `nlp.install(..., visual=True)`. You can only spawn visual endpoint from a cluster created by `nlp.install(..., visual=True)` |\n", + "| `JOHNSNOWLABS_LICENSE_JSON` | JSON content of your john snow labs licensed to use for endpoints. Should be **airgap license** |\n", + "\n" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexclassified_tokendocumententitiesentities_classentities_confidenceentities_origin_chunkentities_origin_sentencetext
00[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...hisDemographics0.999558800\\n with his breathing while feeding (but negat...
10[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...perioral cyanosisSymptom0.9840110510\\n with his breathing while feeding (but negat...
20[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...retractionsSymptom0.9992540520\\n with his breathing while feeding (but negat...
30[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...One day agoDate_Time0.9993855431\\n with his breathing while feeding (but negat...
40[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...momDemographics0.9998348441\\n with his breathing while feeding (but negat...
50[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...tactile temperatureSymptom0.99935251\\n with his breathing while feeding (but negat...
60[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...TylenolDrug0.999762561\\n with his breathing while feeding (but negat...
70[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...Baby-girlAge0.9805288372\\n with his breathing while feeding (but negat...
80[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...decreased p.o. intakeSymptom0.9989776682\\n with his breathing while feeding (but negat...
90[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...HisDemographics0.9999116793\\n with his breathing while feeding (but negat...
100[O, B-Demographics, O, O, O, O, O, O, O, O, B-...\\n with his breathing while feeding (but negat...breast-feedingBody_Part0.9993943103\\n with his breathing while feeding (but negat...
\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# https://nlp.johnsnowlabs.com/2023/06/17/bert_token_classifier_ner_jsl_pipeline_en.html\n", - "nlp.query_and_deploy_if_missing('en.classify.bert_token_ner_jsl.pipeline',cancer_text)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "14634db9-c400-45fa-ade1-511f94150f70", - "showTitle": false, - "title": "" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", - "Endpoint ner_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \n", - "Out[9]:
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5deb994e-db2a-43c8-82f0-ed809352870b", + "showTitle": false, + "title": "" + }, + "id": "-RMtuWefjNUM", + "outputId": "9eb77692-9773-41e2-e03d-b64340abeac6" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + } + ], + "source": [ + "general_text = \"John Snow is a medical doctor from England. Peter likes America\"\n", + "cancer_text = \"\"\"with his breathing while feeding (but negative for any perioral cyanosis or retractions). One day ago, mom also noticed a tactile temperature and gave the patient Tylenol. Baby-girl also has had some decreased p.o. intake. His normal breast-feeding is down from 20 minutes q.2h. to 5 to 10 minutes s\"\"\"\n", + "\n", + "ade_text = \"\"\"So glad I am off effexor, so sad it ruined my teeth. tip Please be carefull taking antideppresiva and read about it 1st\"\"\"\n", + "\n", + "body_re_text = \"\"\"MRI demonstrated infarction in the upper brain stem , left cerebellum and right basil ganglia\"\"\"\n", + "radiology_text = \"\"\"INTERPRETATION: There has been interval development of a moderate left-sided pneumothorax with near complete collapse of the left upper lobe. The lower lobe appears aerated. There is stable, diffuse, bilateral interstitial thickening with no definite acute air space consolidation. The heart and pulmonary vascularity are within normal limits. Left-sided port is seen with Groshong tip at the SVC/RA junction. No evidence for acute fracture, malalignment, or dislocation.\"\"\"\n", + "\n", + "\n", + "long_text = \"\"\"\n", + "Patient with hypertension, syncope, and spinal stenosis - for recheck.\n", + "(Medical Transcription Sample Report)\n", + "SUBJECTIVE:\n", + "The patient is a 78-year-old female who returns for recheck. She has hypertension. She denies difficulty with chest pain, palpations, orthopnea, nocturnal dyspnea, or edema.\n", + "PAST MEDICAL HISTORY / SURGERY / HOSPITALIZATIONS:\n", + "Reviewed and unchanged from the dictation on 12/03/2003.\n", + "MEDICATIONS:\n", + "Atenolol 50 mg daily, Premarin 0.625 mg daily, calcium with vitamin D two to three pills daily, multivitamin daily, aspirin as needed, and TriViFlor 25 mg two pills daily. She also has Elocon cream 0.1% and Synalar cream 0.01% that she uses as needed for rash\n", + "\"\"\"\n", + "all_data = [general_text,cancer_text,ade_text,body_re_text, radiology_text, long_text ]" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint ner_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \nOut[9]:
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentencetextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...John SnowPERSON0.9836499700John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
10John Snow is a medical doctor from England. Pe...EnglandGPE0.976210John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
20John Snow is a medical doctor from England. Pe...PeterPERSON0.997621John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
30John Snow is a medical doctor from England. Pe...AmericaGPE0.960331John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n", - "
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "290afadb-e662-4d10-964a-68b27551c614", + "showTitle": false, + "title": "" + }, + "id": "ahKcmZHdjNUM", + "outputId": "046ff9fb-1815-4f1f-a40c-a7d3cf9819d7" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", + "Writing license to scope JSL_SCOPE\n", + "Creating new serving endpoint: tokenize_ENDPOINT\n", + "Deployment starting, this may take 10 to 20 minutes...\n", + "Created serving endpoint tokenize_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/tokenize_ENDPOINT\n", + "Out[16]:
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nWriting license to scope JSL_SCOPE\nCreating new serving endpoint: tokenize_ENDPOINT\nDeployment starting, this may take 10 to 20 minutes...\nCreated serving endpoint tokenize_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/tokenize_ENDPOINT\nOut[16]:
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indextexttoken
00John Snow is a medical doctor from England. Pe...John
10John Snow is a medical doctor from England. Pe...Snow
20John Snow is a medical doctor from England. Pe...is
30John Snow is a medical doctor from England. Pe...a
40John Snow is a medical doctor from England. Pe...medical
............
3065\\nPatient with hypertension, syncope, and spin...uses
3075\\nPatient with hypertension, syncope, and spin...as
3085\\nPatient with hypertension, syncope, and spin...needed
3095\\nPatient with hypertension, syncope, and spin...for
3105\\nPatient with hypertension, syncope, and spin...rash
\n", + "

311 rows × 3 columns

\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indextexttoken
00John Snow is a medical doctor from England. Pe...John
10John Snow is a medical doctor from England. Pe...Snow
20John Snow is a medical doctor from England. Pe...is
30John Snow is a medical doctor from England. Pe...a
40John Snow is a medical doctor from England. Pe...medical
............
3065\\nPatient with hypertension, syncope, and spin...uses
3075\\nPatient with hypertension, syncope, and spin...as
3085\\nPatient with hypertension, syncope, and spin...needed
3095\\nPatient with hypertension, syncope, and spin...for
3105\\nPatient with hypertension, syncope, and spin...rash
\n

311 rows × 3 columns

\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + } + } + ], + "source": [ + "# simple tokenizer endpoint\n", + "nlp.deploy_endpoint('tokenize')\n", + "nlp.query_endpoint('tokenize_ENDPOINT',all_data)" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentencetextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...John SnowPERSON0.9836499700John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
10John Snow is a medical doctor from England. Pe...EnglandGPE0.976210John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
20John Snow is a medical doctor from England. Pe...PeterPERSON0.997621John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
30John Snow is a medical doctor from England. Pe...AmericaGPE0.960331John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Deploy & Query general ner model\n", - "nlp.query_and_deploy_if_missing('ner',general_text,workload_size='Large')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "74638827-ea43-4bfa-9e67-49f37cd07114", - "showTitle": false, - "title": "" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", - "Endpoint en_relation_bodypart_directions_pipeline_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \n", - "Out[13]:
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4405286c-65c1-4ad5-8f54-2a017c9c9760", + "showTitle": false, + "title": "" + }, + "id": "pSWhjYF9jNUM", + "outputId": "8cebfaae-603e-4a54-c9bb-57b42738c6f0" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", + "Endpoint ner_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \n", + "Out[10]:
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint ner_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \nOut[10]:
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentencetextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...John SnowPERSON0.9836499700John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
10John Snow is a medical doctor from England. Pe...EnglandGPE0.976210John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
20John Snow is a medical doctor from England. Pe...PeterPERSON0.997621John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
30John Snow is a medical doctor from England. Pe...AmericaGPE0.960331John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentencetextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...John SnowPERSON0.9836499700John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
10John Snow is a medical doctor from England. Pe...EnglandGPE0.976210John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
20John Snow is a medical doctor from England. Pe...PeterPERSON0.997621John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
30John Snow is a medical doctor from England. Pe...AmericaGPE0.960331John Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + } + } + ], + "source": [ + "# Deploy & Query general ner model. Default output level is auto-inferred to chunk\n", + "nlp.deploy_endpoint('ner')\n", + "nlp.query_endpoint('ner_ENDPOINT', general_text)\n" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint en_relation_bodypart_directions_pipeline_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \nOut[13]:
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentenceposrelationrelation_confidencerelation_entity1relation_entity1_beginrelation_entity1_classrelation_entity1_endrelation_entity2relation_entity2_beginrelation_entity2_classrelation_entity2_endrelation_origin_sentencetextunlabeled_dependencyword_embedding_from_disk
00MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...10.27865upper35Direction39brain stem41Internal_organ_or_component500MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
10MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.7909upper35Direction39cerebellum59Internal_organ_or_component680MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
20MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.2843upper35Direction39basil ganglia81Internal_organ_or_component930MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
30MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.5153brain stem41Internal_organ_or_component50left54Direction570MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
40MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.4533brain stem41Internal_organ_or_component50right75Direction790MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
50MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...10.7909left54Direction57cerebellum59Internal_organ_or_component680MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
60MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.2843left54Direction57basil ganglia81Internal_organ_or_component930MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
70MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.4533cerebellum59Internal_organ_or_component68right75Direction790MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
80MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...10.2843right75Direction79basil ganglia81Internal_organ_or_component930MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
\n", - "
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bd7de6ad-83c4-41d5-8412-121a9334109c", + "showTitle": false, + "title": "" + }, + "id": "xlwQUiZUjNUN", + "outputId": "1aa4e6a1-fd9d-4a8a-c0f3-43c6e69a75d1" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
Out[11]:
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Out[11]:
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentenceoutput_leveltextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...John SnowPERSON0.9836499700documentJohn Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
10John Snow is a medical doctor from England. Pe...EnglandGPE0.976210documentJohn Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
20John Snow is a medical doctor from England. Pe...PeterPERSON0.997621documentJohn Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
30John Snow is a medical doctor from England. Pe...AmericaGPE0.960331documentJohn Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentenceoutput_leveltextword_embedding_from_disk
00John Snow is a medical doctor from England. Pe...John SnowPERSON0.9836499700documentJohn Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
10John Snow is a medical doctor from England. Pe...EnglandGPE0.976210documentJohn Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
20John Snow is a medical doctor from England. Pe...PeterPERSON0.997621documentJohn Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
30John Snow is a medical doctor from England. Pe...AmericaGPE0.960331documentJohn Snow is a medical doctor from England. Pe...[[-0.2747400105, 0.4868099988, -0.0717220008, ...
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + } + } + ], + "source": [ + "# can change output level like this\n", + "nlp.query_endpoint('ner_ENDPOINT', general_text,output_level='document')\n" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexdocumententities_from_diskentities_from_disk_classentities_from_disk_confidenceentities_from_disk_origin_chunkentities_from_disk_origin_sentenceposrelationrelation_confidencerelation_entity1relation_entity1_beginrelation_entity1_classrelation_entity1_endrelation_entity2relation_entity2_beginrelation_entity2_classrelation_entity2_endrelation_origin_sentencetextunlabeled_dependencyword_embedding_from_disk
00MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...10.27865upper35Direction39brain stem41Internal_organ_or_component500MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
10MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.7909upper35Direction39cerebellum59Internal_organ_or_component680MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
20MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.2843upper35Direction39basil ganglia81Internal_organ_or_component930MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
30MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.5153brain stem41Internal_organ_or_component50left54Direction570MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
40MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.4533brain stem41Internal_organ_or_component50right75Direction790MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
50MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...10.7909left54Direction57cerebellum59Internal_organ_or_component680MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
60MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.2843left54Direction57basil ganglia81Internal_organ_or_component930MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
70MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...00.4533cerebellum59Internal_organ_or_component68right75Direction790MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
80MRI demonstrated infarction in the upper brain...[MRI, infarction, upper, brain stem, left, cer...[Test, Disease_Syndrome_Disorder, Direction, I...[0.998, 0.4944, 0.2124, 0.27865, 0.5153, 0.790...[0, 1, 2, 3, 4, 5, 6, 7][0, 0, 0, 0, 0, 0, 0, 0][NN, VVD, NN, II, DD, JJ, NN, NN, NN, JJ, NN, ...10.2843right75Direction79basil ganglia81Internal_organ_or_component930MRI demonstrated infarction in the upper brain...[infarction, infarction, ROOT, stem, stem, ste...[[0.3066935539, -0.1863229275, 0.6304829717, -...
\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# https://nlp.johnsnowlabs.com/2023/06/17/re_bodypart_directions_pipeline_en.html \n", - "# Relation Example\n", - "nlp.query_and_deploy_if_missing(\"en.relation.bodypart_directions.pipeline\",body_re_text)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "7d48af06-622d-4ffb-bbdf-049218834d41", - "showTitle": false, - "title": "" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "
Deleting registered model en_classify_ade_seq_biobert_REGISTERD_MODEL\n", - "Warning::Spark Session already created, some configs may not take.\n", - "Warning::Spark Session already created, some configs may not take.\n", - "bert_sequence_classifier_ade download started this may take some time.\n", - "\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[OK!]\n", - "sentence_detector_dl download started this may take some time.\n", - "Approximate size to download 354.6 KB\n", - "\r[ | ]\r[OK!]\n", - "Warning::Spark Session already created, some configs may not take.\n", - "Successfully registered model 'en_classify_ade_seq_biobert_REGISTERD_MODEL'.\n", - "2023/09/02 17:45:18 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: en_classify_ade_seq_biobert_REGISTERD_MODEL, version 1\n", - "Created version '1' of model 'en_classify_ade_seq_biobert_REGISTERD_MODEL'.\n", - "2023/09/02 17:47:18 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.20.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n", - "Registered model 'en_classify_ade_seq_biobert_REGISTERD_MODEL' already exists. Creating a new version of this model...\n", - "2023/09/02 17:47:18 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: en_classify_ade_seq_biobert_REGISTERD_MODEL, version 2\n", - "Created version '2' of model 'en_classify_ade_seq_biobert_REGISTERD_MODEL'.\n", - "Deleting exisiting Endpoint en_classify_ade_seq_biobert_ENDPOINT\n", - "Writing license to scope JSL_SCOPE\n", - "Creating new serving endpoint: en_classify_ade_seq_biobert_ENDPOINT\n", - "Deployment starting, this may take 10 to 20 minutes...\n", - "Created serving endpoint en_classify_ade_seq_biobert_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/en_classify_ade_seq_biobert_ENDPOINT\n", - "Out[7]:
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6fa0494e-19d5-4009-8c1c-418b36eaa676", + "showTitle": false, + "title": "" + }, + "id": "3hyeh58wjNUN", + "outputId": "654ab833-96fb-4dfa-9426-89019a589ef4" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
Deleting registered model en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL\n", + "Warning::Spark Session already created, some configs may not take.\n", + "Warning::Spark Session already created, some configs may not take.\n", + "bert_token_classifier_ner_jsl_pipeline download started this may take some time.\n", + "Approx size to download 386.3 MB\n", + "\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[OK!]\n", + "2023/11/17 03:43:16 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.40.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n", + "Saving model to /tmp/mlflow/d821b627-7ea6-4cf9-b0f9-3747d0e6b82c\n", + "Warning::Spark Session already created, some configs may not take.\n", + "WARNING: No Visual NLP jar found!\n", + "Successfully registered model 'en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL'.\n", + "2023/11/17 03:43:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL, version 1\n", + "Created version '1' of model 'en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL'.\n", + "2023/11/17 03:44:40 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.40.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n", + "Registered model 'en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL' already exists. Creating a new version of this model...\n", + "2023/11/17 03:45:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL, version 2\n", + "Created version '2' of model 'en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL'.\n", + "Deleting exisiting Endpoint en_classify_bert_token_ner_jsl_pipeline_ENDPOINT\n", + "Writing license to scope JSL_SCOPE\n", + "Creating new serving endpoint: en_classify_bert_token_ner_jsl_pipeline_ENDPOINT\n", + "Deployment starting, this may take 10 to 20 minutes...\n", + "Created serving endpoint en_classify_bert_token_ner_jsl_pipeline_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/en_classify_bert_token_ner_jsl_pipeline_ENDPOINT\n", + "endpoint name is en_classify_bert_token_ner_jsl_pipeline_ENDPOINT\n", + "Out[30]:
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Deleting registered model en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL\nWarning::Spark Session already created, some configs may not take.\nWarning::Spark Session already created, some configs may not take.\nbert_token_classifier_ner_jsl_pipeline download started this may take some time.\nApprox size to download 386.3 MB\n\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[OK!]\n2023/11/17 03:43:16 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.40.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\nSaving model to /tmp/mlflow/d821b627-7ea6-4cf9-b0f9-3747d0e6b82c\nWarning::Spark Session already created, some configs may not take.\nWARNING: No Visual NLP jar found!\nSuccessfully registered model 'en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL'.\n2023/11/17 03:43:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL, version 1\nCreated version '1' of model 'en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL'.\n2023/11/17 03:44:40 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.40.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\nRegistered model 'en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL' already exists. Creating a new version of this model...\n2023/11/17 03:45:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL, version 2\nCreated version '2' of model 'en_classify_bert_token_ner_jsl_pipeline_REGISTERD_MODEL'.\nDeleting exisiting Endpoint en_classify_bert_token_ner_jsl_pipeline_ENDPOINT\nWriting license to scope JSL_SCOPE\nCreating new serving endpoint: en_classify_bert_token_ner_jsl_pipeline_ENDPOINT\nDeployment starting, this may take 10 to 20 minutes...\nCreated serving endpoint en_classify_bert_token_ner_jsl_pipeline_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/en_classify_bert_token_ner_jsl_pipeline_ENDPOINT\nendpoint name is en_classify_bert_token_ner_jsl_pipeline_ENDPOINT\nOut[30]:
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexclassified_tokendocumententitiesentities_classentities_confidenceentities_origin_chunkentities_origin_sentencetext
00[O, O, O, O, B-Demographics, I-Demographics, O...John Snow is a medical doctor from England. Pe...medical doctorDemographics0.994721400John Snow is a medical doctor from England. Pe...
10[O, O, O, O, B-Demographics, I-Demographics, O...John Snow is a medical doctor from England. Pe...PeterDemographics0.982508911John Snow is a medical doctor from England. Pe...
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexclassified_tokendocumententitiesentities_classentities_confidenceentities_origin_chunkentities_origin_sentencetext
00[O, O, O, O, B-Demographics, I-Demographics, O...John Snow is a medical doctor from England. Pe...medical doctorDemographics0.994721400John Snow is a medical doctor from England. Pe...
10[O, O, O, O, B-Demographics, I-Demographics, O...John Snow is a medical doctor from England. Pe...PeterDemographics0.982508911John Snow is a medical doctor from England. Pe...
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + } + } + ], + "source": [ + "# https://nlp.johnsnowlabs.com/2023/06/17/bert_token_classifier_ner_jsl_pipeline_en.html\n", + "# '.' is replaced with '_' in the nlu reference for the endpoint name\n", + "endpoint_name = nlp.deploy_endpoint('en.classify.bert_token_ner_jsl.pipeline')\n", + "print(\"endpoint name is \", endpoint_name)\n", + "nlp.query_endpoint(endpoint_name, general_text)" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Deleting registered model en_classify_ade_seq_biobert_REGISTERD_MODEL\nWarning::Spark Session already created, some configs may not take.\nWarning::Spark Session already created, some configs may not take.\nbert_sequence_classifier_ade download started this may take some time.\n\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[ — ]\r[ \\ ]\r[ | ]\r[ / ]\r[OK!]\nsentence_detector_dl download started this may take some time.\nApproximate size to download 354.6 KB\n\r[ | ]\r[OK!]\nWarning::Spark Session already created, some configs may not take.\nSuccessfully registered model 'en_classify_ade_seq_biobert_REGISTERD_MODEL'.\n2023/09/02 17:45:18 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: en_classify_ade_seq_biobert_REGISTERD_MODEL, version 1\nCreated version '1' of model 'en_classify_ade_seq_biobert_REGISTERD_MODEL'.\n2023/09/02 17:47:18 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.20.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\nRegistered model 'en_classify_ade_seq_biobert_REGISTERD_MODEL' already exists. Creating a new version of this model...\n2023/09/02 17:47:18 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: en_classify_ade_seq_biobert_REGISTERD_MODEL, version 2\nCreated version '2' of model 'en_classify_ade_seq_biobert_REGISTERD_MODEL'.\nDeleting exisiting Endpoint en_classify_ade_seq_biobert_ENDPOINT\nWriting license to scope JSL_SCOPE\nCreating new serving endpoint: en_classify_ade_seq_biobert_ENDPOINT\nDeployment starting, this may take 10 to 20 minutes...\nCreated serving endpoint en_classify_ade_seq_biobert_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/en_classify_ade_seq_biobert_ENDPOINT\nOut[7]:
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexclassified_sequenceclassified_sequence_confidencesentencetext
00True0.998191So glad I am off effexor, so sad it ruined my ...So glad I am off effexor, so sad it ruined my ...
10False0.999601tip Please be carefull taking antideppresiva ...So glad I am off effexor, so sad it ruined my ...
\n", - "
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e9d00f23-fb0a-4f93-a690-3cd1b0f9b647", + "showTitle": false, + "title": "" + }, + "id": "Tp0tDMKdjNUN", + "outputId": "f7d59010-3c23-4072-ba16-02cd6eee3fa1" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", + "Endpoint en_classify_bert_token_ner_jsl_pipeline_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \n", + "Out[36]:
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint en_classify_bert_token_ner_jsl_pipeline_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \nOut[36]:
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexclassified_tokendocumententitiesentities_classentities_confidenceentities_origin_chunkentities_origin_sentencetext
00[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...hisDemographics0.999558800with his breathing while feeding (but negative...
10[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...perioral cyanosisSymptom0.9840110510with his breathing while feeding (but negative...
20[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...retractionsSymptom0.9992540520with his breathing while feeding (but negative...
30[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...One day agoDate_Time0.9993855431with his breathing while feeding (but negative...
40[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...momDemographics0.9998348441with his breathing while feeding (but negative...
50[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...tactile temperatureSymptom0.99935251with his breathing while feeding (but negative...
60[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...TylenolDrug0.999762561with his breathing while feeding (but negative...
70[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...Baby-girlAge0.9805288372with his breathing while feeding (but negative...
80[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...decreased p.o. intakeSymptom0.9989776682with his breathing while feeding (but negative...
90[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...HisDemographics0.9999116793with his breathing while feeding (but negative...
100[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...breast-feedingBody_Part0.9993943103with his breathing while feeding (but negative...
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexclassified_tokendocumententitiesentities_classentities_confidenceentities_origin_chunkentities_origin_sentencetext
00[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...hisDemographics0.999558800with his breathing while feeding (but negative...
10[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...perioral cyanosisSymptom0.9840110510with his breathing while feeding (but negative...
20[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...retractionsSymptom0.9992540520with his breathing while feeding (but negative...
30[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...One day agoDate_Time0.9993855431with his breathing while feeding (but negative...
40[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...momDemographics0.9998348441with his breathing while feeding (but negative...
50[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...tactile temperatureSymptom0.99935251with his breathing while feeding (but negative...
60[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...TylenolDrug0.999762561with his breathing while feeding (but negative...
70[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...Baby-girlAge0.9805288372with his breathing while feeding (but negative...
80[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...decreased p.o. intakeSymptom0.9989776682with his breathing while feeding (but negative...
90[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...HisDemographics0.9999116793with his breathing while feeding (but negative...
100[O, B-Demographics, O, O, O, O, O, O, O, O, B-...with his breathing while feeding (but negative...breast-feedingBody_Part0.9993943103with his breathing while feeding (but negative...
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + } + } + ], + "source": [ + "# Relation Example\n", + "# https://nlp.johnsnowlabs.com/2023/06/17/bert_token_classifier_ner_jsl_pipeline_en.html\n", + "endpoint_name = nlp.deploy_endpoint('en.classify.bert_token_ner_jsl.pipeline')\n", + "nlp.query_endpoint(endpoint_name, cancer_text)" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexclassified_sequenceclassified_sequence_confidencesentencetext
00True0.998191So glad I am off effexor, so sad it ruined my ...So glad I am off effexor, so sad it ruined my ...
10False0.999601tip Please be carefull taking antideppresiva ...So glad I am off effexor, so sad it ruined my ...
\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Sequence Classifier Example https://nlp.johnsnowlabs.com/2022/02/08/bert_sequence_classifier_ade_en.html\n", - "nlp.query_and_deploy_if_missing(\"en.classify.ade.seq_biobert\", ade_text,True,True )\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "372f65b0-f258-4640-b3d2-7b3bed3a6571", - "showTitle": false, - "title": "" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "
Warning::Spark Session already created, some configs may not take.\n", - "Warning::Spark Session already created, some configs may not take.\n", - "assertion_dl_radiology download started this may take some time.\n", - "\r[ | ]\r[OK!]\n", - "embeddings_clinical download started this may take some time.\n", - "Approximate size to download 1.6 GB\n", - "\r[ | ]\r[OK!]\n", - "ner_jsl download started this may take some time.\n", - "\r[ | ]\r[OK!]\n", - "sentence_detector_dl download started this may take some time.\n", - "Approximate size to download 354.6 KB\n", - "\r[ | ]\r[OK!]\n", - "Warning::Spark Session already created, some configs may not take.\n", - "2023/09/03 02:49:58 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.30.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n", - "Successfully registered model 'en_assert_radiology_REGISTERD_MODEL'.\n", - "2023/09/03 02:49:58 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: en_assert_radiology_REGISTERD_MODEL, version 1\n", - "Created version '1' of model 'en_assert_radiology_REGISTERD_MODEL'.\n", - "2023/09/03 02:53:30 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.30.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n", - "Registered model 'en_assert_radiology_REGISTERD_MODEL' already exists. Creating a new version of this model...\n", - "2023/09/03 02:53:31 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: en_assert_radiology_REGISTERD_MODEL, version 2\n", - "Created version '2' of model 'en_assert_radiology_REGISTERD_MODEL'.\n", - "Deleting exisiting Endpoint en_assert_radiology_ENDPOINT\n", - "Writing license to scope JSL_SCOPE\n", - "Creating new serving endpoint: en_assert_radiology_ENDPOINT\n", - "Deployment starting, this may take 10 to 20 minutes...\n", - "
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7d48af06-622d-4ffb-bbdf-049218834d41", + "showTitle": false, + "title": "" + }, + "id": "QfJ6mflCjNUN", + "outputId": "475e6980-eea2-4e71-9514-fb093e6a6e79" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", + "Endpoint en_classify_ade_seq_biobert_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \n", + "Out[13]:
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint en_classify_ade_seq_biobert_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \nOut[13]:
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexclassified_sequenceclassified_sequence_confidencesentencetext
00True0.998191So glad I am off effexor, so sad it ruined my ...So glad I am off effexor, so sad it ruined my ...
10False0.999601tip Please be carefull taking antideppresiva ...So glad I am off effexor, so sad it ruined my ...
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexclassified_sequenceclassified_sequence_confidencesentencetext
00True0.998191So glad I am off effexor, so sad it ruined my ...So glad I am off effexor, so sad it ruined my ...
10False0.999601tip Please be carefull taking antideppresiva ...So glad I am off effexor, so sad it ruined my ...
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + } + } + ], + "source": [ + "# Sequence Classifier Example https://nlp.johnsnowlabs.com/2022/02/08/bert_sequence_classifier_ade_en.html\n", + "endpoint_name = nlp.deploy_endpoint('en.classify.ade.seq_biobert')\n", + "nlp.query_endpoint(endpoint_name, ade_text)" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Warning::Spark Session already created, some configs may not take.\nWarning::Spark Session already created, some configs may not take.\nassertion_dl_radiology download started this may take some time.\n\r[ | ]\r[OK!]\nembeddings_clinical download started this may take some time.\nApproximate size to download 1.6 GB\n\r[ | ]\r[OK!]\nner_jsl download started this may take some time.\n\r[ | ]\r[OK!]\nsentence_detector_dl download started this may take some time.\nApproximate size to download 354.6 KB\n\r[ | ]\r[OK!]\nWarning::Spark Session already created, some configs may not take.\n2023/09/03 02:49:58 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.30.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\nSuccessfully registered model 'en_assert_radiology_REGISTERD_MODEL'.\n2023/09/03 02:49:58 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: en_assert_radiology_REGISTERD_MODEL, version 1\nCreated version '1' of model 'en_assert_radiology_REGISTERD_MODEL'.\n2023/09/03 02:53:30 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.30.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\nRegistered model 'en_assert_radiology_REGISTERD_MODEL' already exists. Creating a new version of this model...\n2023/09/03 02:53:31 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: en_assert_radiology_REGISTERD_MODEL, version 2\nCreated version '2' of model 'en_assert_radiology_REGISTERD_MODEL'.\nDeleting exisiting Endpoint en_assert_radiology_ENDPOINT\nWriting license to scope JSL_SCOPE\nCreating new serving endpoint: en_assert_radiology_ENDPOINT\nDeployment starting, this may take 10 to 20 minutes...\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "372f65b0-f258-4640-b3d2-7b3bed3a6571", + "showTitle": false, + "title": "" + }, + "id": "emhJA7ScjNUN", + "outputId": "ee632830-f48d-42de-a115-51ad62b45a0b" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", + "Writing license to scope JSL_SCOPE\n", + "Creating new serving endpoint: en_assert_radiology_ENDPOINT\n", + "Deployment starting, this may take 10 to 20 minutes...\n", + "Created serving endpoint en_assert_radiology_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/en_assert_radiology_ENDPOINT\n", + "Out[14]:
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nWriting license to scope JSL_SCOPE\nCreating new serving endpoint: en_assert_radiology_ENDPOINT\nDeployment starting, this may take 10 to 20 minutes...\nCreated serving endpoint en_assert_radiology_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/en_assert_radiology_ENDPOINT\nOut[14]:
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexassertiondocumententitiesentities_classentities_confidenceentities_origin_chunkentities_origin_sentencetextword_embedding_from_disk
00ConfirmedINTERPRETATION: There has been interval develo...INTERPRETATION:Section_Header0.9476500INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
10ConfirmedINTERPRETATION: There has been interval develo...moderateModifier0.897211INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
20ConfirmedINTERPRETATION: There has been interval develo...left-sidedDirection0.972421INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
30ConfirmedINTERPRETATION: There has been interval develo...pneumothoraxDisease_Syndrome_Disorder0.92631INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
40ConfirmedINTERPRETATION: There has been interval develo...nearModifier0.719641INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
50ConfirmedINTERPRETATION: There has been interval develo...completeModifier0.693351INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
60ConfirmedINTERPRETATION: There has been interval develo...collapseSymptom0.664361INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
70ConfirmedINTERPRETATION: There has been interval develo...leftDirection0.661371INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
80ConfirmedINTERPRETATION: There has been interval develo...upper lobeInternal_organ_or_component0.3519581INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
90ConfirmedINTERPRETATION: There has been interval develo...lower lobeInternal_organ_or_component0.5610592INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
100ConfirmedINTERPRETATION: There has been interval develo...stableModifier0.8667103INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
110ConfirmedINTERPRETATION: There has been interval develo...diffuseModifier0.8833113INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
120ConfirmedINTERPRETATION: There has been interval develo...bilateralDirection0.9789123INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
130ConfirmedINTERPRETATION: There has been interval develo...interstitial thickeningSymptom0.3513133INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
140NegativeINTERPRETATION: There has been interval develo...definiteModifier0.9115143INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
150NegativeINTERPRETATION: There has been interval develo...acuteModifier0.8942153INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
160NegativeINTERPRETATION: There has been interval develo...air space consolidationSymptom0.25703335163INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
170ConfirmedINTERPRETATION: There has been interval develo...heartInternal_organ_or_component0.9672174INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
180ConfirmedINTERPRETATION: There has been interval develo...pulmonary vascularityInternal_organ_or_component0.72284997184INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
190ConfirmedINTERPRETATION: There has been interval develo...Left-sidedDirection0.9675195INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
200ConfirmedINTERPRETATION: There has been interval develo...portMedical_Device0.5644205INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
210ConfirmedINTERPRETATION: There has been interval develo...Groshong tipMedical_Device0.82495215INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
220ConfirmedINTERPRETATION: There has been interval develo...SVC/RA junctionInternal_organ_or_component0.5879225INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
230NegativeINTERPRETATION: There has been interval develo...acuteModifier0.8571236INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
240NegativeINTERPRETATION: There has been interval develo...fractureInjury_or_Poisoning0.8191246INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
250NegativeINTERPRETATION: There has been interval develo...malalignmentSymptom0.9013256INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
260NegativeINTERPRETATION: There has been interval develo...dislocationInjury_or_Poisoning0.9575266INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexassertiondocumententitiesentities_classentities_confidenceentities_origin_chunkentities_origin_sentencetextword_embedding_from_disk
00ConfirmedINTERPRETATION: There has been interval develo...INTERPRETATION:Section_Header0.9476500INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
10ConfirmedINTERPRETATION: There has been interval develo...moderateModifier0.897211INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
20ConfirmedINTERPRETATION: There has been interval develo...left-sidedDirection0.972421INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
30ConfirmedINTERPRETATION: There has been interval develo...pneumothoraxDisease_Syndrome_Disorder0.92631INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
40ConfirmedINTERPRETATION: There has been interval develo...nearModifier0.719641INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
50ConfirmedINTERPRETATION: There has been interval develo...completeModifier0.693351INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
60ConfirmedINTERPRETATION: There has been interval develo...collapseSymptom0.664361INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
70ConfirmedINTERPRETATION: There has been interval develo...leftDirection0.661371INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
80ConfirmedINTERPRETATION: There has been interval develo...upper lobeInternal_organ_or_component0.3519581INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
90ConfirmedINTERPRETATION: There has been interval develo...lower lobeInternal_organ_or_component0.5610592INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
100ConfirmedINTERPRETATION: There has been interval develo...stableModifier0.8667103INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
110ConfirmedINTERPRETATION: There has been interval develo...diffuseModifier0.8833113INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
120ConfirmedINTERPRETATION: There has been interval develo...bilateralDirection0.9789123INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
130ConfirmedINTERPRETATION: There has been interval develo...interstitial thickeningSymptom0.3513133INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
140NegativeINTERPRETATION: There has been interval develo...definiteModifier0.9115143INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
150NegativeINTERPRETATION: There has been interval develo...acuteModifier0.8942153INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
160NegativeINTERPRETATION: There has been interval develo...air space consolidationSymptom0.25703335163INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
170ConfirmedINTERPRETATION: There has been interval develo...heartInternal_organ_or_component0.9672174INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
180ConfirmedINTERPRETATION: There has been interval develo...pulmonary vascularityInternal_organ_or_component0.72284997184INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
190ConfirmedINTERPRETATION: There has been interval develo...Left-sidedDirection0.9675195INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
200ConfirmedINTERPRETATION: There has been interval develo...portMedical_Device0.5644205INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
210ConfirmedINTERPRETATION: There has been interval develo...Groshong tipMedical_Device0.82495215INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
220ConfirmedINTERPRETATION: There has been interval develo...SVC/RA junctionInternal_organ_or_component0.5879225INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
230NegativeINTERPRETATION: There has been interval develo...acuteModifier0.8571236INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
240NegativeINTERPRETATION: There has been interval develo...fractureInjury_or_Poisoning0.8191246INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
250NegativeINTERPRETATION: There has been interval develo...malalignmentSymptom0.9013256INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
260NegativeINTERPRETATION: There has been interval develo...dislocationInjury_or_Poisoning0.9575266INTERPRETATION: There has been interval develo...[[0.0482680351, 0.8041833639, 0.2013939619, 0....
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + } + } + ], + "source": [ + "# Assertion Example https://nlp.johnsnowlabs.com/2021/03/18/assertion_dl_radiology_en.html\n", + "endpoint_name = nlp.deploy_endpoint('en.assert.radiology')\n", + "nlp.query_endpoint(endpoint_name, radiology_text)" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "Cancelled", - "errorTraceType": "html", - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Assertion Example https://nlp.johnsnowlabs.com/2021/03/18/assertion_dl_radiology_en.html\n", - "nlp.query_and_deploy_if_missing(\"en.assert.radiology\", radiology_text,workload_size='Large')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "641a6d60-5fa1-4076-9d8b-5ea088dc0901", - "showTitle": false, - "title": "" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", - "Deleting exisiting Endpoint en_summarize_clinical_jsl_ENDPOINT\n", - "Writing license to scope JSL_SCOPE\n", - "Creating new serving endpoint: en_summarize_clinical_jsl_ENDPOINT\n", - "Deployment starting, this may take 10 to 20 minutes...\n", - "Created serving endpoint en_summarize_clinical_jsl_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/en_summarize_clinical_jsl_ENDPOINT\n", - "Out[7]:
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "641a6d60-5fa1-4076-9d8b-5ea088dc0901", + "showTitle": false, + "title": "" + }, + "id": "sA2Z9cRwjptp", + "outputId": "272bc438-a155-4e5a-9b5d-f671f67a1fb3" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", + "Deleting exisiting Endpoint en_summarize_clinical_jsl_ENDPOINT\n", + "Writing license to scope JSL_SCOPE\n", + "Creating new serving endpoint: en_summarize_clinical_jsl_ENDPOINT\n", + "Deployment starting, this may take 10 to 20 minutes...\n", + "Created serving endpoint en_summarize_clinical_jsl_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/en_summarize_clinical_jsl_ENDPOINT\n", + "Out[7]:
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nDeleting exisiting Endpoint en_summarize_clinical_jsl_ENDPOINT\nWriting license to scope JSL_SCOPE\nCreating new serving endpoint: en_summarize_clinical_jsl_ENDPOINT\nDeployment starting, this may take 10 to 20 minutes...\nCreated serving endpoint en_summarize_clinical_jsl_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/en_summarize_clinical_jsl_ENDPOINT\nOut[7]:
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexsentencesummarytext
00Patient with hypertension, syncope, and spinal...The patient has hypertension, syncope, and spi...Patient with hypertension, syncope, and spinal...
10The patient is a 78-year-old female who return...A 78-year-old female returns for a recheck aft...Patient with hypertension, syncope, and spinal...
20She has hypertension.The patient has hypertension.Patient with hypertension, syncope, and spinal...
30She denies difficulty with chest pain, palpati...The patient denies any difficulty with chest p...Patient with hypertension, syncope, and spinal...
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexsentencesummarytext
00Patient with hypertension, syncope, and spinal...The patient has hypertension, syncope, and spi...Patient with hypertension, syncope, and spinal...
10The patient is a 78-year-old female who return...A 78-year-old female returns for a recheck aft...Patient with hypertension, syncope, and spinal...
20She has hypertension.The patient has hypertension.Patient with hypertension, syncope, and spinal...
30She denies difficulty with chest pain, palpati...The patient denies any difficulty with chest p...Patient with hypertension, syncope, and spinal...
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# Deploy & Query summarizer model https://nlp.johnsnowlabs.com/2023/03/25/summarizer_clinical_jsl.html\n", + "endpoint_name = nlp.deploy_endpoint('en.summarize.clinical_jsl')\n", + "nlp.query_endpoint(endpoint_name, long_text,workload_size='Large')" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nDeleting exisiting Endpoint en_summarize_clinical_jsl_ENDPOINT\nWriting license to scope JSL_SCOPE\nCreating new serving endpoint: en_summarize_clinical_jsl_ENDPOINT\nDeployment starting, this may take 10 to 20 minutes...\nCreated serving endpoint en_summarize_clinical_jsl_ENDPOINT at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/en_summarize_clinical_jsl_ENDPOINT\nOut[7]:
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexsentencesummarytext
00Patient with hypertension, syncope, and spinal...The patient has hypertension, syncope, and spi...Patient with hypertension, syncope, and spinal...
10The patient is a 78-year-old female who return...A 78-year-old female returns for a recheck aft...Patient with hypertension, syncope, and spinal...
20She has hypertension.The patient has hypertension.Patient with hypertension, syncope, and spinal...
30She denies difficulty with chest pain, palpati...The patient denies any difficulty with chest p...Patient with hypertension, syncope, and spinal...
\n", - "
" + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "92febe30-157d-4071-9b1f-ffcc2f524afc", + "showTitle": false, + "title": "" + }, + "id": "-xjDmeErjNUO" + }, + "source": [ + "# 3) Deploying&Query Custom Pipelines\n", + "\n", + "You must specify `endpoint_name` when deploying custom pipelines" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexsentencesummarytext
00Patient with hypertension, syncope, and spinal...The patient has hypertension, syncope, and spi...Patient with hypertension, syncope, and spinal...
10The patient is a 78-year-old female who return...A 78-year-old female returns for a recheck aft...Patient with hypertension, syncope, and spinal...
20She has hypertension.The patient has hypertension.Patient with hypertension, syncope, and spinal...
30She denies difficulty with chest pain, palpati...The patient denies any difficulty with chest p...Patient with hypertension, syncope, and spinal...
\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Deploy & Query summarizer model https://nlp.johnsnowlabs.com/2023/03/25/summarizer_clinical_jsl.html \n", - "nlp.query_and_deploy_if_missing('en.summarize.clinical_jsl',long_text,workload_size='Large')\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "92febe30-157d-4071-9b1f-ffcc2f524afc", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# 3) Deploying&Query Custom Pipelines" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "cd65798b-6a61-4005-8ab7-2e894abd99ae", - "showTitle": false, - "title": "" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "
Warning::Spark Session already created, some configs may not take.\n", - "Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", - "Endpoint custom_pipe_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \n", - "Out[14]:
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cd65798b-6a61-4005-8ab7-2e894abd99ae", + "showTitle": false, + "title": "" + }, + "id": "eWAlbw-DjNUO", + "outputId": "0e4e20d4-c563-400b-e12b-ae1422d56891" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
Warning::Spark Session already created, some configs may not take.\n", + "2023/11/17 01:57:51 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.40.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n", + "Saving model to /tmp/mlflow/4b24734b-4afc-467a-a292-adc8d651ea36\n", + "Warning::Spark Session already created, some configs may not take.\n", + "WARNING: No Visual NLP jar found!\n", + "Successfully registered model 'my_simple_custom_pipe'.\n", + "2023/11/17 01:57:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: my_simple_custom_pipe, version 1\n", + "Created version '1' of model 'my_simple_custom_pipe'.\n", + "2023/11/17 01:58:27 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.40.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n", + "Registered model 'my_simple_custom_pipe' already exists. Creating a new version of this model...\n", + "2023/11/17 01:59:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: my_simple_custom_pipe, version 2\n", + "Created version '2' of model 'my_simple_custom_pipe'.\n", + "Writing license to scope JSL_SCOPE\n", + "Creating new serving endpoint: my_simple_custom_pipe\n", + "Deployment starting, this may take 10 to 20 minutes...\n", + "Created serving endpoint my_simple_custom_pipe at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/my_simple_custom_pipe\n", + "Out[7]:
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Warning::Spark Session already created, some configs may not take.\n2023/11/17 01:57:51 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.40.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\nSaving model to /tmp/mlflow/4b24734b-4afc-467a-a292-adc8d651ea36\nWarning::Spark Session already created, some configs may not take.\nWARNING: No Visual NLP jar found!\nSuccessfully registered model 'my_simple_custom_pipe'.\n2023/11/17 01:57:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: my_simple_custom_pipe, version 1\nCreated version '1' of model 'my_simple_custom_pipe'.\n2023/11/17 01:58:27 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.40.0/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\nRegistered model 'my_simple_custom_pipe' already exists. Creating a new version of this model...\n2023/11/17 01:59:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: my_simple_custom_pipe, version 2\nCreated version '2' of model 'my_simple_custom_pipe'.\nWriting license to scope JSL_SCOPE\nCreating new serving endpoint: my_simple_custom_pipe\nDeployment starting, this may take 10 to 20 minutes...\nCreated serving endpoint my_simple_custom_pipe at https://dbc-3d4c44aa-a512.cloud.databricks.com/#mlflow/endpoints/my_simple_custom_pipe\nOut[7]:
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indextexttoken
00hello worldhello
10hello worldworld
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indextexttoken
00hello worldhello
10hello worldworld
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + } + } + ], + "source": [ + "# Example of simple custom Pipeline\n", + "documentAssembler = nlp.DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", + "tokenizer = nlp.Tokenizer().setInputCols(\"document\").setOutputCol(\"token\")\n", + "fitted_pipe = nlp.Pipeline(\n", + " stages=[documentAssembler, tokenizer]).fit(\n", + " spark.createDataFrame([[\"\"]]).toDF(\"text\"))\n", + "\n", + "\n", + "# You must specify endpoint_name for custom pipes\n", + "nlp.deploy_endpoint(fitted_pipe,endpoint_name='my_simple_custom_pipe')\n", + "nlp.query_endpoint('my_simple_custom_pipe','hello world')" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Warning::Spark Session already created, some configs may not take.\nModel already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint custom_pipe_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \nOut[14]:
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indextexttoken
00This isThis
10This isis
21my custom pipemy
31my custom pipecustom
41my custom pipepipe
52and data for itand
62and data for itdata
72and data for itfor
82and data for itit
\n", - "
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "914e3bf8-cf56-4bb8-bcad-96413f5bb266", + "showTitle": false, + "title": "" + }, + "id": "UeLPAnSEjNUO", + "outputId": "94f181b0-2e20-49a7-b2e2-0c241f9a2257" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
glove_100d download started this may take some time.\n", + "Approximate size to download 145.3 MB\n", + "\r[ | ]\r[OK!]\n", + "Warning::Spark Session already created, some configs may not take.\n", + "Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", + "Endpoint complex_custom_pipe already exists! Set re_create_endpoint=True if you want to re-create it \n", + "Out[21]:
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
glove_100d download started this may take some time.\nApproximate size to download 145.3 MB\n\r[ | ]\r[OK!]\nWarning::Spark Session already created, some configs may not take.\nModel already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint complex_custom_pipe already exists! Set re_create_endpoint=True if you want to re-create it \nOut[21]:
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexassertionassertion_confidenceassertion_idbeginchunkchunk_idchunk_numdocumentendentitylabelsent_beginsent_endsentence_idtask_idtexttkn_endtkn_startword_embedding_from_disk
0000.584610Lorem11Lorem1A00311Lorem10[[-0.028802, 0.1998700052, -0.3181200027, 0.21...
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexassertionassertion_confidenceassertion_idbeginchunkchunk_idchunk_numdocumentendentitylabelsent_beginsent_endsentence_idtask_idtexttkn_endtkn_startword_embedding_from_disk
0000.584610Lorem11Lorem1A00311Lorem10[[-0.028802, 0.1998700052, -0.3181200027, 0.21...
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + } + } + ], + "source": [ + "# Example of complex custom pipe\n", + "# This pipe is expecting multiple external input features\n", + "# You must provide them when when sending query\n", + "import pandas as pd\n", + "from johnsnowlabs import nlp,medical\n", + "from pyspark.sql import SparkSession\n", + "# Dummy data\n", + "dummy_data = {\n", + " \"assertion_id\": ['1', '2', '3'],\n", + " \"label\": ['0', '1', '0'],\n", + " \"begin\": ['0', '0', '0'],\n", + " \"chunk_id\": ['1', '2', '3'],\n", + " \"entity\": [\"A\", \"B\", \"C\"],\n", + " \"chunk_num\": ['1', '2', '3'],\n", + " \"chunk\": [\"Lorem\", \"ipsum\", \"dolor\"],\n", + " \"end\": ['1', '1', '1'],\n", + " \"sentence_id\": ['1', '1', '2'],\n", + " \"text\": [\"Lorem ipsum\", \"dolor sit\", \"amet\"],\n", + " \"sent_begin\": ['0', '0', '0'],\n", + " \"sent_end\": ['3', '3', '3'],\n", + " \"task_id\": ['1', '2', '1'],\n", + " \"tkn_start\": ['0', '0', '0'],\n", + " \"tkn_end\": ['1', '1', '1']\n", + "}\n", + "\n", + "\n", + "# Create Spark DataFrame\n", + "df = spark.createDataFrame(pd.DataFrame(dummy_data))\n", + "\n", + "\n", + "document = nlp.DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "chunk = nlp.Doc2Chunk()\\\n", + " .setInputCols(\"document\")\\\n", + " .setOutputCol(\"assertion_chunk\")\\\n", + " .setChunkCol(\"chunk\")\\\n", + " .setStartCol(\"tkn_start\")\\\n", + " .setStartColByTokenIndex(True)\\\n", + " .setFailOnMissing(False)\\\n", + " .setLowerCase(False)\n", + "\n", + "token = nlp.Tokenizer()\\\n", + " .setInputCols(['document'])\\\n", + " .setOutputCol('token')\\\n", + " .setContextChars([' ', '_', '/', '.', '(', ')', ':', ',', '-', ''',''', '\"', '\"', '\"', '\\'', ';', '#', '{', '}', '[', ']', '*', '!', '?', '+'])\n", + "embeddings = (\n", + " nlp.WordEmbeddingsModel.pretrained()\n", + " .setInputCols([\"document\", \"token\"])\n", + " .setOutputCol(\"embeddings\")\n", + " )\n", + "\n", + "assertionStatus = medical.AssertionDLApproach()\\\n", + " .setLabelCol(\"label\")\\\n", + " .setInputCols(\"document\", \"assertion_chunk\", \"embeddings\")\\\n", + " .setOutputCol(\"prediction\")\\\n", + " .setBatchSize(32)\\\n", + " .setDropout(0.2)\\\n", + " .setLearningRate(0.001)\\\n", + " .setEpochs(1)\\\n", + " .setValidationSplit(0.2)\\\n", + " .setStartCol(\"tkn_start\")\\\n", + " .setEndCol(\"tkn_end\")\\\n", + " .setScopeWindow([12,9])\\\n", + " .setIncludeConfidence(True)\\\n", + " .setEnableOutputLogs(True)\\\n", + " .setOutputLogsPath('../assertion/logs/mlflow/')\n", + "\n", + "\n", + "fitted_pipe = nlp.Pipeline(\n", + " stages = [\n", + " document,\n", + " chunk,\n", + " token,\n", + " embeddings,\n", + " assertionStatus]).fit(df)\n", + "\n", + "json_query = \"\"\"\n", + "{\n", + " \"dataframe_split\": {\n", + " \"columns\": [\"assertion_id\", \"label\", \"begin\", \"chunk_id\", \"entity\", \"chunk_num\", \"chunk\", \"end\", \"sentence_id\", \"text\", \"sent_begin\", \"sent_end\", \"task_id\", \"tkn_start\", \"tkn_end\"],\n", + " \"data\": [[\"1\", \"0\", \"0\", \"1\", \"A\", \"1\", \"Lorem\", \"1\", \"1\", \"Lorem\", \"0\", \"3\", \"1\", \"0\", \"1\"]]\n", + "}\n", + "}\n", + "\"\"\"\n", + "nlp.deploy_endpoint(fitted_pipe,endpoint_name='complex_custom_pipe')\n", + "nlp.query_endpoint('complex_custom_pipe',json_query,is_json_query=True)" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indextexttoken
00This isThis
10This isis
21my custom pipemy
31my custom pipecustom
41my custom pipepipe
52and data for itand
62and data for itdata
72and data for itfor
82and data for itit
\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Example of simple custom Pipeline\n", - "documentAssembler = nlp.DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", - "tokenizer = nlp.Tokenizer().setInputCols(\"document\").setOutputCol(\"token\")\n", - "fitted_pipe = nlp.Pipeline(\n", - " stages=[documentAssembler, tokenizer]).fit(\n", - " spark.createDataFrame([[\"\"]]).toDF(\"text\"))\n", - "nlp.query_and_deploy_if_missing(fitted_pipe,['This is ', 'my custom pipe', 'and data for it'],base_name='custom_pipe')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "914e3bf8-cf56-4bb8-bcad-96413f5bb266", - "showTitle": false, - "title": "" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "
Warning::Spark Session already created, some configs may not take.\n", - "Model already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\n", - "Endpoint complex_custom_pipe_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \n", - "Out[25]:
" + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "016ec44e-70cd-4bf1-b502-820fb303116d", + "showTitle": false, + "title": "" + }, + "id": "brJU0yu0jNUO" + }, + "source": [ + "# 4) Delete all endpoints\n", + "\n", + "# You can uncomment and run this to clean up all endpoints" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
Warning::Spark Session already created, some configs may not take.\nModel already has been logged, skipping logging and using latest. Set re_create_model=True if you want to cre-create it\nEndpoint complex_custom_pipe_ENDPOINT already exists! Set re_create_endpoint=True if you want to re-create it \nOut[25]:
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexassertionassertion_confidenceassertion_idbeginchunkchunk_idchunk_numdocumentendentitylabelsent_beginsent_endsentence_idtask_idtexttkn_endtkn_startword_embedding_from_disk
0010.851910Lorem11Lorem1A00311Lorem10[[-0.028802, 0.1998700052, -0.3181200027, 0.21...
\n", - "
" + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "142ee3eb-7d63-4558-9399-23d115c6f6bb", + "showTitle": false, + "title": "" + }, + "id": "37XlerdQjNUO", + "outputId": "e2a472fc-ae93-4fb5-d066-43667830fede" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + } + } + ], + "source": [ + "from johnsnowlabs.auto_install.databricks.endpoints import delete_all_endpoints\n", + "# delete_all_endpoints()" ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexassertionassertion_confidenceassertion_idbeginchunkchunk_idchunk_numdocumentendentitylabelsent_beginsent_endsentence_idtask_idtexttkn_endtkn_startword_embedding_from_disk
0010.851910Lorem11Lorem1A00311Lorem10[[-0.028802, 0.1998700052, -0.3181200027, 0.21...
\n
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" } - ], - "source": [ - "# Example of complex custom pipe\n", - "# This pipe is expecting multiple external input features \n", - "# You must provide them when when sending query\n", - "import pandas as pd\n", - "from johnsnowlabs import nlp,medical\n", - "from pyspark.sql import SparkSession\n", - "# Dummy data\n", - "dummy_data = {\n", - " \"assertion_id\": ['1', '2', '3'],\n", - " \"label\": ['0', '1', '0'],\n", - " \"begin\": ['0', '0', '0'],\n", - " \"chunk_id\": ['1', '2', '3'],\n", - " \"entity\": [\"A\", \"B\", \"C\"],\n", - " \"chunk_num\": ['1', '2', '3'],\n", - " \"chunk\": [\"Lorem\", \"ipsum\", \"dolor\"],\n", - " \"end\": ['1', '1', '1'],\n", - " \"sentence_id\": ['1', '1', '2'],\n", - " \"text\": [\"Lorem ipsum\", \"dolor sit\", \"amet\"],\n", - " \"sent_begin\": ['0', '0', '0'],\n", - " \"sent_end\": ['3', '3', '3'],\n", - " \"task_id\": ['1', '2', '1'],\n", - " \"tkn_start\": ['0', '0', '0'],\n", - " \"tkn_end\": ['1', '1', '1']\n", - "}\n", - "\n", - "\n", - "# Create Spark DataFrame\n", - "df = spark.createDataFrame(pd.DataFrame(dummy_data))\n", - "\n", - "\n", - "document = nlp.DocumentAssembler()\\\n", - " .setInputCol(\"text\")\\\n", - " .setOutputCol(\"document\")\n", - "\n", - "chunk = nlp.Doc2Chunk()\\\n", - " .setInputCols(\"document\")\\\n", - " .setOutputCol(\"assertion_chunk\")\\\n", - " .setChunkCol(\"chunk\")\\\n", - " .setStartCol(\"tkn_start\")\\\n", - " .setStartColByTokenIndex(True)\\\n", - " .setFailOnMissing(False)\\\n", - " .setLowerCase(False)\n", - "\n", - "token = nlp.Tokenizer()\\\n", - " .setInputCols(['document'])\\\n", - " .setOutputCol('token')\\\n", - " .setContextChars([' ', '_', '/', '.', '(', ')', ':', ',', '-', ''',''', '\"', '\"', '\"', '\\'', ';', '#', '{', '}', '[', ']', '*', '!', '?', '+'])\n", - "embeddings = (\n", - " nlp.WordEmbeddingsModel.pretrained()\n", - " .setInputCols([\"document\", \"token\"])\n", - " .setOutputCol(\"embeddings\")\n", - " )\n", - "\n", - "assertionStatus = medical.AssertionDLApproach()\\\n", - " .setLabelCol(\"label\")\\\n", - " .setInputCols(\"document\", \"assertion_chunk\", \"embeddings\")\\\n", - " .setOutputCol(\"prediction\")\\\n", - " .setBatchSize(32)\\\n", - " .setDropout(0.2)\\\n", - " .setLearningRate(0.001)\\\n", - " .setEpochs(1)\\\n", - " .setValidationSplit(0.2)\\\n", - " .setStartCol(\"tkn_start\")\\\n", - " .setEndCol(\"tkn_end\")\\\n", - " .setScopeWindow([12,9])\\\n", - " .setIncludeConfidence(True)\\\n", - " .setEnableOutputLogs(True)\\\n", - " .setOutputLogsPath('../assertion/logs/mlflow/')\n", - "\n", - "\n", - "fitted_pipe = nlp.Pipeline(\n", - " stages = [\n", - " document,\n", - " chunk,\n", - " token,\n", - " embeddings,\n", - " assertionStatus]).fit(df)\n", - "\n", - "json_query = \"\"\"\n", - "{\n", - " \"dataframe_split\": {\n", - " \"columns\": [\"assertion_id\", \"label\", \"begin\", \"chunk_id\", \"entity\", \"chunk_num\", \"chunk\", \"end\", \"sentence_id\", \"text\", \"sent_begin\", \"sent_end\", \"task_id\", \"tkn_start\", \"tkn_end\"],\n", - " \"data\": [[\"1\", \"0\", \"0\", \"1\", \"A\", \"1\", \"Lorem\", \"1\", \"1\", \"Lorem\", \"0\", \"3\", \"1\", \"0\", \"1\"]]\n", - "}\n", - "}\n", - "\"\"\"\n", - "nlp.query_and_deploy_if_missing(fitted_pipe,json_query,base_name='complex_custom_pipe',is_json_query=True,workload_size='Large')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "016ec44e-70cd-4bf1-b502-820fb303116d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# 4) Delete all endpoints\n", - "\n", - "# You can uncomment and run this to clean up all endpoints" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "142ee3eb-7d63-4558-9399-23d115c6f6bb", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "
", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "html" - } - }, - "output_type": "display_data" + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "databricks_endpoints_tutorial_PUBLIC", + "widgets": {} + }, + "colab": { + "provenance": [] } - ], - "source": [ - "from johnsnowlabs.auto_install.databricks.endpoints import delete_all_endpoints\n", - "# delete_all_endpoints()" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "databricks_endpoints_tutorial_PUBLIC_BACKUP", - "widgets": {} }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/notebooks/haystack_with_johnsnowlabs.ipynb b/notebooks/haystack_with_johnsnowlabs.ipynb new file mode 100644 index 0000000000..c6a5afaffa --- /dev/null +++ b/notebooks/haystack_with_johnsnowlabs.ipynb @@ -0,0 +1,1073 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "1eaaa772a41a4abda538965d76ddff87": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6aaeb75f42a947aa9178130cbb28e43a", + "IPY_MODEL_15af62b24c7148e8b736d805bc800e62", + "IPY_MODEL_51ef52743f5446dfabda362253b19b1f" + ], + "layout": "IPY_MODEL_aced084f506a4cd682e8da83bdfb1e68" + } + }, + "6aaeb75f42a947aa9178130cbb28e43a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0d378bc994a6482d8b13454565176f06", + "placeholder": "​", + "style": "IPY_MODEL_9804a7ed7233493cb2a14a04c5eabe31", + "value": "Preprocessing: 100%" + } + }, + "15af62b24c7148e8b736d805bc800e62": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4182cae4c4674beb967a92754eecfc34", + "max": 723, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_488ba2172ee141569980eb6be70d225b", + "value": 723 + } + }, + "51ef52743f5446dfabda362253b19b1f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3dc7773eeef04a30aa38711140f1d3c7", + "placeholder": "​", + "style": "IPY_MODEL_6e431a4e46f648688f0af665fa315ac3", + "value": " 723/723 [00:17<00:00, 68.52docs/s]" + } + }, + "aced084f506a4cd682e8da83bdfb1e68": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0d378bc994a6482d8b13454565176f06": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9804a7ed7233493cb2a14a04c5eabe31": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4182cae4c4674beb967a92754eecfc34": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "488ba2172ee141569980eb6be70d225b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3dc7773eeef04a30aa38711140f1d3c7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6e431a4e46f648688f0af665fa315ac3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "126b8af600354a48a721d26064c22ed8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d95e4f184c684e118ba47cf119d901c0", + "IPY_MODEL_a888d53b771e47618995b2d7e1deb40d", + "IPY_MODEL_4ae36ee20c5143eabb1b63bf993e68ca" + ], + "layout": "IPY_MODEL_427a3dcdea7645feb8f3326b2bcf16ee" + } + }, + "d95e4f184c684e118ba47cf119d901c0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_89b6d8480fa14c19a16130fb36fdc924", + "placeholder": "​", + "style": "IPY_MODEL_3d577af888f84735a4add9068b549559", + "value": "Preprocessing: 100%" + } + }, + "a888d53b771e47618995b2d7e1deb40d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f002f053acbb4f0488023618371a3fff", + "max": 723, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2cd1a6ca97a243a5835815eab1d27d3c", + "value": 723 + } + }, + "4ae36ee20c5143eabb1b63bf993e68ca": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dcee6bd3ccab4abc9b7990a20e11d2f5", + "placeholder": "​", + "style": "IPY_MODEL_ef3ef7d5c4dd4240af3a0e1c29823047", + "value": " 723/723 [00:07<00:00, 85.73docs/s]" + } + }, + "427a3dcdea7645feb8f3326b2bcf16ee": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "89b6d8480fa14c19a16130fb36fdc924": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3d577af888f84735a4add9068b549559": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f002f053acbb4f0488023618371a3fff": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2cd1a6ca97a243a5835815eab1d27d3c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "dcee6bd3ccab4abc9b7990a20e11d2f5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ef3ef7d5c4dd4240af3a0e1c29823047": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + " \n", + "\n", + "https://haystack.deepset.ai [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/johnsnowlabs/blob/master/notebooks/haystack_with_johnsnowlabs.ipynb)\n", + "\n", + "\n", + "This tutorial showcase how to use [Johnsnowlabs Components with Langchain](https://nlp.johnsnowlabs.com/docs/en/jsl/haystack-utils) for Scalable Pre-Processing and Embedding computation on clusters\n", + "\n", + "If you want to scale this, you can re-use this code in a spark-cluster created with [nlp.install_to_databricks()](https://nlp.johnsnowlabs.com/docs/en/jsl/install_advanced#into-a-freshly-created-databricks-cluster-automatically)" + ], + "metadata": { + "id": "LIUmU42lYLMA" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "u3eDorlyQ0Sr" + }, + "outputs": [], + "source": [ + "! pip install johnsnowlabs\n", + "from johnsnowlabs import nlp\n", + "! pip install 'farm-haystack[all]'\n", + "! pip install tensorflow==2.14\n", + "nlp.start()\n", + "\n", + "# restart session after installing evertything\n", + "import os\n", + "os.kill(os.getpid(), 9)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Download some Sample Data and Convert do Haystack Documents" + ], + "metadata": { + "id": "TlnJcqZAmrT3" + } + }, + { + "cell_type": "code", + "source": [ + "# Download some sample data we use as a mini-db\n", + "! wget https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt\n", + "\n", + "from haystack import Document\n", + "def create_documents_from_file(file_path):\n", + " # Helper func for reading files as Haystack Document object\n", + " # returns a list of Document with one object for every line in file_path\n", + " documents = []\n", + " with open(file_path, 'r') as file:\n", + " for id, line in enumerate(file):\n", + " documents.append(Document(content=line.strip(), content_type=\"text\", id=id))\n", + " return documents\n" + ], + "metadata": { + "id": "25uCldigJec4", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1e8756f1-9567-43db-dc58-700cc103f182" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-11-17 03:16:53-- https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 39028 (38K) [text/plain]\n", + "Saving to: ‘state_of_the_union.txt.3’\n", + "\n", + "\rstate_of_the_union. 0%[ ] 0 --.-KB/s \rstate_of_the_union. 100%[===================>] 38.11K --.-KB/s in 0.01s \n", + "\n", + "2023-11-17 03:16:53 (3.07 MB/s) - ‘state_of_the_union.txt.3’ saved [39028/39028]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Create a Haystack pipe\n", + "We will add a `JohnSnowLabsHaystackProcessor` and `JohnSnowLabsHaystackEmbedder` for fully distributed computation on spark-clusters.\n", + "\n", + "In this simple example we split documents and store their embeddings in the document store for RAG applications" + ], + "metadata": { + "id": "4KR1mozkmvBu" + } + }, + { + "cell_type": "code", + "source": [ + "from johnsnowlabs.llm import embedding_retrieval\n", + "from haystack.nodes import PreProcessor\n", + "from haystack import Pipeline\n", + "from haystack.document_stores import InMemoryDocumentStore\n", + "\n", + "def get_hay_jsl_pipe(model_name='en.embed_sentence.bert_base_uncased',embed_dim=512):\n", + " # small example Haystack pipeline demonstrating JohnSnowLabsHaystackProcessor\n", + "\n", + "\n", + " # JohnSnowLabsHaystackProcessor support all parameters of JSl DocumentSplitter\n", + " processor = embedding_retrieval.JohnSnowLabsHaystackProcessor(\n", + " chunk_overlap=2,\n", + " chunk_size=20,\n", + " explode_splits=True,\n", + " keep_seperators=True,\n", + " patterns_are_regex=False,\n", + " split_patterns=[\"\\n\\n\", \"\\n\", \" \", \"\"],\n", + " trim_whitespace=True,\n", + " )\n", + "\n", + " # Write some processed data to Doc store, so we can retrieve it later\n", + " document_store = InMemoryDocumentStore(embedding_dim=embed_dim)\n", + " document_store.write_documents(processor.process(create_documents_from_file(\"state_of_the_union.txt\")))\n", + "\n", + " # If you want to use GPU, make sure you ran nlp.start(hardware_target='gpu') !\n", + " retriever = embedding_retrieval.JohnSnowLabsHaystackEmbedder(\n", + " embedding_model=model_name,\n", + " document_store=document_store,\n", + " use_gpu=False,\n", + " )\n", + "\n", + " document_store.update_embeddings(retriever)\n", + "\n", + " pipe = Pipeline()\n", + " pipe.add_node(component=processor, name=\"Preprocess\", inputs=[\"Query\"])\n", + " pipe.add_node(component=retriever, name=\"Embed&Retrieve\", inputs=[\"Query\"])\n", + " return pipe\n", + "\n" + ], + "metadata": { + "id": "HAUSU4TuJp53" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Create & Query the pipe\n", + "We will get the top K most similar results to our query.\n", + "You can use [any Sentence Embedding](https://nlp.johnsnowlabs.com/models?task=Embeddings) from John Snow Labs by passing the **nlu_reference** of any Sentence Embedder.\n" + ], + "metadata": { + "id": "mlIgbmMNnFMn" + } + }, + { + "cell_type": "code", + "source": [ + "use_pipe = get_hay_jsl_pipe('en.embed_sentence.use', embed_dim=512)\n", + "result = use_pipe.run(query=\"Who is the first lady\")\n", + "for r in result['documents']:\n", + " print(r.to_dict())" + ], + "metadata": { + "id": "Au6ZftFXKD7V", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 379, + "referenced_widgets": [ + "1eaaa772a41a4abda538965d76ddff87", + "6aaeb75f42a947aa9178130cbb28e43a", + "15af62b24c7148e8b736d805bc800e62", + "51ef52743f5446dfabda362253b19b1f", + "aced084f506a4cd682e8da83bdfb1e68", + "0d378bc994a6482d8b13454565176f06", + "9804a7ed7233493cb2a14a04c5eabe31", + "4182cae4c4674beb967a92754eecfc34", + "488ba2172ee141569980eb6be70d225b", + "3dc7773eeef04a30aa38711140f1d3c7", + "6e431a4e46f648688f0af665fa315ac3" + ] + }, + "outputId": "6cba71fe-b04c-46b1-d31e-2e8d40ba2c31" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Spark Session already created, some configs may not take.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Preprocessing: 0%| | 0/723 [00:00 \n", + "https://www.langchain.com\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/johnsnowlabs/blob/master/notebooks/langchain_with_johnsnowlabs.ipynb)\n", + "\n", + "\n", + "This tutorial showcase how to use [Johnsnowlabs Components with Langchain](https://nlp.johnsnowlabs.com/docs/en/jsl/langchain-utils) for Scalable Pre-Processing and Embedding computation on clusters\n", + "\n", + "If you want to scale this, you can re-use this code in a spark-cluster created with [nlp.install_to_databricks()](https://nlp.johnsnowlabs.com/docs/en/jsl/install_advanced#into-a-freshly-created-databricks-cluster-automatically)" + ], + "id": "47d2a8ca" + }, + { + "cell_type": "markdown", + "source": [ + "# Installing dependencies & Downloading the jsl_embedder" + ], + "metadata": { + "id": "Ll5vk27UNgCD" + }, + "id": "Ll5vk27UNgCD" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GASkYNwoaW6Z" + }, + "outputs": [], + "source": [ + "! pip install johnsnowlabs\n", + "from johnsnowlabs import nlp\n", + "nlp.start()\n", + "! pip install langchain openai tiktoken faiss-cpu\n", + "\n", + "# restart session after installing evertything\n", + "import os\n", + "os.kill(os.getpid(), 9)\n" + ], + "id": "GASkYNwoaW6Z" + }, + { + "cell_type": "markdown", + "source": [ + "# Langchain based JSL-Embedder and Text Splitters\n", + "based on this [conversational_retrieval_agents tutorial](https://python.langchain.com/docs/use_cases/question_answering/conversational_retrieval_agents) building a mini RAG system" + ], + "metadata": { + "id": "amC-VYnzbtih" + }, + "id": "amC-VYnzbtih" + }, + { + "cell_type": "markdown", + "source": [ + "## Download some Sample Data" + ], + "metadata": { + "id": "ADv47UUhngtN" + }, + "id": "ADv47UUhngtN" + }, + { + "cell_type": "code", + "source": [ + "# Download some sample data we use as a mini-db\n", + "! wget https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt" + ], + "metadata": { + "id": "gV6-sHwMlyHZ", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "14633f0a-df1a-4a66-f756-cb1c30997d1c" + }, + "id": "gV6-sHwMlyHZ", + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-11-17 03:56:24-- https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 39028 (38K) [text/plain]\n", + "Saving to: ‘state_of_the_union.txt’\n", + "\n", + "state_of_the_union. 100%[===================>] 38.11K --.-KB/s in 0.007s \n", + "\n", + "2023-11-17 03:56:25 (5.11 MB/s) - ‘state_of_the_union.txt’ saved [39028/39028]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Load data as Langchain Docs" + ], + "metadata": { + "id": "wLmeVOr9niNq" + }, + "id": "wLmeVOr9niNq" + }, + { + "cell_type": "code", + "source": [ + "from langchain.document_loaders import TextLoader\n", + "loader = TextLoader('/content/state_of_the_union.txt')\n", + "documents = loader.load()\n" + ], + "metadata": { + "id": "E2do2KPonk1F" + }, + "id": "E2do2KPonk1F", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Create Pre-Processor which is connected to Spark-Cluster and **pre-processes documents** distributed" + ], + "metadata": { + "id": "FbNTjSeenpDM" + }, + "id": "FbNTjSeenpDM" + }, + { + "cell_type": "code", + "source": [ + "from johnsnowlabs.llm import embedding_retrieval\n", + "jsl_splitter = embedding_retrieval.JohnSnowLabsLangChainCharSplitter(\n", + " chunk_overlap=2,\n", + " chunk_size=20,\n", + " explode_splits=True,\n", + " keep_seperators=True,\n", + " patterns_are_regex=False,\n", + " split_patterns=[\"\\n\\n\", \"\\n\", \" \", \"\"],\n", + " trim_whitespace=True,\n", + "\n", + ")\n", + "texts = jsl_splitter.split_documents(documents)" + ], + "metadata": { + "id": "GrUZfwYOm49V", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "729dfe7c-ebbf-442b-d3dc-0408debb6df0" + }, + "id": "GrUZfwYOm49V", + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Spark Session already created, some configs may not take.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Create Pre-Processor which is connected to Spark-Cluster and **Embeds documents** distributed" + ], + "metadata": { + "id": "Qd2GA3-nn38L" + }, + "id": "Qd2GA3-nn38L" + }, + { + "cell_type": "code", + "source": [ + "from langchain.vectorstores import FAISS\n", + "embeddings = embedding_retrieval.JohnSnowLabsLangChainEmbedder('en.embed_sentence.bert_base_uncased')\n", + "db = FAISS.from_documents(texts, embeddings)\n", + "retriever = db.as_retriever()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eqO8Gqhdn2XA", + "outputId": "e5bd1935-4614-4d3a-cc59-2f853bd41b4c" + }, + "id": "eqO8Gqhdn2XA", + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Spark Session already created, some configs may not take.\n", + "Warning::Spark Session already created, some configs may not take.\n", + "Warning::Spark Session already created, some configs may not take.\n", + "sent_bert_base_uncased download started this may take some time.\n", + "Approximate size to download 392.5 MB\n", + "[OK!]\n", + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Create a Tool with the Distributed Embedding Retriever" + ], + "metadata": { + "id": "8YkeV-jYoJTS" + }, + "id": "8YkeV-jYoJTS" + }, + { + "cell_type": "code", + "source": [ + "from langchain.agents.agent_toolkits import create_retriever_tool\n", + "tool = create_retriever_tool(\n", + " retriever,\n", + " \"search_state_of_union\",\n", + " \"Searches and returns documents regarding the state-of-the-union.\"\n", + ")\n", + "tools = [tool]\n" + ], + "metadata": { + "id": "YASZZUrcm5DR" + }, + "id": "YASZZUrcm5DR", + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Create an agent with access to the Tool" + ], + "metadata": { + "id": "305VsUk_oOy4" + }, + "id": "305VsUk_oOy4" + }, + { + "cell_type": "code", + "source": [ + "from langchain.agents.agent_toolkits import create_conversational_retrieval_agent\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "open_api_key = 'YOUR API KEY'\n", + "llm = ChatOpenAI(temperature = 0,openai_api_key=open_api_key)\n", + "agent_executor = create_conversational_retrieval_agent(llm, tools, verbose=True)" + ], + "metadata": { + "id": "aPijWx86m5MX" + }, + "id": "aPijWx86m5MX", + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Query the Agent" + ], + "metadata": { + "id": "InRURraHoQvN" + }, + "id": "InRURraHoQvN" + }, + { + "cell_type": "code", + "source": [ + "result = agent_executor({\"input\": \"what did the president say about going to east of Columbus?\"})\n", + "result['output']" + ], + "metadata": { + "id": "D77Ayn29m5Sn", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 229 + }, + "outputId": "1c9aaa48-fdd2-425e-e1e9-6f74b2a97597" + }, + "id": "D77Ayn29m5Sn", + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m\n", + "Invoking: `search_state_of_union` with `{'query': 'going to east of Columbus'}`\n", + "\n", + "\n", + "\u001b[0m\u001b[36;1m\u001b[1;3m[Document(page_content='miles east of', metadata={'source': '/content/state_of_the_union.txt'}), Document(page_content='in America.', metadata={'source': '/content/state_of_the_union.txt'}), Document(page_content='out of America.', metadata={'source': '/content/state_of_the_union.txt'}), Document(page_content='upside down.', metadata={'source': '/content/state_of_the_union.txt'})]\u001b[0m\u001b[32;1m\u001b[1;3mI'm sorry, but I couldn't find any specific information about the president's statement regarding going to the east of Columbus in the State of the Union address.\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "\"I'm sorry, but I couldn't find any specific information about the president's statement regarding going to the east of Columbus in the State of the Union address.\"" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "AVfc4HZ_ZDYw" + }, + "id": "AVfc4HZ_ZDYw", + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/tests/pytest.ini b/pytest.ini similarity index 100% rename from tests/pytest.ini rename to pytest.ini diff --git a/tests/databricks/db_test_utils.py b/tests/databricks/db_test_utils.py index e4967e27e3..8c550ead30 100644 --- a/tests/databricks/db_test_utils.py +++ b/tests/databricks/db_test_utils.py @@ -2,6 +2,7 @@ import time import pytest +from nlu.utils.environment.offline_load_utils import NLP_ref_to_NLU_ref from johnsnowlabs import nlp, settings from johnsnowlabs.auto_install.databricks.install_utils import ( @@ -11,12 +12,33 @@ wait_till_cluster_running, _get_cluster_id, ) -from johnsnowlabs.auto_install.health_checks.generate_endpoint_test import ( +from johnsnowlabs.auto_install.health_checks.generate_test import ( generate_endpoint_test, ) +from johnsnowlabs.auto_install.health_checks.generate_test import ( + generate_load_predict_test, +) from johnsnowlabs.utils.enums import DatabricksClusterStates from tests.utilsz import secrets as sct +db_cloud_node_params = pytest.mark.parametrize( + "creds, node_type", + [ + ("aws_creds", "aws_cpu_node"), + # ("azure_creds", "azure_cpu_node"), + # ("azure_creds", "azure_gpu_node_type"), + ], + indirect=["creds", "node_type"], +) +db_cloud_params = pytest.mark.parametrize( + "creds", + [ + "aws_creds", + # "azure_creds", + ], + indirect=["creds"], +) + @pytest.fixture() def azure_gpu_node_type(): @@ -88,16 +110,40 @@ def assert_job_suc(state): def run_endpoint_tests(test_cluster_id, host, token, model): # generate job.py script and submit it - assert_job_suc( - nlp.run_in_databricks( - generate_endpoint_test(model, sct.container_lic_json), - databricks_cluster_id=test_cluster_id, - databricks_host=host, - databricks_token=token, - run_name=f"endpoint_creation_test_run_{model}", - ) + job_id, job_url = nlp.run_in_databricks( + generate_endpoint_test(model, sct.container_lic_json), + databricks_cluster_id=test_cluster_id, + databricks_host=host, + databricks_token=token, + run_name=f"endpoint_creation_test_run_{model}", + return_job_url=True, + ) + try: + assert_job_suc(job_id) + except Exception as e: + return job_url, False + + return job_url, True + + +def run_load_predict_tests(test_cluster_id, host, token, model): + # generate job.py script and submit it + job_id, job_url = nlp.run_in_databricks( + generate_load_predict_test(model), + databricks_cluster_id=test_cluster_id, + databricks_host=host, + databricks_token=token, + run_name=f"load_predict_test_{model}", + return_job_url=True, ) + try: + assert_job_suc(job_id) + except Exception as e: + return job_url, False + + return job_url, True + def run_cluster_test_suite(test_cluster_id, host, token): # run test suite again a existing cluster @@ -140,7 +186,16 @@ def run_cluster_test_suite(test_cluster_id, host, token): def get_one_model_per_class(): # todo actually generate from spellbook - return ["sentiment", "ner", "spell", "bert", "elmo", "albert", "roberta"] + return [ + # "tokenize", + "sentiment", + "ner", + ] + # "bert", + # "elmo", + # "albert", + # "roberta", + # ] def delete_all_test_clusters(db_client): @@ -152,6 +207,7 @@ def delete_all_test_clusters(db_client): time.sleep(5) +# TODO add UC cluster!? def get_or_create_test_cluster( creds, node_type, n=0, runtime=None, clean_workspace=False ): @@ -160,6 +216,7 @@ def get_or_create_test_cluster( If it exists, it checks if it's running and if not, it starts it. "exists" means another cluster with same name and runtime exists. """ + # m5d.2xlarge # lic, host, token = creds # runtime = "9.1.x-scala2.12" if runtime is None: @@ -195,7 +252,7 @@ def get_or_create_test_cluster( else: # no cluster exists, create new one - cluster_id = nlp.install( + cluster_id = nlp.install_to_databricks( spark_version=runtime, json_license_path=lic, databricks_host=host, @@ -209,35 +266,185 @@ def get_or_create_test_cluster( return cluster_id -def subtester_thread(cluster_id, job_que, host, token, results): +test_funcs = { + "endpoint": run_endpoint_tests, + "load_predict": run_load_predict_tests, + # "cluster_test_suite": run_cluster_test_suite, +} + + +def execute_test(test_type, cluster_id, host, token, model): + if test_type in test_funcs: + return test_funcs[test_type]( + cluster_id, host, token, model, raise_on_fail=False + ) + else: + raise ValueError(f"Unknown test type {test_type}") + + +def subtester_thread(cluster_id, job_que, host, token, results, test_type): while not job_que.empty(): try: model = job_que.get_nowait() + result = {"model": model} except queue.Empty: + print("Que Empty Thread done!") return - try: - run_endpoint_tests(cluster_id, host, token, model) - print(f"{model} success!") - results[model] = "SUCCESS" - except Exception as e: - print(f"{model} failed!") - results[model] = f"FAILED: {str(e)}" - - -db_cloud_node_params = pytest.mark.parametrize( - "creds, node_type", - [ - ("aws_creds", "aws_cpu_node"), - ("azure_creds", "azure_cpu_node"), - # ("azure_creds", "azure_gpu_node_type"), - ], - indirect=["creds", "node_type"], -) -db_cloud_params = pytest.mark.parametrize( - "creds", - [ - "aws_creds", - "azure_creds", - ], - indirect=["creds"], -) + print(f"Test {model} with {test_type}") + result["job_url"], result["success"] = test_funcs[test_type]( + cluster_id, host, token, model + ) + results[model] = result + print(f"✅ {model} TEST RESULTS {results[model]} ") + + +mm_models = [ + "ner_chemd_clinical_pipeline", + "summarizer_clinical_questions_pipeline", + "summarizer_clinical_guidelines_large_pipeline", + "summarizer_clinical_jsl_augmented_pipeline", + "summarizer_clinical_laymen_onnx", + "summarizer_clinical_laymen", + "ner_profiling_biobert", + " ner_jsl_pipeline assertion_jsl_augmented", + "ner_jsl_langtest_pipeline", + "ner_living_species_pipeline", + "ner_jsl_pipeline", + "sbiobertresolve_cpt_augmented", + "sbiobertresolve_HPO", + "sbiobertresolve_icd10cm_augmented_billable_hcc", + "sbiobertresolve_loinc_augmented", + "sbiobertresolve_rxnorm_augmented", + "medication_resolver_transform_pipeline", + "sbiobertresolve_snomed_findings", + "umls_disease_syndrome_resolver_pipeline", + "umls_drug_resolver_pipeline", + "umls_drug_substance_resolver_pipeline", + "umls_major_concepts_resolver_pipeline", + "umls_clinical_findings_resolver_pipeline", + "bert_sequence_classifier_gender_biobert", + "ner_cellular_pipeline", + "summarizer_generic_jsl_pipeline", + "biogpt_chat_jsl", + "medical_qa_biogpt", + "re_test_result_date_pipeline", + "re_temporal_events_clinical_pipeline", + "ner_pathogen_pipeline", + "ner_supplement_clinical_pipeline", + "clinical_notes_qa_base_onnx", + "clinical_notes_qa_large_onnx", + "ner_nihss redl_nihss_biobert", + "multiclassifierdl_hoc", + "oncology_general_pipeline", + "ner_profiling_oncology", + "ner_oncology_response_to_treatment", + "ner_oncology_therapy", + "ner_oncology_test", + "ner_oncology_tnm", + "ner_oncology_diagnosis", + "ner_oncology_anatomy_granular", + "ner_oncology_posology_langtest_pipeline", + "ner_radiology_wip_clinical assertion_dl_radiology", + "ner_chexpert_pipeline", + "summarizer_radiology_pipeline", + "ner_risk_factors_biobert_pipeline", + "ner_sdoh_langtest_pipeline assertion_sdoh_wip", + "ner_profiling_sdoh assertion_sdoh_wip", + "ner_sdoh_access_to_healthcare", + "ner_sdoh_community_condition", + "ner_sdoh_demographics", + "ner_sdoh_health_behaviours_problems", + "ner_sdoh_income_social_status", + "ner_sdoh_social_environment", + "ner_sdoh_substance_usage", + "genericclassifier_sdoh_economics_binary_sbiobert_cased_mli", + "genericclassifier_sdoh_alcohol_usage_sbiobert_cased_mli", + "ner_nihss_pipeline", + "ner_vop assertion_vop_clinical_large", + "bert_sequence_classifier_vop_hcp_consult", + "bert_sequence_classifier_vop_self_report", + "bert_sequence_classifier_vop_sound_medical", + "bert_sequence_classifier_vop_side_effect", + "bert_sequence_classifier_vop_drug_side_effect", + "ner_profiling_vop", + "zero_shot_ner_roberta", + "re_zeroshot_biobert", + "distilbert_base_sequence_classifier_imdb", + "bert_sequence_classifier_emotion", + "bert_sequence_classifier_age_news", + "bert_sequence_classifier_trec_coarse", + "distilbert_base_sequence_classifier_food", + "bert_sequence_classifier_multilingual_sentiment", + "distilbert_multilingual_sequence_classifier_allocine", + "distilbert_base_sequence_classifier_toxicity", + "bert_sequence_classifier_toxicity", + "deberta_v3_xsmall_token_classifier_ontonotes", + "deberta_v3_small_token_classifier_ontonotes", + "deberta_v3_base_token_classifier_ontonotes", + "deberta_v3_large_token_classifier_ontonotes", + "bert_base_token_classifier_ontonote", + "bert_large_token_classifier_ontonote", + "bert_token_classifier_aner", + "bert_token_classifier_named_entity_recognition_nerkor_hu_hungarian", + "xlmroberta_ner_uk_ner", + "sentence_detector_dl", + "bert_token_classifier_base_chinese_ner", + "bert_token_classifier_base_turkish_cased_ner", + "roberta_qa_deepset_base_squad2", + "bert_qa_arap", + "camembert_base_qa_fquad", + "distilbert_qa_base_cased_squadv2", + "xlm_roberta_qa_xlm_roberta_qa_chaii", + "bart_large_zero_shot_classifier_mnli", + "roberta_base_zero_shot_classifier_nli", + "bert_base_cased_zero_shot_classifier_xnli", + "distilbert_base_zero_shot_classifier_uncased_mnli", + "distilbert_base_zero_shot_classifier_turkish_cased_snli", + "xlm_roberta_large_zero_shot_classifier_xnli_anli", + "opus_mt_en_zh", + "opus_mt_en_es", + "opus_mt_en_fr", + "opus_mt_en_it", + "opus_mt_es_en", + "opus_mt_fr_en", + "opus_mt_it_en", + "distilbart_xsum_12_6", + "distilbart_xsum_6_6", + "distilbart_cnn_6_6", + "t5_small", + "e5_small_v2", + "e5_base_v2", + "e5_large_v2", + "e5_small_v2_quantized", + "e5_base_v2_quantized", + "e5_large_v2_quantized", + "instructor_base", + "instructor_large", + "multi_qa_mpnet_base_cos_v1", + "all_mpnet_base_v2", +] + + +def get_mm_models(): + # TODO use actual langs and make sure all models are resolved + nlu_refs = [] + miss = [] + for m in mm_models: + lang = "en" + models = m.split(" ") + nlu_ref = "" + for model in models: + partial_ref = NLP_ref_to_NLU_ref(model, lang) + if partial_ref: + nlu_ref = nlu_ref + " " + partial_ref + else: + print(f"{m} missing!") + miss.append(m) + continue + if nlu_ref: + nlu_refs.append(nlu_ref) + + print(f"Num Missing : {len(miss)} Num Found : {len(nlu_refs)}") + print("Missing:", miss) + print("Found:", nlu_refs) + return nlu_refs diff --git a/tests/databricks/endpoint_tests.py b/tests/databricks/endpoint_tests.py index f86a84ebc3..c9fc3d65d3 100644 --- a/tests/databricks/endpoint_tests.py +++ b/tests/databricks/endpoint_tests.py @@ -1,52 +1,112 @@ +import json from multiprocessing import Queue from threading import Thread +from johnsnowlabs.auto_install.jsl_home import get_install_suite_from_jsl_home +from johnsnowlabs.utils.enums import JvmHardwareTarget from tests.databricks.db_test_utils import * from tests.databricks.db_test_utils import ( run_endpoint_tests, - get_one_model_per_class, get_or_create_test_cluster, subtester_thread, ) -@pytest.mark.skip(reason="WIP") +def log_and_get_failed_models(results): + retry_models = [] + for model, result in results.items(): + print(f"Model {model}: {result}") + if result["success"] is False: + retry_models.append(model) + return retry_models + + +def parallel_run( + cluster_ids, + n_parallel_jobs_per_cluster, + models_to_test, + host, + token, + results, + test_type, +): + # 3) For each cluster, start a tester-thread. + # Start an extra thread for same cluster, for every parallel job run on cluster + # Threads take jobs from the queue and run them on the cluster till completion. + job_que = Queue() + for model in models_to_test: + job_que.put(model) + threads = [] + for cluster_id in cluster_ids: + for i in range(n_parallel_jobs_per_cluster): + # Start 1 thread for every job that should run, for every cluster + t = Thread( + target=subtester_thread, + args=( + cluster_id, + job_que, + host, + token, + results, + test_type, + ), + ) + threads.append(t) + t.start() + # Wait for all threads to finish + for t in threads: + t.join() + + +# @pytest.mark.skip(reason="WIP") @db_cloud_node_params def test_endpoints_multi_cluster(creds, node_type): n_clusters = 1 - # n_parallel_jobs_per_cluster = 4 # todo add + n_parallel_jobs_per_cluster = 2 + runtime = "9.1.x-scala2.12" + lic, host, token = creds + # 1) Create clusters cluster_ids = [ - get_or_create_test_cluster(creds, node_type, i) for i in range(n_clusters) + get_or_create_test_cluster(creds, node_type, i, runtime=runtime) + for i in range(n_clusters) ] - # 2) Define job-queue - job_que = Queue() - one_model_per_class = get_one_model_per_class() - for model in one_model_per_class: - job_que.put(model) - - # Create a semaphore to limit parallelism per cluster + # 2) Define models to test + models_to_test = get_mm_models() # [:3] + models_to_test = ["tokenize"] + # one_model_per_class = get_one_model_per_class() - # 3) For each cluster, start a tester-thread. - # Threads take jobs from the queue and run them on the cluster till completion. - lic, host, token = aws_creds - threads = [] + # 3) Start parallel-job-cluster test results = {} - for cluster_id in cluster_ids: - t = Thread( - target=subtester_thread, args=(cluster_id, job_que, host, token, results) - ) - threads.append(t) - t.start() + # test_type = "load_predict" # 'endpoint' + test_type = "endpoint" # '' + parallel_run( + cluster_ids=cluster_ids, + n_parallel_jobs_per_cluster=n_parallel_jobs_per_cluster, + models_to_test=models_to_test, + host=host, + token=token, + results=results, + test_type=test_type, + ) - # Wait for all threads to finish - for t in threads: - t.join() + retry_models = log_and_get_failed_models(results) + print(f"Retrying {len(retry_models)} models") + # Give clusters some time to recover from any failures + time.sleep(60 * 5) - # 4) Print results - for model, result in results.items(): - print(f"Model {model}: {result}") + # run failed models again, with job-parallelism 1 but same cluster-parallelism + parallel_run( + cluster_ids=cluster_ids, + n_parallel_jobs_per_cluster=1, + models_to_test=retry_models, + host=host, + token=token, + results=results, + test_type=test_type, + ) + json.dump(results, open("results.json", "w")) # 5) Delete all clusters # for cluster_id in cluster_ids: @@ -56,5 +116,31 @@ def test_endpoints_multi_cluster(creds, node_type): @db_cloud_node_params def test_endpoint(creds, node_type): lic, host, token = creds - cluster_id = get_or_create_test_cluster(creds, node_type, 0) - run_endpoint_tests(cluster_id, host, token, "tokenize") + cluster_id = get_or_create_test_cluster(creds, node_type, 10, clean_workspace=False) + job_url, success = run_endpoint_tests(cluster_id, host, token, "tokenize") + assert success + + +@db_cloud_node_params +def test_endpoint_licensed(creds, node_type): + lic, host, token = creds + cluster_id = get_or_create_test_cluster(creds, node_type, 0, clean_workspace=True) + # run_endpoint_tests(cluster_id, host, token, "med_ner.clinical") + # run_endpoint_tests(cluster_id, host, token, "en.med_ner.clinical") + job_url, success = run_endpoint_tests(cluster_id, host, token, "tokenize") + assert success + + +""" +We want to be able to test on : +- List of models +- List of runtimes +- auto-generate benchmarks --> time for load/predicting + +# TODO +- handle spot instance reclamation https://dbc-3d4c44aa-a512.cloud.databricks.com/?o=4085846932608579#job/407841450144996/run/583312836892114 -162028-g0yy9b85 +- handle endpoint stuck/ use timeout 60mins + + +""" +# https://github.com/qdrant/qdrant-haystack/tree/master diff --git a/tests/databricks/submit_tests.py b/tests/databricks/submit_tests.py index d3b185f762..e56ebba245 100644 --- a/tests/databricks/submit_tests.py +++ b/tests/databricks/submit_tests.py @@ -105,8 +105,9 @@ def test_submit_notebook_task_to_databricks(creds, node_type): @db_cloud_node_params def test_submit_notebook_parameterized_task_to_databricks(creds, node_type): - cluster_id = get_or_create_test_cluster(creds, node_type, 0) + # cluster_id = get_or_create_test_cluster(creds, node_type, 0) lic, host, token = creds + cluster_id = "1005-134624-a6p9ji9u" nb_path = "parameterized_nb_example.ipynb" dst_path = "/Users/christian@johnsnowlabs.com/test.ipynb" @@ -125,7 +126,7 @@ def test_submit_notebook_parameterized_task_to_databricks(creds, node_type): @db_cloud_node_params def test_submit_script_parameterized_task_to_databricks(creds, node_type): - cluster_id = get_or_create_test_cluster(creds, node_type, 0) + # cluster_id = get_or_create_test_cluster(creds, node_type, 0) script = """ import sys @@ -133,6 +134,7 @@ def test_submit_script_parameterized_task_to_databricks(creds, node_type): """ lic, host, token = creds + cluster_id = "1005-134624-a6p9ji9u" arg1 = "My first arg" arg2 = "My second arg" diff --git a/tests/llm_frameworks/test_haystack.py b/tests/llm_frameworks/test_haystack.py new file mode 100644 index 0000000000..fd41ad1caf --- /dev/null +++ b/tests/llm_frameworks/test_haystack.py @@ -0,0 +1,62 @@ +from haystack import Document +from mlflow.models import ModelSignature, infer_signature +import os +from haystack.pipelines import RayPipeline + +# https://docs.haystack.deepset.ai/docs/pipelines#distributed-pipelines-with-ray +from haystack.nodes import EmbeddingRetriever + +from johnsnowlabs.frameworks.embedding_retrieval.haystack_node import ( + JohnSnowLabsHaystackProcessor, +) + +# os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" +from johnsnowlabs.llm import embedding_retrieval + + +def get_docs(): + return [ + Document( + content="I like apples", + content_type="text", + id=1, + ), + Document( + content="I like bananas \n and other things \n like icream \n and cats", + content_type="text", + id=2, + ), + ] + + +def test_integration(): + from haystack.nodes import PreProcessor + from haystack import Pipeline + from haystack.document_stores import InMemoryDocumentStore + + # processor = PreProcessor( + # clean_whitespace=True, + # split_by="word", + # ) + + processor = JohnSnowLabsHaystackProcessor() + + # Write some processed data to Doc store, so we can retrieve it later + document_store = InMemoryDocumentStore(embedding_dim=512) + document_store.write_documents(processor.process(get_docs())) + + # could just use EmbeddingRetriever but mehhh + retriever = embedding_retrieval.JohnSnowLabsHaystackEmbedder( + embedding_model="en.embed_sentence.bert_base_uncased", + # model_format="johnsnowlabs", + document_store=document_store, + use_gpu=False, + ) + document_store.update_embeddings(retriever) + + pipe = Pipeline() + # pipe.add_node(component=processor, name="Preprocess", inputs=["Query"]) + pipe.add_node(component=processor, name="Preprocess", inputs=["Query"]) + pipe.add_node(component=retriever, name="Embed&Retrieve", inputs=["Query"]) + result = pipe.run(documents=get_docs(), query="lol") + print(result) diff --git a/tests/llm_frameworks/test_langchain.py b/tests/llm_frameworks/test_langchain.py new file mode 100644 index 0000000000..8fcf0cdb19 --- /dev/null +++ b/tests/llm_frameworks/test_langchain.py @@ -0,0 +1,45 @@ +import pytest + +from johnsnowlabs.frameworks.embedding_retrieval import JohnSnowLabsCharSplitter + + +# https://colab.research.google.com/drive/1J7JpxIkYcOlm01otttJLD6iFLge43dZ9?usp=sharing + + +def test_integration(): + from langchain.document_loaders import TextLoader + from langchain.text_splitter import CharacterTextSplitter + from langchain.vectorstores import FAISS + from langchain.embeddings import OpenAIEmbeddings + from langchain.agents.agent_toolkits import create_retriever_tool + + from johnsnowlabs.llm import embedding_retrival + + p = "/home/ckl/Documents/freelance/jsl/johnsnowlabs-4-real/tests/datasets/state_of_the_union.txt" + loader = TextLoader(p) + documents = loader.load() + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + texts = text_splitter.split_documents(documents) + embeddings = embedding_retrival.JohnSnowLabsLangChainEmbedder( + "en.embed_sentence.bert_base_uncased" + ) # OpenAIEmbeddings() + db = FAISS.from_documents(texts, embeddings) + retriever = db.as_retriever() + tool = create_retriever_tool( + retriever, + "search_state_of_union", + "Searches and returns documents regarding the state-of-the-union.", + ) + tools = [tool] + + from langchain.agents.agent_toolkits import create_conversational_retrieval_agent + from langchain.chat_models import ChatOpenAI + + open_api_key = "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" + llm = ChatOpenAI(temperature=0, openai_api_key=open_api_key) + agent_executor = create_conversational_retrieval_agent(llm, tools, verbose=True) + + result = agent_executor( + {"input": "what did the president say about going to east of Columbus?"} + ) + result["output"]