Skip to content

Commit

Permalink
[SPARKNLP-1080] AutoGGUFEmbeddings change default pretrained model
Browse files Browse the repository at this point in the history
  • Loading branch information
DevinTDHa committed Nov 2, 2024
1 parent 13c06a8 commit 7fd370f
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 27 deletions.
2 changes: 1 addition & 1 deletion docs/en/annotator_entries/AutoGGUFEmbeddings.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ val autoGGUFEmbeddings = AutoGGUFEmbeddings.pretrained()
.setOutputCol("embeddings")
```

The default model is `"nomic-embed-text-v1.5.f16.gguf"`, if no name is provided.
The default model is `"nomic-embed-text-v1.5.Q8_0.gguf"`, if no name is provided.

For available pretrained models please see the [Models Hub](https://sparknlp.org/models).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"display_name": "sparknlp_dev",
"language": "python",
"name": "python3"
},
Expand All @@ -264,7 +264,8 @@
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3"
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,27 +36,27 @@
"name": "stdout",
"output_type": "stream",
"text": [
"--2024-11-02 13:42:45-- https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf?download=true\n",
"--2024-11-02 13:42:45-- https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q8_0.gguf?download=true\n",
"Resolving huggingface.co (huggingface.co)... 3.160.39.87, 3.160.39.100, 3.160.39.99, ...\n",
"Connecting to huggingface.co (huggingface.co)|3.160.39.87|:443... connected.\n",
"HTTP request sent, awaiting response... 302 Found\n",
"Location: https://cdn-lfs-us-1.hf.co/repos/19/39/19396cd98fe8b02e39b1be815db29f6b251fee34fc5d6550db0b478083fdda2f/f7af6f66802f4df86eda10fe9bbcfc75c39562bed48ef6ace719a251cf1c2fdb?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27nomic-embed-text-v1.5.f16.gguf%3B+filename%3D%22nomic-embed-text-v1.5.f16.gguf%22%3B&Expires=1730810566&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMDgxMDU2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzE5LzM5LzE5Mzk2Y2Q5OGZlOGIwMmUzOWIxYmU4MTVkYjI5ZjZiMjUxZmVlMzRmYzVkNjU1MGRiMGI0NzgwODNmZGRhMmYvZjdhZjZmNjY4MDJmNGRmODZlZGExMGZlOWJiY2ZjNzVjMzk1NjJiZWQ0OGVmNmFjZTcxOWEyNTFjZjFjMmZkYj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=R7WUe1icdziIE4kS%7EHMcsyiySLOHJpkJ9lM2nC6EtIwPj6V12RRpjpDIyyv0%7EY9h32v2HDomyyNO6-Ry5UeIm3UjyESR9In3kmTKAqZX2zzlslTQMXDQghmLpIEQGgmh5-5RHrFgYPNxKmVICyQL1Vz9IgFQRMfdug6RBTGgmmXfLgksa9IU7TdvZcqvOb68HCdmv1hEt2U5vH4A9MF81ohMBqrvTb9389jzrlP1tZtNFb5wjNdZDmr57XIsvQRZB0ZDUIsMT1nc5QehNpWpX4jMLBSnkj1-oL9XN7%7EhAXDbB1mTH9kbrD3UUNKRm4%7ER-gVhegqsfirdSFi66sP3bg__&Key-Pair-Id=K24J24Z295AEI9 [following]\n",
"--2024-11-02 13:42:46-- https://cdn-lfs-us-1.hf.co/repos/19/39/19396cd98fe8b02e39b1be815db29f6b251fee34fc5d6550db0b478083fdda2f/f7af6f66802f4df86eda10fe9bbcfc75c39562bed48ef6ace719a251cf1c2fdb?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27nomic-embed-text-v1.5.f16.gguf%3B+filename%3D%22nomic-embed-text-v1.5.f16.gguf%22%3B&Expires=1730810566&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMDgxMDU2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzE5LzM5LzE5Mzk2Y2Q5OGZlOGIwMmUzOWIxYmU4MTVkYjI5ZjZiMjUxZmVlMzRmYzVkNjU1MGRiMGI0NzgwODNmZGRhMmYvZjdhZjZmNjY4MDJmNGRmODZlZGExMGZlOWJiY2ZjNzVjMzk1NjJiZWQ0OGVmNmFjZTcxOWEyNTFjZjFjMmZkYj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=R7WUe1icdziIE4kS%7EHMcsyiySLOHJpkJ9lM2nC6EtIwPj6V12RRpjpDIyyv0%7EY9h32v2HDomyyNO6-Ry5UeIm3UjyESR9In3kmTKAqZX2zzlslTQMXDQghmLpIEQGgmh5-5RHrFgYPNxKmVICyQL1Vz9IgFQRMfdug6RBTGgmmXfLgksa9IU7TdvZcqvOb68HCdmv1hEt2U5vH4A9MF81ohMBqrvTb9389jzrlP1tZtNFb5wjNdZDmr57XIsvQRZB0ZDUIsMT1nc5QehNpWpX4jMLBSnkj1-oL9XN7%7EhAXDbB1mTH9kbrD3UUNKRm4%7ER-gVhegqsfirdSFi66sP3bg__&Key-Pair-Id=K24J24Z295AEI9\n",
"Location: https://cdn-lfs-us-1.hf.co/repos/19/39/19396cd98fe8b02e39b1be815db29f6b251fee34fc5d6550db0b478083fdda2f/f7af6f66802f4df86eda10fe9bbcfc75c39562bed48ef6ace719a251cf1c2fdb?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27nomic-embed-text-v1.5.Q8_0.gguf%3B+filename%3D%22nomic-embed-text-v1.5.Q8_0.gguf%22%3B&Expires=1730810566&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMDgxMDU2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzE5LzM5LzE5Mzk2Y2Q5OGZlOGIwMmUzOWIxYmU4MTVkYjI5ZjZiMjUxZmVlMzRmYzVkNjU1MGRiMGI0NzgwODNmZGRhMmYvZjdhZjZmNjY4MDJmNGRmODZlZGExMGZlOWJiY2ZjNzVjMzk1NjJiZWQ0OGVmNmFjZTcxOWEyNTFjZjFjMmZkYj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=R7WUe1icdziIE4kS%7EHMcsyiySLOHJpkJ9lM2nC6EtIwPj6V12RRpjpDIyyv0%7EY9h32v2HDomyyNO6-Ry5UeIm3UjyESR9In3kmTKAqZX2zzlslTQMXDQghmLpIEQGgmh5-5RHrFgYPNxKmVICyQL1Vz9IgFQRMfdug6RBTGgmmXfLgksa9IU7TdvZcqvOb68HCdmv1hEt2U5vH4A9MF81ohMBqrvTb9389jzrlP1tZtNFb5wjNdZDmr57XIsvQRZB0ZDUIsMT1nc5QehNpWpX4jMLBSnkj1-oL9XN7%7EhAXDbB1mTH9kbrD3UUNKRm4%7ER-gVhegqsfirdSFi66sP3bg__&Key-Pair-Id=K24J24Z295AEI9 [following]\n",
"--2024-11-02 13:42:46-- https://cdn-lfs-us-1.hf.co/repos/19/39/19396cd98fe8b02e39b1be815db29f6b251fee34fc5d6550db0b478083fdda2f/f7af6f66802f4df86eda10fe9bbcfc75c39562bed48ef6ace719a251cf1c2fdb?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27nomic-embed-text-v1.5.Q8_0.gguf%3B+filename%3D%22nomic-embed-text-v1.5.Q8_0.gguf%22%3B&Expires=1730810566&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMDgxMDU2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzE5LzM5LzE5Mzk2Y2Q5OGZlOGIwMmUzOWIxYmU4MTVkYjI5ZjZiMjUxZmVlMzRmYzVkNjU1MGRiMGI0NzgwODNmZGRhMmYvZjdhZjZmNjY4MDJmNGRmODZlZGExMGZlOWJiY2ZjNzVjMzk1NjJiZWQ0OGVmNmFjZTcxOWEyNTFjZjFjMmZkYj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=R7WUe1icdziIE4kS%7EHMcsyiySLOHJpkJ9lM2nC6EtIwPj6V12RRpjpDIyyv0%7EY9h32v2HDomyyNO6-Ry5UeIm3UjyESR9In3kmTKAqZX2zzlslTQMXDQghmLpIEQGgmh5-5RHrFgYPNxKmVICyQL1Vz9IgFQRMfdug6RBTGgmmXfLgksa9IU7TdvZcqvOb68HCdmv1hEt2U5vH4A9MF81ohMBqrvTb9389jzrlP1tZtNFb5wjNdZDmr57XIsvQRZB0ZDUIsMT1nc5QehNpWpX4jMLBSnkj1-oL9XN7%7EhAXDbB1mTH9kbrD3UUNKRm4%7ER-gVhegqsfirdSFi66sP3bg__&Key-Pair-Id=K24J24Z295AEI9\n",
"Resolving cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)... 18.66.2.2, 18.66.2.116, 18.66.2.98, ...\n",
"Connecting to cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)|18.66.2.2|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 274290560 (262M) [application/octet-stream]\n",
"Saving to: ‘nomic-embed-text-v1.5.f16.gguf’\n",
"Saving to: ‘nomic-embed-text-v1.5.Q8_0.gguf’\n",
"\n",
"nomic-embed-text-v1 100%[===================>] 261.58M 23.8MB/s in 10s \n",
"\n",
"2024-11-02 13:42:56 (24.9 MB/s) - ‘nomic-embed-text-v1.5.f16.gguf’ saved [274290560/274290560]\n",
"2024-11-02 13:42:56 (24.9 MB/s) - ‘nomic-embed-text-v1.5.Q8_0.gguf’ saved [274290560/274290560]\n",
"\n"
]
}
],
"source": [
"EXPORT_PATH = \"nomic-embed-text-v1.5.f16.gguf\"\n",
"EXPORT_PATH = \"nomic-embed-text-v1.5.Q8_0.gguf\"\n",
"! wget \"https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/{EXPORT_PATH}?download=true\" -O {EXPORT_PATH}"
]
},
Expand Down Expand Up @@ -158,7 +158,7 @@
}
],
"source": [
"autoGGUFEmbeddings.write().overwrite().save(f\"nomic-embed-text-v1.5.f16.gguf_spark_nlp\")"
"autoGGUFEmbeddings.write().overwrite().save(f\"nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp\")"
]
},
{
Expand Down Expand Up @@ -197,12 +197,12 @@
"text": [
"total 267872\n",
"drwxr-xr-x 2 root root 4096 Nov 2 13:48 metadata\n",
"-rwxrwxr-x 1 root root 274290560 Nov 2 13:48 nomic-embed-text-v1.5.f16.gguf\n"
"-rwxrwxr-x 1 root root 274290560 Nov 2 13:48 nomic-embed-text-v1.5.Q8_0.gguf\n"
]
}
],
"source": [
"! ls -l nomic-embed-text-v1.5.f16.gguf_spark_nlp/"
"! ls -l nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp/"
]
},
{
Expand All @@ -221,11 +221,11 @@
"name": "stderr",
"output_type": "stream",
"text": [
"24/11/02 13:48:57 WARN SparkContext: The path /home/root/Workspace/scala/spark-nlp/examples/python/llama.cpp/nomic-embed-text-v1.5.f16.gguf_spark_nlp/nomic-embed-text-v1.5.f16.gguf has been added already. Overwriting of added paths is not supported in the current version.\n",
"24/11/02 13:48:57 WARN SparkContext: The path /home/root/Workspace/scala/spark-nlp/examples/python/llama.cpp/nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp/nomic-embed-text-v1.5.Q8_0.gguf has been added already. Overwriting of added paths is not supported in the current version.\n",
"24/11/02 13:48:57 WARN DAGScheduler: Broadcasting large task binary with size 1028.0 KiB\n",
"24/11/02 13:48:57 WARN DAGScheduler: Broadcasting large task binary with size 1028.0 KiB\n",
"24/11/02 13:48:57 WARN DAGScheduler: Broadcasting large task binary with size 1028.0 KiB\n",
"llama_model_loader: loaded meta data with 22 key-value pairs and 112 tensors from /tmp/spark-6de50aee-1059-4698-98e2-db9d68663467/userFiles-932de0e7-9a8f-41f5-9aaf-94bb7406df74/nomic-embed-text-v1.5.f16.gguf (version GGUF V3 (latest))\n",
"llama_model_loader: loaded meta data with 22 key-value pairs and 112 tensors from /tmp/spark-6de50aee-1059-4698-98e2-db9d68663467/userFiles-932de0e7-9a8f-41f5-9aaf-94bb7406df74/nomic-embed-text-v1.5.Q8_0.gguf (version GGUF V3 (latest))\n",
"llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
"llama_model_loader: - kv 0: general.architecture str = nomic-bert\n",
"llama_model_loader: - kv 1: general.name str = nomic-embed-text-v1.5\n",
Expand Down Expand Up @@ -390,7 +390,7 @@
"\n",
"document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n",
"\n",
"autoGGUFEmbeddings = AutoGGUFEmbeddings.load(\"nomic-embed-text-v1.5.f16.gguf_spark_nlp\")\n",
"autoGGUFEmbeddings = AutoGGUFEmbeddings.load(\"nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp\")\n",
"\n",
"pipeline = Pipeline().setStages([document_assembler, autoGGUFEmbeddings])\n",
"\n",
Expand Down
6 changes: 3 additions & 3 deletions python/sparknlp/annotator/embeddings/auto_gguf_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class AutoGGUFEmbeddings(AnnotatorModel, HasBatchedAnnotate):
... .setInputCols(["document"]) \\
... .setOutputCol("embeddings")
The default model is ``"nomic-embed-text-v1.5.f16.gguf"``, if no name is provided.
The default model is ``"nomic-embed-text-v1.5.Q8_0.gguf"``, if no name is provided.
For extended examples of usage, see the
`AutoGGUFEmbeddingsTest <https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTest.scala>`__
Expand Down Expand Up @@ -513,13 +513,13 @@ def loadSavedModel(folder, spark_session):
return AutoGGUFEmbeddings(java_model=jModel)

@staticmethod
def pretrained(name="nomic-embed-text-v1.5.f16.gguf", lang="en", remote_loc=None):
def pretrained(name="nomic-embed-text-v1.5.Q8_0.gguf", lang="en", remote_loc=None):
"""Downloads and loads a pretrained model.
Parameters
----------
name : str, optional
Name of the pretrained model, by default "nomic-embed-text-v1.5.f16.gguf"
Name of the pretrained model, by default "nomic-embed-text-v1.5.Q8_0.gguf"
lang : str, optional
Language of the pretrained model, by default "en"
remote_loc : str, optional
Expand Down
7 changes: 2 additions & 5 deletions python/test/annotator/embeddings/auto_gguf_embeddings_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,7 @@ def setUp(self):

def runTest(self):
model = (
# AutoGGUFEmbeddings.pretrained()
AutoGGUFEmbeddings.loadSavedModel(
"models/nomic-embed-text-v1.5.f16.gguf", SparkContextForTest.spark
)
AutoGGUFEmbeddings.pretrained()
.setInputCols("document")
.setOutputCol("embeddings")
.setBatchSize(4)
Expand Down Expand Up @@ -88,7 +85,7 @@ def runTest(self):
model = (
# AutoGGUFEmbeddings.pretrained()
AutoGGUFEmbeddings.loadSavedModel(
"models/nomic-embed-text-v1.5.f16.gguf", SparkContextForTest.spark
"models/nomic-embed-text-v1.5.Q8_0.gguf", SparkContextForTest.spark
)
.setInputCols("document")
.setOutputCol("embeddings")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ import org.apache.spark.sql.SparkSession
* .setInputCols("document")
* .setOutputCol("embeddings")
* }}}
* The default model is `"nomic-embed-text-v1.5.f16.gguf"`, if no name is provided.
* The default model is `"nomic-embed-text-v1.5.Q8_0.gguf"`, if no name is provided.
*
* For available pretrained models please see the [[https://sparknlp.org/models Models Hub]].
*
Expand Down Expand Up @@ -191,7 +191,7 @@ class AutoGGUFEmbeddings(override val uid: String)
trait ReadablePretrainedAutoGGUFEmbeddings
extends ParamsAndFeaturesReadable[AutoGGUFEmbeddings]
with HasPretrained[AutoGGUFEmbeddings] {
override val defaultModelName: Some[String] = Some("nomic-embed-text-v1.5.f16.gguf")
override val defaultModelName: Some[String] = Some("nomic-embed-text-v1.5.Q8_0.gguf")
override val defaultLang: String = "en"

/** Java compliant-overrides */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ class AutoGGUFEmbeddingsTestSpec extends AnyFlatSpec {
"The sun is " //
).toDF("text").repartition(1)

// nomic-embed-text-v1.5.f16.gguf
// nomic-embed-text-v1.5.Q8_0.gguf
def model(poolingType: String): AutoGGUFEmbeddings = AutoGGUFEmbeddings
.loadSavedModel("models/nomic-embed-text-v1.5.f16.gguf", ResourceHelper.spark)
.pretrained()
.setInputCols("document")
.setOutputCol("embeddings")
.setBatchSize(4)
Expand Down

0 comments on commit 7fd370f

Please sign in to comment.