databricks · dsmilkov · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/lilac/concepts/db_concept.py b/lilac/concepts/db_concept.py
@@ -473,6 +473,8 @@ def _validate_examples(
     self, examples: List[Union[ExampleIn, Example]], type: ConceptType
   ) -> None:
     for example in examples:
+      if not example.text and not example.img:
+        raise ValueError('The example must have a text or image associated with it.')
       inferred_type = 'text' if example.text else 'unknown'
       if inferred_type != type:
         raise ValueError(f'Example type "{inferred_type}" does not match concept type "{type}".')

diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py
@@ -16,7 +16,7 @@
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE
 
 # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
@@ -69,11 +69,12 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
     return chunked_compute_embedding(
       lambda docs: self._model.encode(docs)['dense_vecs'],
       docs,
       self.local_batch_size * 16,
-      chunker=clustering_spacy_chunker,
+      chunker=chunker,
     )
 
   @override

diff --git a/lilac/embeddings/cohere.py b/lilac/embeddings/cohere.py
@@ -9,7 +9,7 @@
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 
 if TYPE_CHECKING:
   from cohere import Client
@@ -65,6 +65,5 @@ def _embed_fn(docs: list[str]) -> list[np.ndarray]:
         ).embeddings
       ]
 
-    return chunked_compute_embedding(
-      _embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker
-    )
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    return chunked_compute_embedding(_embed_fn, docs, self.local_batch_size, chunker=chunker)
diff --git a/lilac/embeddings/gte.py b/lilac/embeddings/gte.py
@@ -19,7 +19,7 @@
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device
 
 # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
@@ -69,17 +69,19 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
     return chunked_compute_embedding(
-      self._model.encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
+      self._model.encode, docs, self.local_batch_size * 16, chunker=chunker
     )
 
   @override
   def compute_garden(self, docs: Iterator[str]) -> Iterator[Item]:
     # Trim the docs to the max context size.
 
     trimmed_docs = (doc[:GTE_CONTEXT_SIZE] for doc in docs)
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
     text_chunks: Iterator[tuple[int, TextChunk]] = (
-      (i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in clustering_spacy_chunker(doc)
+      (i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in chunker(doc)
     )
     text_chunks, text_chunks_2 = itertools.tee(text_chunks)
     chunk_texts = (chunk[0] for _, chunk in text_chunks)

diff --git a/lilac/embeddings/nomic_embed.py b/lilac/embeddings/nomic_embed.py
@@ -14,7 +14,7 @@
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device
 
 # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
@@ -76,9 +76,8 @@ def _encode(doc: list[str]) -> list[np.ndarray]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
-    return chunked_compute_embedding(
-      _encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
-    )
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    return chunked_compute_embedding(_encode, docs, self.local_batch_size * 16, chunker=chunker)
 
   @override
   def teardown(self) -> None:

diff --git a/lilac/embeddings/openai.py b/lilac/embeddings/openai.py
@@ -10,7 +10,7 @@
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 
 API_NUM_PARALLEL_REQUESTS = 10
 API_OPENAI_BATCH_SIZE = 128
@@ -92,6 +92,5 @@ def embed_fn(texts: list[str]) -> list[np.ndarray]:
       )
       return [np.array(embedding.embedding, dtype=np.float32) for embedding in response.data]
 
-    return chunked_compute_embedding(
-      embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker
-    )
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    return chunked_compute_embedding(embed_fn, docs, self.local_batch_size, chunker=chunker)
diff --git a/lilac/embeddings/sbert.py b/lilac/embeddings/sbert.py
@@ -12,7 +12,7 @@
 from ..schema import Item
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device
 
 # The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2`` is 5 times
@@ -47,8 +47,9 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
     return chunked_compute_embedding(
-      self._model.encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
+      self._model.encode, docs, self.local_batch_size * 16, chunker=chunker
     )
 
   @override

diff --git a/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte b/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte
@@ -76,7 +76,7 @@
   };
   $: {
     pathToSpans = {};
-    spanPaths.forEach(sp => {
+    (spanPaths || []).forEach(sp => {
       if (row == null) return;
       let valueNodes = getValueNodes(row, sp);
       const isSpanNestedUnder = pathMatchesPrefix(sp, path);
@@ -97,7 +97,7 @@
   let spanPathToValueInfos: Record<string, SpanValueInfo[]> = {};
   $: {
     spanPathToValueInfos = {};
-    for (const spanValueInfo of spanValueInfos) {
+    for (const spanValueInfo of spanValueInfos || []) {
       const spanPathStr = serializePath(spanValueInfo.spanPath);
       if (spanPathToValueInfos[spanPathStr] == null) {
         spanPathToValueInfos[spanPathStr] = [];
@@ -206,7 +206,7 @@
   $: {
     if (model != null && editor != null) {
       let minPosition: Monaco.Position | null = null;
-      for (const renderSpan of monacoSpans) {
+      for (const renderSpan of monacoSpans || []) {
         const span = L.span(renderSpan.span)!;
         const position = model.getPositionAt(span.start);
 
@@ -381,7 +381,7 @@
 
   const conceptQuery = queryConcepts();
   $: concepts = $conceptQuery.data;
-  let conceptsInMenu: Set<string>;
+  let conceptsInMenu: Set<string> = new Set();
   let addToConceptItems: DropdownItem[] = [];
 
   $: {