Skip to content

Commit 8054f3c

Browse files
authored
Merge branch 'langgenius:main' into main
2 parents 0b64d5a + 989fb11 commit 8054f3c

File tree

106 files changed

+1358
-641
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

106 files changed

+1358
-641
lines changed

.github/workflows/style.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,33 @@ jobs:
8282
if: steps.changed-files.outputs.any_changed == 'true'
8383
run: yarn run lint
8484

85+
docker-compose-template:
86+
name: Docker Compose Template
87+
runs-on: ubuntu-latest
88+
89+
steps:
90+
- name: Checkout code
91+
uses: actions/checkout@v4
92+
93+
- name: Check changed files
94+
id: changed-files
95+
uses: tj-actions/changed-files@v45
96+
with:
97+
files: |
98+
docker/generate_docker_compose
99+
docker/.env.example
100+
docker/docker-compose-template.yaml
101+
docker/docker-compose.yaml
102+
103+
- name: Generate Docker Compose
104+
if: steps.changed-files.outputs.any_changed == 'true'
105+
run: |
106+
cd docker
107+
./generate_docker_compose
108+
109+
- name: Check for changes
110+
if: steps.changed-files.outputs.any_changed == 'true'
111+
run: git diff --exit-code
85112

86113
superlinter:
87114
name: SuperLinter

api/configs/middleware/vdb/milvus_config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,9 @@ class MilvusConfig(BaseSettings):
3333
description="Name of the Milvus database to connect to (default is 'default')",
3434
default="default",
3535
)
36+
37+
MILVUS_ENABLE_HYBRID_SEARCH: bool = Field(
38+
description="Enable hybrid search features (requires Milvus >= 2.5.0). Set to false for compatibility with "
39+
"older versions",
40+
default=True,
41+
)

api/controllers/console/datasets/datasets.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,12 @@ def get(self):
5252
# provider = request.args.get("provider", default="vendor")
5353
search = request.args.get("keyword", default=None, type=str)
5454
tag_ids = request.args.getlist("tag_ids")
55-
55+
include_all = request.args.get("include_all", default="false").lower() == "true"
5656
if ids:
5757
datasets, total = DatasetService.get_datasets_by_ids(ids, current_user.current_tenant_id)
5858
else:
5959
datasets, total = DatasetService.get_datasets(
60-
page, limit, current_user.current_tenant_id, current_user, search, tag_ids
60+
page, limit, current_user.current_tenant_id, current_user, search, tag_ids, include_all
6161
)
6262

6363
# check embedding setting
@@ -640,6 +640,7 @@ def get(self):
640640
| VectorType.MYSCALE
641641
| VectorType.ORACLE
642642
| VectorType.ELASTICSEARCH
643+
| VectorType.ELASTICSEARCH_JA
643644
| VectorType.PGVECTOR
644645
| VectorType.TIDB_ON_QDRANT
645646
| VectorType.LINDORM
@@ -683,6 +684,7 @@ def get(self, vector_type):
683684
| VectorType.MYSCALE
684685
| VectorType.ORACLE
685686
| VectorType.ELASTICSEARCH
687+
| VectorType.ELASTICSEARCH_JA
686688
| VectorType.COUCHBASE
687689
| VectorType.PGVECTOR
688690
| VectorType.LINDORM

api/controllers/console/datasets/datasets_document.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,8 @@ def post(self, dataset_id):
257257
parser.add_argument("original_document_id", type=str, required=False, location="json")
258258
parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
259259
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
260-
260+
parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
261+
parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
261262
parser.add_argument(
262263
"doc_language", type=str, default="English", required=False, nullable=False, location="json"
263264
)

api/controllers/service_api/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77

88
from . import index
99
from .app import app, audio, completion, conversation, file, message, workflow
10-
from .dataset import dataset, document, hit_testing, segment
10+
from .dataset import dataset, document, hit_testing, segment, upload_file

api/controllers/service_api/dataset/dataset.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,11 @@ def get(self, tenant_id):
3131
# provider = request.args.get("provider", default="vendor")
3232
search = request.args.get("keyword", default=None, type=str)
3333
tag_ids = request.args.getlist("tag_ids")
34+
include_all = request.args.get("include_all", default="false").lower() == "true"
3435

35-
datasets, total = DatasetService.get_datasets(page, limit, tenant_id, current_user, search, tag_ids)
36+
datasets, total = DatasetService.get_datasets(
37+
page, limit, tenant_id, current_user, search, tag_ids, include_all
38+
)
3639
# check embedding setting
3740
provider_manager = ProviderManager()
3841
configurations = provider_manager.get_configurations(tenant_id=current_user.current_tenant_id)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from werkzeug.exceptions import NotFound
2+
3+
from controllers.service_api import api
4+
from controllers.service_api.wraps import (
5+
DatasetApiResource,
6+
)
7+
from core.file import helpers as file_helpers
8+
from extensions.ext_database import db
9+
from models.dataset import Dataset
10+
from models.model import UploadFile
11+
from services.dataset_service import DocumentService
12+
13+
14+
class UploadFileApi(DatasetApiResource):
15+
def get(self, tenant_id, dataset_id, document_id):
16+
"""Get upload file."""
17+
# check dataset
18+
dataset_id = str(dataset_id)
19+
tenant_id = str(tenant_id)
20+
dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first()
21+
if not dataset:
22+
raise NotFound("Dataset not found.")
23+
# check document
24+
document_id = str(document_id)
25+
document = DocumentService.get_document(dataset.id, document_id)
26+
if not document:
27+
raise NotFound("Document not found.")
28+
# check upload file
29+
if document.data_source_type != "upload_file":
30+
raise ValueError(f"Document data source type ({document.data_source_type}) is not upload_file.")
31+
data_source_info = document.data_source_info_dict
32+
if data_source_info and "upload_file_id" in data_source_info:
33+
file_id = data_source_info["upload_file_id"]
34+
upload_file = db.session.query(UploadFile).filter(UploadFile.id == file_id).first()
35+
if not upload_file:
36+
raise NotFound("UploadFile not found.")
37+
else:
38+
raise ValueError("Upload file id not found in document data source info.")
39+
40+
url = file_helpers.get_signed_file_url(upload_file_id=upload_file.id)
41+
return {
42+
"id": upload_file.id,
43+
"name": upload_file.name,
44+
"size": upload_file.size,
45+
"extension": upload_file.extension,
46+
"url": url,
47+
"download_url": f"{url}&as_attachment=true",
48+
"mime_type": upload_file.mime_type,
49+
"created_by": upload_file.created_by,
50+
"created_at": upload_file.created_at.timestamp(),
51+
}, 200
52+
53+
54+
api.add_resource(UploadFileApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/upload-file")

api/core/indexing_runner.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,6 @@ def _load(
530530
# chunk nodes by chunk size
531531
indexing_start_at = time.perf_counter()
532532
tokens = 0
533-
chunk_size = 10
534533
if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX:
535534
# create keyword index
536535
create_keyword_thread = threading.Thread(
@@ -539,11 +538,22 @@ def _load(
539538
)
540539
create_keyword_thread.start()
541540

541+
max_workers = 10
542542
if dataset.indexing_technique == "high_quality":
543-
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
543+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
544544
futures = []
545-
for i in range(0, len(documents), chunk_size):
546-
chunk_documents = documents[i : i + chunk_size]
545+
546+
# Distribute documents into multiple groups based on the hash values of page_content
547+
# This is done to prevent multiple threads from processing the same document,
548+
# Thereby avoiding potential database insertion deadlocks
549+
document_groups: list[list[Document]] = [[] for _ in range(max_workers)]
550+
for document in documents:
551+
hash = helper.generate_text_hash(document.page_content)
552+
group_index = int(hash, 16) % max_workers
553+
document_groups[group_index].append(document)
554+
for chunk_documents in document_groups:
555+
if len(chunk_documents) == 0:
556+
continue
547557
futures.append(
548558
executor.submit(
549559
self._process_chunk,

api/core/model_runtime/model_providers/__base/tokenizers/gpt2_tokenzier.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
import logging
12
from threading import Lock
23
from typing import Any
34

4-
import tiktoken
5+
logger = logging.getLogger(__name__)
56

67
_tokenizer: Any = None
78
_lock = Lock()
@@ -33,9 +34,18 @@ def get_encoder() -> Any:
3334
if _tokenizer is None:
3435
# Try to use tiktoken to get the tokenizer because it is faster
3536
#
36-
_tokenizer = tiktoken.get_encoding("gpt2")
37-
# base_path = abspath(__file__)
38-
# gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
39-
# _tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
37+
try:
38+
import tiktoken
39+
40+
_tokenizer = tiktoken.get_encoding("gpt2")
41+
except Exception:
42+
from os.path import abspath, dirname, join
43+
44+
from transformers import GPT2Tokenizer as TransformerGPT2Tokenizer # type: ignore
45+
46+
base_path = abspath(__file__)
47+
gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
48+
_tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
49+
logger.info("Fallback to Transformers' GPT-2 tokenizer from tiktoken")
4050

4151
return _tokenizer

api/core/model_runtime/model_providers/openai_api_compatible/llm/llm.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -377,10 +377,7 @@ def _generate(
377377
for tool in tools:
378378
formatted_tools.append(helper.dump_model(PromptMessageFunction(function=tool)))
379379

380-
if prompt_messages[-1].role.value == "tool":
381-
data["tools"] = None
382-
else:
383-
data["tools"] = formatted_tools
380+
data["tools"] = formatted_tools
384381

385382
if stop:
386383
data["stop"] = stop

api/core/model_runtime/model_providers/openrouter/llm/claude-3-5-sonnet.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ features:
77
- vision
88
- tool-call
99
- stream-tool-call
10+
- document
1011
model_properties:
1112
mode: chat
1213
context_size: 200000
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import json
2+
import logging
3+
from typing import Any, Optional
4+
5+
from flask import current_app
6+
7+
from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import (
8+
ElasticSearchConfig,
9+
ElasticSearchVector,
10+
ElasticSearchVectorFactory,
11+
)
12+
from core.rag.datasource.vdb.field import Field
13+
from core.rag.datasource.vdb.vector_type import VectorType
14+
from core.rag.embedding.embedding_base import Embeddings
15+
from extensions.ext_redis import redis_client
16+
from models.dataset import Dataset
17+
18+
logger = logging.getLogger(__name__)
19+
20+
21+
class ElasticSearchJaVector(ElasticSearchVector):
22+
def create_collection(
23+
self,
24+
embeddings: list[list[float]],
25+
metadatas: Optional[list[dict[Any, Any]]] = None,
26+
index_params: Optional[dict] = None,
27+
):
28+
lock_name = f"vector_indexing_lock_{self._collection_name}"
29+
with redis_client.lock(lock_name, timeout=20):
30+
collection_exist_cache_key = f"vector_indexing_{self._collection_name}"
31+
if redis_client.get(collection_exist_cache_key):
32+
logger.info(f"Collection {self._collection_name} already exists.")
33+
return
34+
35+
if not self._client.indices.exists(index=self._collection_name):
36+
dim = len(embeddings[0])
37+
settings = {
38+
"analysis": {
39+
"analyzer": {
40+
"ja_analyzer": {
41+
"type": "custom",
42+
"char_filter": [
43+
"icu_normalizer",
44+
"kuromoji_iteration_mark",
45+
],
46+
"tokenizer": "kuromoji_tokenizer",
47+
"filter": [
48+
"kuromoji_baseform",
49+
"kuromoji_part_of_speech",
50+
"ja_stop",
51+
"kuromoji_number",
52+
"kuromoji_stemmer",
53+
],
54+
}
55+
}
56+
}
57+
}
58+
mappings = {
59+
"properties": {
60+
Field.CONTENT_KEY.value: {
61+
"type": "text",
62+
"analyzer": "ja_analyzer",
63+
"search_analyzer": "ja_analyzer",
64+
},
65+
Field.VECTOR.value: { # Make sure the dimension is correct here
66+
"type": "dense_vector",
67+
"dims": dim,
68+
"index": True,
69+
"similarity": "cosine",
70+
},
71+
Field.METADATA_KEY.value: {
72+
"type": "object",
73+
"properties": {
74+
"doc_id": {"type": "keyword"} # Map doc_id to keyword type
75+
},
76+
},
77+
}
78+
}
79+
self._client.indices.create(index=self._collection_name, settings=settings, mappings=mappings)
80+
81+
redis_client.set(collection_exist_cache_key, 1, ex=3600)
82+
83+
84+
class ElasticSearchJaVectorFactory(ElasticSearchVectorFactory):
85+
def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> ElasticSearchJaVector:
86+
if dataset.index_struct_dict:
87+
class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
88+
collection_name = class_prefix
89+
else:
90+
dataset_id = dataset.id
91+
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
92+
dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.ELASTICSEARCH, collection_name))
93+
94+
config = current_app.config
95+
return ElasticSearchJaVector(
96+
index_name=collection_name,
97+
config=ElasticSearchConfig(
98+
host=config.get("ELASTICSEARCH_HOST", "localhost"),
99+
port=config.get("ELASTICSEARCH_PORT", 9200),
100+
username=config.get("ELASTICSEARCH_USERNAME", ""),
101+
password=config.get("ELASTICSEARCH_PASSWORD", ""),
102+
),
103+
attributes=[],
104+
)

api/core/rag/datasource/vdb/field.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ class Field(Enum):
66
METADATA_KEY = "metadata"
77
GROUP_KEY = "group_id"
88
VECTOR = "vector"
9+
# Sparse Vector aims to support full text search
10+
SPARSE_VECTOR = "sparse_vector"
911
TEXT_KEY = "text"
1012
PRIMARY_KEY = "id"
1113
DOC_ID = "metadata.doc_id"

0 commit comments

Comments
 (0)