-
Notifications
You must be signed in to change notification settings - Fork 192
Open
Labels
Description
Bug Description
Some sync jobs seem to not count indexed documents or their volume, despite the fact that documents are being ingested.
To Reproduce
- create a github connector. Use a valid PAT, and configure
elastic/connectors - Use the below mappings
- create the below pipeline
- configure the pipeline with the below request
- Run a sync, and see 0 index documents in the sync job record, but see that the attached index does have documents.
- be confused
Mapping
{
"mappings": {
"properties": {
"title": { "type": "semantic_text" },
"body": { "type": "semantic_text" },
"description": { "type": "semantic_text" },
"semantic_comments": { "type": "semantic_text" },
"issue_comments": {
"properties": {
"body": {
"type": "text",
"copy_to": "semantic_comments"
},
"author": {
"properties": {
"login": { "type": "keyword" }
}
}
}
},
"reviews_comments": {
"properties": {
"body": {
"type": "text",
"copy_to": "semantic_comments"
},
"comments": {
"properties": {
"body": {
"type": "text",
"copy_to": "semantic_comments"
}
}
},
"author": { "type": "keyword" },
"state": { "type": "keyword" }
}
},
"labels_field": {
"properties": {
"name": { "type": "keyword" },
"description": { "type": "text" }
}
},
"author": {
"properties": {
"login": { "type": "keyword" }
}
},
"assignees_list": {
"properties": {
"login": { "type": "keyword" }
}
},
"requested_reviewers": {
"properties": {
"requestedReviewer": {
"properties": {
"login": { "type": "keyword" }
}
}
}
},
"defaultBranchRef": {
"properties": {
"name": { "type": "keyword" }
}
},
"primaryLanguage": {
"properties": {
"name": { "type": "keyword" }
}
},
"state": { "type": "keyword" },
"type": { "type": "keyword" },
"url": { "type": "keyword" },
"nameWithOwner": { "type": "keyword" }
}
}
}
pipeline
{
"processors": [
{
"attachment": {
"on_failure": [
{
"append": {
"description": "Record error information",
"field": "_ingestion_errors",
"value": "Processor 'attachment' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
}
}
],
"remove_binary": false,
"field": "_attachment",
"target_field": "_extracted_attachment",
"description": "Extract text from binary attachments",
"ignore_missing": true,
"indexed_chars_field": "_attachment_indexed_chars",
"if": "ctx?._extract_binary_content == true"
}
},
{
"set": {
"ignore_empty_value": true,
"on_failure": [
{
"append": {
"description": "Record error information",
"field": "_ingestion_errors",
"value": "Processor 'set' with tag 'set_body' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
}
}
],
"field": "body",
"description": "Set any extracted text on the 'body' field",
"tag": "set_body",
"copy_from": "_extracted_attachment.content",
"if": "ctx?._extract_binary_content == true"
}
},
{
"gsub": {
"on_failure": [
{
"append": {
"description": "Record error information",
"field": "_ingestion_errors",
"value": "Processor 'gsub' with tag 'remove_replacement_chars' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
}
}
],
"field": "body",
"pattern": "�",
"description": "Remove unicode 'replacement' characters",
"ignore_missing": true,
"tag": "remove_replacement_chars",
"replacement": "",
"if": "ctx?._extract_binary_content == true"
}
},
{
"gsub": {
"on_failure": [
{
"append": {
"description": "Record error information",
"field": "_ingestion_errors",
"value": "Processor 'gsub' with tag 'remove_extra_whitespace' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
}
}
],
"field": "body",
"pattern": """\s+""",
"description": "Squish whitespace",
"ignore_missing": true,
"tag": "remove_extra_whitespace",
"replacement": " ",
"if": "ctx?._reduce_whitespace == true"
}
},
{
"trim": {
"description": "Trim leading and trailing whitespace",
"ignore_missing": true,
"on_failure": [
{
"append": {
"description": "Record error information",
"field": "_ingestion_errors",
"value": "Processor 'trim' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
}
}
],
"field": "body",
"if": "ctx?._reduce_whitespace == true"
}
},
{
"remove": {
"description": "Remove meta fields",
"ignore_missing": true,
"on_failure": [
{
"append": {
"description": "Record error information",
"field": "_ingestion_errors",
"value": "Processor 'remove' with tag 'remove_meta_fields' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
}
}
],
"tag": "remove_meta_fields",
"field": [
"_attachment",
"_attachment_indexed_chars",
"_extracted_attachment",
"_extract_binary_content",
"_reduce_whitespace",
"_run_ml_inference"
]
}
},
{
"remove": {
"field": [
"_timestamp",
"id",
"number",
"forkCount",
"stargazerCount",
"watchers",
"isArchived",
"isFork",
"visibility",
"mergedAt",
"closedAt",
"createdAt",
"name"
],
"ignore_missing": true
}
}
]
}
Set the pipeline
PUT _connector/<connector_id>/_pipeline
{
"pipeline": {
"extract_binary_content": true,
"name": "github_pipeline",
"reduce_whitespace": true,
"run_ml_inference": false
}
}
Expected behavior
There should be a corresponding count of indexed documents to the documents in the index
Screenshots
Environment
9.2.0-SNAPSHOT (main, at time of writing)
Additional context
Slack thread: https://elastic.slack.com/archives/C01795T48LQ/p1759152996129649