Skip to content

extraction.ipynb in examples folder does not run out of the box. Also what model is used by default for this extraction pls. #2688

@yogesh-n

Description

@yogesh-n

Question

'extraction.ipynb' in examples folder does not run out of the box. It gives the below error on running the cell with code Using a strign template -

Can anyone pls help.

result = extractor.extract(
source=file_path,
template='{"bill_no": "string", "total": "float"}',
)
print(result.pages)

******************************* Error Details ******************************************************


AttributeError Traceback (most recent call last)
Cell In[9], line 1
----> 1 result = extractor.extract(
2 source=file_path,
3 template='{"bill_no": "string", "total": "float"}',
4 )
5 print(result.pages)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic_internal_validate_call.py:39, in update_wrapper_attributes..wrapper_function(*args, **kwargs)
37 @functools.wraps(wrapped)
38 def wrapper_function(*args, **kwargs):
---> 39 return wrapper(*args, **kwargs)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic_internal_validate_call.py:136, in ValidateCallWrapper.call(self, *args, **kwargs)
133 if not self.pydantic_complete:
134 self._create_validators()
--> 136 res = self.pydantic_validator.validate_python(pydantic_core.ArgsKwargs(args, kwargs))
137 if self.return_pydantic_validator:
138 return self.return_pydantic_validator(res)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:145, in DocumentExtractor.extract(self, source, template, headers, raises_on_error, max_num_pages, max_file_size, page_range)
125 @validate_call(config=ConfigDict(strict=True))
126 def extract(
127 self,
(...)
134 page_range: PageRange = DEFAULT_PAGE_RANGE,
135 ) -> ExtractionResult:
136 all_res = self.extract_all(
137 source=[source],
138 headers=headers,
(...)
143 template=template,
144 )
--> 145 return next(all_res)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:179, in DocumentExtractor.extract_all(self, source, template, headers, raises_on_error, max_num_pages, max_file_size, page_range)
174 ext_res_iter = self._extract(
175 conv_input, raises_on_error=raises_on_error, template=template
176 )
178 had_result = False
--> 179 for ext_res in ext_res_iter:
180 had_result = True
181 if raises_on_error and ext_res.status not in {
182 ConversionStatus.SUCCESS,
183 ConversionStatus.PARTIAL_SUCCESS,
184 }:

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:230, in DocumentExtractor._extract(self, conv_input, raises_on_error, template)
228 yield item
229 else:
--> 230 for item in map(
231 process_func,
232 input_batch,
233 ):
234 elapsed = time.monotonic() - start_time
235 start_time = time.monotonic()

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:251, in DocumentExtractor._process_document_extraction(self, in_doc, raises_on_error, template)
247 valid = (
248 self.allowed_formats is not None and in_doc.format in self.allowed_formats
249 )
250 if valid:
--> 251 return self._execute_extraction_pipeline(
252 in_doc, raises_on_error=raises_on_error, template=template
253 )
254 else:
255 error_message = f"File format not allowed: {in_doc.file}"

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:280, in DocumentExtractor._execute_extraction_pipeline(self, in_doc, raises_on_error, template)
277 else:
278 return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
--> 280 pipeline = self._get_pipeline(in_doc.format)
281 if pipeline is None:
282 if raises_on_error:

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:311, in DocumentExtractor._get_pipeline(self, doc_format)
307 if cache_key not in self._initialized_pipelines:
308 _log.info(
309 f"Initializing extraction pipeline for {pipeline_class.name} with options hash {options_hash}"
310 )
--> 311 self._initialized_pipelines[cache_key] = pipeline_class(
312 pipeline_options=pipeline_options # type: ignore[arg-type]
313 )
314 else:
315 _log.debug(
316 f"Reusing cached extraction pipeline for {pipeline_class.name} with options hash {options_hash}"
317 )

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\pipeline\extraction_vlm_pipeline.py:41, in ExtractionVlmPipeline.init(self, pipeline_options)
38 self.pipeline_options: VlmExtractionPipelineOptions
40 # Create VLM model instance
---> 41 self.vlm_model = NuExtractTransformersModel(
42 enabled=True,
43 artifacts_path=self.artifacts_path, # Will download automatically
44 accelerator_options=self.accelerator_options,
45 vlm_options=pipeline_options.vlm_options,
46 )

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\models\vlm_models_inline\nuextract_transformers_model.py:141, in NuExtractTransformersModel.init(self, enabled, artifacts_path, accelerator_options, vlm_options)
138 elif (artifacts_path / repo_cache_folder).exists():
139 artifacts_path = artifacts_path / repo_cache_folder
--> 141 self.processor = AutoProcessor.from_pretrained(
142 artifacts_path,
143 trust_remote_code=vlm_options.trust_remote_code,
144 use_fast=True,
145 )
146 self.processor.tokenizer.padding_side = "left"
148 self.vlm_model = AutoModelForImageTextToText.from_pretrained(
149 artifacts_path,
150 device_map=self.device,
(...)
158 trust_remote_code=vlm_options.trust_remote_code,
159 )

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\models\auto\processing_auto.py:396, in AutoProcessor.from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
392 return processor_class.from_pretrained(
393 pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
394 )
395 elif processor_class is not None:
--> 396 return processor_class.from_pretrained(
397 pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
398 )
399 # Last try: we use the PROCESSOR_MAPPING.
400 elif type(config) in PROCESSOR_MAPPING:

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\processing_utils.py:1394, in ProcessorMixin.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, **kwargs)
1391 if token is not None:
1392 kwargs["token"] = token
-> 1394 args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
1395 processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
1396 return cls.from_args_and_dict(args, processor_dict, **kwargs)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\processing_utils.py:1453, in ProcessorMixin._get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
1450 else:
1451 attribute_class = cls.get_possibly_dynamic_module(class_name)
-> 1453 args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
1455 return args

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\tokenization_utils_base.py:2112, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
2109 else:
2110 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2112 return cls._from_pretrained(
2113 resolved_vocab_files,
2114 pretrained_model_name_or_path,
2115 init_configuration,
2116 *init_inputs,
2117 token=token,
2118 cache_dir=cache_dir,
2119 local_files_only=local_files_only,
2120 _commit_hash=commit_hash,
2121 _is_local=is_local,
2122 trust_remote_code=trust_remote_code,
2123 **kwargs,
2124 )

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\tokenization_utils_base.py:2419, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
2416 transformers_version = _config.get("transformers_version")
2418 if transformers_version and version.parse(transformers_version) <= version.parse("4.57.2"):
-> 2419 if _is_local and _config.model_type not in [
2420 "mistral",
2421 "mistral3",
2422 "voxstral",
2423 "ministral",
2424 "pixtral",
2425 ]:
2426 return tokenizer
2428 # Expose the fix_mistral_regex flag on the tokenizer when provided, even if no correction is applied.

AttributeError: 'dict' object has no attribute 'model_type'

AttributeError Traceback (most recent call last)
Cell In[10], line 1
----> 1 result = extractor.extract(
2 source=file_path,
3 template={
4 "bill_no": "string",
5 "total": "float",
6 },
7 )
8 print(result.pages)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic_internal_validate_call.py:39, in update_wrapper_attributes..wrapper_function(*args, **kwargs)
37 @functools.wraps(wrapped)
38 def wrapper_function(*args, **kwargs):
---> 39 return wrapper(*args, **kwargs)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic_internal_validate_call.py:136, in ValidateCallWrapper.call(self, *args, **kwargs)
133 if not self.pydantic_complete:
134 self._create_validators()
--> 136 res = self.pydantic_validator.validate_python(pydantic_core.ArgsKwargs(args, kwargs))
137 if self.return_pydantic_validator:
138 return self.return_pydantic_validator(res)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:145, in DocumentExtractor.extract(self, source, template, headers, raises_on_error, max_num_pages, max_file_size, page_range)
...
2425 ]:
2426 return tokenizer
2428 # Expose the fix_mistral_regex flag on the tokenizer when provided, even if no correction is applied.

AttributeError: 'dict' object has no attribute 'model_type'
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

Metadata

Metadata

Assignees

No one assigned

    Labels

    questionFurther information is requested

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions