extraction.ipynb in examples folder does not run out of the box. Also what model is used by default for this extraction pls.

### Question

'extraction.ipynb' in examples folder does not run out of the box. It gives the below error on running the cell with code Using a strign template -

Can anyone pls help.

result = extractor.extract(
    source=file_path,
    template='{"bill_no": "string", "total": "float"}',
)
print(result.pages)

******************************* Error Details ****************************************************** 

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[9], [line 1](vscode-notebook-cell:?execution_count=9&line=1)
----> [1](vscode-notebook-cell:?execution_count=9&line=1) result = extractor.extract(
      2     source=file_path,
      3     template='{"bill_no": "string", "total": "float"}',
      4 )
      5 print(result.pages)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic\_internal\_validate_call.py:39, in update_wrapper_attributes.<locals>.wrapper_function(*args, **kwargs)
     37 @functools.wraps(wrapped)
     38 def wrapper_function(*args, **kwargs):
---> [39](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/pydantic/_internal/_validate_call.py:39)     return wrapper(*args, **kwargs)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic\_internal\_validate_call.py:136, in ValidateCallWrapper.__call__(self, *args, **kwargs)
    133 if not self.__pydantic_complete__:
    134     self._create_validators()
--> [136](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/pydantic/_internal/_validate_call.py:136) res = self.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args, kwargs))
    137 if self.__return_pydantic_validator__:
    138     return self.__return_pydantic_validator__(res)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:145, in DocumentExtractor.extract(self, source, template, headers, raises_on_error, max_num_pages, max_file_size, page_range)
    125 @validate_call(config=ConfigDict(strict=True))
    126 def extract(
    127     self,
   (...)
    134     page_range: PageRange = DEFAULT_PAGE_RANGE,
    135 ) -> ExtractionResult:
    136     all_res = self.extract_all(
    137         source=[source],
    138         headers=headers,
   (...)
    143         template=template,
    144     )
--> [145](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/docling/document_extractor.py:145)     return next(all_res)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:179, in DocumentExtractor.extract_all(self, source, template, headers, raises_on_error, max_num_pages, max_file_size, page_range)
    174 ext_res_iter = self._extract(
    175     conv_input, raises_on_error=raises_on_error, template=template
    176 )
    178 had_result = False
--> [179](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/docling/document_extractor.py:179) for ext_res in ext_res_iter:
    180     had_result = True
    181     if raises_on_error and ext_res.status not in {
    182         ConversionStatus.SUCCESS,
    183         ConversionStatus.PARTIAL_SUCCESS,
    184     }:

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:230, in DocumentExtractor._extract(self, conv_input, raises_on_error, template)
    228             yield item
    229 else:
--> [230](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/docling/document_extractor.py:230)     for item in map(
    231         process_func,
    232         input_batch,
    233     ):
    234         elapsed = time.monotonic() - start_time
    235         start_time = time.monotonic()

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:251, in DocumentExtractor._process_document_extraction(self, in_doc, raises_on_error, template)
    247 valid = (
    248     self.allowed_formats is not None and in_doc.format in self.allowed_formats
    249 )
    250 if valid:
--> [251](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/docling/document_extractor.py:251)     return self._execute_extraction_pipeline(
    252         in_doc, raises_on_error=raises_on_error, template=template
    253     )
    254 else:
    255     error_message = f"File format not allowed: {in_doc.file}"

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:280, in DocumentExtractor._execute_extraction_pipeline(self, in_doc, raises_on_error, template)
    277     else:
    278         return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
--> [280](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/docling/document_extractor.py:280) pipeline = self._get_pipeline(in_doc.format)
    281 if pipeline is None:
    282     if raises_on_error:

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:311, in DocumentExtractor._get_pipeline(self, doc_format)
    307 if cache_key not in self._initialized_pipelines:
    308     _log.info(
    309         f"Initializing extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
    310     )
--> [311](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/docling/document_extractor.py:311)     self._initialized_pipelines[cache_key] = pipeline_class(
    312         pipeline_options=pipeline_options  # type: ignore[arg-type]
    313     )
    314 else:
    315     _log.debug(
    316         f"Reusing cached extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
    317     )

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\pipeline\extraction_vlm_pipeline.py:41, in ExtractionVlmPipeline.__init__(self, pipeline_options)
     38 self.pipeline_options: VlmExtractionPipelineOptions
     40 # Create VLM model instance
---> [41](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/docling/pipeline/extraction_vlm_pipeline.py:41) self.vlm_model = NuExtractTransformersModel(
     42     enabled=True,
     43     artifacts_path=self.artifacts_path,  # Will download automatically
     44     accelerator_options=self.accelerator_options,
     45     vlm_options=pipeline_options.vlm_options,
     46 )

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\models\vlm_models_inline\nuextract_transformers_model.py:141, in NuExtractTransformersModel.__init__(self, enabled, artifacts_path, accelerator_options, vlm_options)
    138 elif (artifacts_path / repo_cache_folder).exists():
    139     artifacts_path = artifacts_path / repo_cache_folder
--> [141](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/docling/models/vlm_models_inline/nuextract_transformers_model.py:141) self.processor = AutoProcessor.from_pretrained(
    142     artifacts_path,
    143     trust_remote_code=vlm_options.trust_remote_code,
    144     use_fast=True,
    145 )
    146 self.processor.tokenizer.padding_side = "left"
    148 self.vlm_model = AutoModelForImageTextToText.from_pretrained(
    149     artifacts_path,
    150     device_map=self.device,
   (...)
    158     trust_remote_code=vlm_options.trust_remote_code,
    159 )

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\models\auto\processing_auto.py:396, in AutoProcessor.from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
    392     return processor_class.from_pretrained(
    393         pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
    394     )
    395 elif processor_class is not None:
--> [396](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/transformers/models/auto/processing_auto.py:396)     return processor_class.from_pretrained(
    397         pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
    398     )
    399 # Last try: we use the PROCESSOR_MAPPING.
    400 elif type(config) in PROCESSOR_MAPPING:

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\processing_utils.py:1394, in ProcessorMixin.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, **kwargs)
   1391 if token is not None:
   1392     kwargs["token"] = token
-> [1394](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/transformers/processing_utils.py:1394) args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
   1395 processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
   1396 return cls.from_args_and_dict(args, processor_dict, **kwargs)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\processing_utils.py:1453, in ProcessorMixin._get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
   1450     else:
   1451         attribute_class = cls.get_possibly_dynamic_module(class_name)
-> [1453](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/transformers/processing_utils.py:1453)     args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
   1455 return args

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\tokenization_utils_base.py:2112, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
   2109     else:
   2110         logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> [2112](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/transformers/tokenization_utils_base.py:2112) return cls._from_pretrained(
   2113     resolved_vocab_files,
   2114     pretrained_model_name_or_path,
   2115     init_configuration,
   2116     *init_inputs,
   2117     token=token,
   2118     cache_dir=cache_dir,
   2119     local_files_only=local_files_only,
   2120     _commit_hash=commit_hash,
   2121     _is_local=is_local,
   2122     trust_remote_code=trust_remote_code,
   2123     **kwargs,
   2124 )

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\tokenization_utils_base.py:2419, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
   2416     transformers_version = _config.get("transformers_version")
   2418     if transformers_version and version.parse(transformers_version) <= version.parse("4.57.2"):
-> [2419](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/transformers/tokenization_utils_base.py:2419)         if _is_local and _config.model_type not in [
   2420             "mistral",
   2421             "mistral3",
   2422             "voxstral",
   2423             "ministral",
   2424             "pixtral",
   2425         ]:
   2426             return tokenizer
   2428 # Expose the `fix_mistral_regex` flag on the tokenizer when provided, even if no correction is applied.

AttributeError: 'dict' object has no attribute 'model_type'
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[10], [line 1](vscode-notebook-cell:?execution_count=10&line=1)
----> [1](vscode-notebook-cell:?execution_count=10&line=1) result = extractor.extract(
      2     source=file_path,
      3     template={
      4         "bill_no": "string",
      5         "total": "float",
      6     },
      7 )
      8 print(result.pages)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic\_internal\_validate_call.py:39, in update_wrapper_attributes.<locals>.wrapper_function(*args, **kwargs)
     37 @functools.wraps(wrapped)
     38 def wrapper_function(*args, **kwargs):
---> [39](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/pydantic/_internal/_validate_call.py:39)     return wrapper(*args, **kwargs)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic\_internal\_validate_call.py:136, in ValidateCallWrapper.__call__(self, *args, **kwargs)
    133 if not self.__pydantic_complete__:
    134     self._create_validators()
--> [136](file:///C:/Users/ynars/miniconda3/envs/py310/lib/site-packages/pydantic/_internal/_validate_call.py:136) res = self.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args, kwargs))
    137 if self.__return_pydantic_validator__:
    138     return self.__return_pydantic_validator__(res)

File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:145, in DocumentExtractor.extract(self, source, template, headers, raises_on_error, max_num_pages, max_file_size, page_range)
...
   2425         ]:
   2426             return tokenizer
   2428 # Expose the `fix_mistral_regex` flag on the tokenizer when provided, even if no correction is applied.

AttributeError: 'dict' object has no attribute 'model_type'
Output is truncated. View as a [scrollable element](command:cellOutput.enableScrolling?08452939-6418-43de-87ee-21b32635ea6b) or open in a [text editor](command:workbench.action.openLargeOutput?08452939-6418-43de-87ee-21b32635ea6b). Adjust cell output [settings](command:workbench.action.openSettings?%5B%22%40tag%3AnotebookOutputLayout%22%5D)...
---------------------------------------------------------------------------

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

extraction.ipynb in examples folder does not run out of the box. Also what model is used by default for this extraction pls. #2688

Question

AttributeError: 'dict' object has no attribute 'model_type'

AttributeError: 'dict' object has no attribute 'model_type'
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

extraction.ipynb in examples folder does not run out of the box. Also what model is used by default for this extraction pls. #2688

Description

Question

AttributeError: 'dict' object has no attribute 'model_type'

AttributeError: 'dict' object has no attribute 'model_type' Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

AttributeError: 'dict' object has no attribute 'model_type'
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...