-
Notifications
You must be signed in to change notification settings - Fork 3.2k
Description
Question
'extraction.ipynb' in examples folder does not run out of the box. It gives the below error on running the cell with code Using a strign template -
Can anyone pls help.
result = extractor.extract(
source=file_path,
template='{"bill_no": "string", "total": "float"}',
)
print(result.pages)
******************************* Error Details ******************************************************
AttributeError Traceback (most recent call last)
Cell In[9], line 1
----> 1 result = extractor.extract(
2 source=file_path,
3 template='{"bill_no": "string", "total": "float"}',
4 )
5 print(result.pages)
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic_internal_validate_call.py:39, in update_wrapper_attributes..wrapper_function(*args, **kwargs)
37 @functools.wraps(wrapped)
38 def wrapper_function(*args, **kwargs):
---> 39 return wrapper(*args, **kwargs)
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic_internal_validate_call.py:136, in ValidateCallWrapper.call(self, *args, **kwargs)
133 if not self.pydantic_complete:
134 self._create_validators()
--> 136 res = self.pydantic_validator.validate_python(pydantic_core.ArgsKwargs(args, kwargs))
137 if self.return_pydantic_validator:
138 return self.return_pydantic_validator(res)
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:145, in DocumentExtractor.extract(self, source, template, headers, raises_on_error, max_num_pages, max_file_size, page_range)
125 @validate_call(config=ConfigDict(strict=True))
126 def extract(
127 self,
(...)
134 page_range: PageRange = DEFAULT_PAGE_RANGE,
135 ) -> ExtractionResult:
136 all_res = self.extract_all(
137 source=[source],
138 headers=headers,
(...)
143 template=template,
144 )
--> 145 return next(all_res)
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:179, in DocumentExtractor.extract_all(self, source, template, headers, raises_on_error, max_num_pages, max_file_size, page_range)
174 ext_res_iter = self._extract(
175 conv_input, raises_on_error=raises_on_error, template=template
176 )
178 had_result = False
--> 179 for ext_res in ext_res_iter:
180 had_result = True
181 if raises_on_error and ext_res.status not in {
182 ConversionStatus.SUCCESS,
183 ConversionStatus.PARTIAL_SUCCESS,
184 }:
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:230, in DocumentExtractor._extract(self, conv_input, raises_on_error, template)
228 yield item
229 else:
--> 230 for item in map(
231 process_func,
232 input_batch,
233 ):
234 elapsed = time.monotonic() - start_time
235 start_time = time.monotonic()
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:251, in DocumentExtractor._process_document_extraction(self, in_doc, raises_on_error, template)
247 valid = (
248 self.allowed_formats is not None and in_doc.format in self.allowed_formats
249 )
250 if valid:
--> 251 return self._execute_extraction_pipeline(
252 in_doc, raises_on_error=raises_on_error, template=template
253 )
254 else:
255 error_message = f"File format not allowed: {in_doc.file}"
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:280, in DocumentExtractor._execute_extraction_pipeline(self, in_doc, raises_on_error, template)
277 else:
278 return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
--> 280 pipeline = self._get_pipeline(in_doc.format)
281 if pipeline is None:
282 if raises_on_error:
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:311, in DocumentExtractor._get_pipeline(self, doc_format)
307 if cache_key not in self._initialized_pipelines:
308 _log.info(
309 f"Initializing extraction pipeline for {pipeline_class.name} with options hash {options_hash}"
310 )
--> 311 self._initialized_pipelines[cache_key] = pipeline_class(
312 pipeline_options=pipeline_options # type: ignore[arg-type]
313 )
314 else:
315 _log.debug(
316 f"Reusing cached extraction pipeline for {pipeline_class.name} with options hash {options_hash}"
317 )
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\pipeline\extraction_vlm_pipeline.py:41, in ExtractionVlmPipeline.init(self, pipeline_options)
38 self.pipeline_options: VlmExtractionPipelineOptions
40 # Create VLM model instance
---> 41 self.vlm_model = NuExtractTransformersModel(
42 enabled=True,
43 artifacts_path=self.artifacts_path, # Will download automatically
44 accelerator_options=self.accelerator_options,
45 vlm_options=pipeline_options.vlm_options,
46 )
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\models\vlm_models_inline\nuextract_transformers_model.py:141, in NuExtractTransformersModel.init(self, enabled, artifacts_path, accelerator_options, vlm_options)
138 elif (artifacts_path / repo_cache_folder).exists():
139 artifacts_path = artifacts_path / repo_cache_folder
--> 141 self.processor = AutoProcessor.from_pretrained(
142 artifacts_path,
143 trust_remote_code=vlm_options.trust_remote_code,
144 use_fast=True,
145 )
146 self.processor.tokenizer.padding_side = "left"
148 self.vlm_model = AutoModelForImageTextToText.from_pretrained(
149 artifacts_path,
150 device_map=self.device,
(...)
158 trust_remote_code=vlm_options.trust_remote_code,
159 )
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\models\auto\processing_auto.py:396, in AutoProcessor.from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
392 return processor_class.from_pretrained(
393 pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
394 )
395 elif processor_class is not None:
--> 396 return processor_class.from_pretrained(
397 pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
398 )
399 # Last try: we use the PROCESSOR_MAPPING.
400 elif type(config) in PROCESSOR_MAPPING:
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\processing_utils.py:1394, in ProcessorMixin.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, **kwargs)
1391 if token is not None:
1392 kwargs["token"] = token
-> 1394 args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
1395 processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
1396 return cls.from_args_and_dict(args, processor_dict, **kwargs)
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\processing_utils.py:1453, in ProcessorMixin._get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
1450 else:
1451 attribute_class = cls.get_possibly_dynamic_module(class_name)
-> 1453 args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
1455 return args
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\tokenization_utils_base.py:2112, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
2109 else:
2110 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2112 return cls._from_pretrained(
2113 resolved_vocab_files,
2114 pretrained_model_name_or_path,
2115 init_configuration,
2116 *init_inputs,
2117 token=token,
2118 cache_dir=cache_dir,
2119 local_files_only=local_files_only,
2120 _commit_hash=commit_hash,
2121 _is_local=is_local,
2122 trust_remote_code=trust_remote_code,
2123 **kwargs,
2124 )
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\transformers\tokenization_utils_base.py:2419, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
2416 transformers_version = _config.get("transformers_version")
2418 if transformers_version and version.parse(transformers_version) <= version.parse("4.57.2"):
-> 2419 if _is_local and _config.model_type not in [
2420 "mistral",
2421 "mistral3",
2422 "voxstral",
2423 "ministral",
2424 "pixtral",
2425 ]:
2426 return tokenizer
2428 # Expose the fix_mistral_regex flag on the tokenizer when provided, even if no correction is applied.
AttributeError: 'dict' object has no attribute 'model_type'
AttributeError Traceback (most recent call last)
Cell In[10], line 1
----> 1 result = extractor.extract(
2 source=file_path,
3 template={
4 "bill_no": "string",
5 "total": "float",
6 },
7 )
8 print(result.pages)
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic_internal_validate_call.py:39, in update_wrapper_attributes..wrapper_function(*args, **kwargs)
37 @functools.wraps(wrapped)
38 def wrapper_function(*args, **kwargs):
---> 39 return wrapper(*args, **kwargs)
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\pydantic_internal_validate_call.py:136, in ValidateCallWrapper.call(self, *args, **kwargs)
133 if not self.pydantic_complete:
134 self._create_validators()
--> 136 res = self.pydantic_validator.validate_python(pydantic_core.ArgsKwargs(args, kwargs))
137 if self.return_pydantic_validator:
138 return self.return_pydantic_validator(res)
File c:\Users\ynars\miniconda3\envs\py310\lib\site-packages\docling\document_extractor.py:145, in DocumentExtractor.extract(self, source, template, headers, raises_on_error, max_num_pages, max_file_size, page_range)
...
2425 ]:
2426 return tokenizer
2428 # Expose the fix_mistral_regex flag on the tokenizer when provided, even if no correction is applied.