- 강의 질문
- AI TECH
docling에서 pdf파일 오류 질문입니다.
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
FILE_PATH = "data/Ba_zi_the_four_pillars_of_destiny.pdf"
loader = DoclingLoader(
file_path=FILE_PATH,
export_type=ExportType.MARKDOWN,
)
docs = loader.load()
이와같은 코드에서, 유니코드 디코드 오류가 발생하는데, pdf는 무리가 있는걸까요? 오류 내용은 다음과 같습니다.
Encountered an error during conversion of document 70d2dbfdb2f210cf61d5611790938adbf3eb09086f1efd8480c7656752e85d7e:
Traceback (most recent call last):
File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py", line 160, in _build_document
for p in pipeline_pages: # Must exhaust!
File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py", line 126, in _apply_on_pages
yield from page_batch
File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\page_assemble_model.py", line 69, in __call__
for page in page_batch:
File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\table_structure_model.py", line 181, in __call__
for page in page_batch:
File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\layout_model.py", line 146, in __call__
for page in page_batch:
File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\easyocr_model.py", line 134, in __call__
for page in page_batch:
File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\page_preprocessing_model.py", line 27, in __call__
for page in page_batch:
File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\standard_pdf_pipeline.py", line 175, in initialize_page
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\backend\docling_parse_v4_backend.py", line 175, in load_page
self.dp_doc.get_page(
File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling_parse\pdf_parser.py", line 124, in get_page
doc_dict = self._parser.parse_pdf_from_key_on_page(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb9 in position 1: invalid start byte
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
Cell In[10], line 12
5 FILE_PATH = "data/Ba_zi_the_four_pillars_of_destiny.pdf"
7 loader = DoclingLoader(
8 file_path=FILE_PATH,
9 export_type=ExportType.MARKDOWN,
10 )
---> 12 docs = loader.load()
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\langchain_core\document_loaders\base.py:32, in BaseLoader.load(self)
30 def load(self) -> list[Document]:
31 """Load data into Document objects."""
---> 32 return list(self.lazy_load())
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\langchain_docling\loader.py:117, in DoclingLoader.lazy_load(self)
115 """Lazy load documents."""
116 for file_path in self._file_paths:
--> 117 conv_res = self._converter.convert(
118 source=file_path,
119 **self._convert_kwargs,
120 )
121 dl_doc = conv_res.document
122 if self._export_type == ExportType.MARKDOWN:
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\pydantic\_internal\_validate_call.py:39, in update_wrapper_attributes.<locals>.wrapper_function(*args, **kwargs)
37 @functools.wraps(wrapped)
38 def wrapper_function(*args, **kwargs):
---> 39 return wrapper(*args, **kwargs)
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\pydantic\_internal\_validate_call.py:136, in ValidateCallWrapper.__call__(self, *args, **kwargs)
133 if not self.__pydantic_complete__:
134 self._create_validators()
--> 136 res = self.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args, kwargs))
137 if self.__return_pydantic_validator__:
138 return self.__return_pydantic_validator__(res)
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\document_converter.py:220, in DocumentConverter.convert(self, source, headers, raises_on_error, max_num_pages, max_file_size, page_range)
202 @validate_call(config=ConfigDict(strict=True))
203 def convert(
204 self,
(...) 210 page_range: PageRange = DEFAULT_PAGE_RANGE,
211 ) -> ConversionResult:
212 all_res = self.convert_all(
213 source=[source],
214 raises_on_error=raises_on_error,
(...) 218 page_range=page_range,
219 )
--> 220 return next(all_res)
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\document_converter.py:243, in DocumentConverter.convert_all(self, source, headers, raises_on_error, max_num_pages, max_file_size, page_range)
240 conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
242 had_result = False
--> 243 for conv_res in conv_res_iter:
244 had_result = True
245 if raises_on_error and conv_res.status not in {
246 ConversionStatus.SUCCESS,
247 ConversionStatus.PARTIAL_SUCCESS,
248 }:
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\document_converter.py:278, in DocumentConverter._convert(self, conv_input, raises_on_error)
269 _log.info("Going to convert document batch...")
271 # parallel processing only within input_batch
272 # with ThreadPoolExecutor(
273 # max_workers=settings.perf.doc_batch_concurrency
274 # ) as pool:
275 # yield from pool.map(self.process_document, input_batch)
276 # Note: PDF backends are not thread-safe, thread pool usage was disabled.
--> 278 for item in map(
279 partial(self._process_document, raises_on_error=raises_on_error),
280 input_batch,
281 ):
282 elapsed = time.monotonic() - start_time
283 start_time = time.monotonic()
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\document_converter.py:324, in DocumentConverter._process_document(self, in_doc, raises_on_error)
320 valid = (
321 self.allowed_formats is not None and in_doc.format in self.allowed_formats
322 )
323 if valid:
--> 324 conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
325 else:
326 error_message = f"File format not allowed: {in_doc.file}"
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\document_converter.py:347, in DocumentConverter._execute_pipeline(self, in_doc, raises_on_error)
345 pipeline = self._get_pipeline(in_doc.format)
346 if pipeline is not None:
--> 347 conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
348 else:
349 if raises_on_error:
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py:54, in BasePipeline.execute(self, in_doc, raises_on_error)
52 conv_res.status = ConversionStatus.FAILURE
53 if raises_on_error:
---> 54 raise e
55 finally:
56 self._unload(conv_res)
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py:46, in BasePipeline.execute(self, in_doc, raises_on_error)
40 try:
41 with TimeRecorder(
42 conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
43 ):
44 # These steps are building and assembling the structure of the
45 # output DoclingDocument.
---> 46 conv_res = self._build_document(conv_res)
47 conv_res = self._assemble_document(conv_res)
48 # From this stage, all operations should rely only on conv_res.output
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py:194, in PaginatedPipeline._build_document(self, conv_res)
187 trace = "\n".join(
188 traceback.format_exception(type(e), e, e.__traceback__)
189 )
190 _log.warning(
191 f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
192 f"{trace}"
193 )
--> 194 raise e
196 return conv_res
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py:160, in PaginatedPipeline._build_document(self, conv_res)
157 # 2. Run pipeline stages
158 pipeline_pages = self._apply_on_pages(conv_res, init_pages)
--> 160 for p in pipeline_pages: # Must exhaust!
161 # Cleanup cached images
162 if not self.keep_images:
163 p._image_cache = {}
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py:126, in PaginatedPipeline._apply_on_pages(self, conv_res, page_batch)
123 for model in self.build_pipe:
124 page_batch = model(conv_res, page_batch)
--> 126 yield from page_batch
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\page_assemble_model.py:69, in PageAssembleModel.__call__(self, conv_res, page_batch)
66 def __call__(
67 self, conv_res: ConversionResult, page_batch: Iterable[Page]
68 ) -> Iterable[Page]:
---> 69 for page in page_batch:
70 assert page._backend is not None
71 if not page._backend.is_valid():
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\table_structure_model.py:181, in TableStructureModel.__call__(self, conv_res, page_batch)
178 yield from page_batch
179 return
--> 181 for page in page_batch:
182 assert page._backend is not None
183 if not page._backend.is_valid():
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\layout_model.py:146, in LayoutModel.__call__(self, conv_res, page_batch)
143 def __call__(
144 self, conv_res: ConversionResult, page_batch: Iterable[Page]
145 ) -> Iterable[Page]:
--> 146 for page in page_batch:
147 assert page._backend is not None
148 if not page._backend.is_valid():
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\easyocr_model.py:134, in EasyOcrModel.__call__(self, conv_res, page_batch)
131 yield from page_batch
132 return
--> 134 for page in page_batch:
135 assert page._backend is not None
136 if not page._backend.is_valid():
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\page_preprocessing_model.py:27, in PagePreprocessingModel.__call__(self, conv_res, page_batch)
24 def __call__(
25 self, conv_res: ConversionResult, page_batch: Iterable[Page]
26 ) -> Iterable[Page]:
---> 27 for page in page_batch:
28 assert page._backend is not None
29 if not page._backend.is_valid():
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\standard_pdf_pipeline.py:175, in StandardPdfPipeline.initialize_page(self, conv_res, page)
173 def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
174 with TimeRecorder(conv_res, "page_init"):
--> 175 page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
176 if page._backend is not None and page._backend.is_valid():
177 page.size = page._backend.get_size()
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\backend\docling_parse_v4_backend.py:175, in DoclingParseV4DocumentBackend.load_page(self, page_no, create_words, create_textlines)
170 def load_page(
171 self, page_no: int, create_words: bool = True, create_textlines: bool = True
172 ) -> DoclingParseV4PageBackend:
173 with pypdfium2_lock:
174 return DoclingParseV4PageBackend(
--> 175 self.dp_doc.get_page(
176 page_no + 1,
177 create_words=create_words,
178 create_textlines=create_textlines,
179 ),
180 self._pdoc[page_no],
181 )
File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling_parse\pdf_parser.py:124, in PdfDocument.get_page(self, page_no, create_words, create_textlines)
122 else:
123 if 1 <= page_no <= self.number_of_pages():
--> 124 doc_dict = self._parser.parse_pdf_from_key_on_page(
125 key=self._key,
126 page=page_no - 1,
127 page_boundary=self._boundary_type,
128 do_sanitization=False,
129 )
130 for pi, page in enumerate(
131 doc_dict["pages"]
132 ): # only one page is expected
133 self._pages[page_no] = self._to_segmented_page(
134 page=page["original"],
135 create_words=create_words,
136 create_textlines=create_textlines,
137 ) # put on cache
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb9 in position 1: invalid start byte