root layout

패스트캠퍼스

  1. 강의 질문
  2. AI TECH

docling에서 pdf파일 오류 질문입니다.

2025.05.01 04:02 작성

from langchain_docling import DoclingLoader

from langchain_docling.loader import ExportType



FILE_PATH = "data/Ba_zi_the_four_pillars_of_destiny.pdf"



loader = DoclingLoader(

    file_path=FILE_PATH,

    export_type=ExportType.MARKDOWN,

)



docs = loader.load()


이와같은 코드에서, 유니코드 디코드 오류가 발생하는데, pdf는 무리가 있는걸까요? 오류 내용은 다음과 같습니다.


Encountered an error during conversion of document 70d2dbfdb2f210cf61d5611790938adbf3eb09086f1efd8480c7656752e85d7e:

Traceback (most recent call last):

File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py", line 160, in _build_document

for p in pipeline_pages: # Must exhaust!

File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py", line 126, in _apply_on_pages

yield from page_batch

File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\page_assemble_model.py", line 69, in __call__

for page in page_batch:

File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\table_structure_model.py", line 181, in __call__

for page in page_batch:

File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\layout_model.py", line 146, in __call__

for page in page_batch:

File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\easyocr_model.py", line 134, in __call__

for page in page_batch:

File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\page_preprocessing_model.py", line 27, in __call__

for page in page_batch:

File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\standard_pdf_pipeline.py", line 175, in initialize_page

page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\backend\docling_parse_v4_backend.py", line 175, in load_page

self.dp_doc.get_page(

File "c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling_parse\pdf_parser.py", line 124, in get_page

doc_dict = self._parser.parse_pdf_from_key_on_page(

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb9 in position 1: invalid start byte


---------------------------------------------------------------------------

UnicodeDecodeError Traceback (most recent call last)

Cell In[10], line 12

5 FILE_PATH = "data/Ba_zi_the_four_pillars_of_destiny.pdf"

7 loader = DoclingLoader(

8 file_path=FILE_PATH,

9 export_type=ExportType.MARKDOWN,

10 )

---> 12 docs = loader.load()

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\langchain_core\document_loaders\base.py:32, in BaseLoader.load(self)

30 def load(self) -> list[Document]:

31 """Load data into Document objects."""

---> 32 return list(self.lazy_load())

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\langchain_docling\loader.py:117, in DoclingLoader.lazy_load(self)

115 """Lazy load documents."""

116 for file_path in self._file_paths:

--> 117 conv_res = self._converter.convert(

118 source=file_path,

119 **self._convert_kwargs,

120 )

121 dl_doc = conv_res.document

122 if self._export_type == ExportType.MARKDOWN:

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\pydantic\_internal\_validate_call.py:39, in update_wrapper_attributes.<locals>.wrapper_function(*args, **kwargs)

37 @functools.wraps(wrapped)

38 def wrapper_function(*args, **kwargs):

---> 39 return wrapper(*args, **kwargs)

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\pydantic\_internal\_validate_call.py:136, in ValidateCallWrapper.__call__(self, *args, **kwargs)

133 if not self.__pydantic_complete__:

134 self._create_validators()

--> 136 res = self.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args, kwargs))

137 if self.__return_pydantic_validator__:

138 return self.__return_pydantic_validator__(res)

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\document_converter.py:220, in DocumentConverter.convert(self, source, headers, raises_on_error, max_num_pages, max_file_size, page_range)

202 @validate_call(config=ConfigDict(strict=True))

203 def convert(

204 self,

(...) 210 page_range: PageRange = DEFAULT_PAGE_RANGE,

211 ) -> ConversionResult:

212 all_res = self.convert_all(

213 source=[source],

214 raises_on_error=raises_on_error,

(...) 218 page_range=page_range,

219 )

--> 220 return next(all_res)

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\document_converter.py:243, in DocumentConverter.convert_all(self, source, headers, raises_on_error, max_num_pages, max_file_size, page_range)

240 conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)

242 had_result = False

--> 243 for conv_res in conv_res_iter:

244 had_result = True

245 if raises_on_error and conv_res.status not in {

246 ConversionStatus.SUCCESS,

247 ConversionStatus.PARTIAL_SUCCESS,

248 }:

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\document_converter.py:278, in DocumentConverter._convert(self, conv_input, raises_on_error)

269 _log.info("Going to convert document batch...")

271 # parallel processing only within input_batch

272 # with ThreadPoolExecutor(

273 # max_workers=settings.perf.doc_batch_concurrency

274 # ) as pool:

275 # yield from pool.map(self.process_document, input_batch)

276 # Note: PDF backends are not thread-safe, thread pool usage was disabled.

--> 278 for item in map(

279 partial(self._process_document, raises_on_error=raises_on_error),

280 input_batch,

281 ):

282 elapsed = time.monotonic() - start_time

283 start_time = time.monotonic()

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\document_converter.py:324, in DocumentConverter._process_document(self, in_doc, raises_on_error)

320 valid = (

321 self.allowed_formats is not None and in_doc.format in self.allowed_formats

322 )

323 if valid:

--> 324 conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)

325 else:

326 error_message = f"File format not allowed: {in_doc.file}"

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\document_converter.py:347, in DocumentConverter._execute_pipeline(self, in_doc, raises_on_error)

345 pipeline = self._get_pipeline(in_doc.format)

346 if pipeline is not None:

--> 347 conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)

348 else:

349 if raises_on_error:

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py:54, in BasePipeline.execute(self, in_doc, raises_on_error)

52 conv_res.status = ConversionStatus.FAILURE

53 if raises_on_error:

---> 54 raise e

55 finally:

56 self._unload(conv_res)

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py:46, in BasePipeline.execute(self, in_doc, raises_on_error)

40 try:

41 with TimeRecorder(

42 conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT

43 ):

44 # These steps are building and assembling the structure of the

45 # output DoclingDocument.

---> 46 conv_res = self._build_document(conv_res)

47 conv_res = self._assemble_document(conv_res)

48 # From this stage, all operations should rely only on conv_res.output

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py:194, in PaginatedPipeline._build_document(self, conv_res)

187 trace = "\n".join(

188 traceback.format_exception(type(e), e, e.__traceback__)

189 )

190 _log.warning(

191 f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"

192 f"{trace}"

193 )

--> 194 raise e

196 return conv_res

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py:160, in PaginatedPipeline._build_document(self, conv_res)

157 # 2. Run pipeline stages

158 pipeline_pages = self._apply_on_pages(conv_res, init_pages)

--> 160 for p in pipeline_pages: # Must exhaust!

161 # Cleanup cached images

162 if not self.keep_images:

163 p._image_cache = {}

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\base_pipeline.py:126, in PaginatedPipeline._apply_on_pages(self, conv_res, page_batch)

123 for model in self.build_pipe:

124 page_batch = model(conv_res, page_batch)

--> 126 yield from page_batch

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\page_assemble_model.py:69, in PageAssembleModel.__call__(self, conv_res, page_batch)

66 def __call__(

67 self, conv_res: ConversionResult, page_batch: Iterable[Page]

68 ) -> Iterable[Page]:

---> 69 for page in page_batch:

70 assert page._backend is not None

71 if not page._backend.is_valid():

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\table_structure_model.py:181, in TableStructureModel.__call__(self, conv_res, page_batch)

178 yield from page_batch

179 return

--> 181 for page in page_batch:

182 assert page._backend is not None

183 if not page._backend.is_valid():

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\layout_model.py:146, in LayoutModel.__call__(self, conv_res, page_batch)

143 def __call__(

144 self, conv_res: ConversionResult, page_batch: Iterable[Page]

145 ) -> Iterable[Page]:

--> 146 for page in page_batch:

147 assert page._backend is not None

148 if not page._backend.is_valid():

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\easyocr_model.py:134, in EasyOcrModel.__call__(self, conv_res, page_batch)

131 yield from page_batch

132 return

--> 134 for page in page_batch:

135 assert page._backend is not None

136 if not page._backend.is_valid():

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\models\page_preprocessing_model.py:27, in PagePreprocessingModel.__call__(self, conv_res, page_batch)

24 def __call__(

25 self, conv_res: ConversionResult, page_batch: Iterable[Page]

26 ) -> Iterable[Page]:

---> 27 for page in page_batch:

28 assert page._backend is not None

29 if not page._backend.is_valid():

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\pipeline\standard_pdf_pipeline.py:175, in StandardPdfPipeline.initialize_page(self, conv_res, page)

173 def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:

174 with TimeRecorder(conv_res, "page_init"):

--> 175 page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore

176 if page._backend is not None and page._backend.is_valid():

177 page.size = page._backend.get_size()

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling\backend\docling_parse_v4_backend.py:175, in DoclingParseV4DocumentBackend.load_page(self, page_no, create_words, create_textlines)

170 def load_page(

171 self, page_no: int, create_words: bool = True, create_textlines: bool = True

172 ) -> DoclingParseV4PageBackend:

173 with pypdfium2_lock:

174 return DoclingParseV4PageBackend(

--> 175 self.dp_doc.get_page(

176 page_no + 1,

177 create_words=create_words,

178 create_textlines=create_textlines,

179 ),

180 self._pdoc[page_no],

181 )

File c:\Users\skyop\JaehoNote_2\.venv\Lib\site-packages\docling_parse\pdf_parser.py:124, in PdfDocument.get_page(self, page_no, create_words, create_textlines)

122 else:

123 if 1 <= page_no <= self.number_of_pages():

--> 124 doc_dict = self._parser.parse_pdf_from_key_on_page(

125 key=self._key,

126 page=page_no - 1,

127 page_boundary=self._boundary_type,

128 do_sanitization=False,

129 )

130 for pi, page in enumerate(

131 doc_dict["pages"]

132 ): # only one page is expected

133 self._pages[page_no] = self._to_segmented_page(

134 page=page["original"],

135 create_words=create_words,

136 create_textlines=create_textlines,

137 ) # put on cache

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb9 in position 1: invalid start byte


답변 

연관 질문

커뮤니티 질문보기