root layout

패스트캠퍼스

  1. 강의 질문
  2. 인공지능

pdf 문서 전처리 실습 중 오류 발생.

2025.03.27 16:17 작성

parse_yaml_path = os.path.join(root_dir, 'config', 'parse', 'simple_pdf.yaml')

parser.start_parsing(parse_yaml_path)


코드에서, 다음과 같은 오류가 발생합니다.


---------------------------------------------------------------------------

RemoteTraceback Traceback (most recent call last)

RemoteTraceback:

"""

Traceback (most recent call last):

File "C:\Users\skyop\anaconda3\Lib\multiprocessing\pool.py", line 125, in worker

result = (True, func(*args, **kwds))

^^^^^^^^^^^^^^^^^^^

File "C:\Users\skyop\anaconda3\Lib\multiprocessing\pool.py", line 51, in starmapstar

return list(itertools.starmap(args[0], args[1]))

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\data\parse\langchain_parse.py", line 58, in langchain_parse_pure

documents = parse_instance.load()

^^^^^^^^^^^^^^^^^^^^^

File "c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 32, in load

return list(self.lazy_load())

^^^^^^^^^^^^^^^^^^^^^^

File "c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\langchain_community\document_loaders\pdf.py", line 682, in lazy_load

yield from self.parser.lazy_parse(blob)

File "c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\langchain_community\document_loaders\parsers\pdf.py", line 793, in lazy_parse

metadata=_validate_metadata(doc_metadata),

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\langchain_community\document_loaders\parsers\pdf.py", line 140, in _validate_metadata

raise ValueError("The PDF parser must valorize the standard metadata.")

ValueError: The PDF parser must valorize the standard metadata.

"""

The above exception was the direct cause of the following exception:

ValueError Traceback (most recent call last)

Cell In[6], line 2

1 parse_yaml_path = os.path.join(root_dir, 'config', 'parse', 'simple_pdf.yaml')

----> 2 parser.start_parsing(parse_yaml_path)

File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\parser.py:30, in Parser.start_parsing(self, yaml_path, all_files)

27 input_modules, input_params = get_param_combinations(modules)

29 logger.info("Parsing Start...")

---> 30 run_parser(

31 modules=input_modules,

32 module_params=input_params,

33 data_path_glob=self.data_path_glob,

34 project_dir=self.project_dir,

35 all_files=all_files,

36 )

37 logger.info("Parsing Done!")

File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\data\parse\run.py:88, in run_parser(modules, module_params, data_path_glob, project_dir, all_files)

85 modules.extend(add_modules)

86 module_params.extend(add_params)

---> 88 results, execution_times = zip(

89 *map(

90 lambda x: measure_speed(x[0], data_path_glob=data_path_glob, **x[1]),

91 zip(modules, module_params),

92 )

93 )

94 average_times = list(map(lambda x: x / len(results[0]), execution_times))

96 # save results to parquet files

File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\data\parse\run.py:90, in run_parser.<locals>.<lambda>(x)

85 modules.extend(add_modules)

86 module_params.extend(add_params)

88 results, execution_times = zip(

89 *map(

---> 90 lambda x: measure_speed(x[0], data_path_glob=data_path_glob, **x[1]),

91 zip(modules, module_params),

92 )

93 )

94 average_times = list(map(lambda x: x / len(results[0]), execution_times))

96 # save results to parquet files

File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\strategy.py:14, in measure_speed(func, *args, **kwargs)

10 """

11 Method for measuring execution speed of the function.

12 """

13 start_time = time.time()

---> 14 result = func(*args, **kwargs)

15 end_time = time.time()

16 return result, end_time - start_time

File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\utils\util.py:72, in result_to_dataframe.<locals>.decorator_result_to_dataframe.<locals>.wrapper(*args, **kwargs)

70 @functools.wraps(func)

71 def wrapper(*args, **kwargs) -> pd.DataFrame:

---> 72 results = func(*args, **kwargs)

73 if len(column_names) == 1:

74 df_input = {column_names[0]: results}

File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\data\parse\base.py:61, in parser_node.<locals>.wrapper(data_path_glob, file_type, parse_method, **kwargs)

57 result = func(

58 data_path_list=data_paths, parse_method=parse_method, **kwargs

59 )

60 else:

---> 61 result = func(

62 data_path_list=data_paths, parse_method=parse_method, **kwargs

63 )

64 elif func.__name__ in ["clova_ocr", "llama_parse", "table_hybrid_parse"]:

65 result = func(data_path_list=data_paths, **kwargs)

File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\data\parse\langchain_parse.py:30, in langchain_parse(data_path_list, parse_method, **kwargs)

28 # Execute parallel processing

29 with mp.Pool(num_workers) as pool:

---> 30 results = pool.starmap(

31 langchain_parse_pure,

32 [(data_path, parse_method, kwargs) for data_path in data_path_list],

33 )

35 texts, path, pages = (list(chain.from_iterable(item)) for item in zip(*results))

37 return texts, path, pages

File ~\anaconda3\Lib\multiprocessing\pool.py:375, in Pool.starmap(self, func, iterable, chunksize)

369 def starmap(self, func, iterable, chunksize=None):

370 '''

371 Like `map()` method but the elements of the `iterable` are expected to

372 be iterables as well and will be unpacked as arguments. Hence

373 `func` and (a, b) becomes func(a, b).

374 '''

--> 375 return self._map_async(func, iterable, starmapstar, chunksize).get()

File ~\anaconda3\Lib\multiprocessing\pool.py:774, in ApplyResult.get(self, timeout)

772 return self._value

773 else:

--> 774 raise self._value

ValueError: The PDF parser must valorize the standard metadata.


답변 

연관 질문

커뮤니티 질문보기