- 강의 질문
- 인공지능
pdf 문서 전처리 실습 중 오류 발생.
parse_yaml_path = os.path.join(root_dir, 'config', 'parse', 'simple_pdf.yaml')
parser.start_parsing(parse_yaml_path)
코드에서, 다음과 같은 오류가 발생합니다.
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\skyop\anaconda3\Lib\multiprocessing\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
^^^^^^^^^^^^^^^^^^^
File "C:\Users\skyop\anaconda3\Lib\multiprocessing\pool.py", line 51, in starmapstar
return list(itertools.starmap(args[0], args[1]))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\data\parse\langchain_parse.py", line 58, in langchain_parse_pure
documents = parse_instance.load()
^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 32, in load
return list(self.lazy_load())
^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\langchain_community\document_loaders\pdf.py", line 682, in lazy_load
yield from self.parser.lazy_parse(blob)
File "c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\langchain_community\document_loaders\parsers\pdf.py", line 793, in lazy_parse
metadata=_validate_metadata(doc_metadata),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\langchain_community\document_loaders\parsers\pdf.py", line 140, in _validate_metadata
raise ValueError("The PDF parser must valorize the standard metadata.")
ValueError: The PDF parser must valorize the standard metadata.
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
Cell In[6], line 2
1 parse_yaml_path = os.path.join(root_dir, 'config', 'parse', 'simple_pdf.yaml')
----> 2 parser.start_parsing(parse_yaml_path)
File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\parser.py:30, in Parser.start_parsing(self, yaml_path, all_files)
27 input_modules, input_params = get_param_combinations(modules)
29 logger.info("Parsing Start...")
---> 30 run_parser(
31 modules=input_modules,
32 module_params=input_params,
33 data_path_glob=self.data_path_glob,
34 project_dir=self.project_dir,
35 all_files=all_files,
36 )
37 logger.info("Parsing Done!")
File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\data\parse\run.py:88, in run_parser(modules, module_params, data_path_glob, project_dir, all_files)
85 modules.extend(add_modules)
86 module_params.extend(add_params)
---> 88 results, execution_times = zip(
89 *map(
90 lambda x: measure_speed(x[0], data_path_glob=data_path_glob, **x[1]),
91 zip(modules, module_params),
92 )
93 )
94 average_times = list(map(lambda x: x / len(results[0]), execution_times))
96 # save results to parquet files
File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\data\parse\run.py:90, in run_parser.<locals>.<lambda>(x)
85 modules.extend(add_modules)
86 module_params.extend(add_params)
88 results, execution_times = zip(
89 *map(
---> 90 lambda x: measure_speed(x[0], data_path_glob=data_path_glob, **x[1]),
91 zip(modules, module_params),
92 )
93 )
94 average_times = list(map(lambda x: x / len(results[0]), execution_times))
96 # save results to parquet files
File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\strategy.py:14, in measure_speed(func, *args, **kwargs)
10 """
11 Method for measuring execution speed of the function.
12 """
13 start_time = time.time()
---> 14 result = func(*args, **kwargs)
15 end_time = time.time()
16 return result, end_time - start_time
File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\utils\util.py:72, in result_to_dataframe.<locals>.decorator_result_to_dataframe.<locals>.wrapper(*args, **kwargs)
70 @functools.wraps(func)
71 def wrapper(*args, **kwargs) -> pd.DataFrame:
---> 72 results = func(*args, **kwargs)
73 if len(column_names) == 1:
74 df_input = {column_names[0]: results}
File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\data\parse\base.py:61, in parser_node.<locals>.wrapper(data_path_glob, file_type, parse_method, **kwargs)
57 result = func(
58 data_path_list=data_paths, parse_method=parse_method, **kwargs
59 )
60 else:
---> 61 result = func(
62 data_path_list=data_paths, parse_method=parse_method, **kwargs
63 )
64 elif func.__name__ in ["clova_ocr", "llama_parse", "table_hybrid_parse"]:
65 result = func(data_path_list=data_paths, **kwargs)
File c:\Users\skyop\JaehoNote\AutoRAG\Lib\site-packages\autorag\data\parse\langchain_parse.py:30, in langchain_parse(data_path_list, parse_method, **kwargs)
28 # Execute parallel processing
29 with mp.Pool(num_workers) as pool:
---> 30 results = pool.starmap(
31 langchain_parse_pure,
32 [(data_path, parse_method, kwargs) for data_path in data_path_list],
33 )
35 texts, path, pages = (list(chain.from_iterable(item)) for item in zip(*results))
37 return texts, path, pages
File ~\anaconda3\Lib\multiprocessing\pool.py:375, in Pool.starmap(self, func, iterable, chunksize)
369 def starmap(self, func, iterable, chunksize=None):
370 '''
371 Like `map()` method but the elements of the `iterable` are expected to
372 be iterables as well and will be unpacked as arguments. Hence
373 `func` and (a, b) becomes func(a, b).
374 '''
--> 375 return self._map_async(func, iterable, starmapstar, chunksize).get()
File ~\anaconda3\Lib\multiprocessing\pool.py:774, in ApplyResult.get(self, timeout)
772 return self._value
773 else:
--> 774 raise self._value
ValueError: The PDF parser must valorize the standard metadata.