import os
import re
import time
import pickle
def preProcessNote(file, path_dir):
try:
file_path = os.path.join(path_dir, file)
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 지워야 할 지점 체크
start_idx = None
end_idx = None
for idx, line in enumerate(lines):
if re.match('---', line):
if start_idx is None:
start_idx = idx
else:
end_idx = idx
break
if start_idx is None or end_idx is None:
raise ValueError(f"Start or end delimiter not found in file: {file_path}")
# 삭제하기
lines = lines[:start_idx] + lines[end_idx+1:]
note = ''.join(lines)
# 데이터 정제
note = re.sub(r'[\n\t]+', '', note)
return note
except FileNotFoundError as e:
raise FileNotFoundError(f"File not found: {file_path}") from e
except PermissionError as e:
raise PermissionError(f"Permission denied for file: {file_path}") from e
except UnicodeDecodeError as e:
raise UnicodeDecodeError(f"Error decoding file: {file_path}. Ensure the file is UTF-8 encoded.") from e
except ValueError as e:
raise ValueError(f"Processing error in file {file_path}: {str(e)}") from e
except OSError as e:
raise OSError(f"OS error occurred while processing file: {file_path}") from e
except Exception as e:
raise Exception(f"Unexpected error while processing file: {file_path}") from e
def preProcessNotes(path_dir="data/md", file_ext="md", output_dir="output", output_filename="notes.pkl"):
start_time = time.time() # 시작 시간 기록
try:
# 파일 리스트 가지고 오기
file_list = os.listdir(path_dir)
file_list = [file for file in file_list if file.endswith(f'.{file_ext}')]
except FileNotFoundError as e:
raise FileNotFoundError(f"Directory not found: {path_dir}") from e
except PermissionError as e:
raise PermissionError(f"Permission denied for directory: {path_dir}") from e
except OSError as e:
raise OSError(f"OS error occurred while accessing directory: {path_dir}") from e
except Exception as e:
raise Exception(f"Unexpected error while listing files in directory: {path_dir}") from e
# 출력 디렉토리 생성
if not os.path.exists(output_dir):
try:
os.makedirs(output_dir)
except OSError as e:
raise OSError(f"Failed to create output directory: {output_dir}") from e
notes = []
for file in file_list:
try:
note = preProcessNote(file, path_dir)
notes.append(note)
except Exception as e:
print(f"Error occurred while processing {file}: {e}")
# notes 리스트를 pickle 파일로 저장
output_file_path = os.path.join(output_dir, output_filename)
try:
with open(output_file_path, 'wb') as pkl_file:
pickle.dump(notes, pkl_file)
except Exception as e:
raise Exception(f"Failed to save notes to pickle file: {output_file_path}") from e
end_time = time.time() # 종료 시간 기록
elapsed_time = end_time - start_time # 경과 시간 계산
print(f"Processing completed successfully in {elapsed_time:.2f} seconds. Notes saved to {output_file_path}")
return notes