목적 및 배경
- 매일 글을 쓰는 입장에서 보다 많은 글을 작성하기 위해서 1차로 글의 구조만 잡아주고, 나머지 내용은 LLM으로 채운 후 퇴고를 하는 형태를 시도해보려고 한다.
Code
01. Obsidian에 있는 내 글을 모아서 전처리하기
import os
import re
import time
import pickle
def preProcessNote(file, path_dir):
try:
file_path = os.path.join(path_dir, file)
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 지워야 할 지점 체크
start_idx = None
end_idx = None
for idx, line in enumerate(lines):
if re.match('---', line):
if start_idx is None:
start_idx = idx
else:
end_idx = idx
break
if start_idx is None or end_idx is None:
raise ValueError(f"Start or end delimiter not found in file: {file_path}")
# 삭제하기
lines = lines[:start_idx] + lines[end_idx+1:]
note = ''.join(lines)
# 데이터 정제
note = re.sub(r'[\n\t]+', '', note)
return note
except FileNotFoundError as e:
raise FileNotFoundError(f"File not found: {file_path}") from e
except PermissionError as e:
raise PermissionError(f"Permission denied for file: {file_path}") from e
except UnicodeDecodeError as e:
raise UnicodeDecodeError(f"Error decoding file: {file_path}. Ensure the file is UTF-8 encoded.") from e
except ValueError as e:
raise ValueError(f"Processing error in file {file_path}: {str(e)}") from e
except OSError as e:
raise OSError(f"OS error occurred while processing file: {file_path}") from e
except Exception as e:
raise Exception(f"Unexpected error while processing file: {file_path}") from e
def preProcessNotes(path_dir="data/md", file_ext="md", output_dir="output", output_filename="notes.pkl"):
start_time = time.time() # 시작 시간 기록
try:
# 파일 리스트 가지고 오기
file_list = os.listdir(path_dir)
file_list = [file for file in file_list if file.endswith(f'.{file_ext}')]
except FileNotFoundError as e:
raise FileNotFoundError(f"Directory not found: {path_dir}") from e
except PermissionError as e:
raise PermissionError(f"Permission denied for directory: {path_dir}") from e
except OSError as e:
raise OSError(f"OS error occurred while accessing directory: {path_dir}") from e
except Exception as e:
raise Exception(f"Unexpected error while listing files in directory: {path_dir}") from e
# 출력 디렉토리 생성
if not os.path.exists(output_dir):
try:
os.makedirs(output_dir)
except OSError as e:
raise OSError(f"Failed to create output directory: {output_dir}") from e
notes = []
for file in file_list:
try:
note = preProcessNote(file, path_dir)
notes.append(note)
except Exception as e:
print(f"Error occurred while processing {file}: {e}")
# notes 리스트를 pickle 파일로 저장
output_file_path = os.path.join(output_dir, output_filename)
try:
with open(output_file_path, 'wb') as pkl_file:
pickle.dump(notes, pkl_file)
except Exception as e:
raise Exception(f"Failed to save notes to pickle file: {output_file_path}") from e
end_time = time.time() # 종료 시간 기록
elapsed_time = end_time - start_time # 경과 시간 계산
print(f"Processing completed successfully in {elapsed_time:.2f} seconds. Notes saved to {output_file_path}")
return notes
02. LangChain 구축
import pickle
import os
import ollama
from langchain.schema import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
notes = pickle.load(open('output/notes.pkl', 'rb'))
notes_dict = {index: document for index, document in enumerate(notes)}
# notes_dict에서 각 문서를 Document 객체로 변환
documents = [Document(page_content=content) for content in notes_dict.values()]
embeddings = OllamaEmbeddings(model="llama3.1:latest")
output_dir = "db"
# 출력 디렉토리 생성
if not os.path.exists(output_dir):
try:
os.makedirs(output_dir)
except OSError as e:
raise OSError(f"Failed to create output directory: {output_dir}") from e
collection_name = "obsidian"
# ChromaDB에 저장
try:
vectordb = Chroma.from_documents(
documents=documents,
embedding=embeddings,
collection_name=collection_name, # 컬렉션 이름 추가
persist_directory=output_dir
)
print("데이터 저장 성공")
except ValueError as e:
print(f"오류 발생: {e}")
except Exception as e:
print(f"예상치 못한 오류 발생: {e}")
[#Defines the retriever
retriever = vectordb.as_retriever(search_type='mmr', search_kwargs ={'k':1})
#Gets the document for the retriever
retriever.get_relevant_documents('사전분포란?')
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from operator import itemgetter
# This is the prompt I used
# It takes in the documents as {context} and user provide {topic}
template = """Mimic the writing style in the context:
{context} and produce a blog on the topic in Korean.
The number of character should be more than 2000 characters.
Topic: {topic}
"""
prompt = ChatPromptTemplate.from_template(template)
model = OllamaFunctions(model="llama3.1:latest", format="json")
# model = ChatOpenAI(api_key = "key")
# Using LangCHain LCEL to supply the prompt and generate output
chain = (
{
"context":itemgetter("topic") | retriever,
"topic": itemgetter("topic"),
}
| prompt
| model
| StrOutputParser()
)
#running the Chain
chain.invoke({"topic": "Airflow "})
수정포인트
- VectorDB에서 검색해본 겨로가, 우선 임베딩이 충분히 한글을 반영하는 것으로 보이지 않는다 → 모델 교체 필요
- VectorDB는 sqlite 형태로 저장이 되는데 데이터를 다시 로딩하는 과정에서 에러 발생
- 기타 Warning에 대한 수정 필요
ValueError: Expected collection name that (1) contains 3-63 characters, (2) starts and ends with an alphanumeric character, (3) otherwise contains only alphanumeric characters, underscores or hyphens (-), (4) contains no two consecutive periods (..) and (5) is not a valid IPv4 address, got ./db/chroma.sqlite3