Data

LangChain 활용해서 내 글스타일에 맞춰 글쓰는 LLM 구축하기 Ver0.1

Bongho, Lee

2024년 8월 28일 — 4 min read

Photo by Joshua Hoehne / Unsplash

목적 및 배경

매일 글을 쓰는 입장에서 보다 많은 글을 작성하기 위해서 1차로 글의 구조만 잡아주고, 나머지 내용은 LLM으로 채운 후 퇴고를 하는 형태를 시도해보려고 한다.

Code

01. Obsidian에 있는 내 글을 모아서 전처리하기

import os  
import re  
import time  
import pickle  
  
def preProcessNote(file, path_dir):  
    try:  
        file_path = os.path.join(path_dir, file)  
        with open(file_path, 'r', encoding='utf-8') as f:  
            lines = f.readlines()  
  
        # 지워야 할 지점 체크  
        start_idx = None  
        end_idx = None  
        for idx, line in enumerate(lines):  
            if re.match('---', line):  
                if start_idx is None:  
                    start_idx = idx  
                else:  
                    end_idx = idx  
                    break  
  
        if start_idx is None or end_idx is None:  
            raise ValueError(f"Start or end delimiter not found in file: {file_path}")  
  
        # 삭제하기  
        lines = lines[:start_idx] + lines[end_idx+1:]  
        note = ''.join(lines)  
  
        # 데이터 정제  
        note = re.sub(r'[\n\t]+', '', note)  
  
        return note  
  
    except FileNotFoundError as e:  
        raise FileNotFoundError(f"File not found: {file_path}") from e  
    except PermissionError as e:  
        raise PermissionError(f"Permission denied for file: {file_path}") from e  
    except UnicodeDecodeError as e:  
        raise UnicodeDecodeError(f"Error decoding file: {file_path}. Ensure the file is UTF-8 encoded.") from e  
    except ValueError as e:  
        raise ValueError(f"Processing error in file {file_path}: {str(e)}") from e  
    except OSError as e:  
        raise OSError(f"OS error occurred while processing file: {file_path}") from e  
    except Exception as e:  
        raise Exception(f"Unexpected error while processing file: {file_path}") from e  
  
def preProcessNotes(path_dir="data/md", file_ext="md", output_dir="output", output_filename="notes.pkl"):  
    start_time = time.time()  # 시작 시간 기록  
  
    try:  
        # 파일 리스트 가지고 오기  
        file_list = os.listdir(path_dir)  
        file_list = [file for file in file_list if file.endswith(f'.{file_ext}')]  
    except FileNotFoundError as e:  
        raise FileNotFoundError(f"Directory not found: {path_dir}") from e  
    except PermissionError as e:  
        raise PermissionError(f"Permission denied for directory: {path_dir}") from e  
    except OSError as e:  
        raise OSError(f"OS error occurred while accessing directory: {path_dir}") from e  
    except Exception as e:  
        raise Exception(f"Unexpected error while listing files in directory: {path_dir}") from e  
  
    # 출력 디렉토리 생성  
    if not os.path.exists(output_dir):  
        try:  
            os.makedirs(output_dir)  
        except OSError as e:  
            raise OSError(f"Failed to create output directory: {output_dir}") from e  
  
    notes = []  
  
    for file in file_list:  
        try:  
            note = preProcessNote(file, path_dir)  
            notes.append(note)  
        except Exception as e:  
            print(f"Error occurred while processing {file}: {e}")  
  
    # notes 리스트를 pickle 파일로 저장  
    output_file_path = os.path.join(output_dir, output_filename)  
    try:  
        with open(output_file_path, 'wb') as pkl_file:  
            pickle.dump(notes, pkl_file)  
    except Exception as e:  
        raise Exception(f"Failed to save notes to pickle file: {output_file_path}") from e  
  
    end_time = time.time()  # 종료 시간 기록  
    elapsed_time = end_time - start_time  # 경과 시간 계산  
  
    print(f"Processing completed successfully in {elapsed_time:.2f} seconds. Notes saved to {output_file_path}")  
  
    return notes

02. LangChain 구축

import pickle  
import os  
import ollama  
from langchain.schema import Document  
from langchain_community.embeddings import OllamaEmbeddings  
from langchain_chroma import Chroma

notes = pickle.load(open('output/notes.pkl', 'rb'))  
notes_dict = {index: document for index, document in enumerate(notes)}

# notes_dict에서 각 문서를 Document 객체로 변환  
documents = [Document(page_content=content) for content in notes_dict.values()]  
embeddings = OllamaEmbeddings(model="llama3.1:latest")

output_dir = "db"  
# 출력 디렉토리 생성  
if not os.path.exists(output_dir):  
    try:  
        os.makedirs(output_dir)  
    except OSError as e:  
        raise OSError(f"Failed to create output directory: {output_dir}") from e  
  
collection_name = "obsidian"  
# ChromaDB에 저장  
try:  
    vectordb = Chroma.from_documents(  
        documents=documents,  
        embedding=embeddings,  
        collection_name=collection_name,  # 컬렉션 이름 추가  
        persist_directory=output_dir  
    )  
    print("데이터 저장 성공")  
except ValueError as e:  
    print(f"오류 발생: {e}")  
except Exception as e:  
    print(f"예상치 못한 오류 발생: {e}")  
  
  
[#Defines the retriever
retriever = vectordb.as_retriever(search_type='mmr', search_kwargs ={'k':1})

#Gets the document for the retriever
retriever.get_relevant_documents('사전분포란?')

from langchain_core.output_parsers import StrOutputParser  
from langchain_core.prompts import ChatPromptTemplate  
from langchain_core.runnables import RunnablePassthrough  
from langchain_openai import ChatOpenAI  
from langchain_experimental.llms.ollama_functions import OllamaFunctions  
from operator import itemgetter   
  
# This is the prompt I used  
  
# It takes in the documents as {context} and user provide {topic}  
template = """Mimic the writing style in the context:  
{context} and produce a blog on the topic in Korean.  
The number of character should be more than 2000 characters.  
Topic: {topic}  
  
  
"""  
  
prompt = ChatPromptTemplate.from_template(template)  
  
model = OllamaFunctions(model="llama3.1:latest", format="json")  
# model = ChatOpenAI(api_key = "key")  
  
# Using LangCHain LCEL to supply the prompt and generate output  
chain = (  
    {  
        "context":itemgetter("topic") | retriever,  
        "topic": itemgetter("topic"),  
  
    }  
    | prompt  
    | model  
    | StrOutputParser()  
)  
#running the Chain  
chain.invoke({"topic":  "Airflow "})

수정포인트

VectorDB에서 검색해본 겨로가, 우선 임베딩이 충분히 한글을 반영하는 것으로 보이지 않는다 → 모델 교체 필요
VectorDB는 sqlite 형태로 저장이 되는데 데이터를 다시 로딩하는 과정에서 에러 발생
기타 Warning에 대한 수정 필요

ValueError: Expected collection name that (1) contains 3-63 characters, (2) starts and ends with an alphanumeric character, (3) otherwise contains only alphanumeric characters, underscores or hyphens (-), (4) contains no two consecutive periods (..) and (5) is not a valid IPv4 address, got ./db/chroma.sqlite3