def clean(text: str) -> str:
"""
한글, 영어만 유지
:example
'안녕하세요 hello 123 #@!' -> '안녕하세요hello'
"""
jamo_patterns = "([ㄱ-ㅎㅏ-ㅣ]+)" # 한글 단일 자음&모음제거
special_patterns = "[-=+,#/\:$. @*\"※&%ㆍ』\\‘|\(\)\[\]\<\>`'…》.!\?]"
text = re.sub(pattern=jamo_patterns, repl="", string=text)
text = re.sub(pattern=special_patterns, repl="", string=text)
text = re.sub(r"[0-9]+", "", string=text)
text = text.strip()
return text
def clean(text) -> str:
"""
띄어쓰기, 한글, 영어, 숫자만 유지
:example
'안녕하세요 hello 123 #@!' -> '안녕하세요 hello 123'
"""
text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9\\s]", "", text)
text = re.sub("([ㄱ-ㅎㅏ-ㅣ]+)","",text)
text = text.strip()
return text
핵심 형태소만 추출
def detect_point_keyword(pos_tag: List[Tuple]) -> List[str]:
point_keyword = []
for word, pos in pos_tag:
if pos in ["NNP", "NNG", "IC", "VV", "VA", "NP"]:
point_keyword.append(word)
return point_keyword
중복제거
print(f'중복 제거 전 train length: {len(train)}')
train.drop_duplicates(subset=['sentence'], inplace=True, ignore_index=True)
print(f'중복 제거 후 train length: {len(train)}\n')
def categorize_time(dt):
hour = dt.hour
if 6 <= hour < 12:
return "아침"
elif 12 <= hour < 18:
return "점심"
elif 18 <= hour < 24:
return "저녁"
else:
return "새벽"
def is_weekday(dt):
day_of_week = dt.weekday()
if day_of_week < 5:
return "주중"
else:
return "주말"