Data Preprocessing and Analyzer Selection
!pip install konlpy pandasimport pandas as pd
import re
from konlpy.tag import Kkma
import multiprocessing as mp
# Load filtered data
df_filtered = pd.read_csv("news_data.csv", encoding='UTF8', engine='python')
def preprocess_text(text):
if isinstance(text, str):
# Convert text to lowercase
text = text.lower()
# Delete email addresses and replace with 'email'
text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b', 'email', text)
# Delete words inside '[ ]'
text = re.sub('\[.*?\]', '', text)
# Delete all words after '▶'
text = re.sub('▶.*', '', text)
# Delete text inside parentheses
text = re.sub(r'\([^()]*\)', '', text)
# Delete HTML tags and URLs
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'http\S+', '', text)
# Delete standalone numbers
text = re.sub(r'(?<=\s)\d+(?=\s)', ' ', text)
# Add spaces before and after quotes if missing
text = re.sub(r'(?<=\S)([\'"])(?=\S)', r' \1', text)
# Replace special characters with spaces, except alphabets, numbers, Korean, and Chinese characters
text = re.sub(r'[^a-zA-Z0-9ㄱ-ㅎㅏ-ㅣ가-힣\u2E80-\u2FD5\u3190-\u319F\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\uFE30-\uFE4F\s]', ' ', text)
# Add space after period if missing
text = re.sub(r'\.(?=\S)', '. ', text)
# Delete the sentence containing the word 'reporter' and all words after it
if '기자' in text:
text = text.rsplit('기자', 1)[0]
text = re.sub(r'\S+\s*$', '', text)
# Replace consecutive spaces with a single space
text = re.sub(r'\s+', ' ', text)
# Remove leading and trailing spaces
text = text.strip()
# Tokenize using Kkma
kkma = Kkma()
pos = kkma.pos(text)
# Filter only nouns
tokens = [word for word, tag in pos if tag.startswith(('N', 'V', 'VA'))]
# Remove stop words
stopwords = ['this', 'before', 'at', 'more', 'this', 'to', 'and', 'and', 'is', 'of', 'number', 'inside', 'etc', 'some', 'one', 'previous', 'to', 'when', 'case',
'thing', 'thing', 'people', 'two', 'my', 'and', 'well', 'became', 'after', 'initial', 'yes', 'back', 'became', 'not', 'main',
'we', 'related', 'above', 'after', 'case', 'today', 'time', 'this', 'afternoon', 'month', 'ongoing', 'all', 'next', 'before', 'interest', 'meanwhile',
'decade', 'instead', 'however', 'here', 'period', 'previous year', 'about', 'next year', 'middle', 'that', 'during that time', 'now', 'everywhere', 'fenta', 'fentanyl', 'rest',
'where', 'yunhee', '2nd', 'sys', 'basis', 'here', 'today', 'this', 'about', 'of course', 'each other', 'I',
'confirm', 'person', 'last year', 'this year', 'target', 'process', 'current', 'among', 'gram', 'as much as', 'part', 'fact',
'last month', 'morning', 'compared to', 'preparation', 'at that time', 'applicable', 'self', 'ultimately', 'during', 'area', 'one', 'now', 'past', 'billion won', 'simultaneous', 'considerable', 'various', 'consider',
'direct', 'historical', 'etc to', 'surrounding', 'last year', 'yonhap news', 'below', 'occupy', 'eventually', 'billion', 'type', 'existing', 'itself', 'cases', 'previous day', 'what',
'daily', 'during', 'department', 'day', 'this month', 'hour minute', 'people', 'name', 'grade', 'manager', 'months', 'thousand cases', 'manager', 'director', 'yearly', 'through', 'timing', 'pro', 'about',
'all', 'part', 'month day', 'around time', 'yaba', 'unit', 'sessions', 'mr kim', 'who', 'republic', 'last', 'number is', 'whether', 'mr kim', 'less than', 'only', 'dozens', 'during that time', 'like', 'first', 'wrong', 'era', 'someone',
'about', 'if', 'self', 'years new', 'around', 'year', 'park', 'nth', 'separate', 'later', 'about ten thousand', 'mr lee', 'minimum', 'maximum', 'once', 'places',
'likewise', 'body', 'once', 'do', 'near', 'education', 'recent', 'incident', 'official', 'situation', 'ten thousand won', 'region', 'include', 'whole', 'degree', 'condition', 'last year']
return tokens
else:
return []
def process_chunk(chunk):
chunk['title_tokenized'] = chunk['title'].apply(preprocess_text)
chunk['content_tokenized'] = chunk['content'].apply(preprocess_text)
return chunk
# Define the chunk size
chunksize = 1000
# Create a Pool of worker processes
pool = mp.Pool(mp.cpu_count())
# Read the input file and process it in smaller chunks
with pd.read_csv("news_data.csv", encoding='UTF8', engine='python', chunksize=chunksize) as reader:
processed_chunks = pool.map(process_chunk, reader)
# Combine processed chunks into a single DataFrame
processed_data = pd.concat(processed_chunks, ignore_index=True)
# Write the processed data to the output file
processed_data.to_csv("final_combined.csv", mode='w', encoding='UTF8', index=False)