Punishiment or prevention?: A machine learning analysis
of drug-related news coverage across cultures

#2 Data Preprocessing

Data Preprocessing and Analyzer Selection

  • After collecting Naver News data, unnecessary symbols and trailing advertisements were removed from the text.
  • Kkma Analyzer Selection: Despite its slower processing time and lower accuracy with informal language, Kkma was chosen for its ability to extract compound nouns. Given the project's flexible timeline and the formal nature of news articles, we determined Kkma would yield more meaningful results.
  • Parts of Speech Selection: Nouns, adjectives, and verbs were extracted to capture more semantic content during embedding and topic modeling compared to using nouns alone.
!pip install konlpy pandas
import pandas as pd
import re
from konlpy.tag import Kkma
import multiprocessing as mp
# Load filtered data
df_filtered = pd.read_csv("news_data.csv", encoding='UTF8', engine='python')
def preprocess_text(text):
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()
        # Delete email addresses and replace with 'email'
        text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b', 'email', text)
        # Delete words inside '[ ]'
        text = re.sub('\[.*?\]', '', text)
        # Delete all words after '▶'
        text = re.sub('▶.*', '', text)
        # Delete text inside parentheses
        text = re.sub(r'\([^()]*\)', '', text)
        # Delete HTML tags and URLs
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'http\S+', '', text)
        # Delete standalone numbers
        text = re.sub(r'(?<=\s)\d+(?=\s)', ' ', text)
        # Add spaces before and after quotes if missing
        text = re.sub(r'(?<=\S)([\'"])(?=\S)', r' \1', text)
        # Replace special characters with spaces, except alphabets, numbers, Korean, and Chinese characters
        text = re.sub(r'[^a-zA-Z0-9ㄱ-ㅎㅏ-ㅣ가-힣\u2E80-\u2FD5\u3190-\u319F\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\uFE30-\uFE4F\s]', ' ', text)
        # Add space after period if missing
        text = re.sub(r'\.(?=\S)', '. ', text)
        
        # Delete the sentence containing the word 'reporter' and all words after it
        if '기자' in text:
            text = text.rsplit('기자', 1)[0]
            text = re.sub(r'\S+\s*$', '', text)
        # Replace consecutive spaces with a single space
        text = re.sub(r'\s+', ' ', text)
        # Remove leading and trailing spaces
        text = text.strip()
        
        # Tokenize using Kkma
        kkma = Kkma()
        pos = kkma.pos(text)
        
        # Filter only nouns
        tokens = [word for word, tag in pos if tag.startswith(('N', 'V', 'VA'))]
        
        # Remove stop words
        stopwords = ['this', 'before', 'at', 'more', 'this', 'to', 'and', 'and', 'is', 'of', 'number', 'inside', 'etc', 'some', 'one', 'previous', 'to', 'when', 'case',
                     'thing', 'thing', 'people', 'two', 'my', 'and', 'well', 'became', 'after', 'initial', 'yes', 'back', 'became', 'not', 'main',
                     'we', 'related', 'above', 'after', 'case', 'today', 'time', 'this', 'afternoon', 'month', 'ongoing', 'all', 'next', 'before', 'interest', 'meanwhile',
                     'decade', 'instead', 'however', 'here', 'period', 'previous year', 'about', 'next year', 'middle', 'that', 'during that time', 'now', 'everywhere', 'fenta', 'fentanyl', 'rest',
                     'where', 'yunhee', '2nd', 'sys', 'basis', 'here', 'today', 'this', 'about', 'of course', 'each other', 'I',
                     'confirm', 'person', 'last year', 'this year', 'target', 'process', 'current', 'among', 'gram', 'as much as', 'part', 'fact',
                     'last month', 'morning', 'compared to', 'preparation', 'at that time', 'applicable', 'self', 'ultimately', 'during', 'area', 'one', 'now', 'past', 'billion won', 'simultaneous', 'considerable', 'various', 'consider',
                     'direct', 'historical', 'etc to', 'surrounding', 'last year', 'yonhap news', 'below', 'occupy', 'eventually', 'billion', 'type', 'existing', 'itself', 'cases', 'previous day', 'what',
                     'daily', 'during', 'department', 'day', 'this month', 'hour minute', 'people', 'name', 'grade', 'manager', 'months', 'thousand cases', 'manager', 'director', 'yearly', 'through', 'timing', 'pro', 'about',
                     'all', 'part', 'month day', 'around time', 'yaba', 'unit', 'sessions', 'mr kim', 'who', 'republic', 'last', 'number is', 'whether', 'mr kim', 'less than', 'only', 'dozens', 'during that time', 'like', 'first', 'wrong', 'era', 'someone',
                     'about', 'if', 'self', 'years new', 'around', 'year', 'park', 'nth', 'separate', 'later', 'about ten thousand', 'mr lee', 'minimum', 'maximum', 'once', 'places',
                     'likewise', 'body', 'once', 'do', 'near', 'education', 'recent', 'incident', 'official', 'situation', 'ten thousand won', 'region', 'include', 'whole', 'degree', 'condition', 'last year']
        return tokens
    else:
        return []
def process_chunk(chunk):
    chunk['title_tokenized'] = chunk['title'].apply(preprocess_text)
    chunk['content_tokenized'] = chunk['content'].apply(preprocess_text)
    return chunk
# Define the chunk size
chunksize = 1000
# Create a Pool of worker processes
pool = mp.Pool(mp.cpu_count())
# Read the input file and process it in smaller chunks
with pd.read_csv("news_data.csv", encoding='UTF8', engine='python', chunksize=chunksize) as reader:
    processed_chunks = pool.map(process_chunk, reader)
# Combine processed chunks into a single DataFrame
processed_data = pd.concat(processed_chunks, ignore_index=True)
# Write the processed data to the output file
processed_data.to_csv("final_combined.csv", mode='w', encoding='UTF8', index=False)