Punishiment or prevention?: A machine learning analysis
of drug-related news coverage across cultures

#1 Data collection through crawling

News data about ‘drugs(마약)’ was collected from Naver News using Selenium and Chromedriver from May 2022 to April 2023.
The crawling included both articles and their associated comments/counts to analyze public response.
While other sources like BigKinds were available, Naver News was chosen for its comprehensive coverage across multiple news agencies and its position as Korea’s leading portal site.

import requests
import bs4
from selenium.webdriver import Chrome
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
import datetime
import re
from selenium.common.exceptions import NoSuchElementException
def create_url(page_number, timestamp):
    base_url = f"https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%A7%88%EC%95%BD&sort=2&photo=3&field=0&pd=3&ds=2022.05.01&de=2023.04.31&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20220501to20230431,a:all&start={page_number}"
    if timestamp:
        base_url = re.sub(r'(?<=&ds=)\d{4}\.\d{2}\.\d{2}', timestamp[:10], base_url)
        if page_number >= 3991:
            base_url = base_url.replace(f"start={page_number}", "start=1")
    return base_url

driver = Chrome('Chromedriver')

time.sleep(2)
url = create_url(1, None)
driver.get(url)

dict_list = []

time.sleep(2)

i = 0
end_date = datetime.datetime.strptime("2023.05.01", "%Y.%m.%d")

while True:

    num = 1 + 10 * i
    for k in range(num, num + 10):
        count_path = f'//*[@id="sp_nws{k}"]/div/div/div[1]/div[2]/a[2]'

        try:
            count = driver.find_element(By.XPATH, count_path)
        except NoSuchElementException:
            if k == num:
                break
            continue
            
        count.click()
        time.sleep(2)

        driver.switch_to.window(driver.window_handles[1])

        time.sleep(2)

        response = driver. page_source
        one_page = bs4. BeautifulSoup(response)

        title_elem = one_page.find('div', class_='media_end_head_title')
        title = title_elem.text if title_elem else None
        
        media_elem = one_page.find('img', class_='media_end_head_top_logo_img light_type')
        media = media_elem['alt'] if media_elem else None

        name_elem = one_page.find('em', class_='media_end_head_journalist_name')
        name = name_elem.text if name_elem else None

        content_elem = one_page.find('div', class_='go_trans _article_content')
        content = content_elem.text if content_elem else None

        time_elem = one_page.find('span', class_='media_end_head_info_datestamp_time _ARTICLE_DATE_TIME')
        timestamp = time_elem.text if time_elem else None

        good_elem = one_page.find('li', class_='u_likeit_list good')
        good = good_elem.find('span', class_='u_likeit_list_count _count').text if good_elem else None

        warm_elem = one_page.find('li', class_='u_likeit_list warm')
        warm = warm_elem.find('span', class_='u_likeit_list_count _count').text if warm_elem else None

        sad_elem = one_page.find('li', class_='u_likeit_list sad')
        sad = sad_elem.find('span', class_='u_likeit_list_count _count').text if sad_elem else None

        angry_elem = one_page.find('li', class_='u_likeit_list angry')
        angry = angry_elem.find('span', class_='u_likeit_list_count _count').text if angry_elem else None

        want_elem = one_page.find('li', class_='u_likeit_list want')
        want = want_elem.find('span', class_='u_likeit_list_count _count').text if want_elem else None

        cheer_elem = one_page.find('li', class_='u_likeit_list cheer')
        cheer = cheer_elem.find('span', class_='u_likeit_list_count _count').text if cheer_elem else None

        congrats_elem = one_page.find('li', class_='u_likeit_list congrats')
        congrats = congrats_elem.find('span', class_='u_likeit_list_count _count').text if congrats_elem else None

        expect_elem = one_page.find('li', class_='u_likeit_list expect')
        expect = expect_elem.find('span', class_='u_likeit_list_count _count').text if expect_elem else None

        surprise_elem = one_page.find('li', class_='u_likeit_list surprise')
        surprise = surprise_elem.find('span', class_='u_likeit_list_count _count').text if surprise_elem else None
        
        useful_elem = one_page.find('li', class_='u_likeit_list useful')
        useful = useful_elem.find('span', class_='u_likeit_list_count _count').text if useful_elem else None

        wow_elem = one_page.find('li', class_='u_likeit_list wow')
        wow = wow_elem.find('span', class_='u_likeit_list_count _count').text if wow_elem else None

        touched_elem = one_page.find('li', class_='u_likeit_list touched')
        touched = touched_elem.find('span', class_='u_likeit_list_count _count').text if touched_elem else None

        analytical_elem = one_page.find('li', class_='u_likeit_list analytical')
        analytical = analytical_elem.find('span', class_='u_likeit_list_count _count').text if analytical_elem else None

        recommend_elem = one_page.find('li', class_='u_likeit_list recommend')
        recommend = recommend_elem.find('span', class_='u_likeit_list_count _count').text if recommend_elem else None

        
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        comment_elems = one_page.find_all('div', class_='u_cbox_comment_box')
        comment_list = []
        for comment_elem in comment_elems:
            comment_id = comment_elem.find('span', class_='u_cbox_nick')
            comment_text_elem = comment_elem.find('span', class_='u_cbox_contents')

            if comment_id and comment_text_elem:
                comment_list.append(f"{comment_id.text}: {comment_text_elem.text}")

        comments = '\n'.join(comment_list)

        comment_count_elem = one_page.find('span', class_='u_cbox_count')
        comment_count = comment_count_elem.text if comment_count_elem else None


        data_dict = {
            'time': timestamp,
            'title': title,
            'content': content,
            'media': media,
            'name': name,
            'good': good,
            'warm': warm,
            'sad': sad,
            'angry': angry,
            'want': want,
            'cheer': cheer,
            'congrats': congrats,
            'expect': expect,
            'surprise': surprise,
            'useful': useful,
            'wow': wow,
            'touched': touched,
            'analytical': analytical,
            'recommend': recommend,
            'sum': 0,
            'comments': comments,
            'comment_count': comment_count,
        }

        should_append = False
        for key in ['good', 'warm', 'sad', 'angry', 'want', 'cheer', 'congrats', 'expect', 'surprise', 'useful', 'wow', 'touched', 'analytical', 'recommend']:
            if data_dict[key] is not None and data_dict[key].isdigit():
                data_dict['sum'] += int(data_dict[key])
                should_append = True

        if should_append:
            dict_list.append(data_dict)


    count_path = f'//*[@id="main_pack"]/div[2]/div/a[2]/i'
    count = driver.find_element(By.XPATH, count_path)
    count.click()
    time.sleep(3)

    if timestamp is not None and (num + 10 >= 400 or datetime.datetime.strptime(timestamp[:10], "%Y.%m.%d") >= end_date):

        df = pd.DataFrame(dict_list)
        csv_filename = f"news_data_{timestamp[:10].replace('.', '-')}.csv"
        df.to_csv(csv_filename, index=False)
        dict_list = []

        i = 0
        url = create_url(1, timestamp)
        driver.get(url)
        time.sleep(2)

        if datetime.datetime.strptime(timestamp[:10], "%Y.%m.%d") >= end_date:
            break
    else:
        i += 1