News data about ‘drugs(마약)’ was collected from Naver News using Selenium and Chromedriver from May 2022 to April 2023.
The crawling included both articles and their associated comments/counts to analyze public response.
While other sources like BigKinds were available, Naver News was chosen for its comprehensive coverage across multiple news agencies and its position as Korea’s leading portal site.
import requests
import bs4
from selenium.webdriver import Chrome
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
import datetime
import re
from selenium.common.exceptions import NoSuchElementExceptiondef create_url(page_number, timestamp):
base_url = f"https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%A7%88%EC%95%BD&sort=2&photo=3&field=0&pd=3&ds=2022.05.01&de=2023.04.31&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20220501to20230431,a:all&start={page_number}"
if timestamp:
base_url = re.sub(r'(?<=&ds=)\d{4}\.\d{2}\.\d{2}', timestamp[:10], base_url)
if page_number >= 3991:
base_url = base_url.replace(f"start={page_number}", "start=1")
return base_url
driver = Chrome('Chromedriver')
time.sleep(2)
url = create_url(1, None)
driver.get(url)
dict_list = []
time.sleep(2)
i = 0
end_date = datetime.datetime.strptime("2023.05.01", "%Y.%m.%d")
while True:
num = 1 + 10 * i
for k in range(num, num + 10):
count_path = f'//*[@id="sp_nws{k}"]/div/div/div[1]/div[2]/a[2]'
try:
count = driver.find_element(By.XPATH, count_path)
except NoSuchElementException:
if k == num:
break
continue
count.click()
time.sleep(2)
driver.switch_to.window(driver.window_handles[1])
time.sleep(2)
response = driver. page_source
one_page = bs4. BeautifulSoup(response)
title_elem = one_page.find('div', class_='media_end_head_title')
title = title_elem.text if title_elem else None
media_elem = one_page.find('img', class_='media_end_head_top_logo_img light_type')
media = media_elem['alt'] if media_elem else None
name_elem = one_page.find('em', class_='media_end_head_journalist_name')
name = name_elem.text if name_elem else None
content_elem = one_page.find('div', class_='go_trans _article_content')
content = content_elem.text if content_elem else None
time_elem = one_page.find('span', class_='media_end_head_info_datestamp_time _ARTICLE_DATE_TIME')
timestamp = time_elem.text if time_elem else None
good_elem = one_page.find('li', class_='u_likeit_list good')
good = good_elem.find('span', class_='u_likeit_list_count _count').text if good_elem else None
warm_elem = one_page.find('li', class_='u_likeit_list warm')
warm = warm_elem.find('span', class_='u_likeit_list_count _count').text if warm_elem else None
sad_elem = one_page.find('li', class_='u_likeit_list sad')
sad = sad_elem.find('span', class_='u_likeit_list_count _count').text if sad_elem else None
angry_elem = one_page.find('li', class_='u_likeit_list angry')
angry = angry_elem.find('span', class_='u_likeit_list_count _count').text if angry_elem else None
want_elem = one_page.find('li', class_='u_likeit_list want')
want = want_elem.find('span', class_='u_likeit_list_count _count').text if want_elem else None
cheer_elem = one_page.find('li', class_='u_likeit_list cheer')
cheer = cheer_elem.find('span', class_='u_likeit_list_count _count').text if cheer_elem else None
congrats_elem = one_page.find('li', class_='u_likeit_list congrats')
congrats = congrats_elem.find('span', class_='u_likeit_list_count _count').text if congrats_elem else None
expect_elem = one_page.find('li', class_='u_likeit_list expect')
expect = expect_elem.find('span', class_='u_likeit_list_count _count').text if expect_elem else None
surprise_elem = one_page.find('li', class_='u_likeit_list surprise')
surprise = surprise_elem.find('span', class_='u_likeit_list_count _count').text if surprise_elem else None
useful_elem = one_page.find('li', class_='u_likeit_list useful')
useful = useful_elem.find('span', class_='u_likeit_list_count _count').text if useful_elem else None
wow_elem = one_page.find('li', class_='u_likeit_list wow')
wow = wow_elem.find('span', class_='u_likeit_list_count _count').text if wow_elem else None
touched_elem = one_page.find('li', class_='u_likeit_list touched')
touched = touched_elem.find('span', class_='u_likeit_list_count _count').text if touched_elem else None
analytical_elem = one_page.find('li', class_='u_likeit_list analytical')
analytical = analytical_elem.find('span', class_='u_likeit_list_count _count').text if analytical_elem else None
recommend_elem = one_page.find('li', class_='u_likeit_list recommend')
recommend = recommend_elem.find('span', class_='u_likeit_list_count _count').text if recommend_elem else None
driver.close()
driver.switch_to.window(driver.window_handles[0])
comment_elems = one_page.find_all('div', class_='u_cbox_comment_box')
comment_list = []
for comment_elem in comment_elems:
comment_id = comment_elem.find('span', class_='u_cbox_nick')
comment_text_elem = comment_elem.find('span', class_='u_cbox_contents')
if comment_id and comment_text_elem:
comment_list.append(f"{comment_id.text}: {comment_text_elem.text}")
comments = '\n'.join(comment_list)
comment_count_elem = one_page.find('span', class_='u_cbox_count')
comment_count = comment_count_elem.text if comment_count_elem else None
data_dict = {
'time': timestamp,
'title': title,
'content': content,
'media': media,
'name': name,
'good': good,
'warm': warm,
'sad': sad,
'angry': angry,
'want': want,
'cheer': cheer,
'congrats': congrats,
'expect': expect,
'surprise': surprise,
'useful': useful,
'wow': wow,
'touched': touched,
'analytical': analytical,
'recommend': recommend,
'sum': 0,
'comments': comments,
'comment_count': comment_count,
}
should_append = False
for key in ['good', 'warm', 'sad', 'angry', 'want', 'cheer', 'congrats', 'expect', 'surprise', 'useful', 'wow', 'touched', 'analytical', 'recommend']:
if data_dict[key] is not None and data_dict[key].isdigit():
data_dict['sum'] += int(data_dict[key])
should_append = True
if should_append:
dict_list.append(data_dict)
count_path = f'//*[@id="main_pack"]/div[2]/div/a[2]/i'
count = driver.find_element(By.XPATH, count_path)
count.click()
time.sleep(3)
if timestamp is not None and (num + 10 >= 400 or datetime.datetime.strptime(timestamp[:10], "%Y.%m.%d") >= end_date):
df = pd.DataFrame(dict_list)
csv_filename = f"news_data_{timestamp[:10].replace('.', '-')}.csv"
df.to_csv(csv_filename, index=False)
dict_list = []
i = 0
url = create_url(1, timestamp)
driver.get(url)
time.sleep(2)
if datetime.datetime.strptime(timestamp[:10], "%Y.%m.%d") >= end_date:
break
else:
i += 1