Comparison between Groups with Comments and without Comments
!pip install gensim
import pandas as pd
# Load the modified data
data = pd.read_csv("final_combined.csv")
# Remove commas from the comment_count column
# data['comment_count'] = data['comment_count'].str.replace(',', '')
# Convert the comment_count column to floats
data['comment_count'] = data['comment_count'].astype(float)
# Fill missing values with 0
data['comment_count'].fillna(0, inplace=True)
# Convert the comment_count column to integers
data['comment_count'] = data['comment_count'].astype(int)
data_without = data[data['comment_count'] == 0]
data_with = data[data['comment_count'] >= 1]
# Save the classified data to separate CSV files
data_without.to_csv("data_without_comments4.csv", index=False)
data_with.to_csv("data_with_comments4.csv", index=False)print(len(data_without))
print(len(data_with))The code above uses Word2Vec to identify keywords emphasized more in group B versus group A.
It calculates keyword similarity scores and mean word embeddings between groups to identify terms more prominent in group B.
While differences exist, they were deemed too minimal or insufficiently supported to be considered significant characteristics.
# Read CSV files
data_without = pd.read_csv("data_without_comments4.csv")
data_with = pd.read_csv("data_with_comments4.csv")
# Clean text function: Remove non-Korean characters
def text_cleaning(text):
hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
result = hangul.sub('', text)
return result
def split_text(text):
return text.split()
# Clean text and tokenize
data_without['title_cleaned'] = data_without['title_tokenized'].apply(text_cleaning)
data_with['title_cleaned'] = data_with['title_tokenized'].apply(text_cleaning)
data_without['processed'] = data_without['title_cleaned'].apply(split_text)
data_with['processed'] = data_with['title_cleaned'].apply(split_text)
# Create dictionaries and corpora
dictionary_data_without = Dictionary(data_without['processed'])
dictionary_data_with = Dictionary(data_with['processed'])
corpus_data_without = [dictionary_data_without.doc2bow(doc) for doc in data_without['processed']]
corpus_data_with = [dictionary_data_with.doc2bow(doc) for doc in data_with['processed']]
# Set topic range (4-20)
topic_range = range(4, 20, 1)
# Compute perplexity
def compute_perplexity(dictionary, corpus, num_topics):
model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=5, random_state=42)
return model.log_perplexity(corpus)
# Compute coherence score
def compute_coherence_score(dictionary, corpus, tokens, num_topics):
model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=5, random_state=42)
coherence_model = CoherenceModel(model=model, texts=tokens, dictionary=dictionary, coherence='c_v')
return coherence_model.get_coherence()
# Calculate scores with progress tracking
def calculate_scores(num_topics, dictionary, corpus, tokens):
perplexity = compute_perplexity(dictionary, corpus, num_topics)
coherence = compute_coherence_score(dictionary, corpus, tokens, num_topics)
print(f"Completed: {num_topics} Topics - Perplexity: {perplexity}, Coherence: {coherence}")
return perplexity, coherence
# Plot scores for each dataset
for dataset_name, dictionary, corpus, tokens, xlabel in [
('data_without', dictionary_data_without, corpus_data_without, data_without['processed'], "Group A(0 comments)"),
('data_with', dictionary_data_with, corpus_data_with, data_with['processed'], "Group B(1 or more comments)")
]:
scores = [calculate_scores(num_topics, dictionary, corpus, tokens) for num_topics in topic_range]
perplexity_scores, coherence_scores = zip(*scores)
# Create plot
fig, ax1 = plt.subplots()
ax1.set_title(dataset_name)
ax1.set_xlabel(xlabel)
ax1.set_ylabel("Perplexity", color="tab:red")
ax1.plot(topic_range, perplexity_scores, color="tab:red")
ax1.tick_params(axis="y", labelcolor="tab:red")
ax2 = ax1.twinx()
ax2.set_ylabel("Coherence Score", color="tab:blue")
ax2.plot(topic_range, coherence_scores, color="tab:blue")
ax2.tick_params(axis="y", labelcolor="tab:blue")
fig.tight_layout()
plt.show()

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
# Train the LDA models
lda_data_without = LdaModel(corpus_data_without, id2word=dictionary_data_without, num_topics=8, passes=20, random_state=42)
lda_data_with = LdaModel(corpus_data_with, id2word=dictionary_data_with, num_topics=13, passes=20, random_state=42)
vis_data_without = gensimvis.prepare(lda_data_without, corpus_data_without, dictionary_data_without, mds='mmds', n_jobs=1)
vis_data_with = gensimvis.prepare(lda_data_with, corpus_data_with, dictionary_data_with, mds='mmds', n_jobs=1)pyLDAvis.display(vis_data_without)












for topic in lda_data_without.print_topics(num_topics=8):
topic_num, topic_keywords = topic
print(f"{topic_num} : {topic_keywords}")for topic in lda_data_with.print_topics(num_topics=13):
topic_num, topic_keywords = topic
print(f"{topic_num} : {topic_keywords}")Next step: Comparing groups using TF-IDF technique only
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import LdaModel, TfidfModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
# Calculate TF-IDF for each dataset
tfidf_data_without = TfidfModel(corpus_data_without)
tfidf_data_with = TfidfModel(corpus_data_with)
# Convert the corpus to a TF-IDF representation
corpus_tfidf_data_without = tfidf_data_without[corpus_data_without]
corpus_tfidf_data_with = tfidf_data_with[corpus_data_with]
# Find the top N keywords based on average TF-IDF scores
def get_top_N_keywords(tfidf_corpus, dictionary, N):
avg_tfidf = np.zeros(len(dictionary))
for doc in tfidf_corpus:
for term_id, tfidf_score in doc:
avg_tfidf[term_id] += tfidf_score
avg_tfidf /= len(tfidf_corpus)
top_N_indices = avg_tfidf.argsort()[-N:][::-1]
top_N_keywords = [(dictionary[i], avg_tfidf[i]) for i in top_N_indices]
return top_N_keywords
# Find the top N keywords for each dataset
N = 10
top_N_keywords_data_without = get_top_N_keywords(corpus_tfidf_data_without, dictionary_data_without, N)
top_N_keywords_data_with = get_top_N_keywords(corpus_tfidf_data_with, dictionary_data_with, N)
# Compare the top N keywords between the two datasets
print("Top 10 keywords in A그룹(0 comments):", top_N_keywords_data_without)
print("Top 10 keywords in B그룹(1 or more comments):", top_N_keywords_data_with)Results show top 10 TF-IDF keywords from both groups. Similar keywords appear across groups (e.g., Gangnam drug drink incident), reflecting similar headlines during specific reporting periods.
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
import numpy as np
# Merge the two dictionaries
merged_dictionary = Dictionary(documents=data_without['processed'].tolist() + data_with['processed'].tolist())
# Convert the original corpora to the merged dictionary
corpus_data_without_merged = [merged_dictionary.doc2bow(doc) for doc in data_without['processed']]
corpus_data_with_merged = [merged_dictionary.doc2bow(doc) for doc in data_with['processed']]
# Update the TfidfModels
tfidf_data_without = TfidfModel(corpus_data_without_merged)
tfidf_data_with = TfidfModel(corpus_data_with_merged)
# Convert the corpus to a TF-IDF representation
corpus_tfidf_data_without_merged = tfidf_data_without[corpus_data_without_merged]
corpus_tfidf_data_with_merged = tfidf_data_with[corpus_data_with_merged]
# Calculate the average TF-IDF scores for each term in the given corpus
def calculate_avg_tfidf(tfidf_corpus, dictionary):
avg_tfidf = np.zeros(len(dictionary))
for doc in tfidf_corpus:
for term_id, tfidf_score in doc:
avg_tfidf[term_id] += tfidf_score
avg_tfidf /= len(tfidf_corpus)
return avg_tfidf
# Calculate average TF-IDF scores for each dataset
avg_tfidf_data_without = calculate_avg_tfidf(corpus_tfidf_data_without_merged, merged_dictionary)
avg_tfidf_data_with = calculate_avg_tfidf(corpus_tfidf_data_with_merged, merged_dictionary)
# Calculate the difference in average TF-IDF scores between the two datasets
tfidf_diff = avg_tfidf_data_with - avg_tfidf_data_without
# Sort the terms based on the difference in their average scores (higher in data_with)
sorted_indices = np.argsort(tfidf_diff)[::-1]
# Print the terms with their average TF-IDF scores in both groups and their difference
print(f"{'Term':<40}{'Group A':<10}{'Group B':<10}{'Difference':<10}")
print("-" * 50)
for i in sorted_indices[:40]: # Display the top 20 terms
term = merged_dictionary[i]
group_a_score = avg_tfidf_data_without[i]
group_b_score = avg_tfidf_data_with[i]
diff = tfidf_diff[i]
print(f"{term:<40}{group_a_score:<10.4f}{group_b_score:<10.4f}{diff:<10.4f}")TF-IDF score comparison identified terms exclusive to Group B (articles with comments, zero value in Group A).
Analysis revealed 5 key patterns in high-engagement articles:
1. Political/drug news (e.g., Han Dong-hoon, Chun Doo-hwan cases)
2. Specific drug names (methamphetamine, marijuana, new synthetic drugs)
3. Local incident details (Gangnam drink case, specific locations)
4. Distribution channels (delivery, supply, part-time workers)
5. Sensational language (surge, all-out war, mass arrests)
Next step: To avoid logical leaps that could occur if these patterns appear in both groups, we will now examine terms that are distinctively characteristic of Group A (articles without comments).
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
import numpy as np
# Merge the two dictionaries
merged_dictionary = Dictionary(documents=data_without['processed'].tolist() + data_with['processed'].tolist())
# Convert the original corpora to the merged dictionary
corpus_data_without_merged = [merged_dictionary.doc2bow(doc) for doc in data_without['processed']]
corpus_data_with_merged = [merged_dictionary.doc2bow(doc) for doc in data_with['processed']]
# Update the TfidfModels
tfidf_data_without = TfidfModel(corpus_data_without_merged)
tfidf_data_with = TfidfModel(corpus_data_with_merged)
# Convert the corpus to a TF-IDF representation
corpus_tfidf_data_without_merged = tfidf_data_without[corpus_data_without_merged]
corpus_tfidf_data_with_merged = tfidf_data_with[corpus_data_with_merged]
# Calculate the average TF-IDF scores for each term in the given corpus
def calculate_avg_tfidf(tfidf_corpus, dictionary):
avg_tfidf = np.zeros(len(dictionary))
for doc in tfidf_corpus:
for term_id, tfidf_score in doc:
avg_tfidf[term_id] += tfidf_score
avg_tfidf /= len(tfidf_corpus)
return avg_tfidf
# Calculate average TF-IDF scores for each dataset
avg_tfidf_data_without = calculate_avg_tfidf(corpus_tfidf_data_without_merged, merged_dictionary)
avg_tfidf_data_with = calculate_avg_tfidf(corpus_tfidf_data_with_merged, merged_dictionary)
# Calculate the difference in average TF-IDF scores between the two datasets
tfidf_diff = avg_tfidf_data_without - avg_tfidf_data_with
# Sort the terms based on the difference in their average scores (higher in data_with)
sorted_indices = np.argsort(tfidf_diff)[::-1]
# Print the terms with their average TF-IDF scores in both groups and their difference
print(f"{'Term':<40}{'Group A':<10}{'Group B':<10}{'Difference':<10}")
print("-" * 50)
for i in sorted_indices[:40]: # Display the top 20 terms
term = merged_dictionary[i]
group_a_score = avg_tfidf_data_without[i]
group_b_score = avg_tfidf_data_with[i]
diff = tfidf_diff[i]
print(f"{term:<40}{group_a_score:<10.4f}{group_b_score:<10.4f}{diff:<10.4f}")Searched for counterexamples to Group B's five characteristics in Group A. While the term ‘강남구(Gangnam-gu)’ could potentially counter Group B’s third characteristic (specific incidents) as referencing the Gangnam drink incident, its absence in Group B makes it unsuitable for comparison. No other counterexamples were found.
Next step: After completing our TF-IDF analysis, we will explore keyword characteristics using Word2vec for additional insights.
Word2vec: Neural network technique that converts words to vectors based on context, capturing semantic relationships between words that appear in similar contexts
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
# Combine the preprocessed text from both groups
combined_text = data_without['processed'].tolist() + data_with['processed'].tolist()
# Train a Word2Vec model using both datasets
model = Word2Vec(sentences=combined_text, vector_size=100, window=5, min_count=1, workers=4)
# Calculate the average word embeddings for each article in both groups
def average_word_embeddings(text, word2vec_model):
embeddings = []
for word in text:
if word in word2vec_model.wv:
embeddings.append(word2vec_model.wv[word])
if embeddings:
return np.mean(embeddings, axis=0)
else:
return np.zeros((model.vector_size,))
data_without['avg_word_embeddings'] = data_without['processed'].apply(average_word_embeddings, word2vec_model=model)
data_with['avg_word_embeddings'] = data_with['processed'].apply(average_word_embeddings, word2vec_model=model)
# Calculate the average word embeddings for both groups
group_a_avg = np.mean(np.vstack(data_without['avg_word_embeddings']), axis=0)
group_b_avg = np.mean(np.vstack(data_with['avg_word_embeddings']), axis=0)
# Get all unique keywords from both groups
keywords = set()
for text in combined_text:
keywords.update(text)
keywords = list(keywords)
keyword_vectors = np.vstack([model.wv[keyword] for keyword in keywords])
# Calculate the similarity scores between the keywords and the average word embeddings for both groups
keyword_scores_a = np.dot(keyword_vectors, group_a_avg)
keyword_scores_b = np.dot(keyword_vectors, group_b_avg)
# Calculate the difference in similarity scores between the two groups for each keyword
keyword_diffs = keyword_scores_b - keyword_scores_a
# Sort the keywords based on the difference in their embeddings (higher in data_with)
sorted_indices = np.argsort(keyword_diffs)[::-1]
# Print the keywords with their similarity scores in both groups and their difference
print(f"{'Term':<40}{'Group A':<10}{'Group B':<10}{'Difference':<10}")
print("-" * 50)
count = 0
for i in sorted_indices:
if count >= 40: # Display the top 20 terms
break
term = keywords[i]
group_a_score = keyword_scores_a[i]
group_b_score = keyword_scores_b[i]
diff = keyword_diffs[i]
print(f"{term:<40}{group_a_score:<10.4f}{group_b_score:<10.4f}{diff:<10.4f}")
count += 1Since word2vec finds semantic similarities by analyzing contextual flow, no significant differences were observed.
Next step: Frequency analysis and word cloud visualization
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
# 한글 폰트 설정
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
# # Create a function to clean tokens
# def clean_token(token):
# return token.replace("[", "").replace("]", "").replace("'", "").strip()
def word_graph(ax, cnt, group_name, max_words=10):
sorted_w = sorted(cnt.items(), key=lambda kv: kv[1])
print(f"{group_name}:\n", sorted_w[-max_words:])
w, n = zip(*sorted_w[-max_words:])
ax.barh(range(len(w)), n, tick_label=w)
ax.set_title(group_name)
# 텍스트 정제 함수: 한글 이외의 문자는 전부 제거합니다.
def clean_token(text):
# 한글의 정규표현식으로 한글만 추출합니다.
hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
result = hangul.sub('', text)
return result
def split_text(text):
tokens = text.split()
return tokens
# 텍스트 정제를 적용하여 처리된 텍스트를 확인합니다.
data_without['title_cleaned'] = data_without['title_tokenized'].apply(text_cleaning)
data_with['title_cleaned'] = data_with['title_tokenized'].apply(text_cleaning)
data_without['processed'] = data_without['title_cleaned'].apply(split_text)
data_with['processed'] = data_with['title_cleaned'].apply(split_text)
fig, axes = plt.subplots(2, 1, figsize=(10, 12))
# Group A
data_A = pd.read_csv('data_without_comments4.csv')
data_A['title_cleaned'] = data_A['title_tokenized'].apply(text_cleaning)
data_A['processed'] = data_A['title_cleaned'].apply(split_text)
content_tokens_A = data_A['processed']
tokens_cnt_A = {}
for tokens in content_tokens_A:
for token in tokens:
cleaned_token = clean_token(token)
if cleaned_token:
tokens_cnt_A[cleaned_token] = tokens_cnt_A.get(cleaned_token, 0) + 1
word_graph(axes[0], tokens_cnt_A, "Group A", max_words=20)
# Group B
data_B = pd.read_csv('data_with_comments4.csv')
data_B['title_cleaned'] = data_B['title_tokenized'].apply(text_cleaning)
data_B['processed'] = data_B['title_cleaned'].apply(split_text)
content_tokens_B = data_B['processed']
tokens_cnt_B = {}
for tokens in content_tokens_B:
for token in tokens:
cleaned_token = clean_token(token)
if cleaned_token:
tokens_cnt_B[cleaned_token] = tokens_cnt_B.get(cleaned_token, 0) + 1
word_graph(axes[1], tokens_cnt_B, "Group B", max_words=20)
plt.tight_layout()
plt.show()Since word2vec finds semantic similarities by analyzing contextual flow, no significant differences were observed.
Next step: Frequency analysis and word cloud visualization