import pandas as pd
# Load the modified data
data = pd.read_csv("final_combined.csv")
# Remove commas from the comment_count column
# data['comment_count'] = data['comment_count'].str.replace(',', '')
# Convert the comment_count column to floats
data['comment_count'] = data['comment_count'].astype(float)
# Fill missing values with 0
data['comment_count'].fillna(0, inplace=True)
# Convert the comment_count column to integers
data['comment_count'] = data['comment_count'].astype(int)
data_without = data[data['comment_count'] == 0]
data_with = data[data['comment_count'] >= 1]
# Save the classified data to separate CSV files
data_without.to_csv("data_without_comments.csv", index=False)
data_with.to_csv("data_with_comments.csv", index=False)print(len(data_without))
print(len(data_with))import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import matplotlib.pyplot as plt
from gensim.models.coherencemodel import CoherenceModel
import re
# Read the CSV files
data_without = pd.read_csv("data_without_comments.csv")
data_with = pd.read_csv("data_with_comments.csv")
# Text cleaning function: Removes all characters except Korean
def text_cleaning(text):
# Extract only Korean text using Korean regular expression
hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
result = hangul.sub('', text)
return result
def split_text(text):
tokens = text.split()
return tokens
# Check the processed text after applying text cleaning
data_without['content_cleaned'] = data_without['content_tokenized'].apply(text_cleaning)
data_with['content_cleaned'] = data_with['content_tokenized'].apply(text_cleaning)
data_without['processed'] = data_without['content_cleaned'].apply(split_text)
data_with['processed'] = data_with['content_cleaned'].apply(split_text)
# Create the dictionary and corpus for each dataset
dictionary_data_without = Dictionary(data_without['processed'])
dictionary_data_with = Dictionary(data_with['processed'])
corpus_data_without = [dictionary_data_without.doc2bow(doc) for doc in data_without['processed']]
corpus_data_with = [dictionary_data_with.doc2bow(doc) for doc in data_with['processed']]
# Set topic range
topic_range = range(4, 20, 1)
# Compute perplexity
def compute_perplexity(dictionary, corpus, num_topics):
model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=20, random_state=42)
return model.log_perplexity(corpus)
# Compute coherence score
def compute_coherence_score(dictionary, corpus, tokens, num_topics):
model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=20, random_state=42)
coherence_model = CoherenceModel(model=model, texts=tokens, dictionary=dictionary, coherence='c_v')
return coherence_model.get_coherence()
# Calculate scores while displaying progress
def calculate_scores(num_topics, dictionary, corpus, tokens):
perplexity = compute_perplexity(dictionary, corpus, num_topics)
coherence = compute_coherence_score(dictionary, corpus, tokens, num_topics)
print(f"Completed: {num_topics} Topics - Perplexity: {perplexity}, Coherence: {coherence}")
return perplexity, coherence
# Calculate and plot scores for each dataset
for dataset_name, dictionary, corpus, tokens, xlabel in [('data_without', dictionary_data_without, corpus_data_without, data_without['processed'], "Group A(0 comments)"),
('data_with', dictionary_data_with, corpus_data_with, data_with['processed'], "Group B(1 or more comments)")]:
# Calculate scores within the topic range
scores = [calculate_scores(num_topics, dictionary, corpus, tokens) for num_topics in topic_range]
# Separate results
perplexity_scores, coherence_scores = zip(*scores)
# Plot the graph
fig, ax1 = plt.subplots()
ax1.set_title(dataset_name)
ax1.set_xlabel(xlabel)
ax1.set_ylabel("Perplexity", color="tab:red")
ax1.plot(topic_range, perplexity_scores, color="tab:red")
ax1.tick_params(axis="y", labelcolor="tab:red")
ax2 = ax1.twinx()
ax2.set_ylabel("Coherence Score", color="tab:blue")
ax2.plot(topic_range, coherence_scores, color="tab:blue")
ax2.tick_params(axis="y", labelcolor="tab:blue")
fig.tight_layout()
plt.show()

# Train the LDA models
lda_data_without = LdaModel(corpus_data_without, id2word=dictionary_data_without, num_topics=9, passes=20, random_state=42)
lda_data_with = LdaModel(corpus_data_with, id2word=dictionary_data_with, num_topics=12, passes=20, random_state=42)
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
vis_data_without = gensimvis.prepare(lda_data_without, corpus_data_without, dictionary_data_without, mds='mmds', n_jobs=1)
vis_data_with = gensimvis.prepare(lda_data_with, corpus_data_with, dictionary_data_with, mds='mmds', n_jobs=1)for topic in lda_data_without.print_topics(num_topics=9):
topic_num, topic_keywords = topic
print(f"{topic_num} : {topic_keywords}")for topic in lda_data_with.print_topics(num_topics=12):
topic_num, topic_keywords = topic
print(f"{topic_num} : {topic_keywords}")pyLDAvis.display(vis_data_without)









for topic in lda_data_with.print_topics(num_topics=12):
topic_num, topic_keywords = topic
print(f"{topic_num} : {topic_keywords}")pyLDAvis.display(vis_data_with)











import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
import numpy as np
# Merge the two dictionaries
merged_dictionary = Dictionary(documents=data_without['processed'].tolist() + data_with['processed'].tolist())
# Convert the original corpora to the merged dictionary
corpus_data_without_merged = [merged_dictionary.doc2bow(doc) for doc in data_without['processed']]
corpus_data_with_merged = [merged_dictionary.doc2bow(doc) for doc in data_with['processed']]
# Update the TfidfModels
tfidf_data_without = TfidfModel(corpus_data_without_merged)
tfidf_data_with = TfidfModel(corpus_data_with_merged)
# Convert the corpus to a TF-IDF representation
corpus_tfidf_data_without_merged = tfidf_data_without[corpus_data_without_merged]
corpus_tfidf_data_with_merged = tfidf_data_with[corpus_data_with_merged]
# Calculate the average TF-IDF scores for each term in the given corpus
def calculate_avg_tfidf(tfidf_corpus, dictionary):
avg_tfidf = np.zeros(len(dictionary))
for doc in tfidf_corpus:
for term_id, tfidf_score in doc:
avg_tfidf[term_id] += tfidf_score
avg_tfidf /= len(tfidf_corpus)
return avg_tfidf
# Calculate average TF-IDF scores for each dataset
avg_tfidf_data_without = calculate_avg_tfidf(corpus_tfidf_data_without_merged, merged_dictionary)
avg_tfidf_data_with = calculate_avg_tfidf(corpus_tfidf_data_with_merged, merged_dictionary)
# Calculate the difference in average TF-IDF scores between the two datasets
tfidf_diff = avg_tfidf_data_with - avg_tfidf_data_without
# Sort the terms based on the difference in their average scores (higher in data_with)
sorted_indices = np.argsort(tfidf_diff)[::-1]
# Print the terms with their average TF-IDF scores in both groups and their difference
print(f"{'Term':<40}{'Group A':<10}{'Group B':<10}{'Difference':<10}")
print("-" * 50)
for i in sorted_indices[:40]: # Display the top 40 terms
term = merged_dictionary[i]
group_a_score = avg_tfidf_data_without[i]
group_b_score = avg_tfidf_data_with[i]
diff = tfidf_diff[i]
print(f"{term:<40}{group_a_score:<10.4f}{group_b_score:<10.4f}{diff:<10.4f}")import pandas as pd
import numpy as np
from gensim.models import Word2Vec
# Combine the preprocessed text from both groups
combined_text = data_without['processed'].tolist() + data_with['processed'].tolist()
# Train a Word2Vec model using both datasets
model = Word2Vec(sentences=combined_text, vector_size=100, window=5, min_count=1, workers=4)
# Calculate the average word embeddings for each article in both groups
def average_word_embeddings(text, word2vec_model):
embeddings = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
if embeddings:
return np.mean(embeddings, axis=0)
else:
return np.zeros(word2vec_model.vector_size)
data_without['avg_word_embeddings'] = data_without['processed'].apply(average_word_embeddings, word2vec_model=model)
data_with['avg_word_embeddings'] = data_with['processed'].apply(average_word_embeddings, word2vec_model=model)
# Calculate the average word embeddings for both groups
group_a_avg = np.mean(np.vstack(data_without['avg_word_embeddings']), axis=0)
group_b_avg = np.mean(np.vstack(data_with['avg_word_embeddings']), axis=0)
# Get all unique keywords from both groups
keywords = set()
for text in combined_text:
keywords.update(text)
keywords = list(keywords)
keyword_vectors = np.vstack([model.wv[keyword] for keyword in keywords])
# Calculate the similarity scores between the keywords and the average word embeddings for both groups
keyword_scores_a = np.dot(keyword_vectors, group_a_avg)
keyword_scores_b = np.dot(keyword_vectors, group_b_avg)
# Calculate the difference in similarity scores between the two groups for each keyword
keyword_diffs = keyword_scores_b - keyword_scores_a
# Sort the keywords based on the difference in their embeddings (higher in data_with)
sorted_indices = np.argsort(keyword_diffs)[::-1]
# Print the keywords with their similarity scores in both groups and their difference
print(f"{'Term':<40}{'Group A':<10}{'Group B':<10}{'Difference':<10}")
print("-" * 50)
count = 0
for i in sorted_indices:
if count >= 40: # Display the top 20 terms
break
term = keywords[i]
group_a_score = keyword_scores_a[i]
group_b_score = keyword_scores_b[i]
diff = keyword_diffs[i]
print(f"{term:<40}{group_a_score:<10.4f}{group_b_score:<10.4f}{diff:<10.4f}")
count += 1import pandas as pd
# Load the modified data
data = pd.read_csv("final_combined.csv")
# Remove commas from the comment_count column
# data['comment_count'] = data['comment_count'].str.replace(',', '')
# Convert the comment_count column to floats
data['comment_count'] = data['comment_count'].astype(float)
# Fill missing values with 0
data['comment_count'].fillna(0, inplace=True)
# Convert the comment_count column to integers
data['comment_count'] = data['comment_count'].astype(int)
data_without = data[(data['comment_count'] >= 1) & (data['comment_count'] < 10)]
data_with = data[data['comment_count'] >= 10]
# Save the classified data to separate CSV files
data_without.to_csv("data_without_comments.csv", index=False)
data_with.to_csv("data_with_comments.csv", index=False)print(len(data_without))
print(len(data_with))import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import matplotlib.pyplot as plt
from gensim.models.coherencemodel import CoherenceModel
import re
# Read the CSV files
data_without = pd.read_csv("data_without_comments.csv")
data_with = pd.read_csv("data_with_comments.csv")
# Text cleaning function: Remove all non-Korean characters
def text_cleaning(text):
# Extract only Korean text using Korean regular expression
hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
result = hangul.sub('', text)
return result
def split_text(text):
tokens = text.split()
return tokens
# Check the processed text after applying text cleaning
data_without['content_cleaned'] = data_without['content_tokenized'].apply(text_cleaning)
data_with['content_cleaned'] = data_with['content_tokenized'].apply(text_cleaning)
data_without['processed'] = data_without['content_cleaned'].apply(split_text)
data_with['processed'] = data_with['content_cleaned'].apply(split_text)
# Create the dictionary and corpus for each dataset
dictionary_data_without = Dictionary(data_without['processed'])
dictionary_data_with = Dictionary(data_with['processed'])
corpus_data_without = [dictionary_data_without.doc2bow(doc) for doc in data_without['processed']]
corpus_data_with = [dictionary_data_with.doc2bow(doc) for doc in data_with['processed']]
# Set topic range
topic_range = range(4, 20, 1)
# Compute perplexity
def compute_perplexity(dictionary, corpus, num_topics):
model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=20, random_state=42)
return model.log_perplexity(corpus)
# Compute coherence score
def compute_coherence_score(dictionary, corpus, tokens, num_topics):
model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=20, random_state=42)
coherence_model = CoherenceModel(model=model, texts=tokens, dictionary=dictionary, coherence='c_v')
return coherence_model.get_coherence()
# Calculate scores while displaying progress
def calculate_scores(num_topics, dictionary, corpus, tokens):
perplexity = compute_perplexity(dictionary, corpus, num_topics)
coherence = compute_coherence_score(dictionary, corpus, tokens, num_topics)
print(f"Completed: {num_topics} Topics - Perplexity: {perplexity}, Coherence: {coherence}")
return perplexity, coherence
# Calculate and plot scores for each dataset
for dataset_name, dictionary, corpus, tokens, xlabel in [('data_without', dictionary_data_without, corpus_data_without, data_without['processed'], "Group A(1 ~ 9 comments)"),
('data_with', dictionary_data_with, corpus_data_with, data_with['processed'], "Group B(10 or more comments)")]:
# Calculate scores within the topic range
scores = [calculate_scores(num_topics, dictionary, corpus, tokens) for num_topics in topic_range]
# Separate results
perplexity_scores, coherence_scores = zip(*scores)
# Plot the graph
fig, ax1 = plt.subplots()
ax1.set_title(dataset_name)
ax1.set_xlabel(xlabel)
ax1.set_ylabel("Perplexity", color="tab:red")
ax1.plot(topic_range, perplexity_scores, color="tab:red")
ax1.tick_params(axis="y", labelcolor="tab:red")
ax2 = ax1.twinx()
ax2.set_ylabel("Coherence Score", color="tab:blue")
ax2.plot(topic_range, coherence_scores, color="tab:blue")
ax2.tick_params(axis="y", labelcolor="tab:blue")
fig.tight_layout()
plt.show()

Based on complexity and cohesion calculations (20 passes, random state 42), 12 topics were determined optimal for both Group A and Group B.
# Train the LDA models
lda_data_without = LdaModel(corpus_data_without, id2word=dictionary_data_without, num_topics=12, passes=20, random_state=42)
lda_data_with = LdaModel(corpus_data_with, id2word=dictionary_data_with, num_topics=11, passes=20, random_state=42)
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
vis_data_without = gensimvis.prepare(lda_data_without, corpus_data_without, dictionary_data_without, mds='mmds', n_jobs=1)
vis_data_with = gensimvis.prepare(lda_data_with, corpus_data_with, dictionary_data_with, mds='mmds', n_jobs=1)
for topic in lda_data_without.print_topics(num_topics=12):
topic_num, topic_keywords = topic
print(f"{topic_num} : {topic_keywords}")for topic in lda_data_with.print_topics(num_topics=11):
topic_num, topic_keywords = topic
print(f"{topic_num} : {topic_keywords}")pyLDAvis.display(vis_data_without)












pyLDAvis.display(vis_data_with)











Topic differentiation between low and high-comment groups proved challenging.
Therefore, TF-IDF was used to compare characteristic keyword differences between groups.
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
import numpy as np
# Merge the two dictionaries
merged_dictionary = Dictionary(documents=data_without['processed'].tolist() + data_with['processed'].tolist())
# Convert the original corpora to the merged dictionary
corpus_data_without_merged = [merged_dictionary.doc2bow(doc) for doc in data_without['processed']]
corpus_data_with_merged = [merged_dictionary.doc2bow(doc) for doc in data_with['processed']]
# Update the TfidfModels
tfidf_data_without = TfidfModel(corpus_data_without_merged)
tfidf_data_with = TfidfModel(corpus_data_with_merged)
# Convert the corpus to a TF-IDF representation
corpus_tfidf_data_without_merged = tfidf_data_without[corpus_data_without_merged]
corpus_tfidf_data_with_merged = tfidf_data_with[corpus_data_with_merged]
# Calculate the average TF-IDF scores for each term in the given corpus
def calculate_avg_tfidf(tfidf_corpus, dictionary):
avg_tfidf = np.zeros(len(dictionary))
for doc in tfidf_corpus:
for term_id, tfidf_score in doc:
avg_tfidf[term_id] += tfidf_score
avg_tfidf /= len(tfidf_corpus)
return avg_tfidf
# Calculate average TF-IDF scores for each dataset
avg_tfidf_data_without = calculate_avg_tfidf(corpus_tfidf_data_without_merged, merged_dictionary)
avg_tfidf_data_with = calculate_avg_tfidf(corpus_tfidf_data_with_merged, merged_dictionary)
# Calculate the difference in average TF-IDF scores between the two datasets
tfidf_diff = avg_tfidf_data_with - avg_tfidf_data_without
# Sort the terms based on the difference in their average scores (higher in data_with)
sorted_indices = np.argsort(tfidf_diff)[::-1]
# Print the terms with their average TF-IDF scores in both groups and their difference
print(f"{'Term':<40}{'Group A':<10}{'Group B':<10}{'Difference':<10}")
print("-" * 50)
for i in sorted_indices[:40]: # Display the top 40 terms
term = merged_dictionary[i]
group_a_score = avg_tfidf_data_without[i]
group_b_score = avg_tfidf_data_with[i]
diff = tfidf_diff[i]
print(f"{term:<40}{group_a_score:<10.4f}{group_b_score:<10.4f}{diff:<10.4f}")- Code calculates TF-IDF score differences between datasets, ranking top 40 keywords by B-A difference to show B's distinctive terms.
- Analysis shows B's keywords ('한동훈') indicate political content; terms like '학원가', '강남', '마약음료' suggest high engagement on specific incident.
- Due to minimal differences, different approach needed.
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
# Combine the preprocessed text from both groups
combined_text = data_without['processed'].tolist() + data_with['processed'].tolist()
# Train a Word2Vec model using both datasets
model = Word2Vec(sentences=combined_text, vector_size=100, window=5, min_count=1, workers=4)
# Calculate the average word embeddings for each article in both groups
def average_word_embeddings(text, word2vec_model):
embeddings = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
if embeddings:
return np.mean(embeddings, axis=0)
else:
return np.zeros(word2vec_model.vector_size)
data_without['avg_word_embeddings'] = data_without['processed'].apply(average_word_embeddings, word2vec_model=model)
data_with['avg_word_embeddings'] = data_with['processed'].apply(average_word_embeddings, word2vec_model=model)
# Calculate the average word embeddings for both groups
group_a_avg = np.mean(np.vstack(data_without['avg_word_embeddings']), axis=0)
group_b_avg = np.mean(np.vstack(data_with['avg_word_embeddings']), axis=0)
# Get all unique keywords from both groups
keywords = set()
for text in combined_text:
keywords.update(text)
keywords = list(keywords)
keyword_vectors = np.vstack([model.wv[keyword] for keyword in keywords])
# Calculate the similarity scores between the keywords and the average word embeddings for both groups
keyword_scores_a = np.dot(keyword_vectors, group_a_avg)
keyword_scores_b = np.dot(keyword_vectors, group_b_avg)
# Calculate the difference in similarity scores between the two groups for each keyword
keyword_diffs = keyword_scores_b - keyword_scores_a
# Sort the keywords based on the difference in their embeddings (higher in data_with)
sorted_indices = np.argsort(keyword_diffs)[::-1]
# Print the keywords with their similarity scores in both groups and their difference
print(f"{'Term':<40}{'Group A':<10}{'Group B':<10}{'Difference':<10}")
print("-" * 50)
count = 0
for i in sorted_indices:
if count >= 40: # Display the top 20 terms
break
term = keywords[i]
group_a_score = keyword_scores_a[i]
group_b_score = keyword_scores_b[i]
diff = keyword_diffs[i]
print(f"{term:<40}{group_a_score:<10.4f}{group_b_score:<10.4f}{diff:<10.4f}")
count += 1The code above uses Word2Vec to identify keywords emphasized more in group B versus group A.
It calculates keyword similarity scores and mean word embeddings between groups to identify terms more prominent in group B.
While differences exist, they were deemed too minimal or insufficiently supported to be considered significant characteristics.
Data preprocessing and rationale for Kkma analyzer and POS selection: