Blog Scraping and Analysis¶
Importing Necessary Dependencies¶
from warnings import filterwarnings
filterwarnings("ignore")
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords as nltk_stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from string import punctuation
from bs4 import BeautifulSoup
from datetime import datetime
import seaborn as sns
import pandas as pd
import numpy as np
import requests
import random
import docx
import nltk
import io
import os
import re
Scraping and Output Generation Section¶
Reading the Input File¶
input_df = pd.read_excel(r'Dataset/Input.xlsx')
input_df.head()
URL_ID | URL | |
---|---|---|
0 | 37 | https://insights.blackcoffer.com/ai-in-healthc... |
1 | 38 | https://insights.blackcoffer.com/what-if-the-c... |
2 | 39 | https://insights.blackcoffer.com/what-jobs-wil... |
3 | 40 | https://insights.blackcoffer.com/will-machine-... |
4 | 41 | https://insights.blackcoffer.com/will-ai-repla... |
input_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 114 entries, 0 to 113 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 URL_ID 114 non-null int64 1 URL 114 non-null object dtypes: int64(1), object(1) memory usage: 1.9+ KB
Scraping HTML and Parsing it to get just the Title and Content¶
output_dir = "Scraped Blogs"
os.makedirs(output_dir, exist_ok=True)
for index, row in input_df.iterrows():
url = row['URL']
url_id = row['URL_ID']
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.title.string.replace(' | Blackcoffer Insights', '')
if title != "Page not found":
# Removing the specified title elements from the HTML tree
title_classes = ['tdb-entry-crumb', 'tdb-bred-no-url-last', 'tdb-title-text']
for title_class in title_classes:
title_elements = soup.find_all(True, {'class': title_class})
for title_element in title_elements:
title_element.decompose()
# Removing the div elements that we don't want from the HTML tree
div_classes = [
'td_block_wrap tdb_single_next_prev tdi_139 td-animation-stack td-pb-border-top td_block_template_1',
'tdb-author-box td_block_wrap tdb_single_author_box tdi_140 tdb-content-vert-top td-pb-border-top td_block_template_1',
'td_block_wrap tdb_single_author tdi_124 td-pb-border-top td_block_template_1 tdb-post-meta',
'td_block_wrap tdb_single_date tdi_125 td-pb-border-top td_block_template_1 tdb-post-meta',
'td_block_wrap tdb_single_comments_count tdi_126 td-pb-border-top td_block_template_1 tdb-post-meta',
'td_block_wrap tdb_single_post_views tdi_127 td-pb-border-top td_block_template_1 tdb-post-meta'
]
for div_class in div_classes:
div_elements = soup.find_all('div', {'class': div_class})
for div_element in div_elements:
div_element.decompose()
# Removing the ul element from the HTML tree
ul_element = soup.find('ul', {'class': 'tdb-tags'})
if ul_element:
ul_element.decompose()
# Removing the preformatted block from the HTML tree
preformatted_block = soup.find('pre', {'class': 'wp-block-preformatted'})
if preformatted_block:
preformatted_block.decompose()
content_div = soup.find('div', {'id': 'tdb-autoload-article'})
if content_div:
content = ''
for div in content_div.find_all('div', {'class': 'tdb-block-inner td-fix-index'}):
content += div.text + '\n'
else:
content_div = soup.find('div', {'class': 'td-post-content tagdiv-type'})
if content_div:
content = content_div.text
output_file_path = os.path.join(output_dir, f"{url_id}.txt")
with open(output_file_path, 'w', encoding='utf-8') as f:
f.write(f"Title: {title}\n\nContent:\n{content.strip()}")
except requests.exceptions.RequestException as e:
print(f"Error processing URL_ID {url_id}: {e}")
Creating Stopwords, Positive and Negative Dictionary¶
master_dict_folder = r"Dataset/MasterDictionary"
stop_words_folder = r"Dataset/StopWords"
positive_words = set()
negative_words = set()
stop_words = set()
def read_words_from_docx(file_path):
doc = docx.Document(file_path)
words = {paragraph.text.strip().lower() for paragraph in doc.paragraphs if paragraph.text.strip()}
return words
positive_file_path = os.path.join(master_dict_folder, "positive-words.docx")
negative_file_path = os.path.join(master_dict_folder, "negative-words.docx")
positive_words = read_words_from_docx(positive_file_path)
negative_words = read_words_from_docx(negative_file_path)
for filename in os.listdir(stop_words_folder):
if filename.endswith(".docx"):
stop_words_file_path = os.path.join(stop_words_folder, filename)
stop_words.update(read_words_from_docx(stop_words_file_path))
nltk_stop_words = set(nltk_stopwords.words('english'))
stop_words.update(nltk_stop_words)
stop_words.update(ENGLISH_STOP_WORDS)
print(f"Positive Words({len(positive_words)}):{list(positive_words)[:7]}")
print(f"Negative Words({len(negative_words)}):{list(negative_words)[:7]}")
print(f"Stop Words({len(stop_words)}): {list(stop_words)[:7]}")
Positive Words(2006):['jolly', 'richness', 'easy', 'admire', 'suave', 'propitious', 'enchant'] Negative Words(4783):['overdo', 'insidiously', 'fuss', 'clique', 'avariciously', 'chintzy', 'hazardous'] Stop Words(12809): ['randolph', 'jolly', 'georgeann', 'herrmann', 'cardwell', 'levasseur', 'krueger']
Sentiment Scores Computation¶
def compute_sentiment_scores(text):
text = ''.join(char for char in text if char not in punctuation)
words = word_tokenize(text)
words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
positive_score = sum(1 for word in words if word in positive_words)
negative_score = sum(1 for word in words if word in negative_words)
polarity_score = round((positive_score - negative_score) / ((positive_score + negative_score) + 0.000001), 2)
subjectivity_score = round((positive_score + negative_score) / ((len(words)) + 0.000001), 2)
return positive_score, negative_score, polarity_score, subjectivity_score
input_dir = "Scraped Blogs"
sentiment_scores = {}
for file_name in os.listdir(input_dir):
url_id = os.path.splitext(file_name)[0]
with open(os.path.join(input_dir, file_name), 'r', encoding='utf-8') as f:
content = f.read()
positive_score, negative_score, polarity_score, subjectivity_score = compute_sentiment_scores(content)
sentiment_scores[url_id] = {
'positive_score': positive_score,
'negative_score': negative_score,
'polarity_score': polarity_score,
'subjectivity_score': subjectivity_score
}
Textual Variables Computation¶
def count_syllables(word):
count = 0
vowels = "aeiouAEIOU"
if word[0] in vowels:
count += 1
for index in range(1, len(word)):
if word[index] in vowels and word[index - 1] not in vowels:
count += 1
if word.endswith("es") or word.endswith("ed"):
count -= 1
if count == 0:
count += 1
return count
def compute_variables(text):
sentences = sent_tokenize(text)
text = ''.join(char for char in text if char not in punctuation)
stop_words = set(nltk_stopwords.words('english'))
words = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]
avg_sentence_length = avg_number_of_words = int(round(len(words) / len(sentences), 0))
num_syllables = 0
num_complex_words = 0
num_chars = 0
for word in words:
num_syllables_in_word = count_syllables(word)
if num_syllables_in_word > 2:
num_complex_words += 1
num_syllables += num_syllables_in_word
num_chars += len(word)
complex_word_count = num_complex_words
total_word_count = len(words)
percentage_complex_words = round((complex_word_count / total_word_count) * 100, 2)
avg_syllable_per_word = int(round(num_syllables / total_word_count, 0))
avg_word_length = int(round(num_chars / total_word_count, 0))
fog_index = round(0.4 * (avg_sentence_length + percentage_complex_words), 2)
personal_pronouns_pattern = r'\bI\b|\bwe\b|\bmy\b|\bours\b|\bus\b(?!A)'
personal_pronouns_count = len(re.findall(personal_pronouns_pattern, text, re.IGNORECASE))
return avg_sentence_length, percentage_complex_words, fog_index, avg_number_of_words, complex_word_count,\
total_word_count, avg_syllable_per_word, personal_pronouns_count, avg_word_length
input_dir = "Scraped Blogs"
text_analysis_vars = {}
for file_name in os.listdir(input_dir):
url_id = os.path.splitext(file_name)[0]
with open(os.path.join(input_dir, file_name), 'r', encoding='utf-8') as f:
content = f.read()
avg_sentence_length, percentage_complex_words, fog_index, avg_number_of_words, complex_word_count,\
total_word_count, avg_syllable_per_word, personal_pronouns_count, avg_word_length = compute_variables(content)
text_analysis_vars[url_id] = {
'avg_sentence_length': avg_sentence_length,
'percentage_of_complex_words': percentage_complex_words,
'fog_index': fog_index,
'avg_number_of_words': avg_number_of_words,
'complex_word_count': complex_word_count,
'word_count': total_word_count,
'syllable_per_word': avg_syllable_per_word,
'personal_pronouns': personal_pronouns_count,
'avg_word_length': avg_word_length
}
Merging Variables with the Input Dataframe¶
sentiment_scores_df = pd.DataFrame.from_dict(sentiment_scores, orient='index').reset_index()
sentiment_scores_df = sentiment_scores_df.rename(columns={'index': 'URL_ID'})
sentiment_scores_df['URL_ID'] = sentiment_scores_df['URL_ID'].astype('int64')
text_analysis_vars_df = pd.DataFrame.from_dict(text_analysis_vars, orient='index').reset_index()
text_analysis_vars_df = text_analysis_vars_df.rename(columns={'index': 'URL_ID'})
text_analysis_vars_df['URL_ID'] = text_analysis_vars_df['URL_ID'].astype('int64')
output_df = input_df.merge(sentiment_scores_df, on='URL_ID', how='inner')
output_df = output_df.merge(text_analysis_vars_df, on='URL_ID', how='inner')
output_df.columns = ['URL_ID'] + [col.upper().replace('_', ' ') for col in output_df.columns if col != 'URL_ID']
output_df.head(3)
URL_ID | URL | POSITIVE SCORE | NEGATIVE SCORE | POLARITY SCORE | SUBJECTIVITY SCORE | AVG SENTENCE LENGTH | PERCENTAGE OF COMPLEX WORDS | FOG INDEX | AVG NUMBER OF WORDS | COMPLEX WORD COUNT | WORD COUNT | SYLLABLE PER WORD | PERSONAL PRONOUNS | AVG WORD LENGTH | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 37 | https://insights.blackcoffer.com/ai-in-healthc... | 65 | 32 | 0.34 | 0.10 | 15 | 38.87 | 21.55 | 15 | 440 | 1132 | 2 | 1 | 7 |
1 | 38 | https://insights.blackcoffer.com/what-if-the-c... | 57 | 37 | 0.21 | 0.18 | 9 | 27.49 | 14.60 | 9 | 204 | 742 | 2 | 6 | 7 |
2 | 39 | https://insights.blackcoffer.com/what-jobs-wil... | 65 | 35 | 0.30 | 0.12 | 12 | 40.87 | 21.15 | 12 | 405 | 991 | 2 | 3 | 7 |
output_df.to_excel('Output Data.xlsx', engine ='xlsxwriter', sheet_name = 'Output', index = False)
Analysis Section¶
Some Additional Feature Engineering¶
# For convenience of calling reverting column names back to original column names
output_df.columns = [col.lower().replace(' ', '_') for col in output_df.columns]
output_df.head(3)
url_id | url | positive_score | negative_score | polarity_score | subjectivity_score | avg_sentence_length | percentage_of_complex_words | fog_index | avg_number_of_words | complex_word_count | word_count | syllable_per_word | personal_pronouns | avg_word_length | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 37 | https://insights.blackcoffer.com/ai-in-healthc... | 65 | 32 | 0.34 | 0.10 | 15 | 38.87 | 21.55 | 15 | 440 | 1132 | 2 | 1 | 7 |
1 | 38 | https://insights.blackcoffer.com/what-if-the-c... | 57 | 37 | 0.21 | 0.18 | 9 | 27.49 | 14.60 | 9 | 204 | 742 | 2 | 6 | 7 |
2 | 39 | https://insights.blackcoffer.com/what-jobs-wil... | 65 | 35 | 0.30 | 0.12 | 12 | 40.87 | 21.15 | 12 | 405 | 991 | 2 | 3 | 7 |
Let's drop one out of avg_sentence_length and avg_number_of_words as both implies same meaning.
output_df.drop('avg_sentence_length', inplace = True, axis = 1)
input_dir = "Scraped Blogs"
output_df.insert(2, 'title', '')
for file_name in os.listdir(input_dir):
file_path = os.path.join(input_dir, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
match = re.search(r'Title:\s*(.+)', content)
if match:
title = match.group(1)
url_id = int(os.path.splitext(file_name)[0])
output_df.loc[output_df['url_id'] == url_id, 'title'] = title
output_df.head(3)
url_id | url | title | positive_score | negative_score | polarity_score | subjectivity_score | percentage_of_complex_words | fog_index | avg_number_of_words | complex_word_count | word_count | syllable_per_word | personal_pronouns | avg_word_length | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 37 | https://insights.blackcoffer.com/ai-in-healthc... | AI in healthcare to Improve Patient Outcomes | 65 | 32 | 0.34 | 0.10 | 38.87 | 21.55 | 15 | 440 | 1132 | 2 | 1 | 7 |
1 | 38 | https://insights.blackcoffer.com/what-if-the-c... | What if the Creation is Taking Over the Creator? | 57 | 37 | 0.21 | 0.18 | 27.49 | 14.60 | 9 | 204 | 742 | 2 | 6 | 7 |
2 | 39 | https://insights.blackcoffer.com/what-jobs-wil... | What Jobs Will Robots Take From Humans in The ... | 65 | 35 | 0.30 | 0.12 | 40.87 | 21.15 | 12 | 405 | 991 | 2 | 3 | 7 |
output_df.title.isnull().sum()
0
Summary Statistics¶
output_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 111 entries, 0 to 110 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 url_id 111 non-null int64 1 url 111 non-null object 2 title 111 non-null object 3 positive_score 111 non-null int64 4 negative_score 111 non-null int64 5 polarity_score 111 non-null float64 6 subjectivity_score 111 non-null float64 7 percentage_of_complex_words 111 non-null float64 8 fog_index 111 non-null float64 9 avg_number_of_words 111 non-null int64 10 complex_word_count 111 non-null int64 11 word_count 111 non-null int64 12 syllable_per_word 111 non-null int64 13 personal_pronouns 111 non-null int64 14 avg_word_length 111 non-null int64 dtypes: float64(4), int64(9), object(2) memory usage: 13.9+ KB
output_df.url_id = output_df.url_id.astype('O')
output_df.describe(include=np.number)
positive_score | negative_score | polarity_score | subjectivity_score | percentage_of_complex_words | fog_index | avg_number_of_words | complex_word_count | word_count | syllable_per_word | personal_pronouns | avg_word_length | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 111.000000 | 111.000000 | 111.000000 | 111.000000 | 111.000000 | 111.000000 | 111.000000 | 111.000000 | 111.000000 | 111.000000 | 111.000000 | 111.000000 |
mean | 28.189189 | 30.684685 | 0.016216 | 0.122342 | 31.765946 | 18.029369 | 13.306306 | 202.126126 | 634.864865 | 2.018018 | 7.261261 | 6.837838 |
std | 17.404552 | 22.092361 | 0.435650 | 0.042532 | 5.868901 | 4.389579 | 9.265569 | 103.907749 | 315.956455 | 0.133620 | 8.510648 | 0.496177 |
min | 0.000000 | 0.000000 | -1.000000 | 0.030000 | 15.830000 | 10.490000 | 6.000000 | 21.000000 | 91.000000 | 2.000000 | 0.000000 | 5.000000 |
25% | 16.000000 | 13.500000 | -0.325000 | 0.100000 | 28.620000 | 15.780000 | 10.000000 | 131.000000 | 399.500000 | 2.000000 | 2.000000 | 7.000000 |
50% | 25.000000 | 26.000000 | -0.040000 | 0.110000 | 31.720000 | 17.540000 | 12.000000 | 189.000000 | 600.000000 | 2.000000 | 4.000000 | 7.000000 |
75% | 36.000000 | 43.000000 | 0.380000 | 0.150000 | 34.475000 | 19.875000 | 14.000000 | 286.500000 | 890.000000 | 2.000000 | 10.000000 | 7.000000 |
max | 85.000000 | 93.000000 | 1.000000 | 0.250000 | 46.810000 | 53.280000 | 101.000000 | 440.000000 | 1998.000000 | 3.000000 | 46.000000 | 8.000000 |
Insights from Summary Statistics:¶
Measure | Range | Mean | Standard Deviation | Insights |
---|---|---|---|---|
Positive Score | 0 to 85 | 28.19 | 17.40 | The average positive score is 28.19, indicating a moderate level of positive sentiment in the data. |
Negative Score | 0 to 93 | 30.68 | 22.09 | The average negative score is 30.68, indicating a moderate level of negative sentiment. |
Polarity Score | -1 to 1 | 0.02 | 0.44 | The average polarity score is 0.02, suggesting a slight positive bias. |
Subjectivity Score | 0.03 to 0.25 | 0.12 | 0.04 | The data shows an average subjectivity score of 0.12, indicating a relatively objective tone. Scores range from 0.03 (objective) to 0.25 (subjective). |
Percentage of Complex Words | 15.83 % to 46.81 % | 31.77 | 5.87 | On average, around 31.77% of words are complex. |
Fog Index | 10.49 to 53.28 | 18.03 | 4.39 | The average Fog index is 18.03, indicating that the text is written at a moderately complex level overall across articles. |
Average number of Words | 6 to 101 | 13.31 | 9.27 | The average number of words per sentence is 13.31, with a wide variation from 6 to 101 words suggesting potential outliers. |
Complex Word Count | 21 to 440 | 202.13 | 103.91 | The average count of complex words is 202.13. |
Word Count | 91 to 1998 | 634.86 | 315.96 | The average word count is 634.86. But, the lowest and highest are at two extremes with huge SD of 315. |
Syllable Per Word | 2 to 3 | 2.02 | 0.13 | The average syllables per word is 2.02, suggesting that most words are 2-syllable words. |
Personal Pronouns | 0 to 46 | 7.26 | 8.51 | The data indicates an average of 7.26 personal pronouns used per article and the maximum being 46 sounds overly subjective article. |
Average Word Length | 5 to 8 | 6.84 | 0.50 | The average word length is 6.84 characters. Word lengths range from 5 to 8 characters. |
Insights:
- Sentiment Analysis: The positive and negative scores are moderately distributed, indicating a mix of positive and negative sentiment. Polarity score is slightly positive on average.
- Subjectivity and Complexity: The subjectivity score suggests that the content tends to be objective. The percentage of complex words and Fog index both indicate a moderate level of textual complexity.
- Text Length: The average number of words per sentence varies significantly across entries, suggesting diversity in sentence lengths.
- Word Characteristics: Most words have around 2 syllables, and the average word length is approximately 6.84 characters.
- Personal Pronouns: Entries use an average of 7.26 personal pronouns, which might indicate a moderate level of subjectivity.
- Range and Variation: Understanding the ranges helps identify outliers and assess data distribution.
Let's now dive deep and see the distributions of sentiment features and the textual ones...
Understanding the Distribution of Variables¶
sentiment_features = ['positive_score', 'negative_score', 'polarity_score', 'subjectivity_score']
num_plots = len(sentiment_features)
num_rows = (num_plots + 1) // 2 # Number of rows for subplots
fig, axes = plt.subplots(num_rows, 2, figsize=(12, 5 * num_rows))
for i, feature in enumerate(sentiment_features):
row = i // 2
col = i % 2
ax = axes[row, col]
sns.histplot(output_df[feature], ax=ax, bins=20, color=random.choice(['green', 'red', 'blue', 'purple', 'orange', 'grey']))
ax.set_xlabel(feature.replace('_', ' ').title())
ax.set_ylabel("Frequency")
# Remove any empty subplots
if num_plots < num_rows * 2:
fig.delaxes(axes[num_rows-1, 1])
plt.tight_layout()
plt.show()
Insights:
Positive scores are close to being normally distributed which is a good trend showing most articles stayed moderately positive.
Negative scores tends to be right skewed meaning most articles had least negative scores which as well is a very good trend. But what's concerning here is around some 30-40 articles seems to be overly negative.
Polarity again stays mostly in the mid region leaving a balance between positive and negative trend and some 4+ articles seems to be close to 1 meaning highly positive articles.
Subjectivity scores shows that mostly the articles are objective in nature meaning positive, negative scores not much relative with respect to the context(total no. of words). Possibly because only few words from dictionary being in the articles. So, we might have to increase the words in dictionary to see more relevant trends.
textual_features = ['complex_word_count', 'percentage_of_complex_words', 'fog_index', 'avg_number_of_words', 'word_count', 'syllable_per_word', 'personal_pronouns', 'avg_word_length']
num_plots = len(textual_features)
num_rows = (num_plots + 1) // 2 # Number of rows for subplots
fig, axes = plt.subplots(num_rows, 2, figsize=(12, 5 * num_rows))
for i, feature in enumerate(textual_features):
row = i // 2
col = i % 2
ax = axes[row, col]
sns.histplot(output_df[feature], ax=ax, bins=20, color='darkblue')
ax.set_xlabel(feature.replace('_', ' ').title())
ax.set_ylabel("Frequency")
# Remove any empty subplots
if num_plots < num_rows * 2:
fig.delaxes(axes[num_rows-1, 1])
plt.tight_layout()
plt.show()
Insights:
Complex word count: It's close to being normally distributed meaning that we have moderate amount of complex word count in most of the articles.
Percentage of complex word count: The percentage of complex words also suggests the same as the complex word count suggests. Most of the articles distributed at the center meaning that the percentage of complex word for most of the articles is moderate.
Fog index: Fog index seems to have some outlier. Because of that, the distribution looks like bit right skewed. Actually, if you remove that outlier. The distribution will become like most articles are pretty much with moderate complexity of reading.
Average no. of words: We have 2 outliers - One with 40 and another with 100 words per sentence which might be due to some inconsistency in writing causing the sentence tokenizer to pick 3-4 lines as a single line. If we remove those, most articles seems to have somewhere around 10 words per sentence.
Word Count: We have an outlier here as well with close to 2000 words per article which seems to be more verbose, almost twice as lengthy as the next close article with most word count. We can notice that we have most articles between 300-1000 words per article and a fairly good amount of articles in other range within 100-1250
Syllable Per Word: One or two article seems to have 3 syllablled word and mostly we have 2 syllabled words.
Personal Pronouns: Personal pronoun count per article is completely right skewed with some 4-5 outliers at the right extreme. But, even if we remove them and see the distribution, the fact that it is right skewed wouldn't change here. We have most no. of articles with around 2-3 personal pronouns(35%) and around some 20% articles have 4-5 personal pronouns and 15% of articles have around 5-6 personal pronouns and 13% of have 7-8 personal pronouns and so on. So the trend clearly shows that there is a fair bit of articles with subjective opinions.
Average Word Length: Most words seems to be with 6-7 charcters with few outliers which seems to have either 5 or 8 characters per word. So, overall the average no. of characters per word seems to be between 5-8.
Correlation Analysis¶
num_cols = [col for col in output_df.columns if col not in ['url_id', 'url', 'title']]
corr_matrix = output_df[num_cols].corr()
fig = plt.figure(figsize = (9, 4))
sns.heatmap(corr_matrix, cmap = 'Reds')
<Axes: >
Here though we could get some information, we can't take away anything from this visual solidly... So, let's dive deep and do a bit more work over here....
# Defining the ranges for different correlation labels
ranges = {
'Highly positively correlated': (0.7, 1.0),
'Low Positive Correlation': (0.3, 0.7),
'No Correlation': (-0.3, 0.3),
'Low Negative Correlation': (-0.7, -0.3),
'High Negatively Correlated': (-1.0, -0.7)
}
labeled_corr_matrix = pd.DataFrame(index=corr_matrix.index, columns=corr_matrix.columns)
for i in range(corr_matrix.shape[0]):
for j in range(corr_matrix.shape[1]):
# Getting the value of the current cell
value = corr_matrix.iloc[i, j]
# Finding the label for the current value
label = None
for k, v in ranges.items():
if v[0] <= value < v[1]:
label = k
break
# Setting the label for the current cell in the labeled correlation matrix
labeled_corr_matrix.iloc[i, j] = label
labeled_corr_matrix = labeled_corr_matrix.fillna('')
label_lists = {
'Highly positively correlated': [],
'Low Positive Correlation': [],
'No Correlation': [],
'Low Negative Correlation': [],
'Highly Negatively Correlated': []
}
for label in label_lists.keys():
if label in label_lists: # Checking if the current label is in the label_lists dictionary
pairs = np.where(labeled_corr_matrix == label) # Finding all pairs of columns that have this label
for i in range(len(pairs[0])): # Iterating over each pair of columns
col1 = labeled_corr_matrix.columns[pairs[0][i]] # Getting the names of the columns in this pair
col2 = labeled_corr_matrix.index[pairs[1][i]]
if (col2, col1) not in label_lists[label]:
label_lists[label].append((col1, col2)) # Adding this pair of columns to the list for this label
for label, lst in label_lists.items():
print(f"{label}:\n")
for pair in lst:
print(f" {pair}\n")
Highly positively correlated: ('positive_score', 'complex_word_count') ('percentage_of_complex_words', 'avg_word_length') ('fog_index', 'avg_number_of_words') ('complex_word_count', 'word_count') Low Positive Correlation: ('positive_score', 'polarity_score') ('positive_score', 'word_count') ('negative_score', 'subjectivity_score') ('negative_score', 'complex_word_count') ('negative_score', 'word_count') ('subjectivity_score', 'personal_pronouns') ('percentage_of_complex_words', 'fog_index') ('percentage_of_complex_words', 'complex_word_count') ('fog_index', 'avg_word_length') ('complex_word_count', 'avg_word_length') ('syllable_per_word', 'avg_word_length') No Correlation: ('positive_score', 'negative_score') ('positive_score', 'subjectivity_score') ('positive_score', 'percentage_of_complex_words') ('positive_score', 'fog_index') ('positive_score', 'avg_number_of_words') ('positive_score', 'syllable_per_word') ('positive_score', 'personal_pronouns') ('positive_score', 'avg_word_length') ('negative_score', 'percentage_of_complex_words') ('negative_score', 'fog_index') ('negative_score', 'avg_number_of_words') ('negative_score', 'syllable_per_word') ('negative_score', 'personal_pronouns') ('negative_score', 'avg_word_length') ('polarity_score', 'percentage_of_complex_words') ('polarity_score', 'fog_index') ('polarity_score', 'avg_number_of_words') ('polarity_score', 'complex_word_count') ('polarity_score', 'word_count') ('polarity_score', 'syllable_per_word') ('polarity_score', 'personal_pronouns') ('polarity_score', 'avg_word_length') ('subjectivity_score', 'percentage_of_complex_words') ('subjectivity_score', 'fog_index') ('subjectivity_score', 'avg_number_of_words') ('subjectivity_score', 'complex_word_count') ('subjectivity_score', 'word_count') ('subjectivity_score', 'syllable_per_word') ('subjectivity_score', 'avg_word_length') ('percentage_of_complex_words', 'avg_number_of_words') ('percentage_of_complex_words', 'word_count') ('percentage_of_complex_words', 'syllable_per_word') ('fog_index', 'complex_word_count') ('fog_index', 'word_count') ('fog_index', 'syllable_per_word') ('fog_index', 'personal_pronouns') ('avg_number_of_words', 'complex_word_count') ('avg_number_of_words', 'word_count') ('avg_number_of_words', 'syllable_per_word') ('avg_number_of_words', 'personal_pronouns') ('avg_number_of_words', 'avg_word_length') ('complex_word_count', 'syllable_per_word') ('complex_word_count', 'personal_pronouns') ('word_count', 'syllable_per_word') ('word_count', 'personal_pronouns') ('word_count', 'avg_word_length') ('syllable_per_word', 'personal_pronouns') Low Negative Correlation: ('negative_score', 'polarity_score') ('polarity_score', 'subjectivity_score') ('percentage_of_complex_words', 'personal_pronouns') ('personal_pronouns', 'avg_word_length') Highly Negatively Correlated:
Positive Trends:
- Higher complex word count is associated with more positive scores.
- Articles with a greater percentage of complex words tend to have longer average word lengths.
- As fog index increases, average number of words also increases.
- Positive scores have a moderate positive correlation with polarity scores.
Negative Trends:
- Negative scores show a moderate positive correlation with complex word count.
- Negative scores have a moderate positive correlation with word count.
- Negative scores are mildly negatively correlated with polarity scores and subjectivity scores.
- Higher percentage of complex words is associated with lower use of personal pronouns.
No Clear Trends:
- There is no strong correlation between positive and negative scores.
- No distinct correlation between polarity scores and complexity or subjectivity.
- Complex word count and word count don't have a clear relationship.
- No notable link between complexity features and negative scores.
- No distinct correlation between complexity features and subjectivity scores.
Miscellaneous:
- Subjectivity scores and personal pronouns show a low positive correlation.
- Fog index and average word length have a moderate positive correlation.
- Complex word count and average word length have a moderate positive correlation.
Word Cloud Based Analysis¶
top_positive_url_ids = output_df.nlargest(10, 'positive_score')['url_id'].tolist()
top_negative_url_ids = output_df.nlargest(10, 'negative_score')['url_id'].tolist()
def get_content_from_file(url_id):
file_path = os.path.join('Scraped Blogs', f'{url_id}.txt')
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content
positive_content = ' '.join([get_content_from_file(url_id) for url_id in top_positive_url_ids])
negative_content = ' '.join([get_content_from_file(url_id) for url_id in top_negative_url_ids])
print('\n\n')
positive_wordcloud = WordCloud(width=1000, height=500, background_color='white').generate(positive_content)
plt.figure(figsize=(10, 5))
plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Top 10 Positive Articles', y=1.1)
plt.axis('off')
plt.show()
print('\n\n')
negative_wordcloud = WordCloud(width=1000, height=500, background_color='black').generate(negative_content)
plt.figure(figsize=(10, 5))
plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Top 10 Negative Articles', y=1.1)
plt.axis('off')
plt.show()
Insights:
Top 10 Positive articles seems to be associated with words like AI, Human, Data, Machine Learning, Job, Work, Artificial Intelligence, Information, Algorithm, Skill, Computer.
Top 10 Negative articles seems to have words like People, COVID, India, Pandemic, Government, Time, Financial, Economic, Video Game, China, Crisis, Impact, Health, Disease, Virus.
Both seems to be pretty much reasonable when you see the period we were in by 2021 or so when most of these articles seems to have been published.
all_titles = ' '.join(output_df['title'])
titles_wordcloud = WordCloud(width=1000, height=500, background_color='yellow').generate(all_titles)
plt.figure(figsize=(10, 5))
plt.imshow(titles_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for All Titles', y=1.1)
plt.axis('off')
plt.show()
Insights:
Mostly people seem to have written articles on topics related to COVID, Corona Virus and Job related trends like AI, ML and other career related articles.
There are few articles related to Finance, Marketing and Healthcare as well.