Blog Scraping and Analysis¶

Importing Necessary Dependencies¶

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords as nltk_stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from string import punctuation
from bs4 import BeautifulSoup
from datetime import datetime
import seaborn as sns
import pandas as pd
import numpy as np
import requests
import random
import docx
import nltk
import io
import os
import re

Scraping and Output Generation Section¶

Reading the Input File¶

In [2]:
input_df = pd.read_excel(r'Dataset/Input.xlsx')
input_df.head()
Out[2]:
URL_ID URL
0 37 https://insights.blackcoffer.com/ai-in-healthc...
1 38 https://insights.blackcoffer.com/what-if-the-c...
2 39 https://insights.blackcoffer.com/what-jobs-wil...
3 40 https://insights.blackcoffer.com/will-machine-...
4 41 https://insights.blackcoffer.com/will-ai-repla...
In [3]:
input_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL_ID  114 non-null    int64 
 1   URL     114 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.9+ KB

Scraping HTML and Parsing it to get just the Title and Content¶

In [4]:
output_dir = "Scraped Blogs"
os.makedirs(output_dir, exist_ok=True)

for index, row in input_df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        title = soup.title.string.replace(' | Blackcoffer Insights', '')

        if title != "Page not found":

            # Removing the specified title elements from the HTML tree
            title_classes = ['tdb-entry-crumb', 'tdb-bred-no-url-last', 'tdb-title-text']
            for title_class in title_classes:
                title_elements = soup.find_all(True, {'class': title_class})
                for title_element in title_elements:
                    title_element.decompose()
            
            # Removing the div elements that we don't want from the HTML tree
            div_classes = [
                'td_block_wrap tdb_single_next_prev tdi_139 td-animation-stack td-pb-border-top td_block_template_1',
                'tdb-author-box td_block_wrap tdb_single_author_box tdi_140 tdb-content-vert-top td-pb-border-top td_block_template_1',
                'td_block_wrap tdb_single_author tdi_124 td-pb-border-top td_block_template_1 tdb-post-meta',
                'td_block_wrap tdb_single_date tdi_125 td-pb-border-top td_block_template_1 tdb-post-meta',
                'td_block_wrap tdb_single_comments_count tdi_126 td-pb-border-top td_block_template_1 tdb-post-meta',
                'td_block_wrap tdb_single_post_views tdi_127 td-pb-border-top td_block_template_1 tdb-post-meta'
            ]
            for div_class in div_classes:
                div_elements = soup.find_all('div', {'class': div_class})
                for div_element in div_elements:
                    div_element.decompose()
            
            # Removing the ul element from the HTML tree
            ul_element = soup.find('ul', {'class': 'tdb-tags'})
            if ul_element:
                ul_element.decompose()
            
            # Removing the preformatted block from the HTML tree
            preformatted_block = soup.find('pre', {'class': 'wp-block-preformatted'})
            if preformatted_block:
                preformatted_block.decompose()
            
            content_div = soup.find('div', {'id': 'tdb-autoload-article'})
            if content_div:
                content = ''
                for div in content_div.find_all('div', {'class': 'tdb-block-inner td-fix-index'}):
                    content += div.text + '\n'
            else:
                content_div = soup.find('div', {'class': 'td-post-content tagdiv-type'})
                if content_div:
                    content = content_div.text
        
            output_file_path = os.path.join(output_dir, f"{url_id}.txt")
            
            with open(output_file_path, 'w', encoding='utf-8') as f:
                f.write(f"Title: {title}\n\nContent:\n{content.strip()}")
            
    except requests.exceptions.RequestException as e:
        print(f"Error processing URL_ID {url_id}: {e}")

Creating Stopwords, Positive and Negative Dictionary¶

In [5]:
master_dict_folder = r"Dataset/MasterDictionary"
stop_words_folder = r"Dataset/StopWords"

positive_words = set()
negative_words = set()
stop_words = set()

def read_words_from_docx(file_path):
    doc = docx.Document(file_path)
    words = {paragraph.text.strip().lower() for paragraph in doc.paragraphs if paragraph.text.strip()}
    return words

positive_file_path = os.path.join(master_dict_folder, "positive-words.docx")
negative_file_path = os.path.join(master_dict_folder, "negative-words.docx")

positive_words = read_words_from_docx(positive_file_path)
negative_words = read_words_from_docx(negative_file_path)

for filename in os.listdir(stop_words_folder):
    if filename.endswith(".docx"):
        stop_words_file_path = os.path.join(stop_words_folder, filename)
        stop_words.update(read_words_from_docx(stop_words_file_path))

nltk_stop_words = set(nltk_stopwords.words('english'))
stop_words.update(nltk_stop_words)
stop_words.update(ENGLISH_STOP_WORDS)

print(f"Positive Words({len(positive_words)}):{list(positive_words)[:7]}")
print(f"Negative Words({len(negative_words)}):{list(negative_words)[:7]}")
print(f"Stop Words({len(stop_words)}): {list(stop_words)[:7]}")
Positive Words(2006):['jolly', 'richness', 'easy', 'admire', 'suave', 'propitious', 'enchant']
Negative Words(4783):['overdo', 'insidiously', 'fuss', 'clique', 'avariciously', 'chintzy', 'hazardous']
Stop Words(12809): ['randolph', 'jolly', 'georgeann', 'herrmann', 'cardwell', 'levasseur', 'krueger']

Sentiment Scores Computation¶

In [6]:
def compute_sentiment_scores(text):

    text = ''.join(char for char in text if char not in punctuation)
    
    words = word_tokenize(text)
    
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    
    polarity_score = round((positive_score - negative_score) / ((positive_score + negative_score) + 0.000001), 2)
    
    subjectivity_score = round((positive_score + negative_score) / ((len(words)) + 0.000001), 2)
    
    return positive_score, negative_score, polarity_score, subjectivity_score
In [7]:
input_dir = "Scraped Blogs"

sentiment_scores = {}

for file_name in os.listdir(input_dir):
    url_id = os.path.splitext(file_name)[0]
    
    with open(os.path.join(input_dir, file_name), 'r', encoding='utf-8') as f:
        content = f.read()
    
    positive_score, negative_score, polarity_score, subjectivity_score = compute_sentiment_scores(content)
    
    sentiment_scores[url_id] = {
        'positive_score': positive_score,
        'negative_score': negative_score,
        'polarity_score': polarity_score,
        'subjectivity_score': subjectivity_score
    }

Textual Variables Computation¶

In [8]:
def count_syllables(word):

    count = 0
    vowels = "aeiouAEIOU"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    
    if word.endswith("es") or word.endswith("ed"):
        count -= 1
    if count == 0:
        count += 1
    
    return count
In [9]:
def compute_variables(text):

    sentences = sent_tokenize(text)
    text = ''.join(char for char in text if char not in punctuation)
    
    stop_words = set(nltk_stopwords.words('english'))
    words = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]
    
    avg_sentence_length = avg_number_of_words = int(round(len(words) / len(sentences), 0))

    num_syllables = 0
    num_complex_words = 0
    num_chars = 0
    
    for word in words:
        num_syllables_in_word = count_syllables(word)
        if num_syllables_in_word > 2:
            num_complex_words += 1
        
        num_syllables += num_syllables_in_word
        num_chars += len(word)

    complex_word_count = num_complex_words
    total_word_count = len(words)
    percentage_complex_words = round((complex_word_count / total_word_count) * 100, 2)
    avg_syllable_per_word = int(round(num_syllables / total_word_count, 0))
    avg_word_length = int(round(num_chars / total_word_count, 0))
    
    fog_index = round(0.4 * (avg_sentence_length + percentage_complex_words), 2)
    
    personal_pronouns_pattern = r'\bI\b|\bwe\b|\bmy\b|\bours\b|\bus\b(?!A)'
    personal_pronouns_count = len(re.findall(personal_pronouns_pattern, text, re.IGNORECASE))
    
    return avg_sentence_length, percentage_complex_words, fog_index, avg_number_of_words, complex_word_count,\
            total_word_count, avg_syllable_per_word, personal_pronouns_count, avg_word_length
In [10]:
input_dir = "Scraped Blogs"

text_analysis_vars = {}

for file_name in os.listdir(input_dir):

    url_id = os.path.splitext(file_name)[0]
    
    with open(os.path.join(input_dir, file_name), 'r', encoding='utf-8') as f:
        content = f.read()
    
    avg_sentence_length, percentage_complex_words, fog_index, avg_number_of_words, complex_word_count,\
            total_word_count, avg_syllable_per_word, personal_pronouns_count, avg_word_length = compute_variables(content)
    
    text_analysis_vars[url_id] = {
                                'avg_sentence_length': avg_sentence_length,
                                'percentage_of_complex_words': percentage_complex_words,
                                'fog_index': fog_index,
                                'avg_number_of_words': avg_number_of_words,
                                'complex_word_count': complex_word_count,
                                'word_count': total_word_count,
                                'syllable_per_word': avg_syllable_per_word,
                                'personal_pronouns': personal_pronouns_count,
                                'avg_word_length': avg_word_length
                                }

Merging Variables with the Input Dataframe¶

In [11]:
sentiment_scores_df = pd.DataFrame.from_dict(sentiment_scores, orient='index').reset_index()
sentiment_scores_df = sentiment_scores_df.rename(columns={'index': 'URL_ID'})
sentiment_scores_df['URL_ID'] = sentiment_scores_df['URL_ID'].astype('int64')

text_analysis_vars_df = pd.DataFrame.from_dict(text_analysis_vars, orient='index').reset_index()
text_analysis_vars_df = text_analysis_vars_df.rename(columns={'index': 'URL_ID'})
text_analysis_vars_df['URL_ID'] = text_analysis_vars_df['URL_ID'].astype('int64')
In [12]:
output_df = input_df.merge(sentiment_scores_df, on='URL_ID', how='inner')
output_df = output_df.merge(text_analysis_vars_df, on='URL_ID', how='inner')

output_df.columns = ['URL_ID'] + [col.upper().replace('_', ' ') for col in output_df.columns if col != 'URL_ID']

output_df.head(3)
Out[12]:
URL_ID URL POSITIVE SCORE NEGATIVE SCORE POLARITY SCORE SUBJECTIVITY SCORE AVG SENTENCE LENGTH PERCENTAGE OF COMPLEX WORDS FOG INDEX AVG NUMBER OF WORDS COMPLEX WORD COUNT WORD COUNT SYLLABLE PER WORD PERSONAL PRONOUNS AVG WORD LENGTH
0 37 https://insights.blackcoffer.com/ai-in-healthc... 65 32 0.34 0.10 15 38.87 21.55 15 440 1132 2 1 7
1 38 https://insights.blackcoffer.com/what-if-the-c... 57 37 0.21 0.18 9 27.49 14.60 9 204 742 2 6 7
2 39 https://insights.blackcoffer.com/what-jobs-wil... 65 35 0.30 0.12 12 40.87 21.15 12 405 991 2 3 7
In [13]:
output_df.to_excel('Output Data.xlsx', engine ='xlsxwriter', sheet_name = 'Output', index = False)

Analysis Section¶

Some Additional Feature Engineering¶

In [14]:
# For convenience of calling reverting column names back to original column names

output_df.columns = [col.lower().replace(' ', '_') for col in output_df.columns]

output_df.head(3)
Out[14]:
url_id url positive_score negative_score polarity_score subjectivity_score avg_sentence_length percentage_of_complex_words fog_index avg_number_of_words complex_word_count word_count syllable_per_word personal_pronouns avg_word_length
0 37 https://insights.blackcoffer.com/ai-in-healthc... 65 32 0.34 0.10 15 38.87 21.55 15 440 1132 2 1 7
1 38 https://insights.blackcoffer.com/what-if-the-c... 57 37 0.21 0.18 9 27.49 14.60 9 204 742 2 6 7
2 39 https://insights.blackcoffer.com/what-jobs-wil... 65 35 0.30 0.12 12 40.87 21.15 12 405 991 2 3 7

Let's drop one out of avg_sentence_length and avg_number_of_words as both implies same meaning.

In [15]:
output_df.drop('avg_sentence_length', inplace = True, axis = 1)
In [16]:
input_dir = "Scraped Blogs"

output_df.insert(2, 'title', '')

for file_name in os.listdir(input_dir):
    file_path = os.path.join(input_dir, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
        match = re.search(r'Title:\s*(.+)', content)
        if match:
            title = match.group(1)
            url_id = int(os.path.splitext(file_name)[0])
            output_df.loc[output_df['url_id'] == url_id, 'title'] = title
            
output_df.head(3)
Out[16]:
url_id url title positive_score negative_score polarity_score subjectivity_score percentage_of_complex_words fog_index avg_number_of_words complex_word_count word_count syllable_per_word personal_pronouns avg_word_length
0 37 https://insights.blackcoffer.com/ai-in-healthc... AI in healthcare to Improve Patient Outcomes 65 32 0.34 0.10 38.87 21.55 15 440 1132 2 1 7
1 38 https://insights.blackcoffer.com/what-if-the-c... What if the Creation is Taking Over the Creator? 57 37 0.21 0.18 27.49 14.60 9 204 742 2 6 7
2 39 https://insights.blackcoffer.com/what-jobs-wil... What Jobs Will Robots Take From Humans in The ... 65 35 0.30 0.12 40.87 21.15 12 405 991 2 3 7
In [17]:
output_df.title.isnull().sum()
Out[17]:
0

Summary Statistics¶

In [18]:
output_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 111 entries, 0 to 110
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   url_id                       111 non-null    int64  
 1   url                          111 non-null    object 
 2   title                        111 non-null    object 
 3   positive_score               111 non-null    int64  
 4   negative_score               111 non-null    int64  
 5   polarity_score               111 non-null    float64
 6   subjectivity_score           111 non-null    float64
 7   percentage_of_complex_words  111 non-null    float64
 8   fog_index                    111 non-null    float64
 9   avg_number_of_words          111 non-null    int64  
 10  complex_word_count           111 non-null    int64  
 11  word_count                   111 non-null    int64  
 12  syllable_per_word            111 non-null    int64  
 13  personal_pronouns            111 non-null    int64  
 14  avg_word_length              111 non-null    int64  
dtypes: float64(4), int64(9), object(2)
memory usage: 13.9+ KB
In [19]:
output_df.url_id = output_df.url_id.astype('O')
In [20]:
output_df.describe(include=np.number)
Out[20]:
positive_score negative_score polarity_score subjectivity_score percentage_of_complex_words fog_index avg_number_of_words complex_word_count word_count syllable_per_word personal_pronouns avg_word_length
count 111.000000 111.000000 111.000000 111.000000 111.000000 111.000000 111.000000 111.000000 111.000000 111.000000 111.000000 111.000000
mean 28.189189 30.684685 0.016216 0.122342 31.765946 18.029369 13.306306 202.126126 634.864865 2.018018 7.261261 6.837838
std 17.404552 22.092361 0.435650 0.042532 5.868901 4.389579 9.265569 103.907749 315.956455 0.133620 8.510648 0.496177
min 0.000000 0.000000 -1.000000 0.030000 15.830000 10.490000 6.000000 21.000000 91.000000 2.000000 0.000000 5.000000
25% 16.000000 13.500000 -0.325000 0.100000 28.620000 15.780000 10.000000 131.000000 399.500000 2.000000 2.000000 7.000000
50% 25.000000 26.000000 -0.040000 0.110000 31.720000 17.540000 12.000000 189.000000 600.000000 2.000000 4.000000 7.000000
75% 36.000000 43.000000 0.380000 0.150000 34.475000 19.875000 14.000000 286.500000 890.000000 2.000000 10.000000 7.000000
max 85.000000 93.000000 1.000000 0.250000 46.810000 53.280000 101.000000 440.000000 1998.000000 3.000000 46.000000 8.000000

Insights from Summary Statistics:¶

Measure Range Mean Standard Deviation Insights
Positive Score 0 to 85 28.19 17.40 The average positive score is 28.19, indicating a moderate level of positive sentiment in the data.
Negative Score 0 to 93 30.68 22.09 The average negative score is 30.68, indicating a moderate level of negative sentiment.
Polarity Score -1 to 1 0.02 0.44 The average polarity score is 0.02, suggesting a slight positive bias.
Subjectivity Score 0.03 to 0.25 0.12 0.04 The data shows an average subjectivity score of 0.12, indicating a relatively objective tone. Scores range from 0.03 (objective) to 0.25 (subjective).
Percentage of Complex Words 15.83 % to 46.81 % 31.77 5.87 On average, around 31.77% of words are complex.
Fog Index 10.49 to 53.28 18.03 4.39 The average Fog index is 18.03, indicating that the text is written at a moderately complex level overall across articles.
Average number of Words 6 to 101 13.31 9.27 The average number of words per sentence is 13.31, with a wide variation from 6 to 101 words suggesting potential outliers.
Complex Word Count 21 to 440 202.13 103.91 The average count of complex words is 202.13.
Word Count 91 to 1998 634.86 315.96 The average word count is 634.86. But, the lowest and highest are at two extremes with huge SD of 315.
Syllable Per Word 2 to 3 2.02 0.13 The average syllables per word is 2.02, suggesting that most words are 2-syllable words.
Personal Pronouns 0 to 46 7.26 8.51 The data indicates an average of 7.26 personal pronouns used per article and the maximum being 46 sounds overly subjective article.
Average Word Length 5 to 8 6.84 0.50 The average word length is 6.84 characters. Word lengths range from 5 to 8 characters.

Insights:

  1. Sentiment Analysis: The positive and negative scores are moderately distributed, indicating a mix of positive and negative sentiment. Polarity score is slightly positive on average.
  2. Subjectivity and Complexity: The subjectivity score suggests that the content tends to be objective. The percentage of complex words and Fog index both indicate a moderate level of textual complexity.
  3. Text Length: The average number of words per sentence varies significantly across entries, suggesting diversity in sentence lengths.
  4. Word Characteristics: Most words have around 2 syllables, and the average word length is approximately 6.84 characters.
  5. Personal Pronouns: Entries use an average of 7.26 personal pronouns, which might indicate a moderate level of subjectivity.
  6. Range and Variation: Understanding the ranges helps identify outliers and assess data distribution.

Let's now dive deep and see the distributions of sentiment features and the textual ones...

Understanding the Distribution of Variables¶

In [21]:
sentiment_features = ['positive_score', 'negative_score', 'polarity_score', 'subjectivity_score']

num_plots = len(sentiment_features)

num_rows = (num_plots + 1) // 2  # Number of rows for subplots

fig, axes = plt.subplots(num_rows, 2, figsize=(12, 5 * num_rows))

for i, feature in enumerate(sentiment_features):

    row = i // 2
    col = i % 2
    ax = axes[row, col]

    sns.histplot(output_df[feature], ax=ax, bins=20, color=random.choice(['green', 'red', 'blue', 'purple', 'orange', 'grey']))
    ax.set_xlabel(feature.replace('_', ' ').title())
    ax.set_ylabel("Frequency")

# Remove any empty subplots
if num_plots < num_rows * 2:
    fig.delaxes(axes[num_rows-1, 1])

plt.tight_layout()
plt.show()
No description has been provided for this image

Insights:

  • Positive scores are close to being normally distributed which is a good trend showing most articles stayed moderately positive.

  • Negative scores tends to be right skewed meaning most articles had least negative scores which as well is a very good trend. But what's concerning here is around some 30-40 articles seems to be overly negative.

  • Polarity again stays mostly in the mid region leaving a balance between positive and negative trend and some 4+ articles seems to be close to 1 meaning highly positive articles.

  • Subjectivity scores shows that mostly the articles are objective in nature meaning positive, negative scores not much relative with respect to the context(total no. of words). Possibly because only few words from dictionary being in the articles. So, we might have to increase the words in dictionary to see more relevant trends.

In [22]:
textual_features = ['complex_word_count', 'percentage_of_complex_words', 'fog_index', 'avg_number_of_words', 'word_count', 'syllable_per_word', 'personal_pronouns', 'avg_word_length']

num_plots = len(textual_features)

num_rows = (num_plots + 1) // 2  # Number of rows for subplots

fig, axes = plt.subplots(num_rows, 2, figsize=(12, 5 * num_rows))

for i, feature in enumerate(textual_features):

    row = i // 2
    col = i % 2
    ax = axes[row, col]

    sns.histplot(output_df[feature], ax=ax, bins=20, color='darkblue')
    ax.set_xlabel(feature.replace('_', ' ').title())
    ax.set_ylabel("Frequency")

# Remove any empty subplots
if num_plots < num_rows * 2:
    fig.delaxes(axes[num_rows-1, 1])

plt.tight_layout()
plt.show()
No description has been provided for this image

Insights:

Complex word count: It's close to being normally distributed meaning that we have moderate amount of complex word count in most of the articles.

Percentage of complex word count: The percentage of complex words also suggests the same as the complex word count suggests. Most of the articles distributed at the center meaning that the percentage of complex word for most of the articles is moderate.

Fog index: Fog index seems to have some outlier. Because of that, the distribution looks like bit right skewed. Actually, if you remove that outlier. The distribution will become like most articles are pretty much with moderate complexity of reading.

Average no. of words: We have 2 outliers - One with 40 and another with 100 words per sentence which might be due to some inconsistency in writing causing the sentence tokenizer to pick 3-4 lines as a single line. If we remove those, most articles seems to have somewhere around 10 words per sentence.

Word Count: We have an outlier here as well with close to 2000 words per article which seems to be more verbose, almost twice as lengthy as the next close article with most word count. We can notice that we have most articles between 300-1000 words per article and a fairly good amount of articles in other range within 100-1250

Syllable Per Word: One or two article seems to have 3 syllablled word and mostly we have 2 syllabled words.

Personal Pronouns: Personal pronoun count per article is completely right skewed with some 4-5 outliers at the right extreme. But, even if we remove them and see the distribution, the fact that it is right skewed wouldn't change here. We have most no. of articles with around 2-3 personal pronouns(35%) and around some 20% articles have 4-5 personal pronouns and 15% of articles have around 5-6 personal pronouns and 13% of have 7-8 personal pronouns and so on. So the trend clearly shows that there is a fair bit of articles with subjective opinions.

Average Word Length: Most words seems to be with 6-7 charcters with few outliers which seems to have either 5 or 8 characters per word. So, overall the average no. of characters per word seems to be between 5-8.

Correlation Analysis¶

In [23]:
num_cols = [col for col in output_df.columns if col not in ['url_id', 'url', 'title']]
In [24]:
corr_matrix = output_df[num_cols].corr()

fig = plt.figure(figsize = (9, 4))

sns.heatmap(corr_matrix, cmap = 'Reds')
Out[24]:
<Axes: >
No description has been provided for this image

Here though we could get some information, we can't take away anything from this visual solidly... So, let's dive deep and do a bit more work over here....

In [25]:
# Defining the ranges for different correlation labels
ranges = {
    'Highly positively correlated': (0.7, 1.0),
    'Low Positive Correlation': (0.3, 0.7),
    'No Correlation': (-0.3, 0.3),
    'Low Negative Correlation': (-0.7, -0.3),
    'High Negatively Correlated': (-1.0, -0.7)
}

labeled_corr_matrix = pd.DataFrame(index=corr_matrix.index, columns=corr_matrix.columns)

for i in range(corr_matrix.shape[0]):
    for j in range(corr_matrix.shape[1]):
        # Getting the value of the current cell
        value = corr_matrix.iloc[i, j]
        
        # Finding the label for the current value
        label = None
        for k, v in ranges.items():
            if v[0] <= value < v[1]:
                label = k
                break
        
        # Setting the label for the current cell in the labeled correlation matrix
        labeled_corr_matrix.iloc[i, j] = label
In [26]:
labeled_corr_matrix = labeled_corr_matrix.fillna('')

label_lists = {
    'Highly positively correlated': [],
    'Low Positive Correlation': [],
    'No Correlation': [],
    'Low Negative Correlation': [],
    'Highly Negatively Correlated': []
}

for label in label_lists.keys():
    
    if label in label_lists:                                       # Checking if the current label is in the label_lists dictionary
        pairs = np.where(labeled_corr_matrix == label)             # Finding all pairs of columns that have this label
        
        for i in range(len(pairs[0])):                             # Iterating over each pair of columns
            
            col1 = labeled_corr_matrix.columns[pairs[0][i]]        # Getting the names of the columns in this pair
            col2 = labeled_corr_matrix.index[pairs[1][i]]
            
            if (col2, col1) not in label_lists[label]:
                label_lists[label].append((col1, col2))            # Adding this pair of columns to the list for this label


for label, lst in label_lists.items():
    print(f"{label}:\n")
    for pair in lst:
        print(f"  {pair}\n")
Highly positively correlated:

  ('positive_score', 'complex_word_count')

  ('percentage_of_complex_words', 'avg_word_length')

  ('fog_index', 'avg_number_of_words')

  ('complex_word_count', 'word_count')

Low Positive Correlation:

  ('positive_score', 'polarity_score')

  ('positive_score', 'word_count')

  ('negative_score', 'subjectivity_score')

  ('negative_score', 'complex_word_count')

  ('negative_score', 'word_count')

  ('subjectivity_score', 'personal_pronouns')

  ('percentage_of_complex_words', 'fog_index')

  ('percentage_of_complex_words', 'complex_word_count')

  ('fog_index', 'avg_word_length')

  ('complex_word_count', 'avg_word_length')

  ('syllable_per_word', 'avg_word_length')

No Correlation:

  ('positive_score', 'negative_score')

  ('positive_score', 'subjectivity_score')

  ('positive_score', 'percentage_of_complex_words')

  ('positive_score', 'fog_index')

  ('positive_score', 'avg_number_of_words')

  ('positive_score', 'syllable_per_word')

  ('positive_score', 'personal_pronouns')

  ('positive_score', 'avg_word_length')

  ('negative_score', 'percentage_of_complex_words')

  ('negative_score', 'fog_index')

  ('negative_score', 'avg_number_of_words')

  ('negative_score', 'syllable_per_word')

  ('negative_score', 'personal_pronouns')

  ('negative_score', 'avg_word_length')

  ('polarity_score', 'percentage_of_complex_words')

  ('polarity_score', 'fog_index')

  ('polarity_score', 'avg_number_of_words')

  ('polarity_score', 'complex_word_count')

  ('polarity_score', 'word_count')

  ('polarity_score', 'syllable_per_word')

  ('polarity_score', 'personal_pronouns')

  ('polarity_score', 'avg_word_length')

  ('subjectivity_score', 'percentage_of_complex_words')

  ('subjectivity_score', 'fog_index')

  ('subjectivity_score', 'avg_number_of_words')

  ('subjectivity_score', 'complex_word_count')

  ('subjectivity_score', 'word_count')

  ('subjectivity_score', 'syllable_per_word')

  ('subjectivity_score', 'avg_word_length')

  ('percentage_of_complex_words', 'avg_number_of_words')

  ('percentage_of_complex_words', 'word_count')

  ('percentage_of_complex_words', 'syllable_per_word')

  ('fog_index', 'complex_word_count')

  ('fog_index', 'word_count')

  ('fog_index', 'syllable_per_word')

  ('fog_index', 'personal_pronouns')

  ('avg_number_of_words', 'complex_word_count')

  ('avg_number_of_words', 'word_count')

  ('avg_number_of_words', 'syllable_per_word')

  ('avg_number_of_words', 'personal_pronouns')

  ('avg_number_of_words', 'avg_word_length')

  ('complex_word_count', 'syllable_per_word')

  ('complex_word_count', 'personal_pronouns')

  ('word_count', 'syllable_per_word')

  ('word_count', 'personal_pronouns')

  ('word_count', 'avg_word_length')

  ('syllable_per_word', 'personal_pronouns')

Low Negative Correlation:

  ('negative_score', 'polarity_score')

  ('polarity_score', 'subjectivity_score')

  ('percentage_of_complex_words', 'personal_pronouns')

  ('personal_pronouns', 'avg_word_length')

Highly Negatively Correlated:

Positive Trends:

  1. Higher complex word count is associated with more positive scores.
  2. Articles with a greater percentage of complex words tend to have longer average word lengths.
  3. As fog index increases, average number of words also increases.
  4. Positive scores have a moderate positive correlation with polarity scores.

Negative Trends:

  1. Negative scores show a moderate positive correlation with complex word count.
  2. Negative scores have a moderate positive correlation with word count.
  3. Negative scores are mildly negatively correlated with polarity scores and subjectivity scores.
  4. Higher percentage of complex words is associated with lower use of personal pronouns.

No Clear Trends:

  1. There is no strong correlation between positive and negative scores.
  2. No distinct correlation between polarity scores and complexity or subjectivity.
  3. Complex word count and word count don't have a clear relationship.
  4. No notable link between complexity features and negative scores.
  5. No distinct correlation between complexity features and subjectivity scores.

Miscellaneous:

  1. Subjectivity scores and personal pronouns show a low positive correlation.
  2. Fog index and average word length have a moderate positive correlation.
  3. Complex word count and average word length have a moderate positive correlation.

Word Cloud Based Analysis¶

In [27]:
top_positive_url_ids = output_df.nlargest(10, 'positive_score')['url_id'].tolist()

top_negative_url_ids = output_df.nlargest(10, 'negative_score')['url_id'].tolist()

def get_content_from_file(url_id):
    file_path = os.path.join('Scraped Blogs', f'{url_id}.txt')
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

positive_content = ' '.join([get_content_from_file(url_id) for url_id in top_positive_url_ids])

negative_content = ' '.join([get_content_from_file(url_id) for url_id in top_negative_url_ids])

print('\n\n')
positive_wordcloud = WordCloud(width=1000, height=500, background_color='white').generate(positive_content)
plt.figure(figsize=(10, 5))
plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Top 10 Positive Articles', y=1.1)
plt.axis('off')
plt.show()

print('\n\n')

negative_wordcloud = WordCloud(width=1000, height=500, background_color='black').generate(negative_content)
plt.figure(figsize=(10, 5))
plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Top 10 Negative Articles', y=1.1)
plt.axis('off')
plt.show()

No description has been provided for this image

No description has been provided for this image

Insights:

  • Top 10 Positive articles seems to be associated with words like AI, Human, Data, Machine Learning, Job, Work, Artificial Intelligence, Information, Algorithm, Skill, Computer.

  • Top 10 Negative articles seems to have words like People, COVID, India, Pandemic, Government, Time, Financial, Economic, Video Game, China, Crisis, Impact, Health, Disease, Virus.

Both seems to be pretty much reasonable when you see the period we were in by 2021 or so when most of these articles seems to have been published.

In [28]:
all_titles = ' '.join(output_df['title'])

titles_wordcloud = WordCloud(width=1000, height=500, background_color='yellow').generate(all_titles)
plt.figure(figsize=(10, 5))
plt.imshow(titles_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for All Titles', y=1.1)
plt.axis('off')
plt.show()
No description has been provided for this image

Insights:

  • Mostly people seem to have written articles on topics related to COVID, Corona Virus and Job related trends like AI, ML and other career related articles.

  • There are few articles related to Finance, Marketing and Healthcare as well.