from warnings import filterwarnings
filterwarnings("ignore")

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords as nltk_stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from string import punctuation
from bs4 import BeautifulSoup
from datetime import datetime
import seaborn as sns
import pandas as pd
import numpy as np
import requests
import random
import docx
import nltk
import io
import os
import re

input_df = pd.read_excel(r'Dataset/Input.xlsx')
input_df.head()

input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL_ID  114 non-null    int64 
 1   URL     114 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.9+ KB

output_dir = "Scraped Blogs"
os.makedirs(output_dir, exist_ok=True)

for index, row in input_df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        title = soup.title.string.replace(' | Blackcoffer Insights', '')

        if title != "Page not found":

            # Removing the specified title elements from the HTML tree
            title_classes = ['tdb-entry-crumb', 'tdb-bred-no-url-last', 'tdb-title-text']
            for title_class in title_classes:
                title_elements = soup.find_all(True, {'class': title_class})
                for title_element in title_elements:
                    title_element.decompose()
            
            # Removing the div elements that we don't want from the HTML tree
            div_classes = [
                'td_block_wrap tdb_single_next_prev tdi_139 td-animation-stack td-pb-border-top td_block_template_1',
                'tdb-author-box td_block_wrap tdb_single_author_box tdi_140 tdb-content-vert-top td-pb-border-top td_block_template_1',
                'td_block_wrap tdb_single_author tdi_124 td-pb-border-top td_block_template_1 tdb-post-meta',
                'td_block_wrap tdb_single_date tdi_125 td-pb-border-top td_block_template_1 tdb-post-meta',
                'td_block_wrap tdb_single_comments_count tdi_126 td-pb-border-top td_block_template_1 tdb-post-meta',
                'td_block_wrap tdb_single_post_views tdi_127 td-pb-border-top td_block_template_1 tdb-post-meta'
            ]
            for div_class in div_classes:
                div_elements = soup.find_all('div', {'class': div_class})
                for div_element in div_elements:
                    div_element.decompose()
            
            # Removing the ul element from the HTML tree
            ul_element = soup.find('ul', {'class': 'tdb-tags'})
            if ul_element:
                ul_element.decompose()
            
            # Removing the preformatted block from the HTML tree
            preformatted_block = soup.find('pre', {'class': 'wp-block-preformatted'})
            if preformatted_block:
                preformatted_block.decompose()
            
            content_div = soup.find('div', {'id': 'tdb-autoload-article'})
            if content_div:
                content = ''
                for div in content_div.find_all('div', {'class': 'tdb-block-inner td-fix-index'}):
                    content += div.text + '\n'
            else:
                content_div = soup.find('div', {'class': 'td-post-content tagdiv-type'})
                if content_div:
                    content = content_div.text
        
            output_file_path = os.path.join(output_dir, f"{url_id}.txt")
            
            with open(output_file_path, 'w', encoding='utf-8') as f:
                f.write(f"Title: {title}\n\nContent:\n{content.strip()}")
            
    except requests.exceptions.RequestException as e:
        print(f"Error processing URL_ID {url_id}: {e}")

master_dict_folder = r"Dataset/MasterDictionary"
stop_words_folder = r"Dataset/StopWords"

positive_words = set()
negative_words = set()
stop_words = set()

def read_words_from_docx(file_path):
    doc = docx.Document(file_path)
    words = {paragraph.text.strip().lower() for paragraph in doc.paragraphs if paragraph.text.strip()}
    return words

positive_file_path = os.path.join(master_dict_folder, "positive-words.docx")
negative_file_path = os.path.join(master_dict_folder, "negative-words.docx")

positive_words = read_words_from_docx(positive_file_path)
negative_words = read_words_from_docx(negative_file_path)

for filename in os.listdir(stop_words_folder):
    if filename.endswith(".docx"):
        stop_words_file_path = os.path.join(stop_words_folder, filename)
        stop_words.update(read_words_from_docx(stop_words_file_path))

nltk_stop_words = set(nltk_stopwords.words('english'))
stop_words.update(nltk_stop_words)
stop_words.update(ENGLISH_STOP_WORDS)

print(f"Positive Words({len(positive_words)}):{list(positive_words)[:7]}")
print(f"Negative Words({len(negative_words)}):{list(negative_words)[:7]}")
print(f"Stop Words({len(stop_words)}): {list(stop_words)[:7]}")

Positive Words(2006):['jolly', 'richness', 'easy', 'admire', 'suave', 'propitious', 'enchant']
Negative Words(4783):['overdo', 'insidiously', 'fuss', 'clique', 'avariciously', 'chintzy', 'hazardous']
Stop Words(12809): ['randolph', 'jolly', 'georgeann', 'herrmann', 'cardwell', 'levasseur', 'krueger']

def compute_sentiment_scores(text):

    text = ''.join(char for char in text if char not in punctuation)
    
    words = word_tokenize(text)
    
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    
    polarity_score = round((positive_score - negative_score) / ((positive_score + negative_score) + 0.000001), 2)
    
    subjectivity_score = round((positive_score + negative_score) / ((len(words)) + 0.000001), 2)
    
    return positive_score, negative_score, polarity_score, subjectivity_score

input_dir = "Scraped Blogs"

sentiment_scores = {}

for file_name in os.listdir(input_dir):
    url_id = os.path.splitext(file_name)[0]
    
    with open(os.path.join(input_dir, file_name), 'r', encoding='utf-8') as f:
        content = f.read()
    
    positive_score, negative_score, polarity_score, subjectivity_score = compute_sentiment_scores(content)
    
    sentiment_scores[url_id] = {
        'positive_score': positive_score,
        'negative_score': negative_score,
        'polarity_score': polarity_score,
        'subjectivity_score': subjectivity_score
    }

def count_syllables(word):

    count = 0
    vowels = "aeiouAEIOU"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    
    if word.endswith("es") or word.endswith("ed"):
        count -= 1
    if count == 0:
        count += 1
    
    return count

def compute_variables(text):

    sentences = sent_tokenize(text)
    text = ''.join(char for char in text if char not in punctuation)
    
    stop_words = set(nltk_stopwords.words('english'))
    words = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]
    
    avg_sentence_length = avg_number_of_words = int(round(len(words) / len(sentences), 0))

    num_syllables = 0
    num_complex_words = 0
    num_chars = 0
    
    for word in words:
        num_syllables_in_word = count_syllables(word)
        if num_syllables_in_word > 2:
            num_complex_words += 1
        
        num_syllables += num_syllables_in_word
        num_chars += len(word)

    complex_word_count = num_complex_words
    total_word_count = len(words)
    percentage_complex_words = round((complex_word_count / total_word_count) * 100, 2)
    avg_syllable_per_word = int(round(num_syllables / total_word_count, 0))
    avg_word_length = int(round(num_chars / total_word_count, 0))
    
    fog_index = round(0.4 * (avg_sentence_length + percentage_complex_words), 2)
    
    personal_pronouns_pattern = r'\bI\b|\bwe\b|\bmy\b|\bours\b|\bus\b(?!A)'
    personal_pronouns_count = len(re.findall(personal_pronouns_pattern, text, re.IGNORECASE))
    
    return avg_sentence_length, percentage_complex_words, fog_index, avg_number_of_words, complex_word_count,\
            total_word_count, avg_syllable_per_word, personal_pronouns_count, avg_word_length

input_dir = "Scraped Blogs"

text_analysis_vars = {}

for file_name in os.listdir(input_dir):

    url_id = os.path.splitext(file_name)[0]
    
    with open(os.path.join(input_dir, file_name), 'r', encoding='utf-8') as f:
        content = f.read()
    
    avg_sentence_length, percentage_complex_words, fog_index, avg_number_of_words, complex_word_count,\
            total_word_count, avg_syllable_per_word, personal_pronouns_count, avg_word_length = compute_variables(content)
    
    text_analysis_vars[url_id] = {
                                'avg_sentence_length': avg_sentence_length,
                                'percentage_of_complex_words': percentage_complex_words,
                                'fog_index': fog_index,
                                'avg_number_of_words': avg_number_of_words,
                                'complex_word_count': complex_word_count,
                                'word_count': total_word_count,
                                'syllable_per_word': avg_syllable_per_word,
                                'personal_pronouns': personal_pronouns_count,
                                'avg_word_length': avg_word_length
                                }

sentiment_scores_df = pd.DataFrame.from_dict(sentiment_scores, orient='index').reset_index()
sentiment_scores_df = sentiment_scores_df.rename(columns={'index': 'URL_ID'})
sentiment_scores_df['URL_ID'] = sentiment_scores_df['URL_ID'].astype('int64')

text_analysis_vars_df = pd.DataFrame.from_dict(text_analysis_vars, orient='index').reset_index()
text_analysis_vars_df = text_analysis_vars_df.rename(columns={'index': 'URL_ID'})
text_analysis_vars_df['URL_ID'] = text_analysis_vars_df['URL_ID'].astype('int64')

output_df = input_df.merge(sentiment_scores_df, on='URL_ID', how='inner')
output_df = output_df.merge(text_analysis_vars_df, on='URL_ID', how='inner')

output_df.columns = ['URL_ID'] + [col.upper().replace('_', ' ') for col in output_df.columns if col != 'URL_ID']

output_df.head(3)

output_df.to_excel('Output Data.xlsx', engine ='xlsxwriter', sheet_name = 'Output', index = False)

# For convenience of calling reverting column names back to original column names

output_df.columns = [col.lower().replace(' ', '_') for col in output_df.columns]

output_df.head(3)

output_df.drop('avg_sentence_length', inplace = True, axis = 1)

input_dir = "Scraped Blogs"

output_df.insert(2, 'title', '')

for file_name in os.listdir(input_dir):
    file_path = os.path.join(input_dir, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
        match = re.search(r'Title:\s*(.+)', content)
        if match:
            title = match.group(1)
            url_id = int(os.path.splitext(file_name)[0])
            output_df.loc[output_df['url_id'] == url_id, 'title'] = title
            
output_df.head(3)

output_df.title.isnull().sum()

0

output_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111 entries, 0 to 110
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   url_id                       111 non-null    int64  
 1   url                          111 non-null    object 
 2   title                        111 non-null    object 
 3   positive_score               111 non-null    int64  
 4   negative_score               111 non-null    int64  
 5   polarity_score               111 non-null    float64
 6   subjectivity_score           111 non-null    float64
 7   percentage_of_complex_words  111 non-null    float64
 8   fog_index                    111 non-null    float64
 9   avg_number_of_words          111 non-null    int64  
 10  complex_word_count           111 non-null    int64  
 11  word_count                   111 non-null    int64  
 12  syllable_per_word            111 non-null    int64  
 13  personal_pronouns            111 non-null    int64  
 14  avg_word_length              111 non-null    int64  
dtypes: float64(4), int64(9), object(2)
memory usage: 13.9+ KB

output_df.url_id = output_df.url_id.astype('O')

output_df.describe(include=np.number)

sentiment_features = ['positive_score', 'negative_score', 'polarity_score', 'subjectivity_score']

num_plots = len(sentiment_features)

num_rows = (num_plots + 1) // 2  # Number of rows for subplots

fig, axes = plt.subplots(num_rows, 2, figsize=(12, 5 * num_rows))

for i, feature in enumerate(sentiment_features):

    row = i // 2
    col = i % 2
    ax = axes[row, col]

    sns.histplot(output_df[feature], ax=ax, bins=20, color=random.choice(['green', 'red', 'blue', 'purple', 'orange', 'grey']))
    ax.set_xlabel(feature.replace('_', ' ').title())
    ax.set_ylabel("Frequency")

# Remove any empty subplots
if num_plots < num_rows * 2:
    fig.delaxes(axes[num_rows-1, 1])

plt.tight_layout()
plt.show()

textual_features = ['complex_word_count', 'percentage_of_complex_words', 'fog_index', 'avg_number_of_words', 'word_count', 'syllable_per_word', 'personal_pronouns', 'avg_word_length']

num_plots = len(textual_features)

num_rows = (num_plots + 1) // 2  # Number of rows for subplots

fig, axes = plt.subplots(num_rows, 2, figsize=(12, 5 * num_rows))

for i, feature in enumerate(textual_features):

    row = i // 2
    col = i % 2
    ax = axes[row, col]

    sns.histplot(output_df[feature], ax=ax, bins=20, color='darkblue')
    ax.set_xlabel(feature.replace('_', ' ').title())
    ax.set_ylabel("Frequency")

# Remove any empty subplots
if num_plots < num_rows * 2:
    fig.delaxes(axes[num_rows-1, 1])

plt.tight_layout()
plt.show()

num_cols = [col for col in output_df.columns if col not in ['url_id', 'url', 'title']]

corr_matrix = output_df[num_cols].corr()

fig = plt.figure(figsize = (9, 4))

sns.heatmap(corr_matrix, cmap = 'Reds')

<Axes: >

# Defining the ranges for different correlation labels
ranges = {
    'Highly positively correlated': (0.7, 1.0),
    'Low Positive Correlation': (0.3, 0.7),
    'No Correlation': (-0.3, 0.3),
    'Low Negative Correlation': (-0.7, -0.3),
    'High Negatively Correlated': (-1.0, -0.7)
}

labeled_corr_matrix = pd.DataFrame(index=corr_matrix.index, columns=corr_matrix.columns)

for i in range(corr_matrix.shape[0]):
    for j in range(corr_matrix.shape[1]):
        # Getting the value of the current cell
        value = corr_matrix.iloc[i, j]
        
        # Finding the label for the current value
        label = None
        for k, v in ranges.items():
            if v[0] <= value < v[1]:
                label = k
                break
        
        # Setting the label for the current cell in the labeled correlation matrix
        labeled_corr_matrix.iloc[i, j] = label

labeled_corr_matrix = labeled_corr_matrix.fillna('')

label_lists = {
    'Highly positively correlated': [],
    'Low Positive Correlation': [],
    'No Correlation': [],
    'Low Negative Correlation': [],
    'Highly Negatively Correlated': []
}

for label in label_lists.keys():
    
    if label in label_lists:                                       # Checking if the current label is in the label_lists dictionary
        pairs = np.where(labeled_corr_matrix == label)             # Finding all pairs of columns that have this label
        
        for i in range(len(pairs[0])):                             # Iterating over each pair of columns
            
            col1 = labeled_corr_matrix.columns[pairs[0][i]]        # Getting the names of the columns in this pair
            col2 = labeled_corr_matrix.index[pairs[1][i]]
            
            if (col2, col1) not in label_lists[label]:
                label_lists[label].append((col1, col2))            # Adding this pair of columns to the list for this label


for label, lst in label_lists.items():
    print(f"{label}:\n")
    for pair in lst:
        print(f"  {pair}\n")

Highly positively correlated:

  ('positive_score', 'complex_word_count')

  ('percentage_of_complex_words', 'avg_word_length')

  ('fog_index', 'avg_number_of_words')

  ('complex_word_count', 'word_count')

Low Positive Correlation:

  ('positive_score', 'polarity_score')

  ('positive_score', 'word_count')

  ('negative_score', 'subjectivity_score')

  ('negative_score', 'complex_word_count')

  ('negative_score', 'word_count')

  ('subjectivity_score', 'personal_pronouns')

  ('percentage_of_complex_words', 'fog_index')

  ('percentage_of_complex_words', 'complex_word_count')

  ('fog_index', 'avg_word_length')

  ('complex_word_count', 'avg_word_length')

  ('syllable_per_word', 'avg_word_length')

No Correlation:

  ('positive_score', 'negative_score')

  ('positive_score', 'subjectivity_score')

  ('positive_score', 'percentage_of_complex_words')

  ('positive_score', 'fog_index')

  ('positive_score', 'avg_number_of_words')

  ('positive_score', 'syllable_per_word')

  ('positive_score', 'personal_pronouns')

  ('positive_score', 'avg_word_length')

  ('negative_score', 'percentage_of_complex_words')

  ('negative_score', 'fog_index')

  ('negative_score', 'avg_number_of_words')

  ('negative_score', 'syllable_per_word')

  ('negative_score', 'personal_pronouns')

  ('negative_score', 'avg_word_length')

  ('polarity_score', 'percentage_of_complex_words')

  ('polarity_score', 'fog_index')

  ('polarity_score', 'avg_number_of_words')

  ('polarity_score', 'complex_word_count')

  ('polarity_score', 'word_count')

  ('polarity_score', 'syllable_per_word')

  ('polarity_score', 'personal_pronouns')

  ('polarity_score', 'avg_word_length')

  ('subjectivity_score', 'percentage_of_complex_words')

  ('subjectivity_score', 'fog_index')

  ('subjectivity_score', 'avg_number_of_words')

  ('subjectivity_score', 'complex_word_count')

  ('subjectivity_score', 'word_count')

  ('subjectivity_score', 'syllable_per_word')

  ('subjectivity_score', 'avg_word_length')

  ('percentage_of_complex_words', 'avg_number_of_words')

  ('percentage_of_complex_words', 'word_count')

  ('percentage_of_complex_words', 'syllable_per_word')

  ('fog_index', 'complex_word_count')

  ('fog_index', 'word_count')

  ('fog_index', 'syllable_per_word')

  ('fog_index', 'personal_pronouns')

  ('avg_number_of_words', 'complex_word_count')

  ('avg_number_of_words', 'word_count')

  ('avg_number_of_words', 'syllable_per_word')

  ('avg_number_of_words', 'personal_pronouns')

  ('avg_number_of_words', 'avg_word_length')

  ('complex_word_count', 'syllable_per_word')

  ('complex_word_count', 'personal_pronouns')

  ('word_count', 'syllable_per_word')

  ('word_count', 'personal_pronouns')

  ('word_count', 'avg_word_length')

  ('syllable_per_word', 'personal_pronouns')

Low Negative Correlation:

  ('negative_score', 'polarity_score')

  ('polarity_score', 'subjectivity_score')

  ('percentage_of_complex_words', 'personal_pronouns')

  ('personal_pronouns', 'avg_word_length')

Highly Negatively Correlated:

top_positive_url_ids = output_df.nlargest(10, 'positive_score')['url_id'].tolist()

top_negative_url_ids = output_df.nlargest(10, 'negative_score')['url_id'].tolist()

def get_content_from_file(url_id):
    file_path = os.path.join('Scraped Blogs', f'{url_id}.txt')
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

positive_content = ' '.join([get_content_from_file(url_id) for url_id in top_positive_url_ids])

negative_content = ' '.join([get_content_from_file(url_id) for url_id in top_negative_url_ids])

print('\n\n')
positive_wordcloud = WordCloud(width=1000, height=500, background_color='white').generate(positive_content)
plt.figure(figsize=(10, 5))
plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Top 10 Positive Articles', y=1.1)
plt.axis('off')
plt.show()

print('\n\n')

negative_wordcloud = WordCloud(width=1000, height=500, background_color='black').generate(negative_content)
plt.figure(figsize=(10, 5))
plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Top 10 Negative Articles', y=1.1)
plt.axis('off')
plt.show()

all_titles = ' '.join(output_df['title'])

titles_wordcloud = WordCloud(width=1000, height=500, background_color='yellow').generate(all_titles)
plt.figure(figsize=(10, 5))
plt.imshow(titles_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for All Titles', y=1.1)
plt.axis('off')
plt.show()

	URL_ID	URL
0	37	https://insights.blackcoffer.com/ai-in-healthc...
1	38	https://insights.blackcoffer.com/what-if-the-c...
2	39	https://insights.blackcoffer.com/what-jobs-wil...
3	40	https://insights.blackcoffer.com/will-machine-...
4	41	https://insights.blackcoffer.com/will-ai-repla...

	URL_ID	URL	POSITIVE SCORE	NEGATIVE SCORE	POLARITY SCORE	SUBJECTIVITY SCORE	AVG SENTENCE LENGTH	PERCENTAGE OF COMPLEX WORDS	FOG INDEX	AVG NUMBER OF WORDS	COMPLEX WORD COUNT	WORD COUNT	SYLLABLE PER WORD	PERSONAL PRONOUNS	AVG WORD LENGTH
0	37	https://insights.blackcoffer.com/ai-in-healthc...	65	32	0.34	0.10	15	38.87	21.55	15	440	1132	2	1	7
1	38	https://insights.blackcoffer.com/what-if-the-c...	57	37	0.21	0.18	9	27.49	14.60	9	204	742	2	6	7
2	39	https://insights.blackcoffer.com/what-jobs-wil...	65	35	0.30	0.12	12	40.87	21.15	12	405	991	2	3	7

	url_id	url	positive_score	negative_score	polarity_score	subjectivity_score	avg_sentence_length	percentage_of_complex_words	fog_index	avg_number_of_words	complex_word_count	word_count	syllable_per_word	personal_pronouns	avg_word_length
0	37	https://insights.blackcoffer.com/ai-in-healthc...	65	32	0.34	0.10	15	38.87	21.55	15	440	1132	2	1	7
1	38	https://insights.blackcoffer.com/what-if-the-c...	57	37	0.21	0.18	9	27.49	14.60	9	204	742	2	6	7
2	39	https://insights.blackcoffer.com/what-jobs-wil...	65	35	0.30	0.12	12	40.87	21.15	12	405	991	2	3	7

	url_id	url	title	positive_score	negative_score	polarity_score	subjectivity_score	percentage_of_complex_words	fog_index	avg_number_of_words	complex_word_count	word_count	syllable_per_word	personal_pronouns	avg_word_length
0	37	https://insights.blackcoffer.com/ai-in-healthc...	AI in healthcare to Improve Patient Outcomes	65	32	0.34	0.10	38.87	21.55	15	440	1132	2	1	7
1	38	https://insights.blackcoffer.com/what-if-the-c...	What if the Creation is Taking Over the Creator?	57	37	0.21	0.18	27.49	14.60	9	204	742	2	6	7
2	39	https://insights.blackcoffer.com/what-jobs-wil...	What Jobs Will Robots Take From Humans in The ...	65	35	0.30	0.12	40.87	21.15	12	405	991	2	3	7

	positive_score	negative_score	polarity_score	subjectivity_score	percentage_of_complex_words	fog_index	avg_number_of_words	complex_word_count	word_count	syllable_per_word	personal_pronouns	avg_word_length
count	111.000000	111.000000	111.000000	111.000000	111.000000	111.000000	111.000000	111.000000	111.000000	111.000000	111.000000	111.000000
mean	28.189189	30.684685	0.016216	0.122342	31.765946	18.029369	13.306306	202.126126	634.864865	2.018018	7.261261	6.837838
std	17.404552	22.092361	0.435650	0.042532	5.868901	4.389579	9.265569	103.907749	315.956455	0.133620	8.510648	0.496177
min	0.000000	0.000000	-1.000000	0.030000	15.830000	10.490000	6.000000	21.000000	91.000000	2.000000	0.000000	5.000000
25%	16.000000	13.500000	-0.325000	0.100000	28.620000	15.780000	10.000000	131.000000	399.500000	2.000000	2.000000	7.000000
50%	25.000000	26.000000	-0.040000	0.110000	31.720000	17.540000	12.000000	189.000000	600.000000	2.000000	4.000000	7.000000
75%	36.000000	43.000000	0.380000	0.150000	34.475000	19.875000	14.000000	286.500000	890.000000	2.000000	10.000000	7.000000
max	85.000000	93.000000	1.000000	0.250000	46.810000	53.280000	101.000000	440.000000	1998.000000	3.000000	46.000000	8.000000

Blog Scraping and Analysis¶

Importing Necessary Dependencies¶

Scraping and Output Generation Section¶

Reading the Input File¶

Scraping HTML and Parsing it to get just the Title and Content¶

Creating Stopwords, Positive and Negative Dictionary¶

Sentiment Scores Computation¶

Textual Variables Computation¶

Merging Variables with the Input Dataframe¶

Analysis Section¶

Some Additional Feature Engineering¶

Summary Statistics¶

Insights from Summary Statistics:¶

Understanding the Distribution of Variables¶

Correlation Analysis¶

Word Cloud Based Analysis¶

Measure	Range	Mean	Standard Deviation	Insights
Positive Score	0 to 85	28.19	17.40	The average positive score is 28.19, indicating a moderate level of positive sentiment in the data.
Negative Score	0 to 93	30.68	22.09	The average negative score is 30.68, indicating a moderate level of negative sentiment.
Polarity Score	-1 to 1	0.02	0.44	The average polarity score is 0.02, suggesting a slight positive bias.
Subjectivity Score	0.03 to 0.25	0.12	0.04	The data shows an average subjectivity score of 0.12, indicating a relatively objective tone. Scores range from 0.03 (objective) to 0.25 (subjective).
Percentage of Complex Words	15.83 % to 46.81 %	31.77	5.87	On average, around 31.77% of words are complex.
Fog Index	10.49 to 53.28	18.03	4.39	The average Fog index is 18.03, indicating that the text is written at a moderately complex level overall across articles.
Average number of Words	6 to 101	13.31	9.27	The average number of words per sentence is 13.31, with a wide variation from 6 to 101 words suggesting potential outliers.
Complex Word Count	21 to 440	202.13	103.91	The average count of complex words is 202.13.
Word Count	91 to 1998	634.86	315.96	The average word count is 634.86. But, the lowest and highest are at two extremes with huge SD of 315.
Syllable Per Word	2 to 3	2.02	0.13	The average syllables per word is 2.02, suggesting that most words are 2-syllable words.
Personal Pronouns	0 to 46	7.26	8.51	The data indicates an average of 7.26 personal pronouns used per article and the maximum being 46 sounds overly subjective article.
Average Word Length	5 to 8	6.84	0.50	The average word length is 6.84 characters. Word lengths range from 5 to 8 characters.