import pandas as pd
import numpy as np
# from pprint import pprint
import datetime
import textwrap
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import praw # Reddit API wrapper
import nltk # Natural Language Toolkit (NLTK)
# nltk.download() # grab datastes and models
Data mining¶
I use praw
to scrape (all) Reddit posts which mention "Novo Nordisk" (or "NovoNordisk"). Why Reddit? Becasue their API (praw
) is easy to use and ✨free✨ (unlike Twitter's, thanks Elon).
As always, it's important to keep in mind "where the data is coming from" when interpreting any insights coming from it. For example, it should be noted that Reddit users may be a somewhat peculiar community, which may not represent a general "public opinion" very well.
But just for the fun of it, let's take a look at what redditors think of Novo Nordisk...
# Read credentials file
with open('RedditApp_credentials') as f:
lines = f.readlines()
# create Reddit client
reddit = praw.Reddit(client_id=lines[1][:-1], client_secret=lines[3][:-1], user_agent='SereDef')
# small helper for publication dates
def get_date(submission):
return datetime.datetime.fromtimestamp(submission.created)
# initiate a set { this way I don't get duplicates when running multiple times }
headlines = set()
# iterate over "all" subreddits searching for company name. Note: limit=None means up to 1000 headlines
for sub in reddit.subreddit('all').search(r'Novo Nordisk|NovoNordisk',
limit=None, syntax='cloudsearch', sort='relevance', time_filter='all'):
headlines.add((str(sub.title), str(sub.selftext), get_date(sub), str(sub.subreddit), str(sub.author)))
print(len(headlines), 'posts found.')
244 posts found.
# Store in dataframe
hl = pd.DataFrame(headlines).rename(columns={0:'title_orig',1:'text_orig',2:'date',3:'subreddit',4:'author'})
hl.head()
title_orig | text_orig | date | subreddit | author | |
---|---|---|---|---|---|
0 | Pharma Giant Novo Nordisk To Buy Obesity Drug ... | 2023-08-10 16:44:31 | Semaglutide | The-Techie | |
1 | Novo Nordisk Moves to Stop Businesses From Sel... | 2023-06-22 01:50:09 | IndustrialPharmacy | anonymous-shad0w | |
2 | [Business] - Novo Nordisk sues clinics alleged... | 2023-06-20 19:22:01 | AutoNewspaper | AutoNewspaperAdmin | |
3 | Weight-Loss Drug Wegovy Launches In U.K. As Sh... | 2023-09-04 12:24:41 | IndustrialPharmacy | anonymous-shad0w | |
4 | 'Much More Needs to Be Done,' Says Sanders as ... | 2023-03-15 12:14:12 | politics | newnemo |
# Some headlines are reposted multiple times
# hl.title.value_counts()
# Order by date and drop duplicated (keeping earliest headline)
hl = hl.sort_values('date').reset_index()
hl = hl.drop_duplicates('title_orig').drop('index',axis='columns')
print(hl.shape[0], 'headlines still in set.')
201 headlines still in set.
from googletrans import Translator
translator = Translator()
hl['language'] = hl['title'] = hl['text'] = ''; count=0
for i in hl.index:
lang = translator.detect(hl.loc[i,'title_orig'])
hl.loc[i, 'language'] = lang.lang
if lang.lang!='en': # print(lang.lang)
trans = translator.translate(hl.loc[i,'title_orig'], dest='en').text
count += 1
else: trans = hl.loc[i,'title_orig']
hl.loc[i,'title'] = trans
if (hl.loc[i,'text_orig']!='') & (hl.loc[i,'text_orig'][:4]!='http'): # remove empty strings and (while at it) links
lang_sub = translator.detect(hl.loc[i,'text_orig'])
if lang_sub.lang!='en': # print(lang.lang)
trans_sub = translator.translate(hl.loc[i,'text_orig'], dest='en').text
else: trans_sub = hl.loc[i,'text_orig']
hl.loc[i,'text'] = trans_sub
print(count, ' headlines translated.')
hl.language.value_counts()
33 headlines translated.
language en 168 da 17 de 6 no 3 bg 2 sv 2 el 1 fi 1 tr 1 Name: count, dtype: int64
# > `TODO` **data input**: include comments as well. This takes quite a bit longer so I'll tune it later...
# keywz = ['Novo Nordisk', 'NovoNordisk']
# for comment in reddit.subreddit('all').stream.comments():
# cbody = comment.body
# if any(keyword in cbody for keyword in keywz):
# comments.add(cbody, get_date(comment), comment.subreddit)
# print(len(comments), 'comments extracted.')
Sentiment extraction¶
I use a rules-based sentiment analyzer form NLTK
(Vader). The model attaches a positive or negative rating to certain words (paying attention to negation if it exists). This tends to work fine, and has the advantage of being simple and extremely fast, but has some weaknesses:
- Longer sentences usually contain more neutral words, so the overall polaruty score tends towards neutral as well
- Sarcasm is often misinterpreted (that includes the use of emojis as well)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# create Sentiment Intensity Analyzer (SIA)
sia = SentimentIntensityAnalyzer()
def analyse_sentiment(col, df=hl):
results = []
for line in df[col]:
pol_score = sia.polarity_scores(line) # get the sentiment
pol_score[col] = line
results.append(pol_score) # append sentiment dictionary to results list
# pprint(results[:2], width=100)
return pd.DataFrame.from_records(results) # returns a dataframe
res_headlines = analyse_sentiment('title')
res_text = analyse_sentiment('text')
# > `TODO` **sentiment analysis model**: try less parsimonious models...
The sentiment scoring yields 4 columns:
Neu
,Neg
andPos
: represent the sentiment score percentage of each categorycompound
: is a single number sentiment score ranging from -1 (extremely negative) to 1 (extremely positive)
# Bind together the sentiment scores from headlines and from text and combine them for more robust total scores
rs = pd.concat( [ res_headlines, res_text[['compound','text']] ], axis=1)
rs['tot_score'] = rs['compound'].sum(axis=1)
# also append date subreddit and author name
rs[['date','subreddit','author']] = hl[['date','subreddit','author']]
rs.head()
neg | neu | pos | compound | title | compound | text | tot_score | date | subreddit | author | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.208 | 0.792 | 0.000 | -0.2732 | Novo Nordisk to cut insulin prices in the U.S. | 0.0000 | -0.2732 | 2019-09-06 19:41:05 | UpliftingNews | AldoTheeApache | |
1 | 0.000 | 0.905 | 0.095 | 0.2500 | Reminder that Eli Lilly, Novo Nordisk, and San... | -0.6842 | I was asked to make a quick post about how the... | -0.4342 | 2019-10-28 18:20:20 | diabetes | cat_attack_ |
2 | 0.098 | 0.748 | 0.154 | 0.2960 | TIL Novo Nordisk provides free insulin for peo... | 0.0000 | 0.2960 | 2020-03-26 20:28:47 | diabetes | cascer1 | |
3 | 0.101 | 0.692 | 0.208 | 0.4019 | Novo Nordisk is offering free insulin for thos... | 0.0000 | 0.4019 | 2020-05-13 22:29:12 | freebies | Sweetguy88 | |
4 | 0.127 | 0.873 | 0.000 | -0.4939 | Novo Nordisk Foundation donates USD 335 millio... | 0.0000 | [New international research center to drive f... | -0.4939 | 2021-12-17 09:57:44 | diabetes | broogernavn |
Data visualization¶
# Plot averall sentiment
f,ax = plt.subplots(figsize=(15,3))
ax = sns.kdeplot(rs.tot_score, color='silver', fill=True, lw=2)
# make it prettier
ax.set_xticks(ticks=np.arange(-1, 1.5, 0.5), labels=['Negative','◂-------','Neutral','-------▸','Positive'])
ax.set_xlabel('Sentiment score', fontsize=15, fontweight='bold', labelpad=10)
ax.tick_params(axis='x', labelsize=12, width=0)
ax.set_yticks([]); ax.set_ylabel('Desnsity', fontsize=12, labelpad=10)
ax.set_title('Sentiment distribution', fontsize=20, fontweight='bold', loc='left', color='royalblue', pad=15);
The scores seem somewhat symmetric around 0 (neutral values), but there is a visible little bump / skew towards positive sentiment.
Let's see if the date of posts matters. Below I plot the sentiment scores against time, adding a few temporal marks (e.g., history of FDA approval for Wegovy, I got from here) to see if they help explain any temporal trends...
NOTE
: Hover over the datapoints to read the headlines they represent, their date, and the subreddit where they were posted.
# Interactive plot
fig = go.Figure(data=go.Scatter(x = rs.date, y = rs.tot_score, mode='markers',
marker = dict(size = 10, symbol = 'circle', color=rs.tot_score,
colorscale=[[0, 'crimson'], [0.5, 'silver'], [1.0, 'blue']],
showscale=True, opacity = .5),
# Add hover box
hovertext = rs.title.apply( lambda t: "<br>".join(textwrap.wrap(t, width=100))),
text = rs.subreddit, hoverlabel = dict(namelength = -1),
hovertemplate = """<b>%{hovertext}</b> <br> Date: %{x} <br> Sureddit: %{text} <extra></extra>"""))
# Add some time markes
def time_stamp(y,m,d, text):
text=text+' '
fig.add_vline(datetime.datetime(y,m,d).timestamp() * 1000, line_dash="dash", line_color='black', line_width=1,
annotation_text=text, annotation_position='top left', annotation_align='right')
time_stamp(2021, 6, 4, 'FDA <br>approval <br>(1)')
# FDA Approves Wegovy (semaglutide) to Treat Adults with Obesity
time_stamp(2022,12,23, 'FDA approval (2)')
# FDA Approves Once-Weekly Wegovy injection for the Treatment of Obesity in Teens Aged 12 Years and Older
time_stamp(2023, 8, 8, 'SELECT trial pub.')
# Novo Nordisk A/S: Semaglutide 2.4 mg reduces the risk of major adverse cardiovascular events by 20% in adults with overweight or obesity in the SELECT trial
# Make it pretty
fig.update_yaxes(title_text='<b>Sentiment score</b>', range=[-1.5,1.5], mirror=True, ticks='outside', showline=True, linecolor='black', gridcolor='lightgrey')
fig.update_xaxes(title_text='Date', mirror=True, ticks='outside', showline=True, linecolor='black', gridcolor='lightgrey')
fig.update_layout(plot_bgcolor='whitesmoke', width=1300, height=400, margin=dict(l=10, r=10, t=25, b=10))
Does not seem like there is a temporal trend towards positive or negative values, the topic only seems to get more popular after the release of FDA approval and the pubblication of certain trials.
Let's look for subreddit- or redditor-specific trends.
NOTE
: Hover over the data points to read the headlines they represent and their Subreddit/Redditor. Hover over the boxplots to get mean, meadian and range values. The number of headlines collected from each Subreddit/Redditor is noted at the bottom.
# subr = rs.subreddit.value_counts() # how many headlines per subreddit?
def find_active(level):
rs[f'{level}_counts'] = rs.groupby([level])['title'].transform('count') # count how many headlines per level
active_sub = rs.loc[rs[f'{level}_counts'] >1,] # exclude those with only one headline
active_sub = active_sub.sort_values(by=f'{level}_counts', ascending=False) # sort
name = level.capitalize(); by = 'subreddit' if level=='author' else 'author'
# make boxplot
fig = go.Figure(go.Box(x = active_sub[level], y = active_sub['tot_score'], boxmean = True, boxpoints = 'all', whiskerwidth=0.5, marker_size=3, line_width=1, showlegend=False,
# Add hover box
hovertext = active_sub.title.apply( lambda t: "<br>".join(textwrap.wrap(t, width=100))),
text = active_sub[by], hoverlabel = dict(namelength = -1),
hovertemplate = "<b>%{hovertext}</b> <br>By: %{text}<extra></extra>"))
# add text noting the number of headlines per group
for i in active_sub[level].unique():
fig.add_trace(go.Scatter(x=[i], y=[-1.3], mode='text', text= int(list(active_sub.loc[active_sub[level]==i, f'{level}_counts'])[0]),
textposition='bottom center', textfont_size=13, showlegend=False,
hovertemplate = 'Number of headlines by %{x} <extra></extra>'))
# aaand make it pretty
fig.update_yaxes(title_text='<b>Sentiment score</b>', range=[-1.5,1.5], mirror=True, ticks='outside', showline=True, linecolor='black', gridcolor='lightgrey')
fig.update_xaxes(title_text=f'<b>{name}</b>', mirror=True, ticks='outside', showline=True, linecolor='black', gridcolor='lightgrey')
fig.update_layout(plot_bgcolor='whitesmoke', width=1300, height=400, margin=dict(l=10, r=10, t=25, b=10))
return fig
find_active('subreddit')
Same as above but looking at specific redditors...
find_active('author')
Conclusions¶
The popularity of "Novo Nordisk" as a Reddit post topic has risen considerably from Jan 2023 onwards. This was true not only for subreddits focused on healthcare (e.g. diabetes, drug developent or "biotech"), but also quite a few finance subreddits.
The simple sentiment analysis conducted here seems to imply a general opinion of "Novo Nordisk" that is skewed towards positive values. However, a quick inspection of the datapoints represented in the interactive graphs can reveal how SIA was not always very accurate in estimating the true sentiment of the posts. The scores based on both headlines and post text are better compared to those extracted from headlines only, but this is still far from satisfactory if you ask me...
Can we do better with this analysis? Do you have any ideas? P.S. Stay tuned for more advanced NLP analysis in the coming weeks! 😊