import pandas as pd
import numpy as np 
# from pprint import pprint
import datetime
import textwrap

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import praw # Reddit API wrapper
import nltk # Natural Language Toolkit (NLTK) 
# nltk.download() # grab datastes and models

# Read credentials file
with open('RedditApp_credentials') as f:
    lines = f.readlines()
    
# create Reddit client
reddit = praw.Reddit(client_id=lines[1][:-1], client_secret=lines[3][:-1], user_agent='SereDef')

# small helper for publication dates
def get_date(submission):
	return datetime.datetime.fromtimestamp(submission.created)

# initiate a set { this way I don't get duplicates when running multiple times }
headlines = set()

# iterate over "all" subreddits searching for company name. Note: limit=None means up to 1000 headlines
for sub in reddit.subreddit('all').search(r'Novo Nordisk|NovoNordisk',
                                          limit=None, syntax='cloudsearch', sort='relevance', time_filter='all'):
    headlines.add((str(sub.title), str(sub.selftext), get_date(sub), str(sub.subreddit), str(sub.author)))
    
print(len(headlines), 'posts found.')

244 posts found.

# Store in dataframe
hl = pd.DataFrame(headlines).rename(columns={0:'title_orig',1:'text_orig',2:'date',3:'subreddit',4:'author'})
hl.head()

# Some headlines are reposted multiple times 
# hl.title.value_counts()

# Order by date and drop duplicated (keeping earliest headline)
hl = hl.sort_values('date').reset_index()
hl = hl.drop_duplicates('title_orig').drop('index',axis='columns')

print(hl.shape[0], 'headlines still in set.')

201 headlines still in set.

from googletrans import Translator

translator = Translator()

hl['language'] = hl['title'] = hl['text'] = ''; count=0

for i in hl.index: 
    lang = translator.detect(hl.loc[i,'title_orig'])
    hl.loc[i, 'language'] = lang.lang
    
    if lang.lang!='en': # print(lang.lang)
        trans = translator.translate(hl.loc[i,'title_orig'], dest='en').text
        count += 1
    else: trans = hl.loc[i,'title_orig']
    hl.loc[i,'title'] = trans
    
    if (hl.loc[i,'text_orig']!='') & (hl.loc[i,'text_orig'][:4]!='http'): # remove empty strings and (while at it) links
        lang_sub = translator.detect(hl.loc[i,'text_orig'])
        if lang_sub.lang!='en': # print(lang.lang)
            trans_sub = translator.translate(hl.loc[i,'text_orig'], dest='en').text
        else: trans_sub = hl.loc[i,'text_orig']
        hl.loc[i,'text'] = trans_sub

print(count, ' headlines translated.')
hl.language.value_counts()

33  headlines translated.

language
en    168
da     17
de      6
no      3
bg      2
sv      2
el      1
fi      1
tr      1
Name: count, dtype: int64

# > `TODO` **data input**: include comments as well. This takes quite a bit longer so I'll tune it later...

# keywz = ['Novo Nordisk', 'NovoNordisk']

# for comment in reddit.subreddit('all').stream.comments():
#      cbody = comment.body
#      if any(keyword in cbody for keyword in keywz):
#          comments.add(cbody, get_date(comment), comment.subreddit)

# print(len(comments), 'comments extracted.')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

# create Sentiment Intensity Analyzer (SIA) 
sia = SentimentIntensityAnalyzer()

def analyse_sentiment(col, df=hl):
    results = []
    for line in df[col]:
        pol_score = sia.polarity_scores(line) # get the sentiment
        pol_score[col] = line
        results.append(pol_score) # append sentiment dictionary to results list
    
    # pprint(results[:2], width=100)
    
    return pd.DataFrame.from_records(results) # returns a dataframe

res_headlines = analyse_sentiment('title')
res_text = analyse_sentiment('text')

# > `TODO` **sentiment analysis model**: try less parsimonious models...

# Bind together the sentiment scores from headlines and from text and combine them for more robust total scores
rs = pd.concat( [ res_headlines, res_text[['compound','text']] ], axis=1)
rs['tot_score'] = rs['compound'].sum(axis=1)

# also append date subreddit and author name
rs[['date','subreddit','author']] = hl[['date','subreddit','author']]
rs.head()

# Plot averall sentiment
f,ax = plt.subplots(figsize=(15,3))
ax = sns.kdeplot(rs.tot_score, color='silver', fill=True, lw=2)

# make it prettier
ax.set_xticks(ticks=np.arange(-1, 1.5, 0.5), labels=['Negative','◂-------','Neutral','-------▸','Positive'])
ax.set_xlabel('Sentiment score', fontsize=15, fontweight='bold', labelpad=10)
ax.tick_params(axis='x', labelsize=12, width=0)

ax.set_yticks([]); ax.set_ylabel('Desnsity', fontsize=12, labelpad=10)

ax.set_title('Sentiment distribution', fontsize=20, fontweight='bold', loc='left', color='royalblue', pad=15);

# Interactive plot
fig = go.Figure(data=go.Scatter(x = rs.date, y = rs.tot_score, mode='markers', 
                                marker = dict(size = 10, symbol = 'circle', color=rs.tot_score,
                                              colorscale=[[0, 'crimson'], [0.5, 'silver'], [1.0, 'blue']],
                                              showscale=True, opacity = .5),
                                # Add hover box
                                hovertext = rs.title.apply( lambda t: "<br>".join(textwrap.wrap(t, width=100))),
                                text = rs.subreddit, hoverlabel = dict(namelength = -1),
                                hovertemplate = """<b>%{hovertext}</b> <br> Date: %{x} <br> Sureddit: %{text} <extra></extra>"""))

# Add some time markes
def time_stamp(y,m,d, text):
    text=text+' '
    fig.add_vline(datetime.datetime(y,m,d).timestamp() * 1000, line_dash="dash", line_color='black', line_width=1, 
                  annotation_text=text, annotation_position='top left', annotation_align='right')

time_stamp(2021, 6, 4, 'FDA <br>approval <br>(1)')
# FDA Approves Wegovy (semaglutide) to Treat Adults with Obesity
time_stamp(2022,12,23, 'FDA approval (2)')
# FDA Approves Once-Weekly Wegovy injection for the Treatment of Obesity in Teens Aged 12 Years and Older
time_stamp(2023, 8, 8, 'SELECT trial pub.')
# Novo Nordisk A/S: Semaglutide 2.4 mg reduces the risk of major adverse cardiovascular events by 20% in adults with overweight or obesity in the SELECT trial

# Make it pretty 
fig.update_yaxes(title_text='<b>Sentiment score</b>', range=[-1.5,1.5], mirror=True, ticks='outside', showline=True, linecolor='black', gridcolor='lightgrey')
fig.update_xaxes(title_text='Date', mirror=True, ticks='outside', showline=True, linecolor='black', gridcolor='lightgrey')
fig.update_layout(plot_bgcolor='whitesmoke', width=1300, height=400, margin=dict(l=10, r=10, t=25, b=10))

# subr = rs.subreddit.value_counts() # how many headlines per subreddit?
def find_active(level):
    rs[f'{level}_counts'] = rs.groupby([level])['title'].transform('count') # count how many headlines per level
    active_sub = rs.loc[rs[f'{level}_counts'] >1,] # exclude those with only one headline
    active_sub = active_sub.sort_values(by=f'{level}_counts', ascending=False) # sort
    
    name = level.capitalize(); by = 'subreddit' if level=='author' else 'author'
    
    # make boxplot 
    fig = go.Figure(go.Box(x = active_sub[level], y = active_sub['tot_score'], boxmean = True, boxpoints = 'all', whiskerwidth=0.5, marker_size=3, line_width=1, showlegend=False,
                           # Add hover box
                           hovertext = active_sub.title.apply( lambda t: "<br>".join(textwrap.wrap(t, width=100))),
                           text = active_sub[by], hoverlabel = dict(namelength = -1),
                           hovertemplate = "<b>%{hovertext}</b> <br>By: %{text}<extra></extra>"))
    # add text noting the number of headlines per group
    for i in active_sub[level].unique():
        fig.add_trace(go.Scatter(x=[i], y=[-1.3], mode='text', text= int(list(active_sub.loc[active_sub[level]==i, f'{level}_counts'])[0]), 
                                  textposition='bottom center', textfont_size=13, showlegend=False, 
                                 hovertemplate = 'Number of headlines by %{x} <extra></extra>'))
    # aaand make it pretty 
    fig.update_yaxes(title_text='<b>Sentiment score</b>', range=[-1.5,1.5], mirror=True, ticks='outside', showline=True, linecolor='black', gridcolor='lightgrey')
    fig.update_xaxes(title_text=f'<b>{name}</b>', mirror=True, ticks='outside', showline=True, linecolor='black', gridcolor='lightgrey')
    fig.update_layout(plot_bgcolor='whitesmoke', width=1300, height=400, margin=dict(l=10, r=10, t=25, b=10))

    return fig

find_active('subreddit')

find_active('author')

	title_orig	date	subreddit	author
0	Pharma Giant Novo Nordisk To Buy Obesity Drug ...	2023-08-10 16:44:31	Semaglutide	The-Techie
1	Novo Nordisk Moves to Stop Businesses From Sel...	2023-06-22 01:50:09	IndustrialPharmacy	anonymous-shad0w
2	[Business] - Novo Nordisk sues clinics alleged...	2023-06-20 19:22:01	AutoNewspaper	AutoNewspaperAdmin
3	Weight-Loss Drug Wegovy Launches In U.K. As Sh...	2023-09-04 12:24:41	IndustrialPharmacy	anonymous-shad0w
4	'Much More Needs to Be Done,' Says Sanders as ...	2023-03-15 12:14:12	politics	newnemo

	neg	neu	pos	compound	title	compound	text	tot_score	date	subreddit	author
0	0.208	0.792	0.000	-0.2732	Novo Nordisk to cut insulin prices in the U.S.	0.0000		-0.2732	2019-09-06 19:41:05	UpliftingNews	AldoTheeApache
1	0.000	0.905	0.095	0.2500	Reminder that Eli Lilly, Novo Nordisk, and San...	-0.6842	I was asked to make a quick post about how the...	-0.4342	2019-10-28 18:20:20	diabetes	cat_attack_
2	0.098	0.748	0.154	0.2960	TIL Novo Nordisk provides free insulin for peo...	0.0000		0.2960	2020-03-26 20:28:47	diabetes	cascer1
3	0.101	0.692	0.208	0.4019	Novo Nordisk is offering free insulin for thos...	0.0000		0.4019	2020-05-13 22:29:12	freebies	Sweetguy88
4	0.127	0.873	0.000	-0.4939	Novo Nordisk Foundation donates USD 335 millio...	0.0000	[New international research center to drive f...	-0.4939	2021-12-17 09:57:44	diabetes	broogernavn

Is Novo Nordisk bad ?¶

Sentiment analysis of Reddit posts¶

Data mining¶

Preprocessing¶

Translate posts to English (where necessary)¶

Sentiment extraction¶

Data visualization¶

Conclusions¶