Measure Correlation between Danske Bank stock price and Top8 news.¶

Stock data is [https://finance.yahoo.com/quote/DANSKE.CO)

Top8 news and works as a ranking of global news. Berlingske news. Ranked by Berlingske self. [https://www.berlingske.dk)

"1" when stock price value rise or stays the same

"0" when stock price value decrease.

Use data from '2019-11-05 16:45:06' to '2020-01-17 17:05:06' for modelling. The split is 80/20.

AUC evaluation metric.

# nlp_notebook.ipynb
#  Assumes python vers. 3.6
# __author__ = 'mizio'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import csv as csv
import numpy as np
import pandas as pd
import pylab as plt
from wordcloud import WordCloud,STOPWORDS
from datetime import date
import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sys import path
import os
from os.path import dirname, abspath
path_of_doc = dirname(dirname(abspath(os.path.split(os.getcwd())[0])))
path_to_ml_lib = path_of_doc + '/machine_learning_modules/'
if path_to_ml_lib not in path:
    path.append(path_to_ml_lib)

from datamunging.explore import *

from machine_learning_estimators.tensorflow_NN import *

# Load data
#file_name = 'berlingske_scraped_data2020-01-18_13:00:21.197992.csv'
#file_name = 'berlingske_scraped_data2020-01-21_21:08:45.439396.csv'
file_name = 'berlingske_scraped_data2020-01-21_22:27:41.064382.csv'
df = pd.read_csv('../CorrelationDataDanskeBankBerlingske/' + file_name, header=0)

count = df.count()[0]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20160 entries, 0 to 20159
Data columns (total 10 columns):
TimeStamp    20160 non-null object
Url          20160 non-null object
top1         20160 non-null object
top2         20160 non-null object
top3         20160 non-null object
top4         20160 non-null object
top5         20160 non-null object
top6         20160 non-null object
top7         20160 non-null object
top8         20160 non-null object
dtypes: object(10)
memory usage: 1.5+ MB

# clean for rows with null
df = df.dropna()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20160 entries, 0 to 20159
Data columns (total 10 columns):
TimeStamp    20160 non-null object
Url          20160 non-null object
top1         20160 non-null object
top2         20160 non-null object
top3         20160 non-null object
top4         20160 non-null object
top5         20160 non-null object
top6         20160 non-null object
top7         20160 non-null object
top8         20160 non-null object
dtypes: object(10)
memory usage: 1.7+ MB

df.head(-1)

#file_name_danske = 'danske_bank_scraped_data2020-01-18_13:00:21.360295.csv'
#file_name_danske = 'danske_bank_scraped_data2020-01-21_21:08:45.914529.csv'
file_name_danske = 'danske_bank_scraped_data2020-01-21_22:27:41.902021.csv'
df_djia = pd.read_csv('../CorrelationDataDanskeBankBerlingske/' + file_name_danske, header=0)

df_djia.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20160 entries, 0 to 20159
Data columns (total 4 columns):
TimeStamp     20160 non-null object
Url           20160 non-null object
Stock         20160 non-null object
StockPrice    20160 non-null float64
dtypes: float64(1), object(3)
memory usage: 630.1+ KB

df_djia.head()

Compare timestamps for stock prices with that of berlingske

df.TimeStamp.head()

0    2020-01-17 22:15:06
1    2020-01-17 22:10:05
2    2020-01-17 22:05:07
3    2020-01-17 22:00:06
4    2020-01-17 21:55:06
Name: TimeStamp, dtype: object

df_djia.TimeStamp.head()

0    2020-01-17 17:05:06
1    2020-01-17 17:00:06
2    2020-01-17 16:55:06
3    2020-01-17 16:50:07
4    2020-01-17 16:45:06
Name: TimeStamp, dtype: object

Align timestamps of Berlingske news media with open market hours for stock prices.

def is_during_market_hours(timestamp):
    current_danish_time = timestamp
    open_hour = datetime.datetime(current_danish_time.year, current_danish_time.month, current_danish_time.day, hour=8, minute=50)
    close_hour = datetime.datetime(current_danish_time.year, current_danish_time.month, current_danish_time.day, hour=17, minute=10)
    return (timestamp > open_hour) & (timestamp < close_hour) & (timestamp.weekday() < 5)

def timestamp_format_to_datetime(df):
    df.loc[:, "TimeStamp"] = df['TimeStamp'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
    return df

def prepare_timestamp_open_market_hours(df, start_time=None, end_time=None):
    if start_time is not None and end_time is not None:
        filtering = df["TimeStamp"].apply(lambda x: is_during_market_hours(x) & (x >= start_time) & (x <= end_time))
    else:
        filtering = df["TimeStamp"].apply(lambda x: is_during_market_hours(x))
    df.sort_values("TimeStamp", inplace=True)
    df.where(filtering, inplace=True)
    df = df.dropna()
    return df

df = timestamp_format_to_datetime(df)
df = prepare_timestamp_open_market_hours(df)
count = df.shape[0]
start_time = df.TimeStamp[df.index[0]]
end_time = df.TimeStamp[df.index[-1]]
#df = prepare_timestamp_open_market_hours(df, start_time, end_time)
training_test_ratio = 0.8
df_train = df.iloc[:int(count*training_test_ratio), :].copy()
df_test = df.iloc[int(count*training_test_ratio):, :].copy()

df_test.shape, df_train.shape

((1020, 10), (4080, 10))

start_time, end_time

(Timestamp('2019-11-05 16:45:06'), Timestamp('2020-01-17 17:05:06'))

df_djia = timestamp_format_to_datetime(df_djia)
df_djia = prepare_timestamp_open_market_hours(df_djia, start_time, end_time)

Check if time stamps are aligned¶

df_djia.TimeStamp.head()

5098   2019-11-05 16:45:06
5097   2019-11-05 16:50:06
5096   2019-11-05 16:55:06
5095   2019-11-05 17:00:08
5094   2019-11-05 17:05:06
Name: TimeStamp, dtype: datetime64[ns]

df.TimeStamp.head()

20159   2019-11-05 16:45:06
20158   2019-11-05 16:50:05
20157   2019-11-05 16:55:06
20156   2019-11-05 17:00:07
20155   2019-11-05 17:05:05
Name: TimeStamp, dtype: datetime64[ns]

A start time stamp should be provide such, since we have more earlier data scraped for the stock price case.

In the following cells a Label column with (0,1) is a two-class classification problem. News are displayed in columns Top1 up to Top8 and works as a ranking of news. Problem is to identify which rows hold information that can be connected to a price change in a stock.

df_djia['TimeStamp'].iloc[:10]

5098   2019-11-05 16:45:06
5097   2019-11-05 16:50:06
5096   2019-11-05 16:55:06
5095   2019-11-05 17:00:08
5094   2019-11-05 17:05:06
5093   2019-11-06 08:50:06
5092   2019-11-06 08:55:06
5091   2019-11-06 09:00:05
5090   2019-11-06 09:05:06
5089   2019-11-06 09:10:06
Name: TimeStamp, dtype: datetime64[ns]

chronological_days = df['TimeStamp'].apply(lambda x: x.minute)
chronological_days_djia = df_djia['TimeStamp'].apply(lambda x: x.minute)

plt.figure()
plt.plot(chronological_days.values, "o")
plt.plot(chronological_days_djia.values, "+")
plt.show()
plt.close()

chronological_days = df['TimeStamp']
chronological_days_djia = df_djia['TimeStamp']

# Let index start from 1
index_df = np.arange(chronological_days.shape[0])
index_djia = np.arange(chronological_days_djia.shape[0])
plt.figure()
plt.plot(index_df, chronological_days.values, "o")
plt.plot(index_djia, chronological_days_djia.values, "+")
plt.show()
plt.close()

This shows that dates for financial and news data are correctly aligned. This shows that data is only for open market hours as expected.

Prepare data such that all words in row gets into a list with single word elements

df_train.head(1)

# Print Top news headline
ranking_news = 1
news_index = 1 + ranking_news 
example_top_news = df_train.iloc[0, news_index]
print(example_top_news)

En ny Trump-fan har meldt sig på banen – og han er blandt verdens mest udskældte statsledere

example_top_news.lower()
print(example_top_news.lower())

en ny trump-fan har meldt sig på banen – og han er blandt verdens mest udskældte statsledere

Clean phrase for abbreviations and punctuations and other non-word parts. Could probably be optimized for Danish language

headline_words_as_vector = CountVectorizer().build_tokenizer()(example_top_news.lower())
print(CountVectorizer().build_tokenizer()(example_top_news.lower()))

['en', 'ny', 'trump', 'fan', 'har', 'meldt', 'sig', 'på', 'banen', 'og', 'han', 'er', 'blandt', 'verdens', 'mest', 'udskældte', 'statsledere']

Build new dataframe with words and corresponding count

pd.DataFrame([[x, headline_words_as_vector.count(x)] for x in set(headline_words_as_vector)], columns=["word", "word_count"])

Instead of taking only 1 news headline append all the 8 news headlines into 1 string of words and make a count

headline_count = 8
all_headline_words_as_vector = ''
for ranking_news in range(1,headline_count + 1):
    news_index = 1 + ranking_news
    top_news = df_train.iloc[0, news_index]
    if ranking_news == 1:
        all_headline_words_as_vector = top_news
    else:
        all_headline_words_as_vector = ' '.join([all_headline_words_as_vector,top_news])

print(all_headline_words_as_vector)
all_headline_words_as_vector = CountVectorizer().build_tokenizer()(all_headline_words_as_vector.lower())

En ny Trump-fan har meldt sig på banen – og han er blandt verdens mest udskældte statsledere Strid om muligt Ørsted-salg: Vi skal ikke gamble med statens guldæg Per Stig Møller: Radio24syv blev lukketaf et nævn, som ikke har kompetencentil at vurdere kultur og medier Retsudvalg indstillerlandsdommer til ny ombudsmand De havde lige fået deres tredje barn, da han trak sig: »Jegkunne ikke forstå, hvad det var, han ville ud og finde« Partier vil stoppe boligejeres tvangsgæld:»Det er meget problematisk« Susanne Staun: Alex Ahrendtsenhar det rigtig skidt for tiden Her er Brittas søgninger påGoogle, mens hun var på flugt:»Hvor længe skal man afsone?«

pd.DataFrame([[x, all_headline_words_as_vector.count(x)] for x in set(all_headline_words_as_vector)], columns=["word", "word_count"])

Notice some words do not look Danish which may occur due to webscraping output formats. Some words may have been merged with others. One could identify the group of words and clean the data by mapping these into their appropriate condition.

def prepare_data(df):
    training_data_rows = []
    for row in range(0, df.shape[0]):
        all_headline_words_as_vector = ''

        for ranking_news in range(1,headline_count+1):
            news_index = 1 + ranking_news
            top_news = df.iloc[row, news_index]
            if ranking_news == 1:
                all_headline_words_as_vector = str(top_news)
            else:
                all_headline_words_as_vector = ' '.join([all_headline_words_as_vector,str(top_news)])
        training_data_rows.append(all_headline_words_as_vector)
    return training_data_rows

training_data_rows = prepare_data(df_train)
test_data = prepare_data(df_test.iloc[:-1,:])

all_data_rows = []
all_data_rows.extend(test_data)
all_data_rows.extend(training_data_rows)

len(training_data_rows), len(test_data), len(all_data_rows)

(4080, 1019, 5099)

Create a count column for each of the words appearing

# Define corpus for text input
corpus = all_data_rows
count_vectorizer = CountVectorizer()
# Inserts the corpus in vectorizer.
fit_matrix = count_vectorizer.fit(all_data_rows)

training_data_transformed = count_vectorizer.transform(training_data_rows)
print(training_data_transformed.shape)

(4080, 8927)

#training_data_transformed[100:101]
#Todo: check that sparse matrix is correct since wordcloud shows all 2-word combination when it should only be 1-word.

dense_matrix = training_data_transformed.todense()
np.where(dense_matrix)

(array([   0,    0,    0, ..., 4079, 4079, 4079]),
 array([ 217,  240,  286, ..., 8578, 8864, 8865]))

A quick comparison of size. If each row implies 90 new words then total size would be 90*4080 ~ 360000 but we expect less since not all words will be new and the above shows a count of 8927.

training_data_rows[:3]

['En ny Trump-fan har meldt sig på banen – og han er blandt verdens mest udskældte statsledere Strid om muligt Ørsted-salg: Vi skal ikke gamble med statens guldæg Per Stig Møller: Radio24syv blev lukketaf et nævn, som ikke har kompetencentil at vurdere kultur og medier Retsudvalg indstillerlandsdommer til ny ombudsmand De havde lige fået deres tredje barn, da han trak sig: »Jegkunne ikke forstå, hvad det var, han ville ud og finde« Partier vil stoppe boligejeres tvangsgæld:»Det er meget problematisk« Susanne Staun: Alex Ahrendtsenhar det rigtig skidt for tiden Her er Brittas søgninger påGoogle, mens hun var på flugt:»Hvor længe skal man afsone?«',
 'En ny Trump-fan har meldt sig på banen – og han er blandt verdens mest udskældte statsledere »Det er gået over gevind«: DanskeRegioner vil bekæmpe drukkulturblandt unge med højere priser Per Stig Møller: Radio24syv blev lukketaf et nævn, som ikke har kompetencentil at vurdere kultur og medier Retsudvalg indstillerlandsdommer til ny ombudsmand De havde lige fået deres tredje barn, da han trak sig: »Jegkunne ikke forstå, hvad det var, han ville ud og finde« Strid om muligt Ørsted-salg: Vi skal ikke gamble med statens guldæg Partier vil stoppeboligejeres tvangsgæld:»Det er meget problematisk« Susanne Staun: Alex Ahrendtsenhar det rigtig skidt for tiden',
 'En ny Trump-fan har meldt sig på banen – og han er blandt verdens mest udskældte statsledere »Det er gået over gevind«: DanskeRegioner vil bekæmpe drukkulturblandt unge med højere priser Per Stig Møller: Radio24syv blev lukketaf et nævn, som ikke har kompetencentil at vurdere kultur og medier Retsudvalg indstillerlandsdommer til ny ombudsmand De havde lige fået deres tredje barn, da han trak sig: »Jegkunne ikke forstå, hvad det var, han ville ud og finde« Strid om muligt Ørsted-salg: Vi skal ikke gamble med statens guldæg Partier vil stoppeboligejeres tvangsgæld:»Det er meget problematisk« Susanne Staun: Alex Ahrendtsenhar det rigtig skidt for tiden']

#type(training_data_transformed)

print(training_data_transformed)
print(count_vectorizer.get_feature_names()[1000:1400])

  (0, 217)	1
  (0, 240)	1
  (0, 286)	1
  (0, 499)	1
  (0, 605)	1
  (0, 635)	1
  (0, 868)	1
  (0, 872)	1
  (0, 935)	1
  (0, 1032)	1
  (0, 1254)	1
  (0, 1329)	1
  (0, 1402)	1
  (0, 1428)	3
  (0, 1795)	1
  (0, 1855)	3
  (0, 1921)	1
  (0, 1993)	1
  (0, 2095)	1
  (0, 2158)	1
  (0, 2226)	1
  (0, 2383)	1
  (0, 2630)	1
  (0, 2715)	1
  (0, 2939)	1
  :	:
  (4079, 6307)	1
  (4079, 6511)	1
  (4079, 6514)	1
  (4079, 6710)	1
  (4079, 6739)	1
  (4079, 7417)	1
  (4079, 7614)	1
  (4079, 7683)	1
  (4079, 7709)	1
  (4079, 7792)	1
  (4079, 7802)	1
  (4079, 7875)	1
  (4079, 7961)	1
  (4079, 8003)	1
  (4079, 8005)	1
  (4079, 8021)	1
  (4079, 8028)	1
  (4079, 8074)	1
  (4079, 8439)	1
  (4079, 8466)	1
  (4079, 8496)	1
  (4079, 8515)	1
  (4079, 8578)	1
  (4079, 8864)	1
  (4079, 8865)	1
['brande', 'brandfolk', 'brandslukning', 'brandvarmt', 'bras', 'brast', 'bredbåndsmarkedet', 'bredebefolkning', 'breder', 'bredes', 'bremse', 'bremseboris', 'bremsede', 'bremser', 'brev', 'brexit', 'brian', 'bridge', 'briefing', 'brik', 'bringe', 'bringer', 'brister', 'briter', 'briterne', 'briternes', 'briters', 'britisk', 'britiske', 'britiskvalg', 'britta', 'brittanielsen', 'brittas', 'broen', 'bror', 'brosten', 'brud', 'brug', 'bruge', 'brugen', 'bruger', 'brugeranmeldelser', 'brugerrøde', 'bruges', 'brugt', 'brugt14', 'brugte', 'brugtebørneopsparingen', 'brumleby', 'brumme', 'brummehjemme', 'brummer', 'brune', 'brutal', 'brutallikvidering', 'brutalt', 'bruun', 'bruxelles', 'bruxelleser', 'bryde', 'bryder', 'brydergrædende', 'bryster', 'brystet', 'brystkræft', 'brystkræftsyge', 'bræk', 'brækkedetæerne', 'brænde', 'brændende', 'brød', 'brødgrædende', 'brødre', 'brødrepar', 'brødreparmed', 'brøggersvigter', 'brøler', 'budget', 'budskaber', 'bue', 'bugne', 'bugner', 'bukker', 'bukket', 'buldrer', 'bullshit', 'bund', 'bunden', 'bunder', 'bunderi', 'bundlinjen', 'bunds', 'bunkeraf', 'burberry', 'burde', 'burdeudløse', 'burgergigant', 'bus', 'business', 'butikken', 'by', 'bybiler', 'bybilledet', 'bydel', 'byder', 'byderakademikere', 'byen', 'byer', 'bygge', 'byggede', 'byggefond', 'byggegigafabrik', 'byggeplads', 'byggepladspå', 'bygger', 'byggeri', 'bygninger', 'byret', 'byture', 'både', 'bånd', 'båndoptagelse', 'båndoptagelsefra', 'bæltestedet', 'bærertydeligt', 'bøde', 'bøder', 'bøderfor', 'bødskov', 'bøf', 'bøger', 'bøje', 'bøjer', 'bølge', 'bøllemetoder', 'bøller', 'bøn', 'bønderned', 'bønne', 'bør', 'børge', 'børn', 'børnefamilier', 'børnefamilierne', 'børneforbud', 'børneforsker', 'børnefrie', 'børnehave', 'børnene', 'børnenes', 'børneopsparingenpå', 'børneplaner', 'børneplanersendte', 'børnepornobagmand', 'børns', 'børsdrøm', 'børsen', 'børsnotering', 'børstille', 'børstørke', 'børudstyres', 'bøsser', 'c20', 'café', 'call', 'callcenter', 'camp', 'cannabis', 'careys', 'carlsberg', 'carolina', 'caroline', 'casperc', 'cats', 'cavlingprisen', 'cbs', 'cdu', 'cecilie', 'cekic', 'censurerer', 'central', 'centralbank', 'centralbankchef', 'centralberegning', 'centrale', 'centralt', 'centrum', 'cepos', 'ceremoni', 'champagne', 'champagnesablen', 'champions', 'chance', 'chancen', 'chancerfor', 'chauffører', 'chef', 'chefen', 'chefens', 'chefer', 'cheflønninger', 'chefredaktion', 'chefredaktør', 'chefstrateg', 'cheftræner', 'chicago', 'chok', 'chokbølger', 'chokbølgergennem', 'chokerede', 'chokerende', 'chokeret', 'chokeretordfører', 'chokmelding', 'chokmåling', 'choknyhed', 'chokolade', 'chokresultat', 'christian', 'christiansborg', 'churchill', 'cilius', 'cirkel', 'citere', 'cityringen', 'civilbetjente', 'civilisation', 'claus', 'clásico', 'co', 'cocktail', 'codan', 'colombianske', 'colorado', 'com', 'computer', 'computerkyndig', 'con', 'containerbarn', 'coop', 'corbyn', 'corbyns', 'cordua', 'cosby', 'coster', 'craig', 'crazy', 'crown', 'csr', 'cyber', 'cyberangreb', 'cykelbro', 'cyklisterne', 'cypern', 'da', 'dade', 'dadet', 'dag', 'dage', 'dagen', 'dagens', 'dagensfremtidsbud', 'dagevis', 'dagind', 'dagomsætter', 'dagpengedebatten', 'dags', 'dagsorden', 'dahl', 'dahun', 'dakineserne', 'damm', 'damp', 'dan', 'danfoss', 'daniel', 'danmark', 'danmark50', 'danmarker', 'danmarkkan', 'danmarklever', 'danmarkog', 'danmarks', 'danmarkshistoriens', 'danmarksmest', 'dannelse', 'dansk', 'danskarkitektfirma', 'danske', 'danskebank', 'danskeforretning', 'danskeiværksættere', 'danskejøder', 'danskenynazister', 'dansker', 'danskere', 'danskeregioner', 'danskerekender', 'danskerer', 'danskererammes', 'danskeres', 'danskerespensionsopsparing', 'danskerhvervsliv', 'danskermed', 'danskerne', 'danskernehar', 'danskernes', 'danskernesskattemoral', 'danskesoldater', 'danskestorbanker', 'danskesupermarkederbyder', 'dansketaber', 'dansketropper', 'danskhjælp', 'danskmusiker', 'danskudlændingecenter', 'danskverdensarv', 'darigmanden', 'darling', 'dasker', 'dataskamme', 'datingprofil', 'datingvaner', 'dato', 'datopå', 'datter', 'davidsstjerner', 'day', 'dbu', 'de', 'dealligevel', 'debare', 'debat', 'debatindlæg', 'debatmed', 'debatten', 'debatteni', 'debuterer', 'december', 'decideret', 'dedanske', 'definere', 'defleste', 'degraderet', 'deham', 'dehandler', 'dehar', 'dehejste', 'dejligt', 'del', 'dele', 'delebilsfirma', 'delebilstjeneste', 'deler', 'delertusindvis', 'delikatesse', 'delstatsvalg', 'deltagelse', 'deltagerbilleder', 'deltaget', 'delte', 'deltid', 'delvisskyldige', 'delvist', 'delykkes', 'dem', 'demensmedicin', 'dement', 'dementvært', 'demokrat', 'demokrater', 'demokraterne', 'demokraternes', 'demokrati', 'demokratiet', 'demokratsendte', 'demonstranter', 'demonstrantervælter', 'demonstranti', 'demonstration', 'demonstrationer', 'den', 'dendanske', 'denfalske', 'denfjerde', 'denmark', 'denmuslimske', 'denne', 'dennegang', 'dennevalgkreds', 'dennis', 'denrummer', 'densocialdemokratiske', 'densvenske', 'departementschef', 'departementschefer', 'der', 'deradikale', 'derammer', 'derbare']

Put the words into a word cloud and show most occuring words by size

text = " ".join(all_heads for all_heads in training_data_rows)
#text = " ".join(all_heads for all_heads in training_data_rows[:4])
print(len(text))
#print(text)

2540637

wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color='white',
                      width=2500,
                      max_words=100,
                      height=2000
                     ).generate(text)
plt.figure(1,figsize=(13, 13))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
plt.close()

Two-word combinations may appear due to re-occurences of exact phrases appearing in the data, which emphasizes that some words always have the same neighbours. This Wordcloud feature is independent of how we set the n-gram parameter in the following cells.

n-gram model with n=1 since all words from every headline is treated equally and without any context of it's neighboring words.¶

A better model is needed with a Label column that corresponds to 1 when stock price goes up or is constant and with label equal to 0 when stock price goes down.¶

# The following mapping will be used for correlation modelling between Berlingske and Danske Bank stock price
# If price increase or const => 1
# If price decrease => 0

following_day_prices = pd.Series(df_djia.StockPrice[1:].values)
current_day_prices = pd.Series(df_djia.StockPrice[:-1].values)
current_day_prices.shape,following_day_prices.shape

logical = (current_day_prices >= following_day_prices).astype(int)

Simple cheat model (not used)¶

Simple cheat model where Label column is set manually independent of the stock price movement. if False: df_train["Label"] = 1 df_train.loc[:10, "Label"] = 0 df_test["Label"] = 1

#df_test.loc[:df_test.index[0] + 100, "Label"] = 0
df_test.loc[:df_test.index[0] + 100, "Label"] = 0

logical[:10]

0    1
1    0
2    1
3    0
4    1
5    1
6    1
7    1
8    1
9    1
dtype: int64

# correct model
#del df_train["Label"]
#del df_test["Label"]

#df_test.loc[:, :] = df_test.iloc[:-1, :]
logical[:df_train.shape[0]].values.shape, df_train.shape, logical[df_train.shape[0]:].values.shape, df_test.shape

((4080,), (4080, 10), (1018,), (1020, 10))

diff_logical = logical[df_train.shape[0]:].values.shape[0] - df_test.shape[0]

count_df = df_train.shape[0]
df_train = df_train.iloc[:,:]
df_train["Label"] = logical[:count_df].values

df_test = df_test.iloc[:diff_logical, :]

df_test.loc[:, "Label"] = logical[df_train.shape[0]:].values

logical[:df_train.shape[0]].values.shape, df_train.shape, logical[df_train.shape[0]:].values.shape, df_test.shape

((4080,), (4080, 11), (1018,), (1018, 11))

Use a simple Logistic Regression model for the two-class classification problem

log_reg_model = LogisticRegression(solver='liblinear')
#log_reg_model = LogisticRegression(solver='lbfgs')
log_reg_model = log_reg_model.fit(training_data_transformed, df_train.Label)

# test_data = prepare_data(df_test)

Number of feature columns is the same for both training and test data shown below for the shape of transformed test data. Here only a transform is applied to test data and not a fit.

test_data_transformed = count_vectorizer.transform(test_data[:-1])
print(test_data_transformed.shape)
print(test_data_transformed)

(1018, 8927)
  (0, 339)	1
  (0, 455)	1
  (0, 499)	2
  (0, 540)	1
  (0, 745)	1
  (0, 872)	1
  (0, 1060)	1
  (0, 1268)	1
  (0, 1396)	2
  (0, 1405)	1
  (0, 1428)	1
  (0, 1488)	1
  (0, 1660)	1
  (0, 1663)	1
  (0, 1795)	1
  (0, 1855)	3
  (0, 2470)	1
  (0, 2676)	1
  (0, 2880)	1
  (0, 2967)	1
  (0, 3001)	1
  (0, 3065)	1
  (0, 3096)	1
  (0, 3139)	1
  (0, 3160)	1
  :	:
  (1017, 5393)	1
  (1017, 5848)	1
  (1017, 6097)	1
  (1017, 6221)	1
  (1017, 6360)	1
  (1017, 6364)	1
  (1017, 6629)	1
  (1017, 6739)	1
  (1017, 7018)	1
  (1017, 7130)	1
  (1017, 7360)	1
  (1017, 7520)	1
  (1017, 7614)	1
  (1017, 7632)	1
  (1017, 7721)	1
  (1017, 7792)	1
  (1017, 7802)	1
  (1017, 7863)	1
  (1017, 7938)	1
  (1017, 8163)	1
  (1017, 8439)	2
  (1017, 8545)	1
  (1017, 8629)	1
  (1017, 8634)	1
  (1017, 8882)	1

predict = log_reg_model.predict(test_data_transformed)

test_data_transformed.shape

(1018, 8927)

# Check and correct if shape mismatch.
#print(predict)
df_test.Label.shape, predict.shape

((1018,), (1018,))

Confusion matrix that explores true and false positives

confusion_matrix_mod(df_test.Label, predict)
save_path = '../plots/confusion_matrix_logreg'
multipage(''.join([save_path, '.pdf']))
plt.show()
plt.close()

Confusion Matrix
 [[ 84 319]
 [154 461]]

Notice that the rate to hit a true positive is 'True Pos'/'Num Pos' = 0.75 which is a which may at first sight seem good but actually is not. Ideally the confusion matrix should not have a majority of false negatives and false positives as is the case in above result with high values in off diagonal fields.

Check that top 10 coefficients used in the model match some of the words from word cloud

def feature_coefficients(count_vectorizer, log_reg_model):
    feature_names = count_vectorizer.get_feature_names()
    model_coefficients = log_reg_model.coef_.tolist()[0]
    coefficients_in_dataframe = pd.DataFrame({'feature': feature_names, 'Coefficient': model_coefficients})
    coefficients_in_dataframe = coefficients_in_dataframe.sort_values(['Coefficient', 'feature'], ascending=[0, 1])
    return coefficients_in_dataframe

coefficients_in_dataframe = feature_coefficients(count_vectorizer, log_reg_model)
coefficients_in_dataframe.head(10)

coefficients_in_dataframe.tail(10)

# prepare data for xgboost
#print(type(training_data_transformed))
#print(training_data_transformed.shape)
#print(training_data_transformed)
#print(test_data_transformed.shape)
#xgb_train_x, xgb_train_y, xgb_test = select_feats_of_testdata(df, df_test, 'Label')

training_data_transformed.shape, test_data_transformed.shape, df_train.Label.shape

((4080, 8927), (1018, 8927), (4080,))

output = xgboost(training_data_transformed.todense(), df_train.Label, test_data_transformed.todense())

[0]	train-auc:0.594232+0.00890921	test-auc:0.567381+0.0118836
[10]	train-auc:0.691629+0.00762277	test-auc:0.6126+0.0131501
[20]	train-auc:0.712688+0.00899057	test-auc:0.620784+0.0132975
[30]	train-auc:0.723022+0.00722682	test-auc:0.624954+0.0186733
[40]	train-auc:0.729425+0.00631394	test-auc:0.62703+0.0188824
[50]	train-auc:0.732838+0.00655275	test-auc:0.627664+0.0211434
[60]	train-auc:0.736139+0.00529054	test-auc:0.627753+0.0214588
[70]	train-auc:0.738838+0.00581152	test-auc:0.627895+0.019917
[80]	train-auc:0.741576+0.00480395	test-auc:0.627986+0.0185405
[90]	train-auc:0.743811+0.00370839	test-auc:0.628414+0.0190589
[100]	train-auc:0.746283+0.00329369	test-auc:0.629058+0.0189313
[110]	train-auc:0.74895+0.00333743	test-auc:0.629696+0.0178849
[120]	train-auc:0.751462+0.00310018	test-auc:0.630756+0.0164704
[130]	train-auc:0.753179+0.00300699	test-auc:0.630266+0.0167905
[140]	train-auc:0.754402+0.00282058	test-auc:0.629673+0.0163807
[150]	train-auc:0.755983+0.00275332	test-auc:0.631015+0.0153056
[160]	train-auc:0.75746+0.00275977	test-auc:0.631362+0.0157051
[170]	train-auc:0.758982+0.00307433	test-auc:0.631551+0.0151288
[180]	train-auc:0.760439+0.0028959	test-auc:0.631426+0.0140229
[190]	train-auc:0.762369+0.0032601	test-auc:0.630853+0.0141466
[200]	train-auc:0.763939+0.00277303	test-auc:0.631504+0.0138949
[210]	train-auc:0.765586+0.00313544	test-auc:0.631845+0.0140361
[220]	train-auc:0.767007+0.00306109	test-auc:0.632027+0.0137981
[230]	train-auc:0.768497+0.0029311	test-auc:0.63274+0.0147661
[240]	train-auc:0.769429+0.00276575	test-auc:0.632867+0.0142555
[250]	train-auc:0.770653+0.00286352	test-auc:0.633059+0.0142332
[260]	train-auc:0.77171+0.00290643	test-auc:0.633592+0.0151516
[270]	train-auc:0.772597+0.00273123	test-auc:0.633875+0.0150652
[280]	train-auc:0.773603+0.00285515	test-auc:0.634802+0.0155043
[290]	train-auc:0.774576+0.00305247	test-auc:0.635051+0.0152582
[300]	train-auc:0.775975+0.00336996	test-auc:0.634787+0.015013
[310]	train-auc:0.776908+0.00333946	test-auc:0.635043+0.0147956
[320]	train-auc:0.777838+0.0032767	test-auc:0.635071+0.014607
[330]	train-auc:0.778663+0.0032964	test-auc:0.634962+0.0149641
[340]	train-auc:0.779448+0.00333608	test-auc:0.634422+0.0147694
[350]	train-auc:0.780467+0.00330811	test-auc:0.634479+0.0145236
[360]	train-auc:0.781471+0.00347343	test-auc:0.634846+0.0147718
[370]	train-auc:0.782246+0.0035536	test-auc:0.635078+0.0148778
[380]	train-auc:0.782951+0.00354577	test-auc:0.635323+0.015037
[390]	train-auc:0.78373+0.00356043	test-auc:0.635115+0.0149623
[400]	train-auc:0.784437+0.00360334	test-auc:0.635182+0.0153788
[410]	train-auc:0.785273+0.00362524	test-auc:0.63512+0.015122
[420]	train-auc:0.785902+0.00374889	test-auc:0.635856+0.0145665
[430]	train-auc:0.786497+0.00383303	test-auc:0.636293+0.0147229
[440]	train-auc:0.787143+0.00412323	test-auc:0.635954+0.0147304
[450]	train-auc:0.787732+0.00427277	test-auc:0.635948+0.0147207
[460]	train-auc:0.788274+0.00416334	test-auc:0.635847+0.014738
[470]	train-auc:0.788918+0.0043324	test-auc:0.63566+0.0147829
[480]	train-auc:0.789494+0.00420291	test-auc:0.635811+0.0144596
[490]	train-auc:0.790277+0.00432627	test-auc:0.635408+0.0145303
[500]	train-auc:0.790888+0.00430293	test-auc:0.635212+0.0146038
[510]	train-auc:0.791491+0.00427226	test-auc:0.635434+0.0144196
[520]	train-auc:0.79206+0.00418917	test-auc:0.635598+0.0144861
[530]	train-auc:0.792488+0.00431215	test-auc:0.636074+0.0145474
[540]	train-auc:0.793109+0.00430815	test-auc:0.635992+0.0145448
[550]	train-auc:0.793675+0.00439753	test-auc:0.635638+0.0146198
[560]	train-auc:0.79428+0.00435397	test-auc:0.635546+0.0149385
[570]	train-auc:0.794745+0.00432275	test-auc:0.635168+0.0146547
[580]	train-auc:0.795206+0.00423197	test-auc:0.635345+0.0146968
[590]	train-auc:0.795786+0.00413903	test-auc:0.635669+0.0146828
[600]	train-auc:0.796373+0.00411612	test-auc:0.635725+0.0146836
[610]	train-auc:0.796861+0.00403567	test-auc:0.635827+0.0145448
[620]	train-auc:0.797306+0.00402533	test-auc:0.635808+0.0145161
[630]	train-auc:0.797913+0.00401963	test-auc:0.636151+0.0141994
[640]	train-auc:0.798441+0.003949	test-auc:0.636496+0.0142141
[650]	train-auc:0.798848+0.00398446	test-auc:0.636379+0.0143899
[660]	train-auc:0.799279+0.0039961	test-auc:0.636743+0.0146424
[670]	train-auc:0.799681+0.00393652	test-auc:0.637119+0.0145597
[680]	train-auc:0.80016+0.00411049	test-auc:0.637229+0.0142829
[690]	train-auc:0.800671+0.00407629	test-auc:0.637439+0.0142993
[700]	train-auc:0.801+0.00396486	test-auc:0.63753+0.0144655
[710]	train-auc:0.801457+0.00399703	test-auc:0.637548+0.0145324
[720]	train-auc:0.801959+0.00413521	test-auc:0.637194+0.0146503
[730]	train-auc:0.802346+0.00414883	test-auc:0.637649+0.0146582
[740]	train-auc:0.802695+0.00409952	test-auc:0.637209+0.0144671
[750]	train-auc:0.802994+0.00397561	test-auc:0.637341+0.0141764
[760]	train-auc:0.803343+0.00393559	test-auc:0.637211+0.0141905
[770]	train-auc:0.803831+0.00385462	test-auc:0.636936+0.0145082
[780]	train-auc:0.804132+0.00391828	test-auc:0.63712+0.0146509
[790]	train-auc:0.804681+0.00387397	test-auc:0.636797+0.0146761
[800]	train-auc:0.805013+0.00392519	test-auc:0.636651+0.014564
[810]	train-auc:0.805461+0.00390517	test-auc:0.637235+0.0143314
[820]	train-auc:0.805806+0.00383169	test-auc:0.637004+0.0143458
[830]	train-auc:0.806223+0.00381872	test-auc:0.637214+0.0141233
[840]	train-auc:0.806592+0.00379971	test-auc:0.637011+0.01433
[850]	train-auc:0.80696+0.00385073	test-auc:0.637077+0.0142683
[860]	train-auc:0.807438+0.00382991	test-auc:0.637212+0.0140906
[870]	train-auc:0.807823+0.0037989	test-auc:0.637128+0.0142611
[880]	train-auc:0.808132+0.00386974	test-auc:0.636996+0.0143437
[890]	train-auc:0.808558+0.00382165	test-auc:0.637413+0.0142062
[900]	train-auc:0.808918+0.00380302	test-auc:0.637457+0.0142099
[910]	train-auc:0.809207+0.0037619	test-auc:0.63757+0.0141004
[920]	train-auc:0.809536+0.00386678	test-auc:0.637077+0.0140369
[930]	train-auc:0.809931+0.00387298	test-auc:0.637199+0.0138486
[940]	train-auc:0.81035+0.00391246	test-auc:0.637323+0.0141617
Ensemble-CV: 0.8007398+0.004016134679016625

prediction_binary = (output > 0.5).astype(int)
prediction_binary

array([1, 1, 1, ..., 1, 1, 1])

confusion_matrix_mod(df_test.Label, prediction_binary)
save_path = '../plots/confusion_matrix_xgboost'
multipage(''.join([save_path, '.pdf']))
plt.show()
plt.close()

/home/mizio/Documents/machine_learning_modules/datamunging/show_confusion_matrix.py:97: RuntimeWarning: invalid value encountered in true_divide
  'Neg Pre Val: %.2f' % (1 - fn / (fn + tn + 0.)),

Confusion Matrix
 [[  0 403]
 [  0 615]]

Again off diagonal count is not good in case of false positives.

print(training_data_transformed[:,:-2].shape), print(test_data_transformed[:,:-2].shape)

(4080, 8925)
(1018, 8925)

(None, None)

n-gram model with n=2 which corresponds to a models where words account for their neighboring words. This model holds all two-word combinations.¶

# n=2 vectorizer
count_vectorizer_n2 = CountVectorizer(ngram_range=(2,2))
fit_matrix_n2 = count_vectorizer_n2.fit(all_data_rows)
train_data_transformed_n2 = count_vectorizer_n2.transform(training_data_rows)
train_data_transformed_n2.shape

(4080, 28548)

The number of feature columns has highly increase from 8927 to 28548 which is a factor 3.

log_reg_model_n2 = LogisticRegression()
log_reg_model_n2 = log_reg_model_n2.fit(train_data_transformed_n2, df_train.Label)

feature_coefficients_df = feature_coefficients(count_vectorizer_n2, log_reg_model_n2)
feature_coefficients_df.head(10)

feature_coefficients_df.tail(10)

test_data_transformed_n2 = count_vectorizer_n2.transform(test_data[:-1])
test_data_transformed_n2.shape

(1018, 28548)

predict_n2 = log_reg_model_n2.predict(test_data_transformed_n2)

Confusion matrix that explores true and false positives

confusion_matrix_mod(df_test.Label, predict_n2)
save_path = '../plots/confusion_matrix_logreg_n2'
multipage(''.join([save_path, '.pdf']))
plt.show()
plt.close()

Confusion Matrix
 [[ 10 393]
 [ 13 602]]

Off diagonals are still not good with many false positives.

output = xgboost(train_data_transformed_n2.todense(), df_train.Label, test_data_transformed_n2.todense())

[0]	train-auc:0.565647+0.00792824	test-auc:0.553915+0.0159244
[10]	train-auc:0.647097+0.00836703	test-auc:0.597746+0.02936
[20]	train-auc:0.667228+0.00941857	test-auc:0.606005+0.0301494
[30]	train-auc:0.680792+0.00807035	test-auc:0.598313+0.0193596
[40]	train-auc:0.684895+0.00739838	test-auc:0.599565+0.0189072
[50]	train-auc:0.688805+0.00531657	test-auc:0.602282+0.0189491
[60]	train-auc:0.694509+0.00405899	test-auc:0.603145+0.0199581
[70]	train-auc:0.698999+0.00314552	test-auc:0.605185+0.0192435
[80]	train-auc:0.703858+0.00276434	test-auc:0.607833+0.0198528
[90]	train-auc:0.708007+0.00328464	test-auc:0.609169+0.0175131
[100]	train-auc:0.710351+0.00316551	test-auc:0.610875+0.016886
[110]	train-auc:0.712996+0.00344528	test-auc:0.613296+0.015832
[120]	train-auc:0.716818+0.00382424	test-auc:0.613321+0.0176337
[130]	train-auc:0.719115+0.00337178	test-auc:0.61326+0.0167583
[140]	train-auc:0.720845+0.00333578	test-auc:0.612332+0.0167543
[150]	train-auc:0.722564+0.00366402	test-auc:0.614497+0.0171559
[160]	train-auc:0.724846+0.00298931	test-auc:0.615202+0.0175522
[170]	train-auc:0.725687+0.00210132	test-auc:0.615189+0.0177796
[180]	train-auc:0.72706+0.00250775	test-auc:0.615719+0.017596
[190]	train-auc:0.728754+0.00260003	test-auc:0.615691+0.016649
[200]	train-auc:0.729446+0.00238414	test-auc:0.615798+0.0167971
[210]	train-auc:0.730789+0.0029858	test-auc:0.615798+0.0160615
[220]	train-auc:0.731939+0.00306309	test-auc:0.615645+0.0158561
[230]	train-auc:0.733135+0.00340156	test-auc:0.6165+0.0154917
[240]	train-auc:0.7339+0.00337393	test-auc:0.617693+0.0164039
[250]	train-auc:0.734386+0.003049	test-auc:0.618673+0.0170473
[260]	train-auc:0.73578+0.00344255	test-auc:0.617453+0.0167953
[270]	train-auc:0.736684+0.00324893	test-auc:0.617954+0.0164003
[280]	train-auc:0.738032+0.0030247	test-auc:0.618198+0.0169303
[290]	train-auc:0.739075+0.0029415	test-auc:0.618611+0.0160283
[300]	train-auc:0.740005+0.00282833	test-auc:0.619299+0.0157344
[310]	train-auc:0.741277+0.00266594	test-auc:0.620084+0.015647
[320]	train-auc:0.742248+0.00263178	test-auc:0.620214+0.0151239
[330]	train-auc:0.743165+0.00287781	test-auc:0.620314+0.0146061
[340]	train-auc:0.74376+0.00264628	test-auc:0.62041+0.0143157
[350]	train-auc:0.744642+0.00229371	test-auc:0.620968+0.0138679
[360]	train-auc:0.745326+0.00227616	test-auc:0.621047+0.0134803
[370]	train-auc:0.746191+0.00235434	test-auc:0.6209+0.013372
[380]	train-auc:0.747039+0.0022086	test-auc:0.621016+0.013536
[390]	train-auc:0.747763+0.00234508	test-auc:0.620771+0.0138742
[400]	train-auc:0.748363+0.00249862	test-auc:0.62062+0.0139445
[410]	train-auc:0.749025+0.00237294	test-auc:0.621193+0.0138934
[420]	train-auc:0.749436+0.00222699	test-auc:0.6211+0.0145287
[430]	train-auc:0.749973+0.00221705	test-auc:0.620975+0.0145584
[440]	train-auc:0.750658+0.00220663	test-auc:0.621505+0.0147002
[450]	train-auc:0.751224+0.00205308	test-auc:0.621456+0.0143719
[460]	train-auc:0.751584+0.00215601	test-auc:0.621839+0.0145014
[470]	train-auc:0.75216+0.00235948	test-auc:0.621453+0.0145634
[480]	train-auc:0.752725+0.00235771	test-auc:0.622145+0.0149738
[490]	train-auc:0.753264+0.00243238	test-auc:0.6225+0.0149077
[500]	train-auc:0.753903+0.00254706	test-auc:0.622728+0.0151535
[510]	train-auc:0.75453+0.00250301	test-auc:0.622963+0.0151753
[520]	train-auc:0.754959+0.00261068	test-auc:0.623043+0.0150966
[530]	train-auc:0.755483+0.00256289	test-auc:0.623394+0.0151961
[540]	train-auc:0.756068+0.0025315	test-auc:0.623468+0.015013
[550]	train-auc:0.756534+0.00263544	test-auc:0.623365+0.0152602
[560]	train-auc:0.756909+0.00254281	test-auc:0.623597+0.0151226
[570]	train-auc:0.757446+0.00263262	test-auc:0.623656+0.0154207
[580]	train-auc:0.758002+0.00265328	test-auc:0.623625+0.0159504
[590]	train-auc:0.758677+0.00260603	test-auc:0.624086+0.0157631
[600]	train-auc:0.758944+0.00263611	test-auc:0.624298+0.0159371
[610]	train-auc:0.759315+0.00261355	test-auc:0.624461+0.0160295
[620]	train-auc:0.75976+0.00259526	test-auc:0.624373+0.0164703
[630]	train-auc:0.760158+0.00266509	test-auc:0.624453+0.0165873
[640]	train-auc:0.760677+0.00269263	test-auc:0.624929+0.0162625
[650]	train-auc:0.760963+0.00269156	test-auc:0.625196+0.0163716
[660]	train-auc:0.761396+0.00280708	test-auc:0.625384+0.0163002
[670]	train-auc:0.761856+0.00283551	test-auc:0.625018+0.0163235
[680]	train-auc:0.76221+0.00287836	test-auc:0.625573+0.0163465
[690]	train-auc:0.762617+0.00289268	test-auc:0.625352+0.0164962
[700]	train-auc:0.762955+0.00290221	test-auc:0.625694+0.0166309
[710]	train-auc:0.763375+0.0028509	test-auc:0.62591+0.016441
[720]	train-auc:0.763788+0.00295171	test-auc:0.62605+0.0167478
[730]	train-auc:0.76429+0.00298447	test-auc:0.626275+0.0163474
[740]	train-auc:0.764801+0.00303902	test-auc:0.626462+0.0162907
[750]	train-auc:0.765099+0.00309588	test-auc:0.62672+0.0167152
[760]	train-auc:0.765584+0.00328713	test-auc:0.626803+0.0163335
[770]	train-auc:0.765982+0.00334812	test-auc:0.626837+0.0158107
[780]	train-auc:0.766249+0.00335437	test-auc:0.626907+0.0163073
[790]	train-auc:0.766601+0.00337469	test-auc:0.627194+0.0164009
[800]	train-auc:0.766808+0.00327211	test-auc:0.627405+0.0160534
[810]	train-auc:0.767273+0.00314573	test-auc:0.627383+0.0165302
[820]	train-auc:0.767597+0.00323205	test-auc:0.627674+0.0165243
[830]	train-auc:0.767952+0.003206	test-auc:0.627815+0.0164229
[840]	train-auc:0.768196+0.00308748	test-auc:0.627872+0.0164709
[850]	train-auc:0.76854+0.00304725	test-auc:0.627771+0.0168343
[860]	train-auc:0.768885+0.00309041	test-auc:0.627818+0.0166146
[870]	train-auc:0.769268+0.00306212	test-auc:0.628104+0.0166428
[880]	train-auc:0.769548+0.0030072	test-auc:0.628163+0.0166762
[890]	train-auc:0.769856+0.00294101	test-auc:0.628743+0.0161186
[900]	train-auc:0.77011+0.00299157	test-auc:0.628748+0.0160099
[910]	train-auc:0.7704+0.00310773	test-auc:0.628591+0.0158835
[920]	train-auc:0.770628+0.00308236	test-auc:0.628768+0.0156873
[930]	train-auc:0.770903+0.00319292	test-auc:0.628625+0.0154755
[940]	train-auc:0.771255+0.00314433	test-auc:0.628716+0.0156877
[950]	train-auc:0.771567+0.00314629	test-auc:0.628813+0.0157749
[960]	train-auc:0.771845+0.00315828	test-auc:0.628498+0.0157023
[970]	train-auc:0.772089+0.00314483	test-auc:0.628724+0.0156412
[980]	train-auc:0.772536+0.00320643	test-auc:0.628441+0.0154971
[990]	train-auc:0.772843+0.00321105	test-auc:0.628495+0.0154984
[1000]	train-auc:0.77314+0.0031795	test-auc:0.628606+0.0154029
[1010]	train-auc:0.773468+0.00322624	test-auc:0.6285+0.0151389
[1020]	train-auc:0.773631+0.00322399	test-auc:0.628653+0.0152759
[1030]	train-auc:0.773954+0.00324789	test-auc:0.628753+0.0150272
[1040]	train-auc:0.774187+0.00324972	test-auc:0.629124+0.0150207
[1050]	train-auc:0.774543+0.0033078	test-auc:0.629126+0.0148456
[1060]	train-auc:0.774788+0.00337591	test-auc:0.629274+0.0147375
[1070]	train-auc:0.775034+0.00340127	test-auc:0.629244+0.014828
[1080]	train-auc:0.775329+0.00345003	test-auc:0.629278+0.0148752
[1090]	train-auc:0.775599+0.00340706	test-auc:0.629397+0.0148915
[1100]	train-auc:0.775801+0.00331882	test-auc:0.629649+0.0148665
[1110]	train-auc:0.776124+0.00331856	test-auc:0.629759+0.0149307
[1120]	train-auc:0.776424+0.00334437	test-auc:0.629994+0.0148724
[1130]	train-auc:0.776712+0.00339245	test-auc:0.629945+0.0151151
[1140]	train-auc:0.77694+0.00348002	test-auc:0.630118+0.015217
[1150]	train-auc:0.77726+0.00347345	test-auc:0.630008+0.015146
[1160]	train-auc:0.777469+0.0034899	test-auc:0.629834+0.0151257
[1170]	train-auc:0.777762+0.0034806	test-auc:0.629987+0.0151254
[1180]	train-auc:0.778015+0.00345526	test-auc:0.629812+0.0152807
[1190]	train-auc:0.778231+0.00343943	test-auc:0.629913+0.0151211
[1200]	train-auc:0.778459+0.00333245	test-auc:0.630416+0.0150678
[1210]	train-auc:0.778694+0.00333636	test-auc:0.630523+0.0153236
[1220]	train-auc:0.778984+0.00341203	test-auc:0.630281+0.0150723
[1230]	train-auc:0.779247+0.00338139	test-auc:0.630375+0.0152875
[1240]	train-auc:0.77944+0.00342843	test-auc:0.630414+0.015232
[1250]	train-auc:0.779675+0.0034938	test-auc:0.630384+0.0154597
[1260]	train-auc:0.779882+0.00348376	test-auc:0.630642+0.015664
[1270]	train-auc:0.78008+0.00360814	test-auc:0.630562+0.0156695
[1280]	train-auc:0.780286+0.00358986	test-auc:0.630524+0.0156526
[1290]	train-auc:0.780503+0.00351877	test-auc:0.63044+0.015522
[1300]	train-auc:0.780699+0.00345535	test-auc:0.630393+0.0157178
[1310]	train-auc:0.780866+0.00347472	test-auc:0.630714+0.0155848
[1320]	train-auc:0.781146+0.00348507	test-auc:0.630683+0.0157302
[1330]	train-auc:0.781387+0.00350267	test-auc:0.630664+0.0154806
[1340]	train-auc:0.781511+0.00342672	test-auc:0.630504+0.0154816
[1350]	train-auc:0.781734+0.00347136	test-auc:0.630641+0.0154946
[1360]	train-auc:0.781963+0.00346937	test-auc:0.630736+0.0152293
[1370]	train-auc:0.782192+0.00348564	test-auc:0.630626+0.0151542
[1380]	train-auc:0.782417+0.00351452	test-auc:0.630878+0.0153476
[1390]	train-auc:0.782608+0.00347286	test-auc:0.630826+0.0154016
[1400]	train-auc:0.7828+0.00351906	test-auc:0.630921+0.0151658
[1410]	train-auc:0.782978+0.00351897	test-auc:0.630993+0.0150573
[1420]	train-auc:0.783207+0.00353929	test-auc:0.630558+0.0152218
[1430]	train-auc:0.783371+0.00351339	test-auc:0.630537+0.0149543
[1440]	train-auc:0.783592+0.00336938	test-auc:0.630759+0.0149656
[1450]	train-auc:0.78375+0.00340721	test-auc:0.631016+0.0147723
[1460]	train-auc:0.783982+0.00344672	test-auc:0.631341+0.0150871
[1470]	train-auc:0.784232+0.00349396	test-auc:0.631409+0.0149361
[1480]	train-auc:0.784336+0.00351221	test-auc:0.631317+0.0149655
[1490]	train-auc:0.784583+0.00352303	test-auc:0.631276+0.0150093
[1500]	train-auc:0.784771+0.00351173	test-auc:0.631162+0.0151975
[1510]	train-auc:0.784928+0.00351953	test-auc:0.631285+0.0153457
[1520]	train-auc:0.785134+0.00349972	test-auc:0.631788+0.0152896
[1530]	train-auc:0.785315+0.00352349	test-auc:0.63189+0.0152238
[1540]	train-auc:0.785491+0.00350771	test-auc:0.631933+0.0152556
[1550]	train-auc:0.785711+0.00356044	test-auc:0.631971+0.0151769
[1560]	train-auc:0.785955+0.00360609	test-auc:0.631926+0.0152583
[1570]	train-auc:0.7861+0.00359397	test-auc:0.632049+0.015244
[1580]	train-auc:0.78626+0.00365767	test-auc:0.632001+0.0154643
[1590]	train-auc:0.78637+0.00369065	test-auc:0.632128+0.0153783
[1600]	train-auc:0.78661+0.00360636	test-auc:0.63247+0.0152017
[1610]	train-auc:0.786843+0.00362162	test-auc:0.632512+0.0153715
[1620]	train-auc:0.787001+0.00357503	test-auc:0.632539+0.0154631
[1630]	train-auc:0.787216+0.00358958	test-auc:0.632443+0.015293
[1640]	train-auc:0.787395+0.00359234	test-auc:0.632385+0.0152959
[1650]	train-auc:0.787503+0.00352146	test-auc:0.632394+0.0152443
[1660]	train-auc:0.787662+0.00354588	test-auc:0.63236+0.0151483
[1670]	train-auc:0.78779+0.00360367	test-auc:0.632637+0.0150164
[1680]	train-auc:0.787961+0.00358254	test-auc:0.632675+0.0148717
[1690]	train-auc:0.7882+0.0035849	test-auc:0.632519+0.0146487
[1700]	train-auc:0.788402+0.00352692	test-auc:0.632459+0.0146039
[1710]	train-auc:0.788639+0.00355617	test-auc:0.632642+0.0145325
[1720]	train-auc:0.788884+0.00356572	test-auc:0.632688+0.0147509
[1730]	train-auc:0.789042+0.00356423	test-auc:0.632715+0.0147372
[1740]	train-auc:0.78916+0.00350313	test-auc:0.632845+0.0148184
[1750]	train-auc:0.789299+0.00345097	test-auc:0.632807+0.0147631
[1760]	train-auc:0.789481+0.00343059	test-auc:0.632584+0.0147465
[1770]	train-auc:0.789636+0.00339183	test-auc:0.632631+0.0145617
[1780]	train-auc:0.789829+0.00340161	test-auc:0.632586+0.014659
[1790]	train-auc:0.790037+0.0034493	test-auc:0.632469+0.0146752
[1800]	train-auc:0.790247+0.00343159	test-auc:0.632522+0.0147311
[1810]	train-auc:0.790396+0.00344623	test-auc:0.632455+0.0147638
[1820]	train-auc:0.790566+0.00344162	test-auc:0.632321+0.0148634
[1830]	train-auc:0.790768+0.00341688	test-auc:0.632414+0.0147928
[1840]	train-auc:0.79091+0.00338436	test-auc:0.632434+0.0146766
[1850]	train-auc:0.791054+0.00332077	test-auc:0.632739+0.0147064
[1860]	train-auc:0.791154+0.00329499	test-auc:0.632767+0.0145308
[1870]	train-auc:0.791264+0.00329408	test-auc:0.632711+0.0145422
[1880]	train-auc:0.791487+0.00334345	test-auc:0.632822+0.0146171
[1890]	train-auc:0.791627+0.00334698	test-auc:0.632819+0.0143804
[1900]	train-auc:0.791781+0.00334732	test-auc:0.632464+0.0143973
[1910]	train-auc:0.791963+0.00336247	test-auc:0.63262+0.0145507
[1920]	train-auc:0.792093+0.00335272	test-auc:0.632543+0.0146468
[1930]	train-auc:0.792208+0.00335537	test-auc:0.632546+0.0146044
[1940]	train-auc:0.792369+0.00329841	test-auc:0.632667+0.014769
[1950]	train-auc:0.79253+0.00333546	test-auc:0.632555+0.0147709
[1960]	train-auc:0.792682+0.00328321	test-auc:0.632352+0.0147008
[1970]	train-auc:0.792875+0.00326778	test-auc:0.632415+0.0147179
[1980]	train-auc:0.793067+0.00322686	test-auc:0.63234+0.0146918
[1990]	train-auc:0.793207+0.00320065	test-auc:0.632144+0.0145437
Ensemble-CV: 0.78918+0.003513595594259523

prediction_binary = (output > 0.5).astype(int)
confusion_matrix_mod(df_test.Label, prediction_binary)
save_path = '../plots/confusion_matrix_xgboost_n2'
multipage(''.join([save_path, '.pdf']))
plt.show()
plt.close()

Confusion Matrix
 [[  6 397]
 [ 13 602]]

prediction_binary

array([1, 1, 1, ..., 1, 1, 1])

Conclusion¶

The models obtained are not good enough. The four models are from logistic regression and XGBoost applied to 1-gram and 2-gram prepared data. Only the logistic regr. applied to 1-gram transformed data may have a microscopic chance. Common for all the models is that they set all Label values to 1, which may result in good accuracy, but also note that number of false positive is very high, meaning that a model is oversimplified. A better model could probably be trained using data where simple non-relevant words ('er', 'en', 'jeg', 'har') have been subtracted even though they may have high occurrence in the data set. From viewing the WordCloud indeed it is observerd that there appear to be a high occurence of words that may appear in any phrase and therefore may not be particularly connected to movements of the Danske Bank stock price. A fast way would be to remove top X occuring words by assuming they are non-relevant.

	TimeStamp	Url	top1	top2	top3	top4	top5	top6	top7	top8
0	2020-01-17 22:15:06	https://www.berlingske.dk	Omstridt moské knyttes nu tættere til regime: ...	Søren Frank savler hæmningsløst, og det smerte...	Borgmester kaldte partifælle »gerningsmand«: »...	Lækket dokument afslører: EU vil forbyde omdis...	Succesfuld direktør kommer med advarsel til iv...	Minister ærgrer sig over ny rapport om folkesk...	Hun er business-Danmarks svar på Thomas Delane...	De har valgt vidt forskellige karriereveje – m...
1	2020-01-17 22:10:05	https://www.berlingske.dk	Omstridt moské knyttes nu tættere til regime: ...	Søren Frank savler hæmningsløst, og det smerte...	Borgmester kaldte partifælle »gerningsmand«: »...	Lækket dokument afslører: EU vil forbyde omdis...	Succesfuld direktør kommer med advarsel til iv...	Minister ærgrer sig over ny rapport om folkesk...	Hun er business-Danmarks svar på Thomas Delane...	De har valgt vidt forskellige karriereveje – m...
2	2020-01-17 22:05:07	https://www.berlingske.dk	Omstridt moské knyttes nu tættere til regime: ...	Søren Frank savler hæmningsløst, og det smerte...	Borgmester kaldte partifælle »gerningsmand«: »...	Lækket dokument afslører: EU vil forbyde omdis...	Succesfuld direktør kommer med advarsel til iv...	Hun er business-Danmarks svar på Thomas Delane...	De har valgt vidt forskellige karriereveje – m...	Klar besked fra studerende til mor og far: »Ma...
3	2020-01-17 22:00:06	https://www.berlingske.dk	Omstridt moské knyttes nu tættere til regime: ...	Søren Frank savler hæmningsløst, og det smerte...	Borgmester kaldte partifælle »gerningsmand«: »...	Danske læger overvejer at sætte fødende i gang...	Succesfuld direktør kommer med advarsel til iv...	Hun er business-Danmarks svar på Thomas Delane...	De har valgt vidt forskellige karriereveje – m...	Klar besked fra studerende til mor og far: »Ma...
4	2020-01-17 21:55:06	https://www.berlingske.dk	Omstridt moské knyttes nu tættere til regime: ...	Søren Frank savler hæmningsløst, og det smerte...	Borgmester kaldte partifælle »gerningsmand«: »...	Danske læger overvejer at sætte fødende i gang...	Succesfuld direktør kommer med advarsel til iv...	Hun er business-Danmarks svar på Thomas Delane...	De har valgt vidt forskellige karriereveje – m...	Klar besked fra studerende til mor og far: »Ma...
...	...	...	...	...	...	...	...	...	...	...
20154	2019-11-05 17:10:05	https://www.berlingske.dk	En ny Trump-fan har meldt sig på banen – og ha...	Ny teori kan skabe »krise« ividenskaben: Sætte...	»Det er gået over gevind«: DanskeRegioner vil ...	Retsudvalg indstillerlandsdommer til ny ombuds...	Per Stig Møller: Radio24syv blev lukketaf et n...	De havde lige fået deres tredje barn, da han t...	Strid om muligt Ørsted-salg: Vi skal ikke gamb...	Partier vil stoppe boligejeres tvangsgæld:»Det...
20155	2019-11-05 17:05:05	https://www.berlingske.dk	En ny Trump-fan har meldt sig på banen – og ha...	Ny teori kan skabe »krise i videnskaben«:Er un...	»Det er gået over gevind«: DanskeRegioner vil ...	Retsudvalg indstillerlandsdommer til ny ombuds...	Per Stig Møller: Radio24syv blev lukketaf et n...	De havde lige fået deres tredje barn, da han t...	Strid om muligt Ørsted-salg: Vi skal ikke gamb...	Partier vil stoppe boligejeres tvangsgæld:»Det...
20156	2019-11-05 17:00:07	https://www.berlingske.dk	En ny Trump-fan har meldt sig på banen – og ha...	»Det er gået over gevind«: DanskeRegioner vil ...	Per Stig Møller: Radio24syv blev lukketaf et n...	Retsudvalg indstillerlandsdommer til ny ombuds...	De havde lige fået deres tredje barn, da han t...	Strid om muligt Ørsted-salg: Vi skal ikke gamb...	Partier vil stoppeboligejeres tvangsgæld:»Det ...	Susanne Staun: Alex Ahrendtsenhar det rigtig s...
20157	2019-11-05 16:55:06	https://www.berlingske.dk	En ny Trump-fan har meldt sig på banen – og ha...	»Det er gået over gevind«: DanskeRegioner vil ...	Per Stig Møller: Radio24syv blev lukketaf et n...	Retsudvalg indstillerlandsdommer til ny ombuds...	De havde lige fået deres tredje barn, da han t...	Strid om muligt Ørsted-salg: Vi skal ikke gamb...	Partier vil stoppeboligejeres tvangsgæld:»Det ...	Susanne Staun: Alex Ahrendtsenhar det rigtig s...
20158	2019-11-05 16:50:05	https://www.berlingske.dk	En ny Trump-fan har meldt sig på banen – og ha...	»Det er gået over gevind«: DanskeRegioner vil ...	Per Stig Møller: Radio24syv blev lukketaf et n...	Retsudvalg indstillerlandsdommer til ny ombuds...	De havde lige fået deres tredje barn, da han t...	Strid om muligt Ørsted-salg: Vi skal ikke gamb...	Partier vil stoppeboligejeres tvangsgæld:»Det ...	Susanne Staun: Alex Ahrendtsenhar det rigtig s...

	feature	Coefficient
4094	kommentar	1.029582
8891	økonom	0.882581
7078	små	0.805612
4558	ligner	0.766638
5102	mod	0.766284
5276	nato	0.738011
8028	trækker	0.715401
8133	ud	0.687592
1014	brev	0.679137
4132	kongelige	0.674254

	feature	Coefficient
7080	smågerningsmænd	-0.677176
7016	slagsmål	-0.683975
5049	minister	-0.692052
1870	erflere	-0.694653
3348	hård	-0.704585
3016	hans	-0.742021
4108	kommune	-0.743720
6758	sigtet	-0.767799
366	anden	-0.778059
450	apple	-0.825980

	feature	Coefficient
7450	finansministeriet dansk	0.668453
8105	forbydes tivolis	0.616631
8860	frontfigur greta	0.616631
12585	jihadist kontanthjælpsmodtagere	0.554859
28327	ægte ukraine	0.535957
19563	penge df	0.532175
27649	virksomheder en	0.529800
19010	opkøb københavns	0.515768
19588	pengepung en	0.514036
5415	død flygtningenævnet	0.503070

	feature	Coefficient
17948	nytårsforsæt skat	-0.514202
12028	indgreb nu	-0.529153
798	aften nogle	-0.531075
24951	til en	-0.538742
7556	flammer bjørn	-0.546007
6497	er gang	-0.571457
19812	politiaktion er	-0.571457
27651	virksomheder københavns	-0.579805
4405	denmark hvordan	-0.644264
8859	frontfigur et	-0.687898

	TimeStamp	Url	Stock	StockPrice
0	2020-01-17 17:05:06	https://finance.yahoo.com/quote/DANSKE.CO	Danske Bank A/S (DANSKE.CO)	110.00
1	2020-01-17 17:00:06	https://finance.yahoo.com/quote/DANSKE.CO	Danske Bank A/S (DANSKE.CO)	110.00
2	2020-01-17 16:55:06	https://finance.yahoo.com/quote/DANSKE.CO	Danske Bank A/S (DANSKE.CO)	110.00
3	2020-01-17 16:50:07	https://finance.yahoo.com/quote/DANSKE.CO	Danske Bank A/S (DANSKE.CO)	110.00
4	2020-01-17 16:45:06	https://finance.yahoo.com/quote/DANSKE.CO	Danske Bank A/S (DANSKE.CO)	109.95

	word	word_count
0	trump	1
1	og	1
2	statsledere	1
3	på	1
4	han	1
5	sig	1
6	banen	1
7	verdens	1
8	har	1
9	blandt	1
10	mest	1
11	en	1
12	meldt	1
13	fan	1
14	udskældte	1
15	ny	1
16	er	1

	word	word_count
0	lukketaf	1
1	vurdere	1
2	da	1
3	ombudsmand	1
4	at	1
...	...	...
85	har	2
86	hun	1
87	strid	1
88	til	1
89	tiden	1

Strucai's Showcase: NLP stock price and news

Measure Correlation between Danske Bank stock price and Top8 news.¶

Check if time stamps are aligned¶

n-gram model with n=1 since all words from every headline is treated equally and without any context of it's neighboring words.¶

A better model is needed with a Label column that corresponds to 1 when stock price goes up or is constant and with label equal to 0 when stock price goes down.¶

Simple cheat model (not used)¶

n-gram model with n=2 which corresponds to a models where words account for their neighboring words. This model holds all two-word combinations.¶

Conclusion¶