What is data about?
Stock data is Dow Jones Industrial Average (DJIA) https://finance.yahoo.com/quote/%5EDJI/history?p=%5EDJI
Top25 news and works as a ranking of global news. Reddit WorldNews Channel (/r/worldnews). Ranked by reddit users' votes on single date. https://www.reddit.com/r/worldnews?hl
"1" when DJIA Adj Close value rise or stays the same
"0" when DJIA Adj Close value decrease.
Use data from 2008-08-08 to 2014-12-31 as training data, for test data use from 2015-01-02 to 2016-07-01. The split is roughly a 80/20.
AUC evaluation metric.
# nlp_notebook.ipynb
# Assumes python vers. 3.6
# __author__ = 'mizio'
import csv as csv
import numpy as np
import pandas as pd
import pylab as plt
from wordcloud import WordCloud,STOPWORDS
from datetime import date
import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sys import path
import os
from os.path import dirname, abspath
path_of_doc = dirname(dirname(abspath(os.path.split(os.getcwd())[0])))
path_to_ml_lib = path_of_doc + '/machine_learning_modules/'
if path_to_ml_lib not in path:
path.append(path_to_ml_lib)
from datamunging.explore import *
from machine_learning_estimators.tensorflow_NN import *
# Load data
df = pd.read_csv('../stocknews/Combined_News_DJIA.csv', header=0)
df_train = df[df.Date < '2015-01-01']
df_test = df[df.Date > '2015-01-02']
df.info()
# clean for rows with null
df = df.dropna()
df.info()
df.head()
Observe that Label column with (0,1) is a two-class classification problem. News are displayed in columns Top1 up to Top25 and works as a ranking of global news. Problem is to identify which rows hold information that can be connected to a price in a stock.
# Visualize if the news stream only match bank days.
# Set start pt. to 2008-08-08 and convert all datetimes to days.
start_date = datetime.datetime.strptime("2008-08-08", "%Y-%m-%d")
chronological_days = df['Date'].apply(lambda x: (datetime.datetime.strptime(x, "%Y-%m-%d") - start_date).days)
# print(chronological_days)
plt.figure()
plt.plot(chronological_days, ".")
plt.plot(chronological_days.index)
plt.show()
plt.close()
This shows that the index is counting slower than the bank days which is to be expected. Ex. first 5 days gets enumerated 1-5, but then weekend causes the following Monday (day 8) to be enumerated with index value of 6.
df_djia = pd.read_csv('../stocknews/DJIA_table.csv', header=0)
df_djia.info()
df_djia.head(2)
Prepare data such that all words in row gets into a list with single word elements
df_train.head(1)
# Print Top news headline
ranking_news = 4
news_index = 1 + ranking_news
example_top_news = df_train.iloc[0, news_index]
print(example_top_news)
print(example_top_news[2:-1])
example_top_news.lower()
print(example_top_news.lower())
Clean phrase for abbreviations and punctuations and other non relevant parts
headline_words_as_vector = CountVectorizer().build_tokenizer()(example_top_news.lower())
print(CountVectorizer().build_tokenizer()(example_top_news.lower()))
Build new dataframe with words and corresponding count
pd.DataFrame([[x, headline_words_as_vector.count(x)] for x in set(headline_words_as_vector)], columns=["word", "word_count"])
Instead of taking only 1 news headline append all the 25 news headlines into 1 list of words and make a count
all_headline_words_as_vector = ''
for ranking_news in range(1,26):
news_index = 1 + ranking_news
top_news = df_train.iloc[0, news_index]
all_headline_words_as_vector = ' '.join([all_headline_words_as_vector,top_news[2:-1]])
print(all_headline_words_as_vector)
all_headline_words_as_vector = CountVectorizer().build_tokenizer()(all_headline_words_as_vector.lower())
pd.DataFrame([[x, all_headline_words_as_vector.count(x)] for x in set(all_headline_words_as_vector)], columns=["word", "word_count"])
def prepare_data(df):
training_data_rows = []
for row in range(0, df.shape[0]):
all_headline_words_as_vector = ''
for ranking_news in range(1,26):
news_index = 1 + ranking_news
top_news = df.iloc[row, news_index]
all_headline_words_as_vector = ' '.join([all_headline_words_as_vector,str(top_news)[2:-1]])
training_data_rows.append(all_headline_words_as_vector)
return training_data_rows
training_data_rows = prepare_data(df_train)
print(len(training_data_rows))
Create a count column for each of the words appearing
count_vectorizer = CountVectorizer()
training_data_transformed = count_vectorizer.fit_transform(training_data_rows)
print(training_data_transformed.shape)
A quick comparison of size. If each row implies 230 new words then total size would be 230*1611 ~ 350000 but we expect less since not all words will be new.
type(training_data_transformed)
print(training_data_transformed)
print(count_vectorizer.get_feature_names()[1000:1400])
Put the words into a word cloud and show most occuring words by size
text = " ".join(all_heads for all_heads in training_data_rows)
print(len(text))
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
width=2500,
max_words=100,
height=2000
).generate(text)
plt.figure(1,figsize=(13, 13))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
plt.close()
Use a simple Logistic Regression model for the two-class classification problem
log_reg_model = LogisticRegression()
log_reg_model = log_reg_model.fit(training_data_transformed, df_train.Label)
Prepare the test data. One minor concern is with new words appearing in the test data but not in training data. How will the trained model account for new words? It will not account for any new words instead it will count words corresponding to the feature columns of the training data.
test_data = prepare_data(df_test)
Number of feature columns is the same for both training and test data shown below for the shape of transformed test data. Here only a transform is applied to test data and not a fit.
test_data_transformed = count_vectorizer.transform(test_data)
print(test_data_transformed.shape)
print(test_data_transformed)
predict = log_reg_model.predict(test_data_transformed)
Confusion matrix that explores true and false positives
confusion_matrix_mod(df_test.Label, predict)
save_path = '../plots/confusion_matrix_logreg'
multipage(''.join([save_path, '.pdf']))
plt.show()
plt.close()
Notice that the rate to hit a true positive is 'True Pos'/'Num Pos' = 0.47 which is a pretty poor result worse than flipping a coin. Ideally the confusion matrix should not have a majority of false negatives and false positives as is the case in above result with high values in off diagonal fields.
Check that top 10 coefficients used in the model match some of the words from word cloud
def feature_coefficients(count_vectorizer, log_reg_model):
feature_names = count_vectorizer.get_feature_names()
model_coefficients = log_reg_model.coef_.tolist()[0]
coefficients_in_dataframe = pd.DataFrame({'feature': feature_names, 'Coefficient': model_coefficients})
coefficients_in_dataframe = coefficients_in_dataframe.sort_values(['Coefficient', 'feature'], ascending=[0, 1])
return coefficients_in_dataframe
coefficients_in_dataframe = feature_coefficients(count_vectorizer, log_reg_model)
coefficients_in_dataframe.head(10)
coefficients_in_dataframe.tail(10)
# prepare data for xgboost
print(type(training_data_transformed))
print(training_data_transformed.shape)
print(training_data_transformed)
print(test_data_transformed.shape)
#xgb_train_x, xgb_train_y, xgb_test = select_feats_of_testdata(df, df_test, 'Label')
output = xgboost(training_data_transformed[:,:-2], df_train.Label, test_data_transformed[:,:-2])
prediction_binary = (output > 0.5).astype(int)
confusion_matrix_mod(df_test.Label, prediction_binary)
save_path = '../plots/confusion_matrix_xgboost'
multipage(''.join([save_path, '.pdf']))
plt.show()
plt.close()
print(training_data_transformed[:,:-2].shape), print(test_data_transformed[:,:-2].shape)
# n=2 vectorizer
count_vectorizer_n2 = CountVectorizer(ngram_range=(2,2))
train_data_transformed_n2 = count_vectorizer_n2.fit_transform(training_data_rows)
train_data_transformed_n2.shape
The number of feature columns has highly increase from 37014 to 377601 which is a factor 10.
log_reg_model_n2 = LogisticRegression()
log_reg_model_n2 = log_reg_model_n2.fit(train_data_transformed_n2, df_train.Label)
feature_coefficients_df = feature_coefficients(count_vectorizer_n2, log_reg_model_n2)
feature_coefficients_df.head(10)
feature_coefficients_df.tail(10)
test_data_transformed_n2 = count_vectorizer_n2.transform(test_data)
predict_n2 = log_reg_model_n2.predict(test_data_transformed_n2)
Confusion matrix that explores true and false positives
confusion_matrix_mod(df_test.Label, predict_n2)
save_path = '../plots/confusion_matrix_logreg_n2'
multipage(''.join([save_path, '.pdf']))
plt.show()
plt.close()