Top8 news and works as a ranking of global news. Use 1 week of data from 2019-07-04 to 2019-07-11 Reddit WorldNews Channel (/r/worldnews). Ranked by reddit users' votes on single date. https://www.reddit.com/r/worldnews?hl
# nlp_notebook.ipynb
# Assumes python vers. 3.6
# __author__ = 'mizio'
import csv as csv
import numpy as np
import pandas as pd
import pylab as plt
from wordcloud import WordCloud,STOPWORDS
from datetime import date
import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sys import path
import os
from os.path import dirname, abspath
path_of_doc = dirname(dirname(abspath(os.path.split(os.getcwd())[0])))
path_to_ml_lib = path_of_doc + '/machine_learning_modules/'
if path_to_ml_lib not in path:
path.append(path_to_ml_lib)
from datamunging.explore import *
# from machine_learning_estimators.tensorflow_NN import *
# Load data
df = pd.read_csv('../scraped_data/reddit_scraped_data2019-07-11_18:21:22.788215.csv', header=0)
df_train = df
df.info()
# clean for rows with null
# df = df.dropna()
df.info()
df.head()
Observe that Label column with (0,1) is a two-class classification problem. News are displayed in columns Top1 up to Top25 and works as a ranking of global news. Problem is to identify which rows hold information that can be connected to a price in a stock.
Prepare data such that all words in row gets into a list with single word elements
df_train.head(1)
# Print Top news headline
ranking_news = 3
news_index = 2 + ranking_news
example_top_news = df_train.iloc[0, news_index]
print(example_top_news)
#print(example_top_news[2:-1])
example_top_news.lower()
print(example_top_news.lower())
Clean phrase for abbreviations and punctuations and other non relevant parts
headline_words_as_vector = CountVectorizer().build_tokenizer()(example_top_news.lower())
print(CountVectorizer().build_tokenizer()(example_top_news.lower()))
Build new dataframe with words and corresponding count
pd.DataFrame([[x, headline_words_as_vector.count(x)] for x in set(headline_words_as_vector)], columns=["word", "word_count"])
Instead of taking only 1 news headline append all the 8 news headlines into 1 list of words and make a count
number_of_news = 8
all_headline_words_as_vector = ''
for ranking_news in range(1,number_of_news + 1):
news_index = 1 + ranking_news
top_news = str(df_train.iloc[0, news_index])
#print(top_news)
if top_news != 'nan':
all_headline_words_as_vector = ' '.join([all_headline_words_as_vector,top_news])
print(all_headline_words_as_vector)
all_headline_words_as_vector = CountVectorizer().build_tokenizer()(all_headline_words_as_vector.lower())
pd.DataFrame([[x, all_headline_words_as_vector.count(x)] for x in set(all_headline_words_as_vector)], columns=["word", "word_count"])
def prepare_data(df):
training_data_rows = []
for row in range(0, df.shape[0]):
all_headline_words_as_vector = ''
for ranking_news in range(1,number_of_news + 1):
news_index = 1 + ranking_news
top_news = str(df.iloc[row, news_index])
if top_news != 'nan':
all_headline_words_as_vector = ' '.join([all_headline_words_as_vector,str(top_news)])
training_data_rows.append(all_headline_words_as_vector)
return training_data_rows
training_data_rows = prepare_data(df_train)
print(len(training_data_rows))
Create a count column for each of the words appearing
count_vectorizer = CountVectorizer()
training_data_transformed = count_vectorizer.fit_transform(training_data_rows)
print(training_data_transformed.shape)
type(training_data_transformed)
#print(training_data_transformed)
#print(count_vectorizer.get_feature_names()[1000:1400])
Put the words into a word cloud and show most occuring words by size
text = " ".join(all_heads for all_heads in training_data_rows)
text = transformed_str = text.replace("`","'")
#print(text)
#print(len(text))
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
width=2500,
max_words=100,
height=2000
).generate(text)
plt.figure(1,figsize=(13, 13))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
plt.close()