#download True.csv and Fake.csv from
#https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset
#to your google drive. The following code connects to gdrive using file ids.
#!pip install Pydrive
#!pip install sklearn
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
#Authentication happens only once and
# a file adc.json is created once authentication is done.
import os
if not os.path.exists("Fake.csv") :
fakefile_id = "1nBBfiZOoZToCaGsLxU3s_FRcFcg0Swkk" #ID OF YOUR GDRIVE FILE.
downloaded = drive.CreateFile({"id": fakefile_id})
downloaded.GetContentFile("Fake.csv")
if not os.path.exists("True.csv") :
truefileid = "1Z_SJxYF-43MUBj3-jmOk_xUS-6znKN_0"
downloaded = drive.CreateFile({"id": truefileid})
downloaded.GetContentFile("True.csv")
import pandas as pd
df_true_news = pd.read_csv("True.csv")
df_fake_news = pd.read_csv("Fake.csv")
print (df_true_news.head(20))
print(df_fake_news.head(20))
print(df_true_news.count())
print(df_fake_news.count())
def find_missing_vals(data) :
total = len(data)
for column in data.columns:
if data[column].isna().sum() != 0 :
print("{} has {:,} ({:.2%}) missing values.".format(column, data[column].isna().sum
() , (data[column].isna().sum() /total) * 100))
else :
print(" {} has no missing values".format(column))
print ("\nMissing Value Summary\n{}".format("-"*35))
print("\ndf_db\n{}".format("-"*15))
print(data.isnull().sum(axis=0))
def remove_duplicates(data) :
print("\nCleaning Summary\n{}".format("-"*35))
size_before = len(data)
data.drop_duplicates(subset = None , keep = "first" , inplace = True)
size_after = len(data)
print("...removed {} duplicate rows in db data".format(size_before - size_after))
find_missing_vals(df_fake_news)
remove_duplicates(df_fake_news)
find_missing_vals(df_true_news)
remove_duplicates(df_true_news)
df_merged = pd.merge(df_fake_news , df_true_news, how = "outer")
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes = True)
fig_dims = (20, 4.8)
fig, ax = plt.subplots(figsize = fig_dims)
sns.countplot(df_merged['subject'], ax = ax , data = df_merged)
df_fake_news["label"] = 0
df_true_news["label"] = 1
df_train = pd.merge(df_fake_news, df_true_news , how = "outer")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import string
def text_process(text) :
no_punctuation = [char for char in text if char not in string.punctuation]
no_punctuation = "".join(no_punctuation)
return [word for word in no_punctuation.split() if word.lower() not in stopwords.words
("english")]
from sklearn.model_selection import train_test_split
xtrain, xtest , ytrain, ytest = train_test_split(df_train["title"] , df_train["label"],
test_size = 0.2 , random_state = 42)
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
news_classifier = Pipeline([
("vectorizer", CountVectorizer(analyzer=text_process)),
("tfidf" , TfidfTransformer()),
("classifier", MLPClassifier(solver="adam", activation="tanh", random_state =1 , max_iter
= 200, early_stopping = True))
])
news_classifier.fit(xtrain,ytrain)
predicted = news_classifier.predict(xtest)
from sklearn.metrics import classification_report
print(classification_report(predicted , ytest))
from sklearn.externals import joblib
joblib.dump(news_classifier , "model.pkl")
from googleapiclient.discovery import build
drive_service = build("drive" , "v3")
from googleapiclient.http import MediaFileUpload
file_metadata = {
"name" : "model.pkl",
"mimeType" : "text/plain",
}
media = MediaFileUpload("model.pkl" , mimetype="text/plain" , resumable = True)
created = drive_service.files().create(body=file_metadata , media_body =
media,fields="id").execute()
print("File ID: {} ".format(created.get("id")))
No comments:
Post a Comment