Tuesday, June 22, 2021

Real or Fake News Classification Kaggle challenge

#download True.csv and Fake.csv from 

#https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset

#to your google drive. The following code connects to gdrive using file ids.




#!pip install Pydrive

#!pip install sklearn


from pydrive.auth import GoogleAuth

from pydrive.drive import GoogleDrive

from google.colab import auth 

from oauth2client.client import GoogleCredentials


auth.authenticate_user()

gauth = GoogleAuth()

gauth.credentials = GoogleCredentials.get_application_default()

drive = GoogleDrive(gauth)

#Authentication happens only once and 

# a file adc.json is created once authentication is done.


import os 

if not os.path.exists("Fake.csv") : 

  fakefile_id = "1nBBfiZOoZToCaGsLxU3s_FRcFcg0Swkk"   #ID OF YOUR GDRIVE FILE.

  downloaded = drive.CreateFile({"id": fakefile_id})

  downloaded.GetContentFile("Fake.csv")


if not os.path.exists("True.csv") : 

  truefileid = "1Z_SJxYF-43MUBj3-jmOk_xUS-6znKN_0"

  downloaded = drive.CreateFile({"id": truefileid})

  downloaded.GetContentFile("True.csv")


import pandas as pd


df_true_news = pd.read_csv("True.csv")

df_fake_news = pd.read_csv("Fake.csv")



print (df_true_news.head(20))

print(df_fake_news.head(20))


print(df_true_news.count())

print(df_fake_news.count())



def find_missing_vals(data) : 

  total = len(data)

  for column in data.columns: 

    if data[column].isna().sum() != 0 : 

      print("{} has {:,}  ({:.2%}) missing values.".format(column, data[column].isna().sum


() , (data[column].isna().sum() /total) * 100))

    else : 

      print(" {} has no missing values".format(column))


  print ("\nMissing Value Summary\n{}".format("-"*35))

  print("\ndf_db\n{}".format("-"*15))

  print(data.isnull().sum(axis=0))


def remove_duplicates(data)   : 

  print("\nCleaning Summary\n{}".format("-"*35))

  size_before = len(data)

  data.drop_duplicates(subset = None , keep = "first" , inplace = True)

  size_after = len(data)

  print("...removed {} duplicate rows in db data".format(size_before - size_after))




  

find_missing_vals(df_fake_news)

remove_duplicates(df_fake_news)

find_missing_vals(df_true_news)

remove_duplicates(df_true_news)


df_merged = pd.merge(df_fake_news , df_true_news, how = "outer")


import seaborn as sns

import matplotlib.pyplot as plt

sns.set(style="ticks", color_codes = True)


fig_dims = (20, 4.8) 

fig, ax = plt.subplots(figsize = fig_dims)

sns.countplot(df_merged['subject'], ax = ax , data = df_merged)



df_fake_news["label"] = 0 

df_true_news["label"] = 1 


df_train = pd.merge(df_fake_news, df_true_news , how = "outer")




from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer



import nltk 

nltk.download("stopwords")

from nltk.corpus import stopwords

import string


def text_process(text) : 

  no_punctuation = [char for char in text if char not in string.punctuation]

  no_punctuation = "".join(no_punctuation)

  return [word for word in no_punctuation.split() if word.lower() not in stopwords.words


("english")]



from sklearn.model_selection import train_test_split

xtrain, xtest , ytrain, ytest = train_test_split(df_train["title"] , df_train["label"], 


test_size = 0.2 , random_state = 42)


from sklearn.neural_network import MLPClassifier 


from sklearn.pipeline import Pipeline


news_classifier = Pipeline([

  ("vectorizer", CountVectorizer(analyzer=text_process)), 

  ("tfidf" , TfidfTransformer()), 

  ("classifier", MLPClassifier(solver="adam", activation="tanh", random_state =1 , max_iter 


= 200, early_stopping = True))

  ])


news_classifier.fit(xtrain,ytrain)



predicted = news_classifier.predict(xtest)


from sklearn.metrics import classification_report

print(classification_report(predicted , ytest))


from sklearn.externals import joblib 

joblib.dump(news_classifier , "model.pkl")


from googleapiclient.discovery import build 

drive_service = build("drive" , "v3")


from googleapiclient.http import MediaFileUpload 


file_metadata = { 

    "name" : "model.pkl",

    "mimeType" : "text/plain", 

  }


media = MediaFileUpload("model.pkl" , mimetype="text/plain" , resumable = True)


created = drive_service.files().create(body=file_metadata , media_body = 


media,fields="id").execute()


print("File ID: {} ".format(created.get("id")))


No comments:

Post a Comment

 using Microsoft.AspNetCore.Mvc; using System.Xml.Linq; using System.Xml.XPath; //<table class="common-table medium js-table js-stre...