MultiOutputRegressor in spark/scala for xgboost - scala

Do we have MultiOutputRegressor as a wrapper for xgboost using spark/scala in distributed way with the help of xgboost4j-spark, my requirement is to implement multiple target prediction.
Please find below the python code snippet for reference.
from sklearn.multioutput import MultiOutputRegressor
from sklearn import ensemble
import xgboost as xgb
xgbr = xgb.XGBRegressor(max_depth=1, eta=0.01, silent=1, subsample= 0.8, reg_lambda = 1.515,
reg_alpha= 0.0017, min_child_weight=7, colsample_bytree=0.85, nthread= 32,
gamma= 0.01, objective='reg:linear',tree_method= 'approx', booster = 'gbtree', n_estimators=100)
X = ["indep_1","indep_2","indep3","indep_4","indep_5","indep_6","indep_7","indep_8"]
Y = ["dep_1","dep_2","dep_3","dep_4"]
model = MultiOutputRegressor(xgbr).fit(X, Y)

Related

Spark vs scikit-learn

I use pyspark for traffic classification using the decision tree model & I measure the time required for training the model. It took 2 min and 17 s. Then, I perform the same task using scikit-learn. In the second case, the training time is 1 min and 19 s. Why? since it is supposed that Spark performs the task in a distributed way.
This is the code for pyspark:
df = (spark.read.format("csv")\
.option('header', 'true')\
.option("inferSchema", "true")\
.load("D:/PHD Project/Paper_3/Datasets_Download/IP Network Traffic Flows Labeled with 75 Apps/Dataset-Unicauca-Version2-87Atts.csv"))
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 10)
pModel = dt.fit(trainDF)
in scikit - learn
import warnings
warnings.filterwarnings('ignore')
path = 'D:/PHD Project/Paper_3/Datasets_Download/IP Network Traffic Flows Labeled with 75 Apps/Dataset-Unicauca-Version2-87Atts.csv'
df= pd.read_csv(path)
#df.info()
%%time
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

TF Keras code adaptation from python2.7 to python3

I am working to adapt a python2.7 code that uses keras and tensorflow to implement a CNN but looks like the keras API has changed a little bit since when the original code was idealized. I keep getting an error about "Negative dimension after subtraction" and I can not find out what is causing it.
Unfortunately I am not able to provide an executable piece of code because I was not capable of make the original code works, but the repository containing all the source files can be found here.
The piece of code:
from keras.callbacks import EarlyStopping
from keras.layers.containers import Sequential
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.layers.core import Reshape, Flatten, Dropout, Dense
from keras.layers.embeddings import Embedding
from keras.models import Graph
from keras.preprocessing import sequence
filter_lengths = [3, 4, 5]
self.model = Graph()
'''Embedding Layer'''
self.model.add_input(name='input', input_shape=(max_len,), dtype=int)
self.model.add_node(Embedding(
max_features, emb_dim, input_length=max_len), name='sentence_embeddings', input='input')
'''Convolution Layer & Max Pooling Layer'''
for i in filter_lengths:
model_internal = Sequential()
model_internal.add(
Reshape(dims=(1, self.max_len, emb_dim), input_shape=(self.max_len, emb_dim))
)
model_internal.add(Convolution2D(
nb_filters, i, emb_dim, activation="relu"))
model_internal.add(
MaxPooling2D(pool_size=(self.max_len - i + 1, 1))
)
model_internal.add(Flatten())
self.model.add_node(model_internal, name='unit_' + str(i), input='sentence_embeddings')
What I have tried:
m = tf.keras.Sequential()
m.add(tf.keras.Input(shape=(max_len, ), name="input"))
m.add(tf.keras.layers.Embedding(max_features, emb_dim, input_length=max_len))
filter_lengths = [ 3, 4, 5 ]
for i in filter_lengths:
model_internal = tf.keras.Sequential(name=f'unit_{i}')
model_internal.add(
tf.keras.layers.Reshape(( 1, max_len, emb_dim ), input_shape=( max_len, emb_dim ))
)
model_internal.add(
tf.keras.layers.Convolution2D(100, i, emb_dim, activation="relu")
)
model_internal.add(
tf.keras.layers.MaxPooling2D(pool_size=( max_len - i + 1, 1 ))
)
model_internal.add(
tf.keras.layers.Flatten()
)
m.add(model_internal)
I do not expect a complete solution, what I am really trying to understand is what is the cause to the following error:
Negative dimension size caused by subtracting 3 from 1 for '{{node conv2d_5/Conv2D}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], explicit_paddings=[], padding="VALID", strides=[1, 200, 200, 1], use_cudnn_on_gpu=true](Placeholder, conv2d_5/Conv2D/ReadVariableOp)' with input shapes: [?,1,300,200], [3,3,200,100].

NotFittedError: Vocabulary not fitted or provided or TypeError: string indices must be integers

Hello everyone, im new hier and want to start learning how machine learning works. So i want to build a machine learning email spam detector in colab, but something seems to be wrong:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from google.colab import files
uploaded = files.upload() filename = ('spam.tsv')
content = []
with open(filename, "r") as file_content:
for line in file_content.readlines():
line = line.strip() content.append(line)
spam = line
#print(spam) for testinn if it is working
z = spam['EmailText']
y = spam["Label"]
z_train, z_test,y_train, y_test = train_test_split(z,y,test_size = 0.2)
count_vector = CountVectorizer()
features = count_vector.transform(z_train)
model = svm.SVC()
model.fit(features,y_train)
features_test = count_vector.transform(z_test)
print(model.score(features_test,y_test))`
NotFittedError: Vocabulary not fitted or provided TypeError: string indices must be integers
i tried everything but nothing works really haha
i hope, you can help
thank you

Getting argument missing error In ParamgridBuilder on Pyspark

I am currently implementing Gradientboost classification model in Pyspark.Based on kaggle dataset My current final columns after fitting pipeline is
I am now trying parameter tuning by PARAMGRIDBUILD. here is my Parameter grid build code
param_grid=ParamGridBuilder.addGrid(gradboost.maxDepth,[2,3,4]).addGrid(gradboost.minInfoGain,[0.0, 0.1, 0.2, 0.3]).addGrid(gradboost.stepSize,[0.05, 0.1, 0.2, 0.4]).build()
and I am getting below error
****param_grid=ParamGridBuilder.addGrid(gradboost.maxDepth,[2,3,4]).addGrid(gradboost.minInfoGain,[0.0, 0.1, 0.2, 0.3]).addGrid(gradboost.stepSize,[0.05, 0.1, 0.2, 0.4]).build()
TypeError: addGrid() missing 1 required positional argument: 'values'****
I did not use Paramgridbuild before. does this array values represent each column of my current dataframe? kindly help me to figure out the error and give me the basic concept of using this values. Here is my full code
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer,VectorIndexer,OneHotEncoder,VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
spark=SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("Gradientboostapp").enableHiveSupport().getOrCreate()
data= spark.read.csv("C:/Users/codemen/Desktop/Timeseries Analytics/liver_patient.csv",header=True, inferSchema=True)
#data.show()
print(data.count())
#data.printSchema()
print("After deleting null values")
data=data.na.drop()
print(data.count())
data.show(5)
gender_column=data.columns[1:2]
#print(categorical_column)
stringindexstage=[StringIndexer(inputCol=c,outputCol='genderindexed')for c in gender_column]
#print(stringindexstage)
stringindexstage=stringindexstage+[StringIndexer(inputCol='category',outputCol='classlabel')]
for x in stringindexstage:
data=x.fit(data).transform(data)
data.show(3)
#data.show(3)
#print ("Type of",type(stringindexstage))
onehotencoderstage=[OneHotEncoder(inputCol='genderindexed', outputCol='onehot'+c) for c in gender_column]
for onehot in onehotencoderstage:
data=onehot.transform(data)
data.show()
#vector assembler
print("data current")
data.show(3)
feature_column=['Age','onehotGender','Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase', 'Alamine_Aminotransferase', 'Aspartate_Aminotransferase', 'Total_Protiens',
'Albumin', 'Albumin_and_Globulin_Ratio']
print(feature_column)
#Vector Assembler stage
vectorassmblestage=[VectorAssembler(inputCols=feature_column,outputCol="features")]
#pipeline model
#allstages=stringindexstage+onehotencoderstage+vectorassmblestage
#for i in allstages:
#
pipelinestage=Pipeline(stages=vectorassmblestage)
#
# #fitting variable
pipelinemodel=pipelinestage.fit(data)
#
# #Transform Data
#
finalcolumns=feature_column+['features','classlabel']
#
dataframe=pipelinemodel.transform(data).select(finalcolumns)
print("final column print")
dataframe.show(5)
#splitting data into train test
(traindata, testdata)=dataframe.randomSplit([0.7,0.3],seed=1234)
#gradientboosting
gradboost=GBTClassifier(featuresCol='features',labelCol='classlabel',maxIter=10)
#parameter tuning
param_grid=ParamGridBuilder.addGrid(gradboost.maxDepth,[2,3,4]).addGrid(gradboost.minInfoGain,[0.0, 0.1, 0.2, 0.3]).addGrid(gradboost.stepSize,[0.05, 0.1, 0.2, 0.4]).build()
##Evaluation
print("Evaluation stage")
evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction')
#crossvalidation state
print("cross validation stage")
crossvalidation=CrossValidator(estimator=gradboost,estimatorParamMaps=param_grid,evaluator=evaluator)
crossvalidateData=crossvalidation.fit(dataframe)
##prediction on Training Data
print("Prediction in Training data ....")
predictTrain=crossvalidateData.transform(traindata)
predictTrain.show(10)
Thank you in advance
For starters it looks like you need to call ParamGridBuilder() with parentheses
so;
param_grid = ParamGridBuilder() \
.addGrid(...)

Loss isn't decreasing in neural network

I am implementing a variant of the CNN described by this paper.
My problem is that the loss isn't decreasing and I don't understand why. Same have to be said concerning accuracy(stuck at 0.5 more or less).
This a problem of 2 classes classification. I am using the data from this website:
I suspected the optimizer so I changed it whithout any improvements. I am pretty sure the data I am using is ok because I used it on an LSTM and the classifier was fine.
Here is my code:
from keras.layers import Embedding
from keras.layers import Conv2D
from keras.models import Sequential
from keras.layers import MaxPooling2D
from keras.layers import Reshape
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Input
from keras import backend as K
from keras.models import Model
import tensorflow as tf
from sklearn import preprocessing
import keras
import numpy as np
import pdb
#using multiple filters
config = tf.ConfigProto()
# Don't pre-allocate memory; allocate as-needed
config.gpu_options.allow_growth = True
# Only allow a total of half the GPU memory to be allocated
config.gpu_options.per_process_gpu_memory_fraction = 0.7
# Create a session with the above options specified.
config.gpu_options.per_process_gpu_memory_fraction = 0.65
K.tensorflow_backend.set_session(tf.Session(config=config))
class Classifier():
def __init__(self,Vocab,maxlen=75):
self.model=Sequential()
self.EMBED_DIM=30
self.batch_size=30
self.MAX_SEQUENCE_LENGTH=maxlen
self.nb_labels=2
self.model = Sequential()
self.Vocab=Vocab
def fit(self, X,y):
#pdb.set_trace()
mainIn=Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')
x=Embedding(len(self.Vocab)+2,self.EMBED_DIM,input_length=self.MAX_SEQUENCE_LENGTH)(mainIn)
x=Reshape((1,self.MAX_SEQUENCE_LENGTH, self.EMBED_DIM))(x)
x1=Conv2D(128, strides=2,kernel_size=5 ,activation="relu", padding='same')(x)
x1=MaxPooling2D((self.MAX_SEQUENCE_LENGTH-5+1,1),padding='same')(x1)
x1=Flatten()(x1)
x2=Conv2D(128, strides=2, kernel_size=4, activation="sigmoid", padding='same')(x)
x2=MaxPooling2D((self.MAX_SEQUENCE_LENGTH-4+1,1),padding='same')(x2)
x2=Flatten()(x2)
x3=Conv2D(128, strides=2, kernel_size=3, activation="tanh", padding='same')(x)
x3=MaxPooling2D((self.MAX_SEQUENCE_LENGTH-3+1,1),padding='same')(x3)
x3=Flatten()(x3)
combinedX=keras.layers.concatenate([x1,x2,x3],axis=1)
combinedX=Dense(64, activation="relu")(combinedX)
combinedX=Dropout(0.2)(combinedX)
#output=Dense(self.nb_labels, activation="sigmoid")(combinedX)
#output=Dense(2, activation="softmax")(combinedX)
output=Dense(1, activation="sigmoid")(combinedX)
#encoder =preprocessing.LabelEncoder()
#encoder.fit(y)
#encoded_Y = encoder.transform(y)
#labels=keras.utils.to_categorical(encoded_Y, num_classes=2)
labels=y
pdb.set_trace()
inputs2=X
self.model = Model(inputs=mainIn, outputs=output)
# self.model.compile(loss='binary_crossentropy',
# optimizer='adam',
# metrics=['acc'])
self.model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['acc'])
self.model.fit(inputs2,labels,epochs=7, batch_size=self.batch_size)
def predict(self, X):
return self.model.predict(np.array(X))
def predict_proba(self, X):
return self.model.predict(np.array(X), self.batch_size)
Here is my code to preprocess the data:
#loading the file
file2=pd.read_csv("Sentiment_Analysis Dataset.csv",error_bad_lines=False)
#splitting into train et test set
from sklearn.model_selection import train_test_split
Text=list(file2.SentimentText)
file2.groupby('Sentiment').count()
train_data,test_data,train_label,test_label=train_test_split(Text, file2.Sentiment, test_size=0.4, random_state=42)
#Buidling the dictionary
vocabDic=dict()
for document in train_data:
document=document.split(" ")
for word in document:
if word not in vocabDic:
vocabDic[word]=len(vocabDic)+1
vocabDic['unk']=len(vocabDic)+1
#coding the documents
def codeDocuments(documents,dictionnary):
documentsArray=list()
for i,document in enumerate(documents):
tempList=list()
document=document.split(" ")
for word in document:
if word in vocabDic:
word=vocabDic[word]
else:
word=vocabDic['unk']
tempList.append(word)
documentsArray.append(tempList)
return np.array(documentsArray)
train_docs=codeDocuments(train_data,vocabDic)
test_docs=codeDocuments(test_data,vocabDic)
#padding the documents
from keras.preprocessing import sequence
maxlen=75
train_set = sequence.pad_sequences(train_docs, maxlen=maxlen)
test_set = sequence.pad_sequences(test_docs, maxlen=maxlen)
#calling the model
model=Classifier(vocabDic,maxlen)
model.fit(train_set[:50000],train_label[:50000])