Make prediction for a new data for the model trained with dummy data - linear-regression

I have converted my categorical data into columns using the dummy variables. And then performed the train test split. Finally, trained the model and tested with the test data. Since the test data is already in the same format the model understands, it predicts without any issues but when I want to make prediction for a totally new data, creating dummy variables for the new data is not working well. Can I know how its generally done?
Here is my code..
import pandas as pd
import numpy as np
df = pd.read_csv('salary_prediction_usa_finance_job_v2.csv')
df_columns = df.columns
degree = pd.get_dummies(df.degree, prefix='degree', drop_first=True)
masters = pd.get_dummies(df.masters, prefix='masters').iloc[:, 1:]
prof_member = pd.get_dummies(df.professional_membership, prefix='professional_membership', drop_first=True)
df = pd.concat([df, degree,masters,prof_member], axis=1)
df = df.drop(['degree','masters','professional_membership'], axis=1)
X = df.drop('salary_per_year', axis=1)
y = df['salary_per_year']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
new_data = {'degree':['yes'], 'masters':['no'], 'professional_membership':['no'], 'years_experience':[10],'audit_experience':[4], 'IT_skill_rate':[6], 'Size_of_the_company_worked':[3]}
single_df = pd.DataFrame(data=new_data)

Related

Keras to ONNX model export for Barracuda in Unity3D

this is my neural network model:
The input is an example of 10000 features. Each feature is a number (0 or 1).
The output is a number between 0 and 1.
from tensorflow.keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
num_words=10000)
import numpy as np
def vectorize_sequences(sequences, dimension=10000):
results = np.zeros((len(sequences), dimension))
for i, sequence in enumerate(sequences):
for j in sequence:
results[i, j] = 1.
return results
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
y_train = np.asarray(train_labels).astype("float32")
y_test = np.asarray(test_labels).astype("float32")
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential([
layers.Dense(16, activation="relu"),
layers.Dense(16, activation="relu"),
layers.Dense(1, activation="sigmoid")
])
model.compile(optimizer="rmsprop",
loss="binary_crossentropy",
metrics=["accuracy"])
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]
model.fit(partial_x_train,
partial_y_train,
epochs=20,
batch_size=512,
validation_data=(x_val, y_val))
I exported the model in this way:
import tensorflow as tf
spec = (tf.TensorSpec(model.inputs[0].shape, tf.float32, name="my input"),)
nchw_inputs_list = [model.inputs[0].name]
import tf2onnx
model_proto, _ = tf2onnx.convert.from_keras(model, input_signature=spec, custom_ops=None, opset=9, inputs_as_nchw=nchw_inputs_list, output_path="example.onnx")
And when I import it in Unity:
Can someone please help me to export a simple model from Keras to ONNX and import it in the right way in Unity3D?
Thank you.

XGBoost Classification with highly unbalanced dataset

I am trying to make a classification with XGBoost (using the Scikit-Learn API).
The problem is that, when the dataset is highly unbalanced, using the parameter scale_pos_weight does not help, because the resulting balanced accuracy is very low (0.5).
I think the scale_pos_weight parameter is not taken into account when using the eval_set.
Can you help? This is an example code:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
import xgboost as xgb
# create huge dataset
X, y = make_classification(n_samples=100000,
n_features=10,
n_informative=10,
n_redundant=0,
n_classes=2,
weights=[0.999],
random_state=1)
# split into train/test sets with same class ratio (stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=1)
# standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# train the classifier
model = xgb.XGBClassifier(n_estimators=1000, scale_pos_weight=1/999)
model.fit(X_train, y_train,
early_stopping_rounds=100,
eval_metric='logloss',
eval_set=[(X_test, y_test)],
verbose=False)
# best iteration
print('best iteration', model.best_iteration)
best_iteration = model.best_ntree_limit#clg.get_booster().best_ntree_limit
#print(model.evals_result())
y_pred = model.predict(X_test, ntree_limit=best_iteration)
#y_pred = model.predict(X_test)
print('accuracy', accuracy_score(y_test, y_pred))
print('balanced_accuracy_score', balanced_accuracy_score(y_test, y_pred))

Neural network, unspecified size of input

I'm currently trying to use some images from the Sun dataset, with varying shapes, around (1000, 400, 1). Since they are varying in shape, my approach to it was to create a numpy array with numpy arrays in it, so that I dont have to define any shape of it. What I want to do is to train a basic CNN using these pictures. The problem is, I dont think my CNN understands how my input data is defined really. In my implementation, self.X_train[0] for example contains one image (with correspondsing target in self.Y_train[0] and so on). My code right now is looking like:
import os
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten
class network:
def __init__(self):
self.X_train, self.Y_train = self.generate_targets()
def generate_targets(self):
path = 'C:\\Users\\joaki\\PycharmProjects\\project\\project dl\\'
folder = os.fsencode(path)
targets = []
inputs = []
for file in os.listdir(folder):
filename = os.fsdecode(file)
if filename.endswith(('.jpg')):
img = Image.open(filename).convert('RGB')
img2 = Image.open(filename).convert('L')
arr2 = np.array(img2)
arr2 = arr2.reshape((arr2.shape[0], arr2.shape[1], 1))
inputs.append(arr2)
arr = np.array(img)
targets.append(arr)
Y = np.array(targets)
X = np.array(inputs)
return X, Y
def plotting(self, type):
plt.figure(figsize=(20, 10))
for i in range(self.X_train.shape[0]):
plt.subplot(2, 2, i+1)
if type == 'targets':
lum_img = self.Y_train[i][:, :, :] #[:,:,:] för färg
plt.imshow(lum_img)
if type == 'inputs':
lum_img = self.X_train[i][:, :, 0] # [:,:,:] för färg
plt.imshow(lum_img)
plt.show()
def train_network(self):
model = Sequential()
# add model layers
model.add(Conv2D(64, kernel_size=3, activation='relu', input_shape = (None, None, 1)))
model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(Flatten())
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(self.X_train, self.Y_train, batch_size = 1, validation_data=(self.X_train, self.Y_train), epochs=3)
network1 = network()
#network1.plotting('inputs')
network1.train_network()
#print(network1.X_train[0].shape)
Is there solution to this problem, if so, can someone provide information or a source that I should follow? Thanks in advance!

plot Roc curve using keras

I have a neural network model and I am using KerasClassifier and then using KFold for cross-validation. Now I am having issues in plotting the ROC curve. I have tried few codes but most of them is giving me an error of multi-labeled is not interpreted. I have the following code till my neural network produces the accuracy. I will be thankful if anyone can help me with the later part of the code.
import numpy as np
import pandas as pd
from keras.layers import Dense, Input
from keras.models import Model, Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler,StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
seed = 7
np.random.seed(seed)
dataset = pd.read_csv('lukemia_2003.csv')
X_train = dataset.values[:,0:12600]
Y_train = dataset.values[:,12600]
scalar = MinMaxScaler()
scaled_data = scalar.fit_transform(X_train)
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(scaled_data)
encoder = LabelEncoder()
encoder.fit(Y_train)
encoded_Y = encoder.transform(Y_train)
dummy_Y = np_utils.to_categorical(encoded_Y)
hid_layer1 = 4
hid_layer2 = 4
output_layer = 4
def my_model():
encoded = Sequential()
encoded.add(Dense(hid_layer1, input_dim = 10, activation='tanh'))
encoded.add(Dense(hid_layer2, activation='tanh'))
encoded.add(Dense(output_layer, activation='softmax'))
encoded.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return encoded
result_mean_list = []
std_list = []
for i in range(30):
estimator = KerasClassifier(build_fn=my_model, epochs=1500, batch_size=5, verbose=2)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X_train_pca, dummy_Y, cv=kfold)
result_mean_list.append(round(results.mean()*100,2))
std_list.append(round(results.std()*100,2))
print ("Result mean: ", result_mean_list)
print ("Standard Deviation List: ", std_list)
Here is the link to the dataset. https://drive.google.com/open?id=15emI90-sPZMkHLuwRbNfTBli0h_S-PpM
For your case since your target is multiclass, you cannot use ROC for the evaluation of the classifier. In a case that there was a binary classifier this link shows how to draw an ROC curve.

how to create a sequence prediction in keras

I want to predict an output sequence with keras LSTM. I have 6 features and 6 output values. However my code throws an error in my label values.
Error when checking model target: expected dense_1 to have shape (None, 1) but got array with shape (4000, 6)
import numpy as np
np.random.seed(seed=7)
import pandas as pd
numbers = pd.read_csv(r'C:\...\Desktop\LSTM.csv', sep=';')
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
numval = numbers.values.astype('float32')
scaler =MinMaxScaler()
scaler.fit_transform(numval)
X = numval[:4000,0:6]
y = numval[:4000,6:]
y_test = numval[4000,:6:]
X = np.reshape(X,(X.shape[0],1,X.shape[1]))
X_test = numval[4000:,0:6]
X_test = np.reshape(X_test,(X_test.shape[0],1,X_test.shape[1]))
print(X.shape)
model = Sequential()
model.add(LSTM(6,input_dim=6,stateful=True))
model.add(Dense(6))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam')
model.fit(X,y,batch_size=200,nb_epoch=100,verbose=2)
scores = model.evaluate(X_test,y_test,batch_size=32,verbose=1)
print(scores[1])
How can i get multiple labeloutputs? Thx