How make custom dataset for classification task when the class is folder name using Pytorch?

The problem is dataloader is returning the wrong class for correspond image?
for example if I print the class_to_idx from the train_loader, when batch size is 1I was expecting to get one class per batch, but currently it’s returning all the classes which is 15 classes per image.
In this case, the classes are folder class (all images exist in one folder belongs to one class)
snippet is here:(this is a function to return the class from the folder name dir)
import os
def find_classes(dir): # Finds the class folders in a dataset, dir (string): Root directory path.
classes = [ for d in os.scandir(dir) if d.is_dir()]
class_to_idx = {classes[i]: i for i in range(len(classes))}
return classes, class_to_idx
here is the main snippet for create a custom dataset and dataloder
def main():
class CustomDataset(Dataset):
def __init__(self, image_paths, classes, class_to_id):
self.image_paths = image_paths
self.transforms = transforms.ToTensor()
classes, class_to_id = find_classes('D:/Neda/Echo_View_Classification/avi_images/')
self.classes = classes
self.class_to_idx = class_to_idx
def __getitem__(self, index):
image =[index])
t_image = image.convert('L')
t_image = self.transforms(t_image)
class_to_idx = self.class_to_idx
return t_image, class_to_idx, self.image_paths[index]
def __len__(self):
return len(self.image_paths)
folder_data = glob.glob("D:\\Neda\\Echo_View_Classification\\avi_images\\*\\*.png") # no augmnetation
#numpy.savetxt('distribution_class.csv', numpy.c_[folder_data], fmt=['%s'], comments='', delimiter = ",")
#split these path using a certain percentage
len_data = len(folder_data)
print("count of dataset: ", len_data)
split_1 = int(0.6 * len(folder_data))
split_2 = int(0.8 * len(folder_data))
train_image_paths = folder_data[:split_1]
print("count of train images is: ", len(train_image_paths))
numpy.savetxt('im_training_path_1.csv', numpy.c_[train_image_paths], fmt=['%s'], comments='', delimiter = ",")
valid_image_paths = folder_data[split_1:split_2]
print("count of validation image is: ", len(valid_image_paths))
numpy.savetxt('im_valid_path_1.csv', numpy.c_[valid_image_paths], fmt=['%s'], comments='', delimiter = ",")
test_image_paths = folder_data[split_2:]
print("count of test images is: ", len(test_image_paths))
numpy.savetxt('im_testing_path_1.csv', numpy.c_[test_image_paths], fmt=['%s'], comments='', delimiter = ",")
classes = ['1_PLAX_1_PLAX_full',
class_to_idx = {'1_PLAX_1_PLAX_full': 0,
'1_PLAX_2_PLAX_valves': 1,
'1_PLAX_4_PLAX_TV': 2,
'2_PSAX_1_PSAX_AV': 3,
'2_PSAX_2_PSAX_LV': 4,
'3_Apical_1_MV_LA_IAS': 5,
'3_Apical_2_A2CH': 6,
'3_Apical_3_A3CH': 7,
'3_Apical_5_A5CH': 8,
'4_A4CH_1_A4CH_LV': 9,
'4_A4CH_2_A4CH_RV': 10,
'4_Subcostal_1_Subcostal_heart': 11,
'4_Subcostal_2_Subcostal_IVC': 12,
'root_5_Suprasternal': 13,
'root_6_OTHER': 14}
train_dataset = CustomDataset(train_image_paths, class_to_idx, classes)
train_loader =, batch_size=1, shuffle=False, num_workers=0)
valid_dataset = CustomDataset(valid_image_paths, class_to_idx, classes)
valid_loader =, batch_size=1, shuffle=False, num_workers=0)
test_dataset = CustomDataset(test_image_paths, class_to_idx, classes)
test_loader =, batch_size=1, shuffle=False, num_workers=0)
dataLoaders = {
'train': train_loader,
'valid': valid_loader,
'test': test_loader,

How to send signals/variables between a QDialog and Main Window

I am currently working on a project that involves graphing text file data into a pyqt graph and I have been running into problems with a subclass QDialog box. My goal is to have the QDialog box use a combox to choose between different data sets to graph (The code below shows the "steering angle" setting being chosen). The problem lies with how to make it so that when the Create Graph button is pressed (Found in the QDialog Class), it runs the createGraph(self): function in the main class. I dont know how to work classes that well so I dont know how to make this work.
If anyone has any pointers on either how to get this working, how to properly structure a PYQT Program or how to make it more efficient, I'm all ears.
Thank you for your time!
Main Window Code:
class MainWidget(QMainWindow):
def __init__(self, parent=None):
super(MainWidget, self).__init__(parent)
self.dockcheck = 0
self.graphcheck = 0
self.setWindowTitle("Drag and Drop Test")
self.resize(1200, 800)
self.LBLDragAndDrop = QLabel("Drag And Drop Files Here")
self.LBLDragAndDrop.setAlignment(Qt.AlignHCenter | Qt.AlignVCenter)
if self.graphcheck == 0:
self.path3 = "C:\\Users\\steph\\OneDrive\\Documents\\SAA Wing\\Coding\\Can Bus Data Reading\\Temporary Saves"
self.treeview = QTreeView()
self.fileModel = QFileSystemModel()
self.indexRoot = self.fileModel.index(self.fileModel.rootPath())
self.treeview.setColumnWidth(0, 250)
# Dialog Box
def onSelectionChanged(self, index):
self.selectionPath = self.sender().model().filePath(index)
self.selectionFilename = (self.selectionPath.split("/")[-1])
IDList = ("ID 00d0","ID 00d1","ID 00d3","ID 00d4","ID 0140","ID 0141","ID 0360","ID 0361")
if self.selectionFilename in IDList:
if self.selectionFilename == "ID 00d0":
editDialog = Dialog00d0()
# Graphing data
def createGraph(self):
self.graphcheck = 1
if self.graphcheck == 1:
# ID 00D0 Creating Graph
if self.selectionFilename == "ID 00d0":
self.df00d0 = pd.read_table(self.selectionPath, header=None , delim_whitespace=True, dtype=object)
self.df00d0.columns = ['Timestamp','ID',"B0","B1","B2","B3","B4","B5","B6","B7"]
self.SA = np.array([], dtype=float)
self.LatAcc = np.array([], dtype=float)
self.LonAcc = np.array([], dtype=float)
self.ComAcc = np.array([], dtype=float)
self.Time00d0 = np.array([], dtype=float)
self.Timestamp00d0 = np.array([], dtype=float)
# Getting Time Stamps
for item in self.df00d0['Timestamp']:
self.Time00d0 = np.append(self.Time00d0, datetime.fromtimestamp(float(item)).strftime("%H:%M:%S.%f")[:-4])
self.Timestamp00d0 = np.append(self.Timestamp00d0, float(item))
# Steering Angle Graph
if self.combobox00d0.currentText() == "Steering Angle":
SA_ = (((self.df00d0['B1']) + (self.df00d0['B0'])).apply(int, base=16) * 0.1)
for item in SA_:
if item > 6000:
self.SA = np.append(self.SA, round((item - 6553.6), 1))
self.SA = np.append(self.SA, round(item))
y_value = self.SA
Here is the QDialog Box class code:
class Dialog00d0(QDialog):
def __init__(self):
self.layout = QVBoxLayout()
hlay = QHBoxLayout()
self.setWindowTitle("Create Graph")
label = QLabel("Data Type")
self.combobox00d0 = QComboBox()
self.combobox00d0.addItem("Steering Angle")
self.combobox00d0.addItem("Latitudinal Acceleration")
self.combobox00d0.addItem("Longitudinal Acceleration")
self.combobox00d0.addItem("Combined Acceleration")
self.BTNCreateGraph = QPushButton("Create Graph")
self.BTNCancel = QPushButton("Cancel")
self.BTNCreateGraph.clicked.connect("I need the self.creatGraph here")
I imagine this will help you.
The pyqtSignal() argument tells you what information you want to carry.
In this case, I'm passing a text.
Good luck, I hope I helped.
import sys
from PyQt5.QtWidgets import QMainWindow, QDialog, QApplication
from PyQt5.QtWidgets import QPushButton, QVBoxLayout
from PyQt5 import QtCore, QtGui
class MainWidget(QMainWindow):
def __init__(self, parent=None):
super(MainWidget, self).__init__(parent)
button = QPushButton("Button to open dialog")
def button_clicked(self):
dlg = Dialog00d0()
def createGraph(self, _str):
print('Now Im here')
class Dialog00d0(QDialog):
signEmit = QtCore.pyqtSignal(str)
def __init__(self):
self.layout = QVBoxLayout()
self.BTNCreateGraph = QPushButton("link to createGraph()")
def BTNCreateGraph_clicked(self):
self.signEmit.emit('But I passed by')
app = QApplication(sys.argv)
Using Spark Scala in EMR to get S3 Object size (folder, files)

I am trying to get the folder size for some S3 folders with scala from my command line EMR.
I have JSON data stored as GZ files in S3. I find I can count the number of JSON records within my files:"s3://mybucket/subfolder/subsubfolder/").count
But now I need to know how much GB that data accounts for.
I am finding options to get the size for distinct files, but not for a whole folder all up.
I am finding options to get the size for distinct files, but not for a
whole folder all up.
Solution :
Get the s3 access by FileSystem
val fs = FileSystem.get(new URI(ipPath), spark.sparkContext.hadoopConfiguration)
Note :
1) new URI is important other wise it will connect to
hadoop file system path instread of s3 file system(object store :-)) path . using new URI you are giving scheme s3://
2) will
give display sizes of file system in GB MB etc...
* recursively print file sizes
* #param filePath
* #param fs
* #return
def getDisplaysizesOfS3Files(filePath: org.apache.hadoop.fs.Path, fs: org.apache.hadoop.fs.FileSystem): scala.collection.mutable.ListBuffer[String] = {
val fileList = new scala.collection.mutable.ListBuffer[String]
val fileStatus = fs.listStatus(filePath)
for (fileStat <- fileStatus) {
println(s"file path Name : ${fileStat.getPath.toString} length is ${fileStat.getLen}")
if (fileStat.isDirectory) fileList ++= (getDisplaysizesOfS3Files(fileStat.getPath, fs))
else if (fileStat.getLen > 0 && !fileStat.getPath.toString.isEmpty) {
println("fileStat.getPath.toString" + fileStat.getPath.toString)
fileList += fileStat.getPath.toString
val size = fileStat.getLen
val display =
println(" length zero files \n " + fileStat)
println("Name = " + fileStat.getPath().getName());
println("Size = " + size);
println("Display = " + display);
} else if (fileStat.getLen == 0) {
println(" length zero files \n " + fileStat)
based on your requirement, you can modify the code... you can sum up all the distinct files.
Option 2 : Simple and crispy using getContentSummary
implicit val spark = SparkSession.builder().appName("ObjectSummary").getOrCreate()
* getDisplaysizesOfS3Files
* #param path
* #param spark [[org.apache.spark.sql.SparkSession]]
def getDisplaysizesOfS3Files(path: String)( implicit spark: org.apache.spark.sql.SparkSession): Unit = {
val filePath = new org.apache.hadoop.fs.Path(path)
val fileSystem = filePath.getFileSystem(spark.sparkContext.hadoopConfiguration)
val size = fileSystem.getContentSummary(filePath).getLength
val display =
println("path = " + path);
println("Size = " + size);
println("Display = " + display);
Note : Any option showed above will work for
local or
hdfs or
as well

Implementing K-medoids in Pyspark

I can not find a library to use PAM (K-medoids) in Pyspark.
I have found this in Scala :
And this issue in Spark which was resolved in 2016 :
But it seems not to be working and this is not included in the mllib documentation :
Does anyone knows any library for PAM in Pyspark ?
Thank you
I actually had a go at this the other day for fun. Can't say much about performance as I'm quite new to spark. But here is KMedoids with K++ seeding:
# (c) 2020 Jonathan Kelsey
# This code is licensed under MIT license
from pyspark.sql import functions as F
import pyspark
import numpy as np
import sys
def seed_kernel(data_broadcast, data_id_value, centeroids, k, metric):
data = data_broadcast.value
point = data_id_value[1]
min_distance = sys.maxsize
for j in range(len(centeroids)):
distance = metric(point, data[centeroids[j]])
min_distance = min(min_distance, distance)
return min_distance
def seed_clusters(data_broadcast, data_frame, k, metric):
data = data_broadcast.value
centeroids = list(np.random.choice(data.shape[0], 1, replace=False))
for i in range(k - 1):
print("clusterSeed", i)
distances = []
mK = data_id_value: seed_kernel(data_broadcast, data_id_value, centeroids, k, metric))
mK_collect = mK.collect()
distances = np.array(mK_collect)
next_centeroid = np.argmax(distances)
print("centeroids", centeroids)
return centeroids
def nearest_centeroid_kernel(data_id_value, centeroid_id_values, metric):
_, data_value = data_id_value
data_np = np.asarray(data_value)
distances = []
for _, centeroid_value in centeroid_id_values:
centeroid_np = np.asarray(centeroid_value)
distance = metric(data_np, centeroid_np)
distances = np.asarray(distances)
closest_centeroid = np.argmin(distances)
return int(closest_centeroid)
def optimise_cluster_membership_spark(data, data_frame, n, metric, intital_cluster_indices=None):
data_shape = data.shape
data_rdd = data_frame.rdd
data_length = data_shape[0]
if intital_cluster_indices is None:
index = np.random.choice(data_length, n, replace=False)
index = intital_cluster_indices
list_index = [int(i) for i in list(index)]
centeroid_id_values = [(i,data[index[i]]) for i in range(len(index))]
data_rdd = data_rdd.filter(lambda data_id_value: int(data_id_value["id"]) not in list_index)
associated_cluster_points = data_id_value: (data_id_value[0],nearest_centeroid_kernel(data_id_value, centeroid_id_values, metric)))
clusters = associated_cluster_points.toDF(["id", "bestC"]).groupBy("bestC").agg(F.collect_list("id").alias("cluster"))
return index, clusters
def cost_kernel(data_broadcast, test_centeroid, cluster_data, metric):
data = data_broadcast.value
cluster = np.asarray(cluster_data)
cluster_length = cluster.shape[0]
feature_length = data.shape[1]
test_centeroid_column = np.zeros(shape=(cluster_length, feature_length), dtype=data.dtype)
new_cluster_column = np.zeros(shape=(cluster_length, feature_length), dtype=data.dtype)
for i in range(0, cluster_length):
new_cluster_column[i] = data[cluster[i]]
test_centeroid_column[i] = data[int(test_centeroid)]
pairwise_distance = metric(new_cluster_column, test_centeroid_column)# (np.absolute(new_cluster_column-test_centeroid_column).sum(axis=1))# metric(new_cluster_column, test_centeroid_column)
cost = np.sum(pairwise_distance)
return float(cost) #new_cluster_column.shape[1]
def optimise_centroid_selection_spark(data_broadcast, data_frame, centeroids, clusters_frames, metric):
data = data_broadcast.value
new_centeroid_ids = []
total_cost = 0
for cluster_idx in range(len(centeroids)):
old_centeroid = centeroids[cluster_idx]
cluster_frame = clusters_frames.filter(clusters_frames.bestC == cluster_idx).select(F.explode(clusters_frames.cluster))
cluster_data = cluster_frame.collect()
if cluster_data:
cluster_data = [cluster_data[i].col for i in range(len(cluster_data))]
cluster_data = []
cost_data = point_id: (point_id[0], cost_kernel(data_broadcast, point_id[0], cluster_data, metric)))
cost = point_id_cost: point_id_cost[1]).sum()
total_cost = total_cost + cost
point_result = cost_data.sortBy(lambda point_id_cost: point_id_cost[1]).take(1)
if (point_result):
best_point = point_result[0][0]
best_point = old_centeroid
return (new_centeroid_ids, total_cost)
def validate_metric(metric):
if (metric == "euclidean" or metric == "hamming"):
return True
if isinstance(metric, dict) == False:
return "Metric is not a dictionary. And not a known string 'euclidean' or 'hamming'"
metric_keys = metric.keys()
if "point" not in metric_keys or "vector" not in metric_keys:
return "Metric does not contain a member function for 'point' and/or 'point'."
if callable(metric["point"]) == False or callable(metric["vector"]) == False:
return "Metric.point and/or Metric.vector are not callable functions."
if (metric["point"].__code__.co_argcount != 2 and metric["vector"].__code__.co_argcount != 2):
return "Metric.point and/or Metric.vector do not both have 2 arguments."
return True
# pre-defined metrics
#vector metrics
def hamming_vector(stack1, stack2):
return (stack1 != stack2).sum(axis=1)
def euclidean_vector(stack1, stack2):
#return (np.absolute(stack2-stack1)).sum(axis=1)
return np.sqrt(((stack2-stack1)**2).sum(axis=1))
# point metrics
def hamming_point(p1, p2):
return np.sum((p1 != p2))
def euclidean_point(p1, p2):
return np.sqrt(np.sum((p1 - p2)**2))
def fit(sc, data, n_clusters = 2, metric = "euclidean", seeding = "heuristic"):
metric_valid = validate_metric(metric)
if metric_valid == True:
if metric == "euclidean":
point_metric = euclidean_point
vector_metric = euclidean_vector
elif metric == "hamming":
point_metric = hamming_point
vector_metric = hamming_vector
point_metric = metric["point"]
vector_metric = metric["vector"]
data_np = np.asarray(data)
data_broadcast = sc.broadcast(data_np)
seeds = None
data_frame = sc.parallelize(data).zipWithIndex().map(lambda xy: (xy[1],xy[0])).toDF(["id", "vector"]).cache()
if (seeding == "heuristic"):
seeds = list(seed_clusters(data_broadcast, data_frame, n_clusters, point_metric))
last_centeroids, last_clusters = optimise_cluster_membership_spark(data_np, data_frame, n_clusters, point_metric, seeds)
last_cost = float('inf')
iteration = 0
escape = False
while not escape:
iteration = iteration + 1
current_centeroids, current_cost = optimise_centroid_selection_spark(data_broadcast, data_frame, last_centeroids, last_clusters, vector_metric)
current_centeroids, current_clusters = optimise_cluster_membership_spark(data_np, data_frame, n_clusters, point_metric, current_centeroids)
print((current_cost<last_cost, current_cost, last_cost, current_cost - last_cost))
if (current_cost<last_cost):
print(("iteration",iteration,"cost improving...", current_cost, last_cost, current_centeroids))
last_cost = current_cost
last_centeroids = current_centeroids
last_clusters = current_clusters
print(("iteration",iteration,"cost got worse or did not improve", current_cost, last_cost))
escape = True
bc = last_clusters.sort("bestC", ascending=True).collect()
unpacked_clusters = [bc[i].cluster for i in range(len(bc))]
return (last_centeroids, unpacked_clusters)
I used some sample data from pyclustering as a sanity check:
from pyclustering.cluster import cluster_visualizer
from pyclustering.utils import read_sample
from pyclustering.samples.definitions import FCPS_SAMPLES
from pyclustering.samples.definitions import SIMPLE_SAMPLES
sample = read_sample(FCPS_SAMPLES.SAMPLE_GOLF_BALL)
bestCentroids, bestClusters = fit(sc, sample, 9)
visualizer = cluster_visualizer()
visualizer.append_clusters(bestClusters, sample)
Your best choice is to adapt this Python implementation into Scala so you take advance of RDD partitions and distributed computation.

Combination of Map Container and Structure in matlab

i would like to visualize what i will get after concatenation of map and struct in matlab , for instance let us consider following Map Container
ticketMap = containers.Map(...
{'2R175', 'B7398', 'A479GY', 'NZ1452'}, ...
{'James Enright', 'Carl Haynes', 'Sarah Latham', ...
'Bradley Reid'});
key/value structure of this map is clear for me, now let us suppose we have following structure
s1.ticketNum = '2S185'; s1.destination = 'Barbados';
s1.reserved = '06-May-2008'; s1.origin = 'La Guardia';
s2.ticketNum = '947F4'; s2.destination = 'St. John';
s2.reserved = '14-Apr-2008'; s2.origin = 'Oakland';
s3.ticketNum = 'A479GY'; s3.destination = 'St. Lucia';
s3.reserved = '28-Mar-2008'; s3.origin = 'JFK';
s4.ticketNum = 'B7398'; s4.destination = 'Granada';
s4.reserved = '30-Apr-2008'; s4.origin = 'JFK';
s5.ticketNum = 'NZ1452'; s5.destination = 'Aruba';
s5.reserved = '01-May-2008'; s5.origin = 'Denver';
we have 5 structure with different fields, now following commands
seatingMap = containers.Map( ...
{'23F', '15C', '15B', '09C', '12D'}, ...
{s5, s1, s3, s4, s2});
make sense for me because for instance using key 23F i can access fields of s1 structure, for instance
>> seatingMap('23F').origin
ans =
all those parts are clear for me, now Using ticketMap and seatingMap together, you can find the name of the person who has reserved seat 15B
ticket = seatingMap('15B').ticketNum;
passenger = ticketMap(ticket)
but is that optimal way?thanks in advance

Attempting to print from method within class

I'm still relatively new to Python and some concepts so you'll have to bear with me.
I am trying to create an animal (in this case a horse), and generate some attributes to it automatically. Here I am attempting to generate and apply the height function to the class using my get_heights method. I am not getting any errors, however it is not printing out the generated numbers that it does when I define it as an independent function. (Outside of a class).
I added parentheses to Horse_heights.get_heights call, but then I get a type error saying get_heights() takes exactly 3 arguments <1 given>. All help is appreciated, apologies if I'm missing some fundamental aspect here.
import random
class Horse(object):
def __init__(self, horse_name):
self.horse_name = horse_name
def get_heights(self, starting_height, max_height):
for sh in range(15):
sh1 = random.randint(14, 15)
sh3 = str(sh1)
self.sh3 = starting_height
print starting_height
for mh in range(3):
mh1 = random.randint(1,2)
mh2 = mh1 + sh1
mh3 = str(mh2)
self.mh3 = max_height
print max_height
Horse_Heights = Horse("Secretariat")
As the error says, function get_height takes 3 arguments:
def get_heights(self, starting_height, max_height)
So when you call this function insert argments:
import random
class Horse(object):
def __init__(self, horse_name):
self.horse_name = horse_name
def get_heights(self, starting_height, max_height):
for sh in range(15):
sh1 = random.randint(14, 15)
sh3 = str(sh1)
self.sh3 = starting_height
print starting_height
for mh in range(3):
mh1 = random.randint(1,2)
mh2 = mh1 + sh1
mh3 = str(mh2)
self.mh3 = max_height
print max_height
Horse_Heights = Horse("Secretariat")