Sorry in advance because I am a beginner in Vivado HLS.
In my code in the following, I wanna Synthesis it, but Vivado tells me you cannot use the mutex and whatever dependent and gives me following errors.
ERROR: [SYNCHK 200-11] Global Variable 'readyQMutex' has an unsynthesizable struct type '%union.pthread_mutex_t.2.12.22 = type { %struct.__pthread_mu...' (a member pointer to struct itself).
ERROR: [SYNCHK 200-71] ../fpga_top.c:221: function 'pthread_mutex_lock' has no function body.
ERROR: [SYNCHK 200-71] ../fpga_top.c:225: function 'pthread_cond_wait' has no function body.
ERROR: [SYNCHK 200-71] ../fpga_top.c:237: function 'pthread_cond_signal' has no function body.
ERROR: [SYNCHK 200-71] ../fpga_top.c:238: function 'pthread_mutex_unlock' has no function body.
ERROR: [SYNCHK 200-11] ../fpga_top.c:18: Constant 'workerInfos' has an unsynthesizable type '[4 x %struct.threadInfo.6.16.26]*' (possible cause(s): structure variable cannot be decomposed due to (1) unsupported type conversion; (2) memory copy operation; (3) function pointer used in struct; (4) unsupported pointer comparison).
ERROR: [SYNCHK 200-61] ../fpga_top.c:75: unsupported memory access on variable 'child_task_ID' which is (or contains) an array with unknown size at compile time.
ERROR: [SYNCHK 200-71] ../fpga_top.c:77: function 'pthread_mutex_init' has no function body.
INFO: [SYNCHK 200-10] 8 error(s), 0 warning(s).
I found I should write the related code to handle it by myself, if so, how and what I should write?!
#include <stdbool.h>
#include "fpga_top.h"
int outputIndex = 0;
double core_speed[CORE_MAX] = {1.0, 1.0, 1.0, 1.0};
double outputTable[WORKLOAD_MAX*TASK_COUNT_MAX][EXCEL_Column_Size];
int readyQueueHead = 0;
int readyQueueRear = 0;
int readyQueueSize = 0;
char canContinue_ = 1;
int wlCounter = 0;
bool flag = 1;
// Add Task to assignedQueue
void addToAssignedQueue(int task_ID, int workload_ID, int q)
{
pthread_mutex_lock(&(workerInfos[q].workerMutex));
while( workerInfos[q].assignedQSize>=DEEP)
{
pthread_cond_wait(&(workerInfos[q].workerWaitHandle_Add), &(workerInfos[q].workerMutex));
}
int i = workerInfos[q].assignedQRear;
workerInfos[q].assignedQueue[i].task_ID = task_ID;
workerInfos[q].assignedQueue[i].workload_ID = workload_ID;
workerInfos[q].assignedQRear = (workerInfos[q].assignedQRear + 1) % DEEP;
workerInfos[q].assignedQSize++;
// A signal to a worker waiting to read from this queue
pthread_cond_signal(&(workerInfos[q].workerWaitHandle));
pthread_mutex_unlock(&(workerInfos[q].workerMutex));
}
// Read from assignedQueue
struct workItem readFromAssignedQueue(int q)
{
struct threadInfo *workerInfo_ = &workerInfos[q];
pthread_mutex_lock(&(workerInfo_->workerMutex));
struct workItem tas_;
// Initialize the output values (which may not be necessary now)
tas_.task_ID = -1;
tas_.workload_ID = -1;
if(workerInfo_->assignedQSize <= 0)
{
struct timespec time_to_wait = {10, 0}; //10 sec wait
pthread_cond_timedwait(&(workerInfo_->workerWaitHandle), &(workerInfo_->workerMutex), &time_to_wait);
}
if(workerInfo_->assignedQSize >0)
{
// Reading the assignedQueue if data is available
tas_ = workerInfo_->assignedQueue[workerInfo_->assignedQHead];
// Move forward the queue head index rotationally
workerInfos[q].assignedQHead = (workerInfos[q].assignedQHead + 1) % DEEP;
// Decreasing the count number of queue elements
workerInfos[q].assignedQSize--;
pthread_cond_signal(&(workerInfos[q].workerWaitHandle_Add));
}
pthread_mutex_unlock(&(workerInfo_->workerMutex));
return tas_;
}
// Add Definition of Task to DAG
void addTask(int task_ID, int parentCount, int child_task_ID[], int childCount, int processingTime)
{
struct Task_Package_Profile *p_task_ = &(taskArray[task_ID]);
p_task_->parentCount = parentCount;
p_task_->childCount = childCount;
p_task_->processingTime = processingTime;
// Initialize the parentReady variable for all workloads
for (int i = 0; i < WORKLOAD_MAX;i++) {p_task_->parentReady[i] = 0;}
// Copy the child's index
for (int i = 0; i < childCount; i++) {p_task_->child_task_ID[i] = child_task_ID[i];}
// Make parentReady mutex
pthread_mutex_init(&(p_task_->parentReadyMutex), NULL);
}
// DAG Definition
void initDag()
{
int ch0[] = { 1, 2, 3, 4}; addTask( 0, 0, ch0, 4, 10000);
int ch1[] = { 5, 6, 7, 8}; addTask( 1, 1, ch1, 4, 20000);
int ch2[] = { 5, 6, 7, 8}; addTask( 2, 1, ch2, 4, 20000);
int ch3[] = { 5, 6, 7, 8}; addTask( 3, 1, ch3, 4, 20000);
int ch4[] = { 5, 6, 7, 8}; addTask( 4, 1, ch4, 4, 20000);
int ch5[] = { 9, 10}; addTask( 5, 4, ch5, 2, 30000);
int ch6[] = { 9, 10}; addTask( 6, 4, ch6, 2, 30000);
int ch7[] = { 9, 10}; addTask( 7, 4, ch7, 2, 30000);
int ch8[] = { 9, 10}; addTask( 8, 4, ch8, 2, 30000);
int ch9[] = { 11, 12}; addTask( 9, 4, ch9, 2, 40000);
int ch10[] = { 11, 12}; addTask( 10, 4, ch10, 2, 40000);
int ch11[] = {}; addTask( 11, 2, ch11, 0, 50000);
int ch12[] = {}; addTask( 12, 2, ch12, 0, 50000);
addToReadyQueue(0, 0); // Root task, addToReadyQueue(int task_ID, int workload_ID)
readFromReadyQueue();
//allocateTask(0, 0, 0); // allocateTask(int task_ID, int workload_ID, int core_ID)
}
// Add Task to the end of the readyQueue
void addToReadyQueue(int task_ID, int workload_ID)
{
pthread_mutex_lock(&readyQMutex);
while(readyQueueSize >= READY_LOOP_DEEP)
{
// Waiting for the queue to be empty if there is no space
int res = pthread_cond_wait( &readyQWaitHandleAdd, &readyQMutex);
}
#ifdef PRINT_ReadyQ
printf("Task #%d (workload #%d) added to readyQueue %d:%d.\n", task_ID, workload_ID,readyQueueRear, readyQueueSize);
#endif
readyQueue[readyQueueRear].task_ID = task_ID;
readyQueue[readyQueueRear].workload_ID = workload_ID;
// Move forward the queue rear index in rotation
readyQueueRear = (readyQueueRear + 1) % READY_LOOP_DEEP;
// Increasing the number of the queue elements
readyQueueSize++;
// The signal is given to workers waiting to read from the queue
pthread_cond_signal(&readyQWaitHandleRead);
pthread_mutex_unlock(&readyQMutex);
}
// Read from the beginning of the readyQueue
struct workItem readFromReadyQueue()
{
struct workItem witem;
witem.task_ID = -1;
witem.workload_ID = -1;
pthread_mutex_lock(&readyQMutex);
// Waiting to queue if empty
while(readyQueueSize <= 0)
{
pthread_cond_wait( &readyQWaitHandleRead, &readyQMutex);
}
// Picking up from queue head
witem = readyQueue[readyQueueHead];
// Move forward the queue head index in rotation
readyQueueHead = (readyQueueHead + 1) % READY_LOOP_DEEP;
// Reduce the number of queue elements
readyQueueSize--;
#ifdef PRINT_ReadyQ
printf("Task #%d (workload #%d) removed to readyQueue. %d : %d\n", witem.task_ID , witem.workload_ID, readyQueueHead, readyQueueSize);
#endif
// The signal is given to workers who are waiting for the queue to be empty
pthread_cond_signal(&readyQWaitHandleAdd);
pthread_mutex_unlock(&readyQMutex);
return witem;
}
// Check if the reaadyQueue is empty with the corresponding mutex
int isReadyQueueEmpty()
{
int res = 0;
pthread_mutex_lock(&readyQMutex);
res = (readyQueueSize == 0);
pthread_mutex_unlock(&readyQMutex);
return res;
}
// Assigning Task to the Worker (Cores)
struct outputsFromFPGA allocateTask(int task_ID, int workload_ID, int core_ID)
{
if (flag == 1)
{
initDag();
flag = 0;
}
#ifdef PRINT_AllocateTask
printf("Task #%d (workload #%d) assigned to Core #%d;\n", task_ID, workload_ID, core_ID);
#endif
addToAssignedQueue( task_ID, workload_ID, core_ID);
struct outputsFromFPGA FPGAOutputs;
FPGAOutputs.task_ID = task_ID;
FPGAOutputs.workload_ID = workload_ID;
FPGAOutputs.core_ID = core_ID;
}
// Ending each task and inform the children
void taskDone(int task_ID, int workload_ID, int core_ID)
{
struct Task_Package_Profile task_ = taskArray[task_ID];
#ifdef PRINT_TaskDone
printf("taskDone: Task #%d (workload #%d);\n", task_ID, workload_ID);
#endif
// Increase the child's parentReady variable and send the children to the ready queue if all parents are finished
struct Task_Package_Profile *p_task_ = &(taskArray[task_ID]);
for(int i = 0; i < p_task_->childCount; i++)
{
struct Task_Package_Profile *p_childTsk = &(taskArray[p_task_->child_task_ID[i]]);
int nbParentReady = 0;
// Increase the parentReady variable
pthread_mutex_lock(&(p_childTsk->parentReadyMutex));
nbParentReady = ++(p_childTsk->parentReady[workload_ID]);
pthread_mutex_unlock(&(p_childTsk->parentReadyMutex));
// Send the child to the ready queue if all parents are finished
if (nbParentReady == p_childTsk->parentCount)
addToReadyQueue(p_task_->child_task_ID[i], workload_ID);
}
pthread_mutex_lock(&assignQSizeCheckMutex);
// Find the most empty assignedQueue and assign ready tasks as much as possible
while(!isReadyQueueEmpty())
{ // Finds the best assignedQueue
int minQueue = 0;
int minSize = workerInfos[0].assignedQSize;
for (int i = 1; i < CORE_MAX; i++)
{
if(workerInfos[i].assignedQSize < minSize)
{
minSize = workerInfos[i].assignedQSize;
minQueue = i;
}
}
// The most empty queue should be smaller than Deep so that it can be added to the queue
if(minSize < DEEP)
{
struct workItem witem = readFromReadyQueue();
struct outputsFromFPGA FPGAOutputs = allocateTask(witem.task_ID, witem.workload_ID, minQueue);
}
else
{
break; // All assignedQueue are full
}
}
pthread_mutex_unlock(&assignQSizeCheckMutex);
}
// Check the end of the program that has all the tests done
void finishCheck()
{
if (wlCounter != WORKLOAD_MAX) return;
for(int i = 0; i < CORE_MAX; i++)
{
if (workerInfos[i].assignedQSize > 0) return;
if (workerInfos[i].coreState > 0) return;
}
if (!isReadyQueueEmpty()) return;
canContinue_ = 0;
for(int i = 0; i < CORE_MAX; i++)
pthread_cond_signal(&(workerInfos[i].workerWaitHandle));
}
Thread synchronization can be done in HLS as shown in this paper for example, but it is not supported in Vivado HLS yet.
That being said, it does not mean that it is impossible to implement your application on hardware. One approach is to implement every thread as a separate hardware kernel. Shared data can be put in another kernel, which ensures that accesses to the data are synchronized the way that you want. The kernels can communicate with the shared object via streaming interfaces. You can implement function parameters as streaming interfaces with hls::stream. After implementing each of the kernels as an IP module, you can connect them via FIFOs generated with FIFO generator in a Vivado block design.
You could make for example a control stream from each processing kernel to the shared object that allow the kernels to send a request to access the shared object. In the shared object, you use non-blocking reads from the streams to see whether any of them wants exclusive access. Then you take write or read requests only from the control stream from the kernel that was granted exclusive access. The data associated with the reads and writes can be communicated via dedicated data streams between the kernels and shared object. When a kernel is done using the shared object, it can send a release command, and the shared object starts looking again for requests on all control streams. It takes a bit of labor, but it is a feasible solution...
Related
MICRO: PIC18LF47K42
compiler: XC8
application: MPLABX
I am trying to combine the values in of 12 bit ADC. They go into ADRESH and ADRESL. My ADC is set up for right-justify which does formating as so:
ADRESH:(----MSB,x,x,x) ADRESL: (X,X,X,X,X,X,X,LSB)
From inspecting the value in my result register i can tell i dont have a great resolution. Im pretty sure its becasue of how im combining ADRESH and ADRESL. How could i do this?
#include "myIncludes.h"
volatile unsigned char ZCDSoftwareFlag = 0;
volatile unsigned char switchValue = 0;
void main(void)
{
portInit();
triac = 0;
unsigned char result;
adcInit();
while(1)
{
__delay_us(4);
ADCON0bits.GO = 1; //Start conversion
while (ADCON0bits.GO); //Wait for conversion done
result = ADRESH;
result = result << 8;
result = result |ADRESL;
}
}
And heres the ADC init function
void adcInit(void)
{
ADCON0bits.FM = 1; //right-justify
ADCON0bits.CS = 1; //ADCRC Clock
ADPCH = 0x00; //RA0 is Analog channel
ADCON0bits.ON = 1; //Turn ADC On
ADCON0bits.GO = 1; //Start conversion
}
You try to put an 12Bit result in an 8 Bit variable. Switch it to 16Bit.
uint16_t result;
Then you can combine the values:
result = ADRESH;
result = result << 8;
result = result |ADRESL;
Edit
**After quite a bit of playing around with my code I have written few versions, one of them is what I sought for.
I'd do the following:
Create a helper Set with all the numbers from 1 to 100 - O(n) in time and space
Create an (initially) empty Set to record the indexes with 0s - O(1)
Go over the array - O(n):
If the value is 0, add the index to the index set
If the value isn't 0, remove it from the helper Set
Go over the helper set, and assign the remaining values to the indexes saved to the index set - O(m), where m<=n
All in all - an O(n) solution in time and space.
In java:
int[] numbers = /* the array with missing numbers */
Set<Integer> allNumbers = IntStream.rangeClosed(1, 100).boxed().Collect(Collectors.toSet());
Set<Ineteger> missingIndexes = new HashSet<>();
for (int i = 0; i < numbers.length; ++i) {
if (numbers[i] == 0) {
missingIndexes.add(i);
} else {
allNumbers.remove(i);
}
}
Iterator<Integer> numberIter = allNumbers.iterator();
Iterator<Integer> indexIter = missingIndexes.iterator();
while (numberIter.hasNext() && indexIter.hasNext()) {
numbers[indexIter.next()] = numberIter.next();
}
If you can sacrifice a little more space to optimize for time.
Basically just create another missing array and fill it with what's missing by traversing your helper array.
Modified your original solution.
int input [] = {5,6,0,3,0,2,1};
int output [] = new int[input.length];
boolean [] helper = new boolean[input.length];
for(int i = 0; i <input.length; i++)
{
if(input[i] != 0)
helper[i] = true;
}
int missing [] = new int[input.length];
int missingCount = 0;
for(int j = 0; j < helper.length; j++)
{
if(!helper[j]){
missing[missingCount++] = j;
}
}
missingCount = 0;
for(int j = 0; j < input.length; j++){
if(input[j]==0){
input[j]=missing[missingCount++];
}
}
Below code can find the missing element and add them back in O(n) complexity:
notFound - Will store index or location of input array having number
with zero
numberDetails - Contains details of numbers whether its present in
input array or not(true or false)
Example: input[3] = false means 4 (3+1) is not present in input array
, input[4] = true means 5 (4+1) is present in input array
int input[] = { 5, 6, 0, 3, 0, 2, 1 };
int notFound[] = new int[input.length];
boolean[] numberDetails = new boolean[input.length];
int notFoundIndex=0;
for(int i=0;i<input.length;i++) {
if(input[i]==0) {
notFound[notFoundIndex++]=i;
}
else {
numberDetails[input[i]-1]=true;
}
}
notFoundIndex=0;
for(int j=0;j<numberDetails.length;j++) {
if(!numberDetails[j]) {
input[notFound[notFoundIndex++]] = j+1;
}
}
System.out.println(Arrays.toString(input));
I am trying to compute sum of large array in parallel with metal swift.
Is there a god way to do it?
My plane was that I divide my array to sub arrays, compute sum of one sub arrays in parallel and then when parallel computation is finished compute sum of sub sums.
for example if I have
array = [a0,....an]
I divide array in sub arrays :
array_1 = [a_0,...a_i],
array_2 = [a_i+1,...a_2i],
....
array_n/i = [a_n-1, ... a_n]
sums for this arrays is computed in parallel and I get
sum_1, sum_2, sum_3, ... sum_n/1
at the end just compute sum of sub sums.
I create application which run my metal shader, but some things I don't understand quite.
var array:[[Float]] = [[1,2,3], [4,5,6], [7,8,9]]
// get device
let device: MTLDevice! = MTLCreateSystemDefaultDevice()
// get library
let defaultLibrary:MTLLibrary! = device.newDefaultLibrary()
// queue
let commandQueue:MTLCommandQueue! = device.newCommandQueue()
// function
let kernerFunction: MTLFunction! = defaultLibrary.newFunctionWithName("calculateSum")
// pipeline with function
let pipelineState: MTLComputePipelineState! = try device.newComputePipelineStateWithFunction(kernerFunction)
// buffer for function
let commandBuffer:MTLCommandBuffer! = commandQueue.commandBuffer()
// encode function
let commandEncoder:MTLComputeCommandEncoder = commandBuffer.computeCommandEncoder()
// add function to encode
commandEncoder.setComputePipelineState(pipelineState)
// options
let resourceOption = MTLResourceOptions()
let arrayBiteLength = array.count * array[0].count * sizeofValue(array[0][0])
let arrayBuffer = device.newBufferWithBytes(&array, length: arrayBiteLength, options: resourceOption)
commandEncoder.setBuffer(arrayBuffer, offset: 0, atIndex: 0)
var result:[Float] = [0,0,0]
let resultBiteLenght = sizeofValue(result[0])
let resultBuffer = device.newBufferWithBytes(&result, length: resultBiteLenght, options: resourceOption)
commandEncoder.setBuffer(resultBuffer, offset: 0, atIndex: 1)
let threadGroupSize = MTLSize(width: 1, height: 1, depth: 1)
let threadGroups = MTLSize(width: (array.count), height: 1, depth: 1)
commandEncoder.dispatchThreadgroups(threadGroups, threadsPerThreadgroup: threadGroupSize)
commandEncoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
let data = NSData(bytesNoCopy: resultBuffer.contents(), length: sizeof(Float), freeWhenDone: false)
data.getBytes(&result, length: result.count * sizeof(Float))
print(result)
is my Swift code,
my shader is :
kernel void calculateSum(const device float *inFloat [[buffer(0)]],
device float *result [[buffer(1)]],
uint id [[ thread_position_in_grid ]]) {
float * f = inFloat[id];
float sum = 0;
for (int i = 0 ; i < 3 ; ++i) {
sum = sum + f[i];
}
result = sum;
}
I don't know how to defined that inFloat is array of array.
I don't know exactly what is threadGroupSize and threadGroups.
I don't know what is device and uint in shader properties.
Is this right approach?
I took the time to create a fully working example of this problem with Metal. The explanation is in the comments:
let count = 10_000_000
let elementsPerSum = 10_000
// Data type, has to be the same as in the shader
typealias DataType = CInt
let device = MTLCreateSystemDefaultDevice()!
let library = self.library(device: device)
let parsum = library.makeFunction(name: "parsum")!
let pipeline = try! device.makeComputePipelineState(function: parsum)
// Our data, randomly generated:
var data = (0..<count).map{ _ in DataType(arc4random_uniform(100)) }
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
// Number of individual results = count / elementsPerSum (rounded up):
let resultsCount = (count + elementsPerSum - 1) / elementsPerSum
// Our data in a buffer (copied):
let dataBuffer = device.makeBuffer(bytes: &data, length: MemoryLayout<DataType>.stride * count, options: [])!
// A buffer for individual results (zero initialized)
let resultsBuffer = device.makeBuffer(length: MemoryLayout<DataType>.stride * resultsCount, options: [])!
// Our results in convenient form to compute the actual result later:
let pointer = resultsBuffer.contents().bindMemory(to: DataType.self, capacity: resultsCount)
let results = UnsafeBufferPointer<DataType>(start: pointer, count: resultsCount)
let queue = device.makeCommandQueue()!
let cmds = queue.makeCommandBuffer()!
let encoder = cmds.makeComputeCommandEncoder()!
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(dataBuffer, offset: 0, index: 0)
encoder.setBytes(&dataCount, length: MemoryLayout<CUnsignedInt>.size, index: 1)
encoder.setBuffer(resultsBuffer, offset: 0, index: 2)
encoder.setBytes(&elementsPerSumC, length: MemoryLayout<CUnsignedInt>.size, index: 3)
// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)
// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()
var start, end : UInt64
var result : DataType = 0
start = mach_absolute_time()
cmds.commit()
cmds.waitUntilCompleted()
for elem in results {
result += elem
}
end = mach_absolute_time()
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
result = 0
start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
for elem in buffer {
result += elem
}
}
end = mach_absolute_time()
print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
I used my Mac to test it, but it should work just fine on iOS.
Output:
Metal result: 494936505, time: 0.024611456
CPU result: 494936505, time: 0.163341018
The Metal version is about 7 times faster. I'm sure you can get more speed if you implement something like divide-and-conquer with cutoff or whatever.
The accepted answer is annoyingly missing the kernel that was written for it. The source is here, but here is the full program and shader that can be run as a swift command line application.
/*
* Command line Metal Compute Shader for data processing
*/
import Metal
import Foundation
//------------------------------------------------------------------------------
let count = 10_000_000
let elementsPerSum = 10_000
//------------------------------------------------------------------------------
typealias DataType = CInt // Data type, has to be the same as in the shader
//------------------------------------------------------------------------------
let device = MTLCreateSystemDefaultDevice()!
let library = device.makeDefaultLibrary()!
let parsum = library.makeFunction(name: "parsum")!
let pipeline = try! device.makeComputePipelineState(function: parsum)
//------------------------------------------------------------------------------
// Our data, randomly generated:
var data = (0..<count).map{ _ in DataType(arc4random_uniform(100)) }
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
// Number of individual results = count / elementsPerSum (rounded up):
let resultsCount = (count + elementsPerSum - 1) / elementsPerSum
//------------------------------------------------------------------------------
// Our data in a buffer (copied):
let dataBuffer = device.makeBuffer(bytes: &data, length: MemoryLayout<DataType>.stride * count, options: [])!
// A buffer for individual results (zero initialized)
let resultsBuffer = device.makeBuffer(length: MemoryLayout<DataType>.stride * resultsCount, options: [])!
// Our results in convenient form to compute the actual result later:
let pointer = resultsBuffer.contents().bindMemory(to: DataType.self, capacity: resultsCount)
let results = UnsafeBufferPointer<DataType>(start: pointer, count: resultsCount)
//------------------------------------------------------------------------------
let queue = device.makeCommandQueue()!
let cmds = queue.makeCommandBuffer()!
let encoder = cmds.makeComputeCommandEncoder()!
//------------------------------------------------------------------------------
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(dataBuffer, offset: 0, index: 0)
encoder.setBytes(&dataCount, length: MemoryLayout<CUnsignedInt>.size, index: 1)
encoder.setBuffer(resultsBuffer, offset: 0, index: 2)
encoder.setBytes(&elementsPerSumC, length: MemoryLayout<CUnsignedInt>.size, index: 3)
//------------------------------------------------------------------------------
// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)
// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
//------------------------------------------------------------------------------
encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()
//------------------------------------------------------------------------------
var start, end : UInt64
var result : DataType = 0
//------------------------------------------------------------------------------
start = mach_absolute_time()
cmds.commit()
cmds.waitUntilCompleted()
for elem in results {
result += elem
}
end = mach_absolute_time()
//------------------------------------------------------------------------------
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
//------------------------------------------------------------------------------
result = 0
start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
for elem in buffer {
result += elem
}
}
end = mach_absolute_time()
print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
//------------------------------------------------------------------------------
#include <metal_stdlib>
using namespace metal;
typedef unsigned int uint;
typedef int DataType;
kernel void parsum(const device DataType* data [[ buffer(0) ]],
const device uint& dataLength [[ buffer(1) ]],
device DataType* sums [[ buffer(2) ]],
const device uint& elementsPerSum [[ buffer(3) ]],
const uint tgPos [[ threadgroup_position_in_grid ]],
const uint tPerTg [[ threads_per_threadgroup ]],
const uint tPos [[ thread_position_in_threadgroup ]]) {
uint resultIndex = tgPos * tPerTg + tPos;
uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end
for (; dataIndex < endIndex; dataIndex++)
sums[resultIndex] += data[dataIndex];
}
Objective-C
The same Swift command-line programme, but in Objective-C
#import <Foundation/Foundation.h>
#import <Metal/Metal.h>
typedef int DataType;
int main(int argc, const char * argv[]) {
#autoreleasepool {
unsigned int count = 10000000;
unsigned int elementsPerSum = 10000;
//----------------------------------------------------------------------
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
id<MTLLibrary>library = [device newDefaultLibrary];
id<MTLFunction>parsum = [library newFunctionWithName:#"parsum"];
id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:parsum error:nil];
//----------------------------------------------------------------------
DataType* data = (DataType*) malloc(sizeof(DataType) * count);
for (int i = 0; i < count; i++){
data[i] = arc4random_uniform(100);
}
unsigned int dataCount = count;
unsigned int elementsPerSumC = elementsPerSum;
unsigned int resultsCount = (count + elementsPerSum - 1) / elementsPerSum;
//------------------------------------------------------------------------------
id<MTLBuffer>dataBuffer = [device newBufferWithBytes:data
length:(sizeof(int) * count)
options:MTLResourceStorageModeManaged];
id<MTLBuffer>resultsBuffer = [device newBufferWithLength:(sizeof(int) * count)
options:0];
DataType* results = resultsBuffer.contents;
//----------------------------------------------------------------------
id<MTLCommandQueue>queue = [device newCommandQueue];
id<MTLCommandBuffer>cmds = [queue commandBuffer];
id<MTLComputeCommandEncoder> encoder = [cmds computeCommandEncoder];
//----------------------------------------------------------------------
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:dataBuffer offset:0 atIndex:0];
[encoder setBytes:&dataCount length:sizeof(unsigned int) atIndex:1];
[encoder setBuffer:resultsBuffer offset:0 atIndex:2];
[encoder setBytes:&elementsPerSumC length:sizeof(unsigned int) atIndex:3];
//----------------------------------------------------------------------
MTLSize threadgroupsPerGrid =
{
(resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth,
1,
1
};
MTLSize threadsPerThreadgroup =
{
pipeline.threadExecutionWidth,
1,
1
};
//----------------------------------------------------------------------
[encoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadsPerThreadgroup];
[encoder endEncoding];
//----------------------------------------------------------------------
uint64_t start, end;
DataType result = 0;
start = mach_absolute_time();
[cmds commit];
[cmds waitUntilCompleted];
for (int i = 0; i < resultsCount; i++){
result += results[i];
}
end = mach_absolute_time();
NSLog(#"Metal Result %d. time %f", result, (float)(end - start)/(float)(NSEC_PER_SEC));
//----------------------------------------------------------------------
result = 0;
start = mach_absolute_time();
for (int i = 0; i < count; i++){
result += data[i];
}
end = mach_absolute_time();
NSLog(#"Metal Result %d. time %f", result, (float)(end - start)/(float)(NSEC_PER_SEC));
//------------------------------------------------------------------------------
free(data);
}
return 0;
}
i've been running the app. on a gt 740 (384 cores) vs. i7-4790 with a multithreader vector sum implementation and here are my figures:
Metal lap time: 19.959092
cpu MT lap time: 4.353881
that's a 5/1 ratio for cpu,
so unless you have a powerful gpu using shaders is not worth it.
i've been testing the same code in a i7-3610qm w/ igpu intel hd 4000 and surprisely results are much better for metal: 2/1
edited: after tweaking with thread parameter i've finally improved gpu performance, now it's upto 16xcpu
I am trying to make a 2d array in objective-c and I don't really want to use NSArray because I'm using int and the code would annoying: {[array objectAtIndex:x] objectAtIndex:y], not to mention I would have to convert the numbers back from NSNumber... Seems like a lot of extra work.
Can't I do the following?
// .h file
int aTiles[10][2];
// .m file
aTiles = {
{ 0, 0}, // 0
{ 0, 1}, // 1
{ 1, 5}, // 2
{ 0, 0}, // 3
{ 0, 0}, // 4
{ 0, 0}, // 5
};
it works together in the same line (int a[x][x] = {...};), but I need the array to be public so I can access it from any function.
The second line says expecting semicolon.
Thanks
Looks like you have an extra comma near //5
Do you need to declare a type for aTiles?
int aTiles = ...
I got slightly different errors when I tried your original code, but this worked:
// .h file
extern int aTiles[10][2];
// .m file
int aTiles[10][2] = {
{ 0, 0}, // 0
{ 0, 1}, // 1
{ 1, 5}, // 2
{ 0, 0}, // 3
{ 0, 0}, // 4
{ 0, 0}, // 5
};
Since this has also been tagged C++, you can use an ivar:
std::vector<std::vector<int> > tiles;
Then you just resize and set initial element values in your object's initializer.
Otherwise, is this a global or an ivar? should it be const or mutable?
How about this awesomeness... (not really)
// .h
int aTiles[10][2];
// .m
int a[10][2] = {
{ 0, 0}, // 0
{ 0, 1}, // 1
{ 1, 5}, // 2
{ 0, 0}, // 3
{ 0, 0}, // 4
{ 0, 0}, // 5
};
for (int r = 0; r <= 5; r++) {
for (int c = 0; c < 2; c++) {
aTiles[r][c] = a[r][c];
}
}
definitely a work around, but... cant figure out any other way. If anyone figures out the correct way, please let me know :P
What I'm Trying To Do
Basically, I've got several possible arrays that I define with macros:
#define ARRAY_ONE {0, 2, 7, 8}
#define ARRAY_TWO {3, 6, 9, 2}
#define ARRAY_THREE {3, 6, 4, 5}
//etc...
At runtime, I have a C-Array that gets used in a lot of places in a certain class. I want this array to use one of the #define values, i.e:
int components[4];
if (caseOne)
{
components = ARRAY_ONE;
}
else if (caseTwo)
{
components = ARRAY_TWO;
}
else if (caseThree)
{
//etc...
}
-
The Problem
However, the above code does not work. Instead, I get a weird error
Expected expression before '[' token
Would anyone mind explaining what's going on, and how I could achieve what I'm attempting to? Any help would be much appreciated - Thanks!
I don't think that C arrays can be initialized using the curly-brace syntax after they've been declared. You can only do that when initializing them while declaring them.
Try adjusting the previously posted answer with:
const int ARRAY_ONE[] = {0, 2, 7, 8};
const int ARRAY_TWO[] = {3, 6, 9, 2};
const int ARRAY_THREE[] = {3, 6, 4, 5};
int *components;
if (case1) {
components = ARRAY_ONE;
} else if (case2) {
components = ARRAY_TWO;
} else if (case3) {
components = ARRAY_THREE;
}
I can't really work out what the error is. I suspect it might be coming from some code you haven't posted. Does it say the error is on the int components[4]; line?
Would this do? I uses constants instead of defines.
const int ARRAY_ONE[] = {0, 2, 7, 8};
const int ARRAY_TWO[] = {3, 6, 9, 2};
const int ARRAY_THREE[] = {3, 6, 4, 5};
int* components = ARRAY_ONE;
int whatever = components[2];
try this:
int ARRAY_ONE[] = {0,2,7,8};
int ARRAY_TWO [] = {3,6,9,2};
int ARRAY_THREE[] = {3,6,4,5};
int components[4];
int count =sizeof(components)/4 //this will get array length, or you can just put array lenght ;
if (case1)
for (int i =0; i< count; i++)
components[i] = ARRAY_ONE[i];
else if (case2)
for (int i =0; i< count; i++)
components[i] = ARRAY_TWO[i];