Maximum number of writable data containers - c++-amp

OS: Windows 8.1 64 Bit - fully updated
IDE: Visual Studio Professional 2013 - Version 12.0.30110.00 Update 1 - fully updated
I have a situation, where I get the following exception not during compile-, but run-time.
The number of writable data containers referenced in the entry function of the parallel_for_each call (17) exceeds the selected accelerator's limit (8).
The function where this happens looks like the following
void run_epoch(
accelerator_view mainAccelView,
ActivatorState activatorState,
TrainingState trainingState,
array_view<double, 2> avLayer1,
array_view<double, 2> avLayer2,
array_view<double, 2> avLayer3,
array_view<const double, 2> avPredictors,
array_view<const double, 2> avTargets,
array_view<double> avErrors,
int epoch
){
accelerator_view mainAccelView = accelerator::accelerator().create_view(queuing_mode::queuing_mode_immediate);
int noOfColumnsPredictors = AmpUtils::get_no_of_columns(avPredictors);
int noOfRowsPredictors = AmpUtils::get_no_of_rows(avPredictors, noOfColumnsPredictors);
int noOfColumnsLayer1 = AmpUtils::get_no_of_columns(avLayer1);
int noOfColumnsLayer2 = AmpUtils::get_no_of_columns(avLayer2);
int noOfColumnsLayer3 = AmpUtils::get_no_of_columns(avLayer3);
int noOfRowsLayer1 = AmpUtils::get_no_of_rows(avLayer1, noOfColumnsLayer1);
int noOfRowsLayer2 = AmpUtils::get_no_of_rows(avLayer2, noOfColumnsLayer2);
int noOfRowsLayer3 = AmpUtils::get_no_of_rows(avLayer3, noOfColumnsLayer3);
array_view<double, 2> avOutputLayer1(noOfRowsPredictors, noOfRowsLayer1);
array_view<double, 2> avOutputLayer2(noOfRowsPredictors, noOfRowsLayer2);
array_view<double, 2> avOutputLayer3(noOfRowsPredictors, noOfRowsLayer3);
array_view<double, 2> avErrorsLayer1(noOfRowsPredictors, noOfRowsLayer1);
array_view<double, 2> avErrorsLayer2(noOfRowsPredictors, noOfRowsLayer2);
array_view<double, 2> avErrorsLayer3(noOfRowsPredictors, noOfRowsLayer3);
array_view<double, 2> avThresholdLayer1(noOfRowsPredictors, noOfRowsLayer1);
array_view<double, 2> avThresholdLayer2(noOfRowsPredictors, noOfRowsLayer2);
array_view<double, 2> avThresholdLayer3(noOfRowsPredictors, noOfRowsLayer3);
array_view<double, 3> avWeightsLayer1(noOfRowsPredictors, noOfRowsLayer1, (noOfColumnsLayer1 - 1));
array_view<double, 3> avWeightsLayer2(noOfRowsPredictors, noOfRowsLayer2, (noOfColumnsLayer2 - 1));
array_view<double, 3> avWeightsLayer3(noOfRowsPredictors, noOfRowsLayer3, (noOfColumnsLayer3 - 1));
array_view<double, 2> avErrorsTempBuffer(noOfRowsPredictors, noOfRowsLayer3);
int errorTempBufferSize = avErrorsTempBuffer.extent.size();
array_view<double> avEpochErrors(noOfRowsPredictors);
try{
parallel_for_each(extent<1>(AmpUtils::get_no_of_rows(avPredictors)), [=](index<1> idx) restrict(cpu, amp){
int predictorRow = idx[0];
// step 1: compute
// step 11: compute layer 1
compute_layer(activatorState, avPredictors[predictorRow], avLayer1, avOutputLayer1, noOfColumnsLayer1, predictorRow);
// step 12: compute layer 2
compute_layer(activatorState, avPredictors[predictorRow], avLayer2, avOutputLayer2, noOfColumnsLayer2, predictorRow);
// step 13: compute layer 3
compute_layer(activatorState, avPredictors[predictorRow], avLayer3, avOutputLayer3, noOfColumnsLayer3, predictorRow);
// step 2: calculate_error
// step 21: calculate_error layer 3
for (int column = 0; column < noOfRowsLayer3; column++){
double neuronError = avTargets[predictorRow][column] - avOutputLayer3[predictorRow][column];
avErrorsTempBuffer[predictorRow][column] = neuronError * neuronError;
avErrorsLayer3[predictorRow][column] = neuronError * AmpActivator::derivative2(activatorState, avOutputLayer3[predictorRow][column]);
}
double errorSum = 0.0;
for (int column = 0; column < errorTempBufferSize; column++){
errorSum += avErrorsTempBuffer[predictorRow][column];
}
avEpochErrors[predictorRow] = errorSum;
// step 22: calculate_error layer 2
calculate_error_layer(activatorState, avErrorsLayer2[predictorRow], avErrorsLayer3, avLayer3, avOutputLayer2[predictorRow], noOfRowsLayer3, noOfRowsLayer3);
// step 23: calculate_error layer 1
calculate_error_layer(activatorState, avErrorsLayer1[predictorRow], avErrorsLayer2, avLayer2, avOutputLayer1[predictorRow], noOfRowsLayer2, noOfRowsLayer2);
// step 3: calculate_updates
// step 31: calculate_updates layer 1
calculate_updates_layer(trainingState, avErrorsLayer1[predictorRow], avPredictors[predictorRow], avThresholdLayer1[predictorRow], avWeightsLayer1[predictorRow], (noOfColumnsLayer1 - 1), noOfRowsLayer1);
// step 31: calculate_updates layer 2
calculate_updates_layer(trainingState, avErrorsLayer2[predictorRow], avPredictors[predictorRow], avThresholdLayer2[predictorRow], avWeightsLayer2[predictorRow], (noOfColumnsLayer2 - 1), noOfRowsLayer2);
// step 31: calculate_updates layer 3
calculate_updates_layer(trainingState, avErrorsLayer3[predictorRow], avPredictors[predictorRow], avThresholdLayer3[predictorRow], avWeightsLayer3[predictorRow], (noOfColumnsLayer3 - 1), noOfRowsLayer3);
// step 4: update_network
// step 41: update_network layer 1
update_layer(avLayer1, avWeightsLayer1[predictorRow], avThresholdLayer1[predictorRow], noOfColumnsLayer1, noOfRowsLayer1);
// step 42: update_network layer 2
update_layer(avLayer2, avWeightsLayer2[predictorRow], avThresholdLayer2[predictorRow], noOfColumnsLayer2, noOfRowsLayer2);
// step 43: update_network layer 3
update_layer(avLayer3, avWeightsLayer3[predictorRow], avThresholdLayer3[predictorRow], noOfColumnsLayer3, noOfRowsLayer3);
});
avEpochErrors.synchronize();
double epochErrorsSum = 0.0;
for (int i = 0; i < (int)avEpochErrors.extent.size(); i++){
epochErrorsSum += avEpochErrors[i];
}
avErrors[epoch] = epochErrorsSum;
}
catch (std::exception e){
std::wcout << "Exception Project::run_epoch: " << e.what() << std::endl;
}
}
According to this MSDN-post here and also here, the maximum number of writeable containers should have been increased to 64 since Windows 8.
My question is now, are there different types of writeable containers whereas I still only might use a maximum of 8 of a certain type?

Strictly speaking the limitation is on the number of UAVs. This is coupled to the DX version not Windows.
Limited number of writable array_view/array/texture/writeonly_texture_view objects allowed per kernel
C++ AMP supports a limited number of writable array_view/array/texture/writeonly_texture_view objects per kernel. Specifically, the total number of writable array_view + array + texture + writeonly_texture_view per kernel should not exceed 8 on DirectX 11 and 64 on DirectX11.1. The total number of allowed read-only array_view/array/texture objects per kernel is 128 and specifying the read-only restriction can help you avoid hitting the limit on maximum number of allowed writable array_view/array/texture/writeonly_texture_view objects per kernel.
From Parallel Programming in Native Code.
DX11.1 is supported on Win8 and has been back ported in some limited form to Win7. Looking at my machine it is running Windows 8.1 but seems to be using DX 11 not 11.1 drivers. DXDIAG.EXE will tell you what you are using. You need to make sure that your card supports DX11.1 and that you have the latest drivers installed.

Related

"Program too large" threshold greater than actual instruction count

I've written a couple production BPF agents, but my approach is very iterative until I please the verifier and can move on. I've reached my limit again.
Here's a program that works if I have one fewer && condition -- and breaks otherwise. The confusing part is that the warning implies that 103 insns is greater-than at most 4096 insns. There's obviously something I'm misunderstanding about how this is all strung together.
My ultimate goal is to do logging based on a process' environment -- so alternative approaches are welcome. :)
Error:
$ sudo python foo.py
bpf: Argument list too long. Program too large (103 insns), at most 4096 insns
Failed to load BPF program b'tracepoint__sched__sched_process_exec': Argument list too long
BPF Source:
#include <linux/mm_types.h>
#include <linux/sched.h>
#include <linux/version.h>
int tracepoint__sched__sched_process_exec(
struct tracepoint__sched__sched_process_exec* args
) {
struct task_struct* task = (typeof(task))bpf_get_current_task();
const struct mm_struct* mm = task->mm;
unsigned long env_start = mm->env_start;
unsigned long env_end = mm->env_end;
// Read up to 512 environment variables -- only way I could find to "limit"
// the loop to satisfy the verifier.
char var[12];
for (int n = 0; n < 512; n++) {
int result = bpf_probe_read_str(&var, sizeof var, (void*)env_start);
if (result <= 0) {
break;
}
env_start += result;
if (
var[0] == 'H' &&
var[1] == 'I' &&
var[2] == 'S' &&
var[3] == 'T' &&
var[4] == 'S' &&
var[5] == 'I' &&
var[6] == 'Z' &&
var[7] == 'E'
) {
bpf_trace_printk("Got it: %s\n", var);
break;
}
}
return 0;
}
Basic loader program for reproducing:
#!/usr/bin/env python3
import sys
from bcc import BPF
if __name__ == '__main__':
source = open("./foo.c").read()
try:
BPF(text=source.encode("utf-8")).trace_print()
except Exception as e:
error = str(e)
sys.exit(error)
bpf: Argument list too long. Program too large (103 insns), at most 4096 insns
Looking at the error message, my guess would be that your program has 103 instructions and it's rejected because it's too complex. That is, the verifier gave up before analyzing all instructions on all paths.
On Linux 5.15 with a privileged user, the verifier gives up after reading 1 million instructions (the complexity limit). Since it has to analyze all paths through the program, a program with a small number of instructions can have a very high complexity. That's particularly the case when you have loops and many conditions, as is your case.
Why is the error message confusing? This error message is coming from libbpf.c:
if (ret < 0 && errno == E2BIG) {
fprintf(stderr,
"bpf: %s. Program %s too large (%u insns), at most %d insns\n\n",
strerror(errno), attr->name, insns_cnt, BPF_MAXINSNS);
return -1;
}
Since the bpf(2) syscall returns E2BIG both when the program is too large and when its complexity is too high, libbpf prints the same error message for both cases, always with at most 4096 instructions. I'm confident upstream would accept a patch to improve that error message.

BPF verifier rejects when try to access __sk_buff member

I'm trying to write a sample eBPF program which can access __sk_buff member and dump it into
/sys/kernel/debug/tracing/trace.
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
SEC("dump_skb_member")
int test_prog(struct __sk_buff *skb)
{
char fmt[] = "packet: local %u remote %u\n";
__u32 local_ip4 = bpf_htonl(skb->local_ip4);
__u32 remote_ip4 = bpf_htonl(skb->remote_ip4);
bpf_trace_printk(fmt, sizeof(fmt), local_ip4, remote_ip4);
return BPF_OK;
}
char _license[] SEC("license") = "GPL";
When i compile this code, and load this program
ip route add 192.168.56.104 encap bpf out obj sample.o section dump_skb_member dev enp0s8
An error is thrown.
Prog section 'dump_skb_member' rejected: Permission denied (13)!
- Type: 11
- Instructions: 21 (0 over limit)
- License: GPL
Verifier analysis:
0: (b7) r2 = 685349
1: (63) *(u32 *)(r10 -8) = r2
2: (18) r2 = 0x2065746f6d657220
4: (7b) *(u64 *)(r10 -16) = r2
5: (18) r2 = 0x7525206c61636f6c
7: (7b) *(u64 *)(r10 -24) = r2
8: (18) r2 = 0x203a74656b636170
10: (7b) *(u64 *)(r10 -32) = r2
11: (61) r4 = *(u32 *)(r1 +92)
invalid bpf_context access off=92 size=4
processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
Error fetching program/map!
But if i don't call bpf_trace_printk to dump member, it can be loaded.
My question is why the error is caused by calling bpf_trace_printk?
The error is not caused by bpf_trace_prink(), but by the skb accesses that are present in your bytecode only when you call bpf_trace_printk().
Accessing skb->local_ip4 and skb->remote_ip4 is not allowed for programs of types BPF_PROG_TYPE_LWT_OUT, that you use.
See kernel code: The function that checks for valid access for this type returns false for certain offsets or range in skb:
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
[...]
return false;
This corresponds to the range where local_ip4 and remote_ip4 are defined:
struct __sk_buff {
[...]
/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
__u32 family;
__u32 remote_ip4; /* Stored in network byte order */
__u32 local_ip4; /* Stored in network byte order */
__u32 remote_ip6[4]; /* Stored in network byte order */
__u32 local_ip6[4]; /* Stored in network byte order */
__u32 remote_port; /* Stored in network byte order */
__u32 local_port; /* stored in host byte order */
/* ... here. */
When you remove your call to the bpf_trace_printk() helper, your local variables are no longer needed and clang compiles your code out of the program. The attempt to read at forbidden offsets is no longer part of your bytecode, so the program loads successfully.

Data transmission Pi/MCP3008

I have a question regarding data transmisison from a Raspberry Pi to a mcp3008. It is just a theoretical one. When they exchange the bytes does the master send 1 byte and receives 1 byte. Then sends the 2nd byte and receives 2nd byte.. OR does the master send 3 bytes and receives 3 bytes after that. From my understanding it is the first one, am I right?
Adafruit's library for the MCP3008 has your answer. Check out the read_adc() function:
def read_adc(self, adc_number):
"""Read the current value of the specified ADC channel (0-7). The values
can range from 0 to 1023 (10-bits).
"""
assert 0 <= adc_number <= 7, 'ADC number must be a value of 0-7!'
# Build a single channel read command.
# For example channel zero = 0b11000000
command = 0b11 << 6 # Start bit, single channel read
command |= (adc_number & 0x07) << 3 # Channel number (in 3 bits)
# Note the bottom 3 bits of command are 0, this is to account for the
# extra clock to do the conversion, and the low null bit returned at
# the start of the response.
resp = self._spi.transfer([command, 0x0, 0x0])
# Parse out the 10 bits of response data and return it.
result = (resp[0] & 0x01) << 9
result |= (resp[1] & 0xFF) << 1
result |= (resp[2] & 0x80) >> 7
return result & 0x3FF
It appears that it sends a three-byte command (where only one byte is non-zero):
resp = self._spi.transfer([command, 0x0, 0x0])
The response is three bytes which contains the packed 10-bit ADC value.
resp = self._spi.transfer([command, 0x0, 0x0])
# Parse out the 10 bits of response data and return it.
result = (resp[0] & 0x01) << 9
result |= (resp[1] & 0xFF) << 1
result |= (resp[2] & 0x80) >> 7

How to benefit from heap tagging by DLL?

How do I use and benefit from the GFlags setting Enable heap tagging by DLL?
I know how to activate the setting for a process, but I did not find useful information in the output of !heap -t in WinDbg. I was expecting some output like this:
0:000> !heap -t
Index Address Allocated by
1: 005c0000 MyDll.dll
2: 006b0000 AnotherDll.dll
so that I can identify which heap was created by which DLL and then e.g. identify the source of a memory leak.
Is this a misunderstanding of the term "heap tagging by DLL" or do I need some more commands to get to the desired result?
My research so far:
I googled for a tutorial on this topic, but I couldn't find a detailed description
I read WinDbg's .hh !heap but it's not explained there in detail as well. Tag is only used in !heap -b
again a very late answer
to benefit from HeapTagging you need to create a tag first in your code.
as far as i know (that is upto xp-sp3) there were no Documented APIS to Create a tag
(I havent mucked with heap since then so i am not aware of latest apis in os > vista Rewrites were done to heap manager so probably many of the ^^^features^^^ that i post below might have been corrected or bettered or bugs removed )
in xp-sp3 you can use undocumented RtlCreateTagHeap to create a new tag to either Process Heap or Private Heap
and after you create tha tag you need to set the global flag 8000 | 800
htg - Enable heap tagging
htd - Enable heap tagging by DLL
and theoratically all allocs and frees must get tagged .
but practically only allocations > 512 kB gets tagged in xp-sp3 with these basic steps
it either is a bug or a feature that limits tagging to allocations and frees > 512 kB
HeapAlloc goes through ZwAllocateVirtualMemory in case of Allocations > 512 kB in 32 bit process refer HeapCreate / HeapAlloc Documentation in msdn
and as a debuging aid you can patch ntdll.dll on the fly to enable tagging for all Allocations and frees .
below is a sample code that demonstrates the tagging and how to view it all in windbg
compile using cl /Zi /analyze /W4 <src> /link /RELEASE
use windbg to execute the app and watch tagging with !heap * -t command
#include <windows.h>
#include <stdio.h>
//heaptags are kinda broken or they are intentionally
//given only to allocations > 512 kb // allocation > 512 kb
//go through VirtualAlloc Route for Heap created with maxsize
//set to 0 uncomment ALLOCSIZE 0xfdfd2 and recompile to watch
// tagging increase by 100% with ALLOCSIZE 0xfdfd1 only 50 allocs
// and frees that are > 512 kB will be tagged these magic numbers
// are related to comment in HeapCreate Documentation that state
// slightly less than 512 kB will be allocated for 32 bit process
// tagging can be dramatically increased by patching ntdll when
// stopped on system breakpoint patch 7c94b8a4 (xpsp3 ntdll.dll)
// use the below command in windbg for finding the offset of pattern
// command must be in single line no line breaks
// .foreach /pS 4 /ps 4 ( place { !grep -i -e call -c
// "# call*RtlpUpdateTagEntry 7c900000 l?20000" } ) { ub place }
// the instruction we are searching to patch is
//7c94b8a1 81e3ff0fffff and ebx,0FFFF0FFFh
// patch 0f to 00 at system breakpoint with eb 7c94b8a1+3 00
#define BUFFERSIZE 100
#define ALLOCSIZE 0xfdfd1
//#define ALLOCSIZE 0xfdfd2
typedef int ( __stdcall *g_RtlCreateTagHeap) (
HANDLE hHeap ,
void * unknown,
wchar_t * BaseString,
wchar_t * TagString
);
void HeapTagwithHeapAllocPrivate()
{
PCHAR pch[BUFFERSIZE] = {};
HANDLE hHeap = 0;
ULONG tag1 = 0;
ULONG tag2 = 0;
ULONG tag3 = 0;
ULONG tag4 = 0;
ULONG tag5 = 0;
g_RtlCreateTagHeap RtlCreateTagHeap = 0;
HMODULE hMod = LoadLibrary("ntdll.dll");
if(hMod)
{
RtlCreateTagHeap = (g_RtlCreateTagHeap)
GetProcAddress( hMod,"RtlCreateTagHeap");
}
if (hHeap == 0)
{
hHeap = HeapCreate(0,0,0);
if (RtlCreateTagHeap != NULL)
{
tag1 = RtlCreateTagHeap (hHeap,0,L"HeapTag!",L"MyTag1");
tag2 = RtlCreateTagHeap (hHeap,0,L"HeapTag!",L"MyTag2");
tag3 = RtlCreateTagHeap (hHeap,0,L"HeapTag!",L"MyTag3");
tag4 = RtlCreateTagHeap (hHeap,0,L"HeapTag!",L"MyTag4");
}
}
HANDLE DefHeap = GetProcessHeap();
if ( (RtlCreateTagHeap != NULL) && (DefHeap != NULL ))
{
tag5 = RtlCreateTagHeap (DefHeap,0,L"HeapTag!",L"MyTag5");
for ( int i = 0; i < BUFFERSIZE ; i++ )
{
pch[i]= (PCHAR) HeapAlloc( DefHeap,HEAP_ZERO_MEMORY| tag5, 1 );
HeapFree(DefHeap,NULL,pch[i]);
}
}
if(hHeap)
{
for ( int i = 0; i < BUFFERSIZE ; i++ )
{
pch[i]= (PCHAR) HeapAlloc( hHeap,HEAP_ZERO_MEMORY| tag1, 1 );
//lets leak all allocs patch ntdll to see the tagging details
//HeapFree(hHeap,NULL,pch[i]);
}
for ( int i = 0; i < BUFFERSIZE ; i++ )
{
pch[i]= (PCHAR) HeapAlloc( hHeap,HEAP_ZERO_MEMORY| tag2, 100 );
// lets leak 40% allocs patch ntdll to see the tagging details
if(i >= 40)
HeapFree(hHeap,NULL,pch[i]);
}
// slightly less than 512 kb no tagging
for ( int i = 0; i < BUFFERSIZE / 2 ; i++ )
{
pch[i]= (PCHAR) HeapAlloc(
hHeap,HEAP_ZERO_MEMORY| tag3, ALLOCSIZE / 2 );
}
// > 512 kb default tagging
for ( int i = BUFFERSIZE / 2; i < BUFFERSIZE ; i++ )
{
pch[i]= (PCHAR) HeapAlloc(
hHeap,HEAP_ZERO_MEMORY | tag4 ,ALLOCSIZE );
}
for (int i =0 ; i < BUFFERSIZE ; i++)
{
HeapFree(hHeap,NULL,pch[i]);
}
}
}
void _cdecl main()
{
HeapTagwithHeapAllocPrivate();
}
the compiled exe to be run with windbg as below
DEFAULT execution and inspection
**only 50 tags will be visible all of them are > 512 kB Allocations
cdb -c "g;!heap * -t;q" newheaptag.exe | grep Tag**
heaptag:\>cdb -c "g;!heap * -t;q" newheaptag.exe | grep Tag
Tag Name Allocs Frees Diff Allocated
Tag Name Allocs Frees Diff Allocated
Tag Name Allocs Frees Diff Allocated
0004: HeapTag!MyTag4 50 50 0 0
patching ntdll on system breakpoint should make all tags visible
eb = write byte
patch and run the exe on exit inspect heaps with tags
cdb -c "eb 7c94b8a1+3 00;g;!heap * -t;q" newheaptag.exe | grep Tag
heaptag:\>cdb -c "eb 7c94b8a1+3 00;g;!heap * -t;q" newheaptag.exe | grep Tag
Tag Name Allocs Frees Diff Allocated
0012: HeapTag!MyTag5 100 100 0 0 <-our tag in process heap
Tag Name Allocs Frees Diff Allocated
Tag Name Allocs Frees Diff Allocated
0001: HeapTag!MyTag1 100 0 100 3200 <--- leak all
0002: HeapTag!MyTag2 100 60 40 5120 <--- leak 40 %
0003: HeapTag!MyTag3 50 50 0 0 <--- clean < 512 kB
0004: HeapTag!MyTag4 50 50 0 0 <----clean > 512 kB

Why system doesn't return main's value?

[root# test]$ cat return10.c
#include <stdio.h>
int main(int argc, char *argv[]){
return 10;
}
[root# test]$ perl -e 'print system("/path_to_return10")'
2560
I was expecting 10 but got 2560,why?
See $? in perldoc perlvar.
You got 10 * 256 (return value = 10) + 0 * 128 (there was no core dump) + 0 (process wasn't killed by signal).
as specified in the documentation for the system call in perl (http://perldoc.perl.org/functions/system.html):
The return value is the exit status of the program as returned by the
wait call. To get the actual exit value, shift right by eight (see
below).
indeed: 2560 >> 8 = 10