Forums Neueste Beiträge
 

Phenon-Nutzer gesucht

12/02/2008 - 00:47 von Elcaro Nosille | Report spam
Ich suche einen Nutzer eines Phenons oder Barcelona-Opterons mit Windows >= NT4/SP2.
Ich habe ein kleines Programm geschrieben das testet wie hoch der Durchsatz und die
Latenz im Transfer zwischen den L1/L2-Caches der Kerne ist und womit sich letztlich
die Frage beantworten ließe ob die Single-Die-Architektur des Phenons wirklich nen-
nenswert vorteilhaft ist (ich glaub's nicht und wenn überhaupt in nicht nennenswer-
ter Größenordnung die von allen anderen Faktoren überwogen wird).
Ich habe das ganze schon auf einem 3GHz Core II Extreme QX6850 getestet und der
random-access Transfer aus einem L1 in den anderen liegt bei 46.4 Takten für Trans-
fers zwischen den L1-Caches eines Dies und bei 66.1 Takten für Transfers zwischen
den L1-Caches der Cores auf unterschiedlichen Dies.
Das EXE ist 8kB groß und kann unter
http://rapidshare.com/files/9104686...e.exe.html
runtergeladen werden.

Source füg ich hier mal ein:



#include <windows.h>

#include <stdio.h>
#include <memory.h>

#define ASIZE(a) ((sizeof a) / (sizeof a[0]))

DWORD WINAPI ThreadFunc( LPVOID lpvThreadParam );
double getCpuFrequency();

#define CACHELINE_SIZE 64

struct ThreadMessage
{
HANDLE hEvtStart,
hEvtFinished;
enum TYPE {
ACQUIRE_BLOCK_LINEAR,
POP_BLOCK_LINEAR,
ACQUIRE_BLOCK_RANDOM,
POP_BLOCK_RANDOM
} type;
void volatile *pvBlock;
size_t blockSize;
DWORDLONG dwlTicks;
};

#if !defined(NDEBUG)
#define SetThreadPriority(thread, priority) (void)0
#endif

struct ChainLink
{
ChainLink *pclNext;
};

ChainLink *makeRandomChain( void *pvBlock, size_t blockSize );

template<typename UNSIGNED>
int log2_floor( UNSIGNED u );

int main()
{
size_t const blockSize = 16 * 1024;
unsigned threads;
unsigned log2Threads;
SYSTEM_INFO si;
void *pvBlock;
double cpuFrequency;
HANDLE ahThreads[64];
ThreadMessage aThreadMessages[64];
DWORD dwThreadId;
HANDLE ahEvtFinished[2];

printf( "cache2cache transfer-speed"
"written 11.2.08 by Olli" );

if( (threads = (unsigned)(GetSystemInfo( &si ), si.dwNumberOfProcessors)) < 2 )
return printf( "only one CPU/core" ), 0;

if( threads != (1u << (log2Threads = (unsigned)log2_floor<size_t>( threads ))) )
return printf( "suports only 2^N cores" ), 0;

pvBlock = ::VirtualAlloc( NULL, blockSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE );

SetThreadPriority( ::GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
::SetThreadAffinityMask( ::GetCurrentThread(), 1 );
cpuFrequency = getCpuFrequency();
printf( "CPU frequency: %.2lfGHz", (double)(cpuFrequency / 1.0E9) );

for( unsigned thread = 0; thread < threads; thread++ )
aThreadMessages[thread].hEvtStart = ::CreateEvent( NULL, FALSE, FALSE, NULL ),
ahThreads[thread] = ::CreateThread( NULL, 0, ThreadFunc, &aThreadMessages[thread], 0, &dwThreadId ),
::SetThreadAffinityMask( ahThreads[thread], (DWORD_PTR)1 << thread ),
SetThreadPriority( ahThreads[thread], THREAD_PRIORITY_TIME_CRITICAL );

ahEvtFinished[0] = ::CreateEvent( NULL, FALSE, FALSE, NULL );
ahEvtFinished[1] = ::CreateEvent( NULL, FALSE, FALSE, NULL );

struct
{
ThreadMessage::TYPE typeAcq,
typePop;
char *pchType;
} aPasses[2] {
ThreadMessage::ACQUIRE_BLOCK_LINEAR, ThreadMessage::POP_BLOCK_LINEAR, "linear",
ThreadMessage::ACQUIRE_BLOCK_RANDOM, ThreadMessage::POP_BLOCK_RANDOM, "random"
};

for( unsigned pass = 0; pass <= 1; pass++ )
{
if( pass == 0 )
::memset( pvBlock, 0, blockSize );
else
::makeRandomChain( pvBlock, blockSize );

for( unsigned threadPop = 0; threadPop < threads; threadPop++ )
for( unsigned mask = 1u << (log2Threads - 1); mask != 0; mask >>= 1 )
{
unsigned threadAcquire;
DWORDLONG dwlLowestTicks;
double cyclesPerCacheline;
double GBperSec;

if( (threadAcquire = threadPop ^ mask) < threadPop )
continue;

aThreadMessages[threadAcquire].hEvtFinished = ahEvtFinished[0];
aThreadMessages[threadAcquire].type = aPasses[pass].typeAcq;
aThreadMessages[threadAcquire].pvBlock = (char *)pvBlock;
aThreadMessages[threadAcquire].blockSize = blockSize;

aThreadMessages[threadPop].hEvtFinished = ahEvtFinished[1];
aThreadMessages[threadPop].type = aPasses[pass].typePop;
aThreadMessages[threadPop].pvBlock = (char *)pvBlock;
aThreadMessages[threadPop].blockSize = blockSize;

dwlLowestTicks = 0xFFFFFFFFFFFFFFFFu;
for( unsigned turn = 32; turn > 0; turn-- )
{
::SetEvent( aThreadMessages[threadAcquire].hEvtStart );
::SetEvent( aThreadMessages[threadPop].hEvtStart );
::WaitForMultipleObjects( 2, ahEvtFinished, TRUE, INFINITE );

if( aThreadMessages[threadPop].dwlTicks < dwlLowestTicks )
dwlLowestTicks = aThreadMessages[threadPop].dwlTicks;
}

cyclesPerCacheline = (double)(LONGLONG)dwlLowestTicks / (blockSize / CACHELINE_SIZE);
GBperSec = (cpuFrequency / cyclesPerCacheline) / (1024.0 * 1024.0 * 1024.0 / CACHELINE_SIZE);
::printf( "core %d from core %d %s: %.1lf cycles per cacheline (%.1lfns / %.2lfGB/s)",
(int)threadPop,
(int)threadAcquire,
(char *)aPasses[pass].pchType,
(double)cyclesPerCacheline,
(double)(1.0e9 * cyclesPerCacheline * (1.0 / cpuFrequency)),
(double)GBperSec );
}
}
getchar();

return 0;
}

DWORD WINAPI ThreadFunc( LPVOID lpvThreadParam )
{
ThreadMessage volatile *pThreadMessage;
void volatile *pvBlock;
size_t blockSize;

pThreadMessage = (ThreadMessage *)lpvThreadParam;

for( ; ; )
{
while( ::WaitForSingleObject( pThreadMessage->hEvtStart, INFINITE ) != WAIT_OBJECT_0 );
pvBlock = pThreadMessage->pvBlock;
blockSize = pThreadMessage->blockSize;

switch( pThreadMessage->type )
{
case ThreadMessage::ACQUIRE_BLOCK_LINEAR:
{
__asm
{
mov esi, pvBlock
lea esi, [esi]
mov ebx, blockSize
lea edi, [esi + ebx - CACHELINE_SIZE]

acqLoop:
mov DWORD PTR [edi], 1
lea edi, [edi - CACHELINE_SIZE]
cmp edi, esi
jae acqLoop

waitPopped:
cmp DWORD PTR [esi], 0
jne waitPopped
}

::SetEvent( pThreadMessage->hEvtFinished );
break;
}

case ThreadMessage::POP_BLOCK_LINEAR:
{
DWORD dwTscLo,
dwTscHi;

__asm
{
mov esi, pvBlock
lea esi, [esi]
mov ebx, blockSize
lea edi, [esi + ebx - CACHELINE_SIZE]

waitAcuqired:
rdtsc
cmp DWORD PTR [esi], 1
jne waitAcuqired
lea esi, [esi + CACHELINE_SIZE]

mov dwTscHi, edx
mov dwTscLo, eax

popLoop:
mov eax, [esi]
lea esi, [esi + CACHELINE_SIZE]
cmp esi, edi
jbe popLoop

rdtsc
sub eax, dwTscLo
sbb edx, dwTscHi
mov dwTscLo, eax
mov dwTscHi, edx

neg ebx
mov DWORD PTR [esi + ebx], 0
}

pThreadMessage->dwlTicks = ((DWORDLONG)dwTscHi >> 32 ) | dwTscLo;
::SetEvent( pThreadMessage->hEvtFinished );
break;
}

case ThreadMessage::ACQUIRE_BLOCK_RANDOM:
{
__asm
{
mov edi, pvBlock
mov esi, edi

acqChainLoop:
mov esi, [esi]
mov [esi + 4], 1
cmp esi, edi
jne acqChainLoop

waitChainPopped:
cmp DWORD PTR [esi + 4], 0
jne waitChainPopped
}

::SetEvent( pThreadMessage->hEvtFinished );
break;
}

case ThreadMessage::POP_BLOCK_RANDOM:
{
DWORD dwTscLo,
dwTscHi;

__asm
{
mov edi, pvBlock
mov esi, edi

waitChainAcuqired:
rdtsc
cmp DWORD PTR [esi + 4], 1
jne waitChainAcuqired

mov dwTscHi, edx
mov dwTscLo, eax

popChainLoop:
mov esi, [esi]
cmp esi, edi
jne popChainLoop

mov DWORD PTR [esi + 4], 0

rdtsc
sub eax, dwTscLo
sbb edx, dwTscHi
mov dwTscLo, eax
mov dwTscHi, edx
}

pThreadMessage->dwlTicks = ((DWORDLONG)dwTscHi >> 32 ) | dwTscLo;
::SetEvent( pThreadMessage->hEvtFinished );
break;
}
}
}

return 0;
}

template<typename UNSIGNED>
UNSIGNED reverse_bits( UNSIGNED u, unsigned bits );

template<typename UNSIGNED>
int log2_ceil( UNSIGNED u );

template<typename UNSIGNED>
int log2_floor( UNSIGNED u );

ChainLink *makeRandomChain( void *pvBlock, size_t blockSize )
{
unsigned blockBits;
size_t roundedBlockSize;
unsigned clBits;
unsigned reverseBits;
unsigned forwardIndex,
reverseIndex;
ChainLink *pclPrev,
*pcl;

blockBits = log2_ceil<size_t>( blockSize );
roundedBlockSize = (size_t)1 << blockBits;
clBits = log2_ceil<size_t>( CACHELINE_SIZE );
reverseBits = blockBits - clBits;

for( (forwardIndex = 0,
pclPrev = (ChainLink *)pvBlock); ; )
{
nextIndex:
if( ++forwardIndex == ((size_t)1 << reverseBits) )
break;

if( (reverseIndex = reverse_bits<unsigned>( forwardIndex, reverseBits )) >= (blockSize >> clBits) )
goto nextIndex;

pcl = (ChainLink *)((char *)pvBlock + (reverseIndex << clBits));
pclPrev->pclNext = pcl;
pclPrev = pcl;
}

pclPrev->pclNext = (ChainLink *)pvBlock;

pcl = (ChainLink *)pvBlock;
do
pcl = pcl->pclNext;
while( pcl != (ChainLink *)pvBlock );

return NULL;
}

DWORDLONG __fastcall GetTsc();

double getCpuFrequency()
{
LONGLONG llPcFrequency;
LONGLONG llPcA,
llPcB;
LONGLONG llTscA,
llTscB;

QueryPerformanceFrequency( &(LARGE_INTEGER &)llPcFrequency );
QueryPerformanceCounter( &(LARGE_INTEGER &)llPcA );
llTscA = GetTsc();

__asm
{
mov ecx, 1000000000
billionLoop:
sub ecx, 1
jnz billionLoop
}

QueryPerformanceCounter( &(LARGE_INTEGER &)llPcB );
llTscB = GetTsc();

return (double)(llTscB - llTscA) / ((double)(llPcB - llPcA) / (double)llPcFrequency);
}

__declspec(naked)
DWORDLONG __fastcall GetTsc()
{
__asm
{
rdtsc
ret
}
}

template<typename UNSIGNED>
inline UNSIGNED reverse_bits( UNSIGNED u )
{
UNSIGNED mask,
set,
reverse;

for( (mask = 1,
set = (UNSIGNED)1 << (sizeof(UNSIGNED) * CHAR_BIT - 1),
reverse = 0);
mask != 0;
(mask <<= 1,
set >>= 1) )
if( (u & mask) != 0 )
reverse |= set;

return reverse;
}

template<typename UNSIGNED>
inline UNSIGNED reverse_bits( UNSIGNED u, unsigned bits )
{
return reverse_bits<UNSIGNED>( u ) >> (sizeof(UNSIGNED) * CHAR_BIT - bits);
}

template<typename UNSIGNED>
int log2_floor( UNSIGNED u )
{
UNSIGNED mask;
int log2;

for( (mask = ~0u,
log2 = -1);
(u & mask) != 0u;
(mask <<= 1,
log2++) );

return log2;
}

template<typename UNSIGNED>
int log2_ceil( UNSIGNED u )
{
int log2;

return log2 = log2_floor<UNSIGNED>( u ),
log2 + (u > ((UNSIGNED)1 << log2));
}
 

Lesen sie die antworten

#1 Ralf Hildebrandt
13/02/2008 - 16:34 | Warnen spam
Elcaro Nosille schrieb:


die Frage beantworten ließe ob die Single-Die-Architektur des Phenons
wirklich nen-
nenswert vorteilhaft ist (ich glaub's nicht und wenn überhaupt in nicht
nennenswer-
ter Größenordnung die von allen anderen Faktoren überwogen wird).



Lat c't 2/2008, S. 75 im Artikel "Vierzylinder" steht:

Beim Rendering-Programm Cinema4D etwa leisten zwei Core2-Kerne ungefàhr
das 1,9fache eines einzelnen, ein quad aber lediglich das 3,5fache. Hier
wird übrigens der von AMD so oft herausgestellte Vorteil der "echten"
Vierkerne deutlich: Die Phenom-Kerne kooperieren in diesem Benchmark
tatsàchlich besser, sein Skalierungsfaktor betràgt fast 3,9. Wegen der
pro Kern niedrigeren Performance der bisher lieferbaren Phenoms bei
diesem ist aber der Core 2 Quad Q6600 trotzdem schneller ...


Ralf

Ähnliche fragen