Litecoin: More straightforward SSE2 detection

- Move CPU capabilities detection from init.cpp to scrypt.cpp
- scrypt_1024_1_1_256_sp is defined as a macro: no need to check for SSE2
  every time it is used
- Test correctness of both scrypt-generic and scrypt-sse2
This commit is contained in:
Nikolay Belikov 2013-12-25 18:33:11 +04:00 committed by Warren Togami
parent 4a96ed8039
commit d781651e4c
5 changed files with 61 additions and 83 deletions

View File

@ -22,18 +22,6 @@
#include <signal.h> #include <signal.h>
#endif #endif
#if defined(USE_SSE2)
#if !defined(MAC_OSX) && (defined(_M_IX86) || defined(__i386__) || defined(__i386))
#ifdef _MSC_VER
// MSVC 64bit is unable to use inline asm
#include <intrin.h>
#else
// GCC Linux or i686-w64-mingw32
#include <cpuid.h>
#endif
#endif
#endif
using namespace std; using namespace std;
using namespace boost; using namespace boost;
@ -512,23 +500,6 @@ bool AppInit2(boost::thread_group& threadGroup)
sigaction(SIGHUP, &sa_hup, NULL); sigaction(SIGHUP, &sa_hup, NULL);
#endif #endif
#if defined(USE_SSE2)
unsigned int cpuid_edx=0;
#if !defined(MAC_OSX) && (defined(_M_IX86) || defined(__i386__) || defined(__i386))
// 32bit x86 Linux or Windows, detect cpuid features
#if defined(_MSC_VER)
// MSVC
int x86cpuid[4];
__cpuid(x86cpuid, 1);
cpuid_edx = (unsigned int)buffer[3];
#else
// Linux or i686-w64-mingw32 (gcc-4.6.3)
unsigned int eax, ebx, ecx;
__get_cpuid(1, &eax, &ebx, &ecx, &cpuid_edx);
#endif
#endif
#endif
// ********************************************************* Step 2: parameter interactions // ********************************************************* Step 2: parameter interactions
fTestNet = GetBoolArg("-testnet"); fTestNet = GetBoolArg("-testnet");
@ -698,6 +669,10 @@ bool AppInit2(boost::thread_group& threadGroup)
int64 nStart; int64 nStart;
#if defined(USE_SSE2)
scrypt_detect_sse2();
#endif
// ********************************************************* Step 5: verify wallet database integrity // ********************************************************* Step 5: verify wallet database integrity
if (!fDisableWallet) { if (!fDisableWallet) {
@ -846,10 +821,6 @@ bool AppInit2(boost::thread_group& threadGroup)
// ********************************************************* Step 7: load block chain // ********************************************************* Step 7: load block chain
#if defined(USE_SSE2)
scrypt_detect_sse2(cpuid_edx);
#endif
fReindex = GetBoolArg("-reindex"); fReindex = GetBoolArg("-reindex");
// Upgrading to 0.8; hard-link the old blknnnn.dat files into /blocks/ // Upgrading to 0.8; hard-link the old blknnnn.dat files into /blocks/

View File

@ -4596,20 +4596,7 @@ void static LitecoinMiner(CWallet *pwallet)
char scratchpad[SCRYPT_SCRATCHPAD_SIZE]; char scratchpad[SCRYPT_SCRATCHPAD_SIZE];
loop loop
{ {
#if defined(USE_SSE2)
// Detection would work, but in cases where we KNOW it always has SSE2,
// it is faster to use directly than to use a function pointer or conditional.
#if defined(_M_X64) || defined(__x86_64__) || defined(_M_AMD64) || (defined(MAC_OSX) && defined(__i386__))
// Always SSE2: x86_64 or Intel MacOS X
scrypt_1024_1_1_256_sp_sse2(BEGIN(pblock->nVersion), BEGIN(thash), scratchpad);
#else
// Detect SSE2: 32bit x86 Linux or Windows
scrypt_1024_1_1_256_sp(BEGIN(pblock->nVersion), BEGIN(thash), scratchpad); scrypt_1024_1_1_256_sp(BEGIN(pblock->nVersion), BEGIN(thash), scratchpad);
#endif
#else
// Generic scrypt
scrypt_1024_1_1_256_sp_generic(BEGIN(pblock->nVersion), BEGIN(thash), scratchpad);
#endif
if (thash <= hashTarget) if (thash <= hashTarget)
{ {

View File

@ -34,6 +34,16 @@
#include <string.h> #include <string.h>
#include <openssl/sha.h> #include <openssl/sha.h>
#if defined(USE_SSE2) && !defined(USE_SSE2_ALWAYS)
#ifdef _MSC_VER
// MSVC 64bit is unable to use inline asm
#include <intrin.h>
#else
// GCC Linux or i686-w64-mingw32
#include <cpuid.h>
#endif
#endif
static inline uint32_t be32dec(const void *pp) static inline uint32_t be32dec(const void *pp)
{ {
const uint8_t *p = (uint8_t const *)pp; const uint8_t *p = (uint8_t const *)pp;
@ -202,26 +212,26 @@ static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
/* Operate on columns. */ /* Operate on columns. */
x04 ^= ROTL(x00 + x12, 7); x09 ^= ROTL(x05 + x01, 7); x04 ^= ROTL(x00 + x12, 7); x09 ^= ROTL(x05 + x01, 7);
x14 ^= ROTL(x10 + x06, 7); x03 ^= ROTL(x15 + x11, 7); x14 ^= ROTL(x10 + x06, 7); x03 ^= ROTL(x15 + x11, 7);
x08 ^= ROTL(x04 + x00, 9); x13 ^= ROTL(x09 + x05, 9); x08 ^= ROTL(x04 + x00, 9); x13 ^= ROTL(x09 + x05, 9);
x02 ^= ROTL(x14 + x10, 9); x07 ^= ROTL(x03 + x15, 9); x02 ^= ROTL(x14 + x10, 9); x07 ^= ROTL(x03 + x15, 9);
x12 ^= ROTL(x08 + x04, 13); x01 ^= ROTL(x13 + x09, 13); x12 ^= ROTL(x08 + x04, 13); x01 ^= ROTL(x13 + x09, 13);
x06 ^= ROTL(x02 + x14, 13); x11 ^= ROTL(x07 + x03, 13); x06 ^= ROTL(x02 + x14, 13); x11 ^= ROTL(x07 + x03, 13);
x00 ^= ROTL(x12 + x08, 18); x05 ^= ROTL(x01 + x13, 18); x00 ^= ROTL(x12 + x08, 18); x05 ^= ROTL(x01 + x13, 18);
x10 ^= ROTL(x06 + x02, 18); x15 ^= ROTL(x11 + x07, 18); x10 ^= ROTL(x06 + x02, 18); x15 ^= ROTL(x11 + x07, 18);
/* Operate on rows. */ /* Operate on rows. */
x01 ^= ROTL(x00 + x03, 7); x06 ^= ROTL(x05 + x04, 7); x01 ^= ROTL(x00 + x03, 7); x06 ^= ROTL(x05 + x04, 7);
x11 ^= ROTL(x10 + x09, 7); x12 ^= ROTL(x15 + x14, 7); x11 ^= ROTL(x10 + x09, 7); x12 ^= ROTL(x15 + x14, 7);
x02 ^= ROTL(x01 + x00, 9); x07 ^= ROTL(x06 + x05, 9); x02 ^= ROTL(x01 + x00, 9); x07 ^= ROTL(x06 + x05, 9);
x08 ^= ROTL(x11 + x10, 9); x13 ^= ROTL(x12 + x15, 9); x08 ^= ROTL(x11 + x10, 9); x13 ^= ROTL(x12 + x15, 9);
x03 ^= ROTL(x02 + x01, 13); x04 ^= ROTL(x07 + x06, 13); x03 ^= ROTL(x02 + x01, 13); x04 ^= ROTL(x07 + x06, 13);
x09 ^= ROTL(x08 + x11, 13); x14 ^= ROTL(x13 + x12, 13); x09 ^= ROTL(x08 + x11, 13); x14 ^= ROTL(x13 + x12, 13);
x00 ^= ROTL(x03 + x02, 18); x05 ^= ROTL(x04 + x07, 18); x00 ^= ROTL(x03 + x02, 18); x05 ^= ROTL(x04 + x07, 18);
x10 ^= ROTL(x09 + x08, 18); x15 ^= ROTL(x14 + x13, 18); x10 ^= ROTL(x09 + x08, 18); x15 ^= ROTL(x14 + x13, 18);
} }
@ -251,7 +261,7 @@ void scrypt_1024_1_1_256_sp_generic(const char *input, char *output, char *scrat
uint32_t i, j, k; uint32_t i, j, k;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
PBKDF2_SHA256((const uint8_t *)input, 80, (const uint8_t *)input, 80, 1, B, 128); PBKDF2_SHA256((const uint8_t *)input, 80, (const uint8_t *)input, 80, 1, B, 128);
for (k = 0; k < 32; k++) for (k = 0; k < 32; k++)
@ -277,47 +287,43 @@ void scrypt_1024_1_1_256_sp_generic(const char *input, char *output, char *scrat
} }
#if defined(USE_SSE2) #if defined(USE_SSE2)
#if defined(_M_X64) || defined(__x86_64__) || defined(_M_AMD64) || (defined(MAC_OSX) && defined(__i386__)) // By default, set to generic scrypt function. This will prevent crash in case when scrypt_detect_sse2() wasn't called
/* Always SSE2 */ void (*scrypt_1024_1_1_256_sp_detected)(const char *input, char *output, char *scratchpad) = &scrypt_1024_1_1_256_sp_generic;
void scrypt_detect_sse2(unsigned int cpuid_edx)
{
printf("scrypt: using scrypt-sse2 as built.\n");
}
#else
/* Detect SSE2 */
void (*scrypt_1024_1_1_256_sp)(const char *input, char *output, char *scratchpad);
void scrypt_detect_sse2(unsigned int cpuid_edx) void scrypt_detect_sse2()
{ {
#if defined(USE_SSE2_ALWAYS)
printf("scrypt: using scrypt-sse2 as built.\n");
#else // USE_SSE2_ALWAYS
// 32bit x86 Linux or Windows, detect cpuid features
unsigned int cpuid_edx=0;
#if defined(_MSC_VER)
// MSVC
int x86cpuid[4];
__cpuid(x86cpuid, 1);
cpuid_edx = (unsigned int)buffer[3];
#else // _MSC_VER
// Linux or i686-w64-mingw32 (gcc-4.6.3)
unsigned int eax, ebx, ecx;
__get_cpuid(1, &eax, &ebx, &ecx, &cpuid_edx);
#endif // _MSC_VER
if (cpuid_edx & 1<<26) if (cpuid_edx & 1<<26)
{ {
scrypt_1024_1_1_256_sp = &scrypt_1024_1_1_256_sp_sse2; scrypt_1024_1_1_256_sp_detected = &scrypt_1024_1_1_256_sp_sse2;
printf("scrypt: using scrypt-sse2 as detected.\n"); printf("scrypt: using scrypt-sse2 as detected.\n");
} }
else else
{ {
scrypt_1024_1_1_256_sp = &scrypt_1024_1_1_256_sp_generic; scrypt_1024_1_1_256_sp_detected = &scrypt_1024_1_1_256_sp_generic;
printf("scrypt: using scrypt-generic, SSE2 unavailable.\n"); printf("scrypt: using scrypt-generic, SSE2 unavailable.\n");
} }
#endif // USE_SSE2_ALWAYS
} }
#endif #endif
#endif
void scrypt_1024_1_1_256(const char *input, char *output) void scrypt_1024_1_1_256(const char *input, char *output)
{ {
char scratchpad[SCRYPT_SCRATCHPAD_SIZE]; char scratchpad[SCRYPT_SCRATCHPAD_SIZE];
#if defined(USE_SSE2) scrypt_1024_1_1_256_sp(input, output, scratchpad);
// Detection would work, but in cases where we KNOW it always has SSE2,
// it is faster to use directly than to use a function pointer or conditional.
#if defined(_M_X64) || defined(__x86_64__) || defined(_M_AMD64) || (defined(MAC_OSX) && defined(__i386__))
// Always SSE2: x86_64 or Intel MacOS X
scrypt_1024_1_1_256_sp_sse2(input, output, scratchpad);
#else
// Detect SSE2: 32bit x86 Linux or Windows
scrypt_1024_1_1_256_sp(input, output, scratchpad);
#endif
#else
// Generic scrypt
scrypt_1024_1_1_256_sp_generic(input, output, scratchpad);
#endif
} }

View File

@ -2,15 +2,25 @@
#define SCRYPT_H #define SCRYPT_H
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
static const int SCRYPT_SCRATCHPAD_SIZE = 131072 + 63; static const int SCRYPT_SCRATCHPAD_SIZE = 131072 + 63;
void scrypt_1024_1_1_256(const char *input, char *output); void scrypt_1024_1_1_256(const char *input, char *output);
void scrypt_1024_1_1_256_sp_generic(const char *input, char *output, char *scratchpad); void scrypt_1024_1_1_256_sp_generic(const char *input, char *output, char *scratchpad);
#if defined(USE_SSE2) #if defined(USE_SSE2)
extern void scrypt_detect_sse2(unsigned int cpuid_edx); #if defined(_M_X64) || defined(__x86_64__) || defined(_M_AMD64) || (defined(MAC_OSX) && defined(__i386__))
#define USE_SSE2_ALWAYS 1
#define scrypt_1024_1_1_256_sp(input, output, scratchpad) scrypt_1024_1_1_256_sp_sse2((input), (output), (scratchpad))
#else
#define scrypt_1024_1_1_256_sp(input, output, scratchpad) scrypt_1024_1_1_256_sp_detected((input), (output), (scratchpad))
#endif
void scrypt_detect_sse2();
void scrypt_1024_1_1_256_sp_sse2(const char *input, char *output, char *scratchpad); void scrypt_1024_1_1_256_sp_sse2(const char *input, char *output, char *scratchpad);
extern void (*scrypt_1024_1_1_256_sp)(const char *input, char *output, char *scratchpad); extern void (*scrypt_1024_1_1_256_sp_detected)(const char *input, char *output, char *scratchpad);
#else
#define scrypt_1024_1_1_256_sp(input, output, scratchpad) scrypt_1024_1_1_256_sp_generic((input), (output), (scratchpad))
#endif #endif
void void

View File

@ -11,6 +11,9 @@ BOOST_AUTO_TEST_CASE(scrypt_hashtest)
#define HASHCOUNT 5 #define HASHCOUNT 5
const char* inputhex[HASHCOUNT] = { "020000004c1271c211717198227392b029a64a7971931d351b387bb80db027f270411e398a07046f7d4a08dd815412a8712f874a7ebf0507e3878bd24e20a3b73fd750a667d2f451eac7471b00de6659", "0200000011503ee6a855e900c00cfdd98f5f55fffeaee9b6bf55bea9b852d9de2ce35828e204eef76acfd36949ae56d1fbe81c1ac9c0209e6331ad56414f9072506a77f8c6faf551eac7471b00389d01", "02000000a72c8a177f523946f42f22c3e86b8023221b4105e8007e59e81f6beb013e29aaf635295cb9ac966213fb56e046dc71df5b3f7f67ceaeab24038e743f883aff1aaafaf551eac7471b0166249b", "010000007824bc3a8a1b4628485eee3024abd8626721f7f870f8ad4d2f33a27155167f6a4009d1285049603888fe85a84b6c803a53305a8d497965a5e896e1a00568359589faf551eac7471b0065434e", "0200000050bfd4e4a307a8cb6ef4aef69abc5c0f2d579648bd80d7733e1ccc3fbc90ed664a7f74006cb11bde87785f229ecd366c2d4e44432832580e0608c579e4cb76f383f7f551eac7471b00c36982" }; const char* inputhex[HASHCOUNT] = { "020000004c1271c211717198227392b029a64a7971931d351b387bb80db027f270411e398a07046f7d4a08dd815412a8712f874a7ebf0507e3878bd24e20a3b73fd750a667d2f451eac7471b00de6659", "0200000011503ee6a855e900c00cfdd98f5f55fffeaee9b6bf55bea9b852d9de2ce35828e204eef76acfd36949ae56d1fbe81c1ac9c0209e6331ad56414f9072506a77f8c6faf551eac7471b00389d01", "02000000a72c8a177f523946f42f22c3e86b8023221b4105e8007e59e81f6beb013e29aaf635295cb9ac966213fb56e046dc71df5b3f7f67ceaeab24038e743f883aff1aaafaf551eac7471b0166249b", "010000007824bc3a8a1b4628485eee3024abd8626721f7f870f8ad4d2f33a27155167f6a4009d1285049603888fe85a84b6c803a53305a8d497965a5e896e1a00568359589faf551eac7471b0065434e", "0200000050bfd4e4a307a8cb6ef4aef69abc5c0f2d579648bd80d7733e1ccc3fbc90ed664a7f74006cb11bde87785f229ecd366c2d4e44432832580e0608c579e4cb76f383f7f551eac7471b00c36982" };
const char* expected[HASHCOUNT] = { "00000000002bef4107f882f6115e0b01f348d21195dacd3582aa2dabd7985806" , "00000000003a0d11bdd5eb634e08b7feddcfbbf228ed35d250daf19f1c88fc94", "00000000000b40f895f288e13244728a6c2d9d59d8aff29c65f8dd5114a8ca81", "00000000003007005891cd4923031e99d8e8d72f6e8e7edc6a86181897e105fe", "000000000018f0b426a4afc7130ccb47fa02af730d345b4fe7c7724d3800ec8c" }; const char* expected[HASHCOUNT] = { "00000000002bef4107f882f6115e0b01f348d21195dacd3582aa2dabd7985806" , "00000000003a0d11bdd5eb634e08b7feddcfbbf228ed35d250daf19f1c88fc94", "00000000000b40f895f288e13244728a6c2d9d59d8aff29c65f8dd5114a8ca81", "00000000003007005891cd4923031e99d8e8d72f6e8e7edc6a86181897e105fe", "000000000018f0b426a4afc7130ccb47fa02af730d345b4fe7c7724d3800ec8c" };
#if defined(USE_SSE2)
scrypt_detect_sse2();
#endif
uint256 scrypthash; uint256 scrypthash;
std::vector<unsigned char> inputbytes; std::vector<unsigned char> inputbytes;
char scratchpad[SCRYPT_SCRATCHPAD_SIZE]; char scratchpad[SCRYPT_SCRATCHPAD_SIZE];
@ -19,6 +22,7 @@ BOOST_AUTO_TEST_CASE(scrypt_hashtest)
#if defined(USE_SSE2) #if defined(USE_SSE2)
// Test SSE2 scrypt // Test SSE2 scrypt
scrypt_1024_1_1_256_sp_sse2((const char*)&inputbytes[0], BEGIN(scrypthash), scratchpad); scrypt_1024_1_1_256_sp_sse2((const char*)&inputbytes[0], BEGIN(scrypthash), scratchpad);
BOOST_CHECK_EQUAL(scrypthash.ToString().c_str(), expected[i]);
#endif #endif
// Test generic scrypt // Test generic scrypt
scrypt_1024_1_1_256_sp_generic((const char*)&inputbytes[0], BEGIN(scrypthash), scratchpad); scrypt_1024_1_1_256_sp_generic((const char*)&inputbytes[0], BEGIN(scrypthash), scratchpad);