From b1b602a92887f271db3101d67e0319ce31fa68b3 Mon Sep 17 00:00:00 2001 From: antirez Date: Sun, 25 Nov 2012 16:21:21 +0100 Subject: [PATCH] On crash memory test rewrote so that it actaully works. 1) We no longer test location by location, otherwise the CPU write cache completely makes our business useless. 2) We still need a memory test that operates in steps from the first to the last location in order to never hit the cache, but that is still able to retain the memory content. This was tested using a Linux box containing a bad memory module with a zingle bit error (always zero). So the final solution does has an error propagation step that is: 1) Invert bits at every location. 2) Swap adiacent locations. 3) Swap adiacent locations again. 4) Invert bits at every location. 5) Swap adiacent locations. 6) Swap adiacent locations again. Before and after these steps, and after step 4, a CRC64 checksum is computed. If the three CRC64 checksums don't match, a memory error was detected. --- src/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++++--------- src/memtest.c | 42 ++++++++++++++++------------------ 2 files changed, 70 insertions(+), 34 deletions(-) diff --git a/src/debug.c b/src/debug.c index 2725aae6..31cfac65 100644 --- a/src/debug.c +++ b/src/debug.c @@ -667,16 +667,22 @@ void logCurrentClient(void) { } #if defined(HAVE_PROC_MAPS) -int memtest_non_destructive(void *addr, size_t size); /* memtest.c */ +uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l); +void memtest_non_destructive_invert(void *addr, size_t size); +void memtest_non_destructive_swap(void *addr, size_t size); +#define MEMTEST_MAX_REGIONS 128 int memtest_test_linux_anonymous_maps(void) { FILE *fp = fopen("/proc/self/maps","r"); char line[1024]; size_t start_addr, end_addr, size; + size_t start_vect[MEMTEST_MAX_REGIONS]; + size_t size_vect[MEMTEST_MAX_REGIONS]; + int regions = 0, j; + uint64_t crc1 = 0, crc2 = 0, crc3 = 0; while(fgets(line,sizeof(line),fp) != NULL) { char *start, *end, *p = line; - int j; start = p; p = strchr(p,'-'); @@ -695,17 +701,51 @@ int memtest_test_linux_anonymous_maps(void) { start_addr = strtoul(start,NULL,16); end_addr = strtoul(end,NULL,16); size = end_addr-start_addr; - redisLog(REDIS_WARNING, - "Testing memory at %lx (%lu bytes)", start_addr, size); - for (j = 0; j < 3; j++) { - if (memtest_non_destructive((void*)start_addr,size) != 0) { - fclose(fp); - return 1; - } - } + + start_vect[regions] = start_addr; + size_vect[regions] = size; + printf("Testing %lx %lu\n", start_vect[regions], size_vect[regions]); + regions++; } + + /* Test all the regions as an unique sequential region. + * 1) Take the CRC64 of the memory region. */ + for (j = 0; j < regions; j++) { + crc1 = crc64(crc1,(void*)start_vect[j],size_vect[j]); + } + + /* 2) Invert bits, swap adiacent words, swap again, invert bits. + * This is the error amplification step. */ + for (j = 0; j < regions; j++) + memtest_non_destructive_invert((void*)start_vect[j],size_vect[j]); + for (j = 0; j < regions; j++) + memtest_non_destructive_swap((void*)start_vect[j],size_vect[j]); + for (j = 0; j < regions; j++) + memtest_non_destructive_swap((void*)start_vect[j],size_vect[j]); + for (j = 0; j < regions; j++) + memtest_non_destructive_invert((void*)start_vect[j],size_vect[j]); + + /* 3) Take the CRC64 sum again. */ + for (j = 0; j < regions; j++) + crc2 = crc64(crc2,(void*)start_vect[j],size_vect[j]); + + /* 4) Swap + Swap again */ + for (j = 0; j < regions; j++) + memtest_non_destructive_swap((void*)start_vect[j],size_vect[j]); + for (j = 0; j < regions; j++) + memtest_non_destructive_swap((void*)start_vect[j],size_vect[j]); + + /* 5) Take the CRC64 sum again. */ + for (j = 0; j < regions; j++) + crc3 = crc64(crc3,(void*)start_vect[j],size_vect[j]); + + /* NOTE: It is very important to close the file descriptor only now + * because closing it before may result into unmapping of some memory + * region that we are testing. */ fclose(fp); - return 0; + + /* If the two CRC are not the same, we trapped a memory error. */ + return crc1 != crc2 || crc2 != crc3; } #endif diff --git a/src/memtest.c b/src/memtest.c index 82da27c8..754d0202 100644 --- a/src/memtest.c +++ b/src/memtest.c @@ -241,34 +241,30 @@ void memtest_test(size_t megabytes, int passes) { } } -/* This is a fast O(N) best effort memory test, only ZERO-ONE tests and - * checkerboard tests are performed, without pauses between setting and - * reading the value, so this can only detect a subclass of permanent errors. - * - * However the function does not destroy the content of the memory tested that - * is left unmodified. - * - * If a memory error is detected, 1 is returned. Otherwise 0 is returned. */ -int memtest_non_destructive(void *addr, size_t size) { +void memtest_non_destructive_invert(void *addr, size_t size) { volatile unsigned long *p = addr; - unsigned long val; + size_t words = size / sizeof(unsigned long); size_t j; - size /= sizeof(unsigned long); - for (j = 0; j < size; j++) { - val = p[j]; + /* Invert */ + for (j = 0; j < words; j++) + p[j] = ~p[j]; +} - p[j] = 0; if (p[j] != 0) goto err; - p[j] = (unsigned long)-1; if (p[j] != (unsigned long)-1) goto err; - p[j] = ULONG_ONEZERO; if (p[j] != ULONG_ONEZERO) goto err; - p[j] = ULONG_ZEROONE; if (p[j] != ULONG_ZEROONE) goto err; - p[j] = val; /* restore the original value. */ - } - return 0; +void memtest_non_destructive_swap(void *addr, size_t size) { + volatile unsigned long *p = addr; + size_t words = size / sizeof(unsigned long); + size_t j; -err: /* memory error detected. */ - p[j] = val; - return 1; + /* Swap */ + for (j = 0; j < words; j += 2) { + unsigned long a, b; + + a = p[j]; + b = p[j+1]; + p[j] = b; + p[j+1] = a; + } } void memtest(size_t megabytes, int passes) { -- 2.47.2