+ /* Fast path: as far as we have data for all the input bitmaps we
+ * can take a fast path that performs much better than the
+ * vanilla algorithm. */
+ j = 0;
+ if (minlen && numkeys <= 16) {
+ unsigned long *lp[16];
+ unsigned long *lres = (unsigned long*) res;
+
+ /* Note: sds pointer is always aligned to 8 byte boundary. */
+ memcpy(lp,src,sizeof(unsigned long*)*numkeys);
+ memcpy(res,src[0],minlen);
+
+ /* Different branches per different operations for speed (sorry). */
+ if (op == BITOP_AND) {
+ while(minlen >= sizeof(unsigned long)*4) {
+ for (i = 1; i < numkeys; i++) {
+ lres[0] &= lp[i][0];
+ lres[1] &= lp[i][1];
+ lres[2] &= lp[i][2];
+ lres[3] &= lp[i][3];
+ lp[i]+=4;
+ }
+ lres+=4;
+ j += sizeof(unsigned long)*4;
+ minlen -= sizeof(unsigned long)*4;
+ }
+ } else if (op == BITOP_OR) {
+ while(minlen >= sizeof(unsigned long)*4) {
+ for (i = 1; i < numkeys; i++) {
+ lres[0] |= lp[i][0];
+ lres[1] |= lp[i][1];
+ lres[2] |= lp[i][2];
+ lres[3] |= lp[i][3];
+ lp[i]+=4;
+ }
+ lres+=4;
+ j += sizeof(unsigned long)*4;
+ minlen -= sizeof(unsigned long)*4;
+ }
+ } else if (op == BITOP_XOR) {
+ while(minlen >= sizeof(unsigned long)*4) {
+ for (i = 1; i < numkeys; i++) {
+ lres[0] ^= lp[i][0];
+ lres[1] ^= lp[i][1];
+ lres[2] ^= lp[i][2];
+ lres[3] ^= lp[i][3];
+ lp[i]+=4;
+ }
+ lres+=4;
+ j += sizeof(unsigned long)*4;
+ minlen -= sizeof(unsigned long)*4;
+ }
+ } else if (op == BITOP_NOT) {
+ while(minlen >= sizeof(unsigned long)*4) {
+ lres[0] = ~lres[0];
+ lres[1] = ~lres[1];
+ lres[2] = ~lres[2];
+ lres[3] = ~lres[3];
+ lres+=4;
+ j += sizeof(unsigned long)*4;
+ minlen -= sizeof(unsigned long)*4;
+ }
+ }
+ }
+
+ /* j is set to the next byte to process by the previous loop. */
+ for (; j < maxlen; j++) {