Thread: June 2022
View Single Post
Old 2022-07-22, 04:06   #10
uau
 
Jan 2017

2·3·52 Posts
Default

Here's the program I used and some notes:

Sizes 2 or 3 mod 4 like 22/23 are actually easier to calculate: for such sizes, the result is just half the total polyomino count (which you can look up on OEIS for example). You can prove that as follows:

The two orderings are rotated 1/4 turn from each other (one is "standard order", the other is "rotate polyomino 1/4 turn, then apply standard order"). Now take an arbitrary polyomino. Its parity is the parity of the difference permutation between "standard order" and "1/4 turn, then standard order". Now consider instead the parity of the polyomino you get by rotating the original by 1/4 turn. Now the relevant permutation is between "1/4 turn, then standard order" and "1/4 + 1/4 turns, then standard order". After 1/2 turn, standard order is inverse to the original. For a number of elements 2 or 3 mod 4, a permutation inversing their order is odd. So you have two permutations - from original to 1/4 turn, and 1/4 turn to 1/2 turn. Their combination from original to inverted is odd for 2 or 3 mod 4. It follows that one of the two permutations must be odd and the other even. From this it follows that rotating a polyomino by 1/4 turn always changes its parity, and half the total polyominoes must be odd and half even.

The overall approach of my code is nothing special - it enumerates every polyomino of the given size and determines its parity. I think it is efficient for that approach though. The parity is incrementally determined when adding each new square to the polyomino by counting the number of existing squares that are smaller in standard order and smaller in rotated order; if the sum of those is odd, adding the square changes parity. This "count numbers smaller than given constant" part uses vectorized code that probably only works when compiling with GCC; changing "shuffle" to "shufflevector" may make it work with clang/icc.

The output is odd polyominoes and total polyominoes. It's fast enough that it should be quite realistic to calculate at least 24 and 25 (and then you get 26 and 27 "for free" as above). 28/29 might be possible at least on a fast machine or distributed between machines (the thread splitting implementation doesn't require any communication after the initial split, so would work equally well between machines).
Code:
/* This compiles into notably faster code with GCC 11 when using
 * profile-guided optimization.
 * To generate a binary:
 * $ gcc -O3 -march=native -fprofile-generate ponderthis_202206.c -lpthread
 * $ ./a.out 17 1   # To generate a profile, run up to n=17 with 1 thread
 * $ gcc -O3 -march=native -fprofile-use ponderthis_202206.c -lpthread
 *
 * Now you can run the optimized version "for real":
 * ./a.out 21 4 # run to n=21 with 4 threads
 */

#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <assert.h>
#include <pthread.h>

#define MAXN 33    // larger values somewhat slow down the program overall
#define SPLITDEPTH 10
#define VECTORSIZE 16
#define MAXTHREADS 200

typedef short vshort __attribute__((vector_size(VECTORSIZE)));

struct stackentry {
    short pointpos;
    bool odd;
    bool is_cell;
    char dir;
};

#define VECC (((MAXN-1)*2+VECTORSIZE-1)/VECTORSIZE)
#define POINTC (VECC*VECTORSIZE/2)

union points {
    vshort v[VECC];
    short a[POINTC];
};


long long solve(int n, long long *rop, int threadcount, int threadsplit) {
    long long r = 0;
    long long ro = 0;
    union points points = {0};
    union points points2 = {0};
    for (int i = 1; i < POINTC; i++)
        points2.a[i] = -32768;
    char mem[1 + (MAXN-1)*MAXN] = {1};
    struct stackentry stack[MAXN*3];
    int pointcount = 1;
    int stackpos = 0;
    int pointpos = 0;
    int dir = 2;
    bool odd = false;
    const int dirs[] = {-MAXN, -1, 1, MAXN};
    const int dirs2[] = {1, -MAXN, MAXN, -1};
    while (1) {
        if (dir >= 4) {
            dir = 0;
            pointpos++;
        }
        if (pointpos >= pointcount) {
            while (1) {
                if (stackpos == 0) {
                    *rop = ro;
                    return r;
                }
                stackpos--;
                bool is_cell = stack[stackpos].is_cell;
                pointpos = stack[stackpos].pointpos;
                dir = stack[stackpos].dir;
                odd = stack[stackpos].odd;
                mem[points.a[pointpos] + dirs[dir]] = is_cell;
                if (is_cell)
                    break;
            }
            pointcount -= 1;
            points.a[pointcount] = 0;
            points2.a[pointcount] = -32768;
            stack[stackpos].is_cell = false;
            stackpos += 1;
            dir++;
            continue;
        }
        int pos = points.a[pointpos] + dirs[dir];
        if (pos < 0 || mem[pos]) {
            dir++;
            continue;
        }
        mem[pos] = 1;
        stack[stackpos].is_cell = true;
        stack[stackpos].pointpos = pointpos;
        stack[stackpos].dir = dir;
        stack[stackpos].odd = odd;
        if (pointcount == SPLITDEPTH) {
            if (--threadsplit) {
                stack[stackpos].is_cell = false;
                stackpos++;
                continue;
            }
            threadsplit = threadcount;
        }

        short pos2 = points2.a[pointpos] + dirs2[dir];
        vshort x = {0};
        for (int i = 0; i < VECC; i++) {
            x ^= points.v[i] < (short)pos;
            x ^= points2.v[i] < (short)pos2;
        }
#if VECTORSIZE == 16
        vshort mask1 = {1, 1, 3, 3, 5, 5, 7, 7};
        vshort mask2 = {2, 2, 2, 2, 6, 6, 6, 6};
        vshort mask3 = {4, 4, 4, 4, 4, 4, 4, 4};
        x ^= __builtin_shuffle(x, mask1);
        x ^= __builtin_shuffle(x, mask2);
        x ^= __builtin_shuffle(x, mask3);
        odd ^= x[0] & 1;
#else
#error unimplemented
#endif
        if (pointcount == n - 1) {
            stack[stackpos].is_cell = false;
            r++;
            ro += odd;
            odd = stack[stackpos].odd;
        } else {
            points.a[pointcount] = pos;
            points2.a[pointcount] = pos2;
            pointcount++;
        }
        stackpos++;
        dir++;
    }
}

static pthread_t threads[MAXTHREADS];
static long long thread_results[MAXTHREADS][2];
static int global_n;
static int global_threadcount;

void *thread_worker(void *arg) {
    int thread_id = (pthread_t *)arg - threads;
    thread_results[thread_id][1] = solve(global_n, thread_results[thread_id],
                                         global_threadcount, thread_id + 1);
    return NULL;
}

int main(int argc, char *argv[]) {
    assert(argc == 3);
    int maxn = atoi(argv[1]);
    assert(maxn <= MAXN);
    int n_threads = atoi(argv[2]);
    assert(n_threads >= 1);
    global_threadcount = n_threads;
    for (int i = 2; i <= maxn; i++) {
        long long odd = 0;
        long long total = 0;
        if (i < SPLITDEPTH + 3 || n_threads == 1) {
            total = solve(i, &odd, 1, 1);
        } else {
            global_n = i;
            for (int j = 0; j < n_threads; j++)
                pthread_create(&threads[j], NULL, thread_worker, &threads[j]);
            for (int j = 0; j < n_threads; j++) {
                pthread_join(threads[j], NULL);
                odd += thread_results[j][0];
                total += thread_results[j][1];
            }
        }
        printf("%d %lld / %lld\n", i, odd, total);
    }
}
uau is offline   Reply With Quote