From 0e52aaa51092c5535cd4e2686bc5c15f7f7b9c5b Mon Sep 17 00:00:00 2001 From: Dirk Engling Date: Thu, 7 Mar 2019 15:24:32 +0100 Subject: Rework merge_entries to account for chunks moving between Zusaetze and Verweise column --- src/postprocess/merge_entries.c | 268 ++++++++++++++++++---------------------- 1 file changed, 123 insertions(+), 145 deletions(-) diff --git a/src/postprocess/merge_entries.c b/src/postprocess/merge_entries.c index 3ebfa8c..f89113e 100644 --- a/src/postprocess/merge_entries.c +++ b/src/postprocess/merge_entries.c @@ -6,17 +6,21 @@ #include #include +extern int halfsiphash(const uint8_t *in, const size_t inlen, const uint8_t *k, uint8_t *out, const size_t outlen); + enum { COLUMNS = 15 }; typedef struct { - char *ptr; long rows; long outoff; long flag; + int year; } entry_t; typedef struct { - char *ptr; - size_t size; + char *ptr; + size_t size; + uint64_t data; /* might store precision or normalized verweise+zusaetze */ } outvec_t; +static outvec_t * g_out_array; const char *g_year_map[] = { "1992_Q2", "1995_Q0", "1996_Q0", "1996_Q1", "1997_Q1", "1997_Q3", "1998_Q1", "1998_Q3", "1999_Q1", "1999_Q3", "2000_Q1", "2000_Q3", "2001_Q1", "2001_Q2", "2001_Q3", "2001_Q4", "2002_Q1", @@ -25,11 +29,7 @@ const char *g_year_map[] = { 0 }; -void SKIP_1_COLUMN(char **ptr) { *ptr = strchr(*ptr, 10) + 1; } -void SKIP_2_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); } -void SKIP_3_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); } - -int year_to_offset(const char *year) { +static int year_to_offset(const char *year) { const char **y = g_year_map; int off = 0; while (*y) { @@ -39,8 +39,33 @@ int year_to_offset(const char *year) { return -1; } +static uint64_t string_to_hash(const char *start, const char *end) { + const uint64_t k = 0xdc082e4772c897e7LL; + uint64_t hash = 1LL, acc = 0LL; + while (start < end) { + const char *tokend = memchr(start, ' ', end - start); + const char *tokend2 = memchr(start, '.', end - start); + const char *compare_end = tokend; + + if (tokend2 && (!tokend || (tokend > tokend2))) { + tokend = 0; + compare_end = tokend2 + 1; + } + if (!tokend && !tokend2) + compare_end = end; -int + if (compare_end != start) { + halfsiphash((const uint8_t*)start, (const size_t)(compare_end - start), (const uint8_t*)&k, (uint8_t*)&acc, sizeof(acc)); + hash *= acc; + //printf("HASH %" PRIX64 " %" PRIX64 ":%" PRIX64 " TOKEN(%zd): %.*s\n", hash, acc, k, tokend - start, (int)(tokend - start), start); + } + start = compare_end; + if (tokend) start++; // for space, we only compared up to the char + } + return hash; +} + +static int STRCMP_n (const char *p1, const char *p2) { const unsigned char *s1 = (const unsigned char *) p1; @@ -57,103 +82,85 @@ STRCMP_n (const char *p1, const char *p2) return c1 - c2; } -int compare_entries(entry_t*a, entry_t*b, int *prec) { - char *pa = a->ptr, *pb = b->ptr; - int col, row, res = 0, nprec = -1; +static int compare_entries(entry_t*a, entry_t*b) { + int col, row, nprec_a, nprec_b, a_is_newer; + outvec_t *oa = g_out_array + a->outoff; + outvec_t *ob = g_out_array + b->outoff; /* Multi line entries never match single line entries */ if (a->rows != b->rows) return -1; - /* Assume house number precision first .. unless */ - if (!memcmp(pa,"2006_Q3",7)) - *prec = 2; - else - *prec = 3; - - if (!memcmp(pb,"2006_Q3",7)) - nprec = 2; - else - nprec = 3; - - /* Skip year and flags */ - SKIP_2_COLUMNS(&pa); - SKIP_2_COLUMNS(&pb); - - /* Test all columns for identity */ - for (col=2; colrows; ++row) + for (col=2; col 0; + nprec_a = memcmp(oa[0].ptr,"2006_Q3",7) ? 2 : 1; + nprec_b = memcmp(ob[0].ptr,"2006_Q3",7) ? 2 : 1; - /* Only if precision is the same, difference in coordinates - is significant. - if (*prec == nprec && STRCMP_n(pa, pb)) - return -1; */ + for (row=0; row <= a->rows; ++row) { + int present_a = oa[row * COLUMNS + 14].size != 1; + int present_b = ob[row * COLUMNS + 14].size != 1; - /* Row 1 has been compared, check the rest of lines */ - for (row=0; rowrows; ++row) { + if (!present_a) continue; - /* Skip last row's coordinate columns, year and flags */ - SKIP_3_COLUMNS(&pa); - SKIP_3_COLUMNS(&pb); + /* If the current entry's coords were copied, use the stored precision */ + if (oa[row * COLUMNS + 14].data > 0) + nprec_a = oa[row * COLUMNS + 14].data; - for (col=2; col nprec_b ) || ( a_is_newer && (nprec_a >= nprec_b))) { + ob[row * COLUMNS + 14].ptr = oa[row * COLUMNS + 14].ptr; + ob[row * COLUMNS + 14].size = oa[row * COLUMNS + 14].size; + ob[row * COLUMNS + 14].data = nprec_a; } - - /* Only if precision is the same, difference in coordinates - is significant. - if (*prec == nprec && STRCMP_n(pa, pb)) - return -1; */ } return 0; } -int sort_me(const void *f_a, const void *f_b) { - entry_t *e_a = (entry_t *)f_a; - entry_t *e_b = (entry_t *)f_b; +static int sort_me(const void *f_a, const void *f_b) { + entry_t *e_a = (entry_t *)f_a; + entry_t *e_b = (entry_t *)f_b; - char * pa = (char*)e_a->ptr; - char * pb = (char*)e_b->ptr; + outvec_t *oa = g_out_array + e_a->outoff; + outvec_t *ob = g_out_array + e_b->outoff; - int results[COLUMNS], c; + int res, row; if (e_a->rows != e_b->rows) return e_a->rows - e_b->rows; - for (c = 0; crows; ++row) { + outvec_t *oa_row = oa + row * COLUMNS; + outvec_t *ob_row = ob + row * COLUMNS; + + if ((res = STRCMP_n(oa_row[10].ptr, ob_row[10].ptr))) return res; /* Vorwahl */ + if ((res = STRCMP_n(oa_row[11].ptr, ob_row[11].ptr))) return res; /* Rufnummer */ + if ((res = STRCMP_n(oa_row[ 2].ptr, ob_row[ 2].ptr))) return res; /* PLZ */ + if ((res = STRCMP_n(oa_row[ 6].ptr, ob_row[ 6].ptr))) return res; /* Strasse */ + if ((res = STRCMP_n(oa_row[ 7].ptr, ob_row[ 7].ptr))) return res; /* Hausnummer */ + if ((res = STRCMP_n(oa_row[ 3].ptr, ob_row[ 3].ptr))) return res; /* Nachname */ + if ((res = STRCMP_n(oa_row[ 4].ptr, ob_row[ 4].ptr))) return res; /* Vorname */ + if (oa_row[8].data != ob_row[8].data ) return oa_row[8].data - ob_row[8].data; } - if (results[10]) return results[10]; /* Vorwahl */ - if (results[11]) return results[11]; /* Rufnummer */ - if (results[2]) return results[2]; /* PLZ */ - if (results[3]) return results[3]; /* Nachname */ - if (results[4]) return results[4]; /* Vorname */ - if (results[6]) return results[6]; /* Strasse */ - if (results[7]) return results[7]; /* Hausnummer */ - if (results[8]) return results[8]; /* Verweise */ - if (results[0]) return results[0]; /* Year */ - return 0; + return STRCMP_n(oa[0].ptr, ob[0].ptr); } -enum { OUTPUT_BUFFER_SIZE = 1024*1024*128 }; - static void do_escape_string(char * s, size_t len) { size_t i; @@ -187,10 +194,9 @@ static void escape_string(char * s, size_t len) { int main(int argc, char **args) { MAP tbuch = map_file(args[1], 1); - char *ptr, *start; + char *ptr; entry_t * sort_array; - outvec_t * out_array; - int current = -1, outoff = 0, lines = 1, i, truth = 0, truth_prec = -1; + int current = -1, outoff = 0, lines = COLUMNS, i; uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0; long flag = 0; @@ -199,24 +205,29 @@ int main(int argc, char **args) { if (tbuch->addr[i] == 10) ++lines; - sort_array = (entry_t*)malloc((lines / COLUMNS) * sizeof(entry_t)); - out_array = (outvec_t*)malloc((lines / COLUMNS) * sizeof(outvec_t)); + g_out_array = (outvec_t*)malloc(lines * sizeof(outvec_t)); + sort_array = (entry_t*)malloc(lines * sizeof(entry_t)); ptr = (char*)tbuch->addr; - start = ptr; while (ptr < (char*)tbuch->addr + tbuch->size) { - int c; - - start = ptr; + int c, year; /* Look for field terminator */ for (c=0; c= 0); sort_array[current].rows++; } else { - sort_array[++current].ptr = start; - sort_array[current].rows = 0; + sort_array[++current].rows = 0; sort_array[current].outoff = outoff; sort_array[current].flag = flag; + sort_array[current].year = year; } - out_array[outoff].size = ptr - out_array[outoff].ptr; - outoff++; + outoff += COLUMNS; } /* Sort the whole thing */ qsort(sort_array, current, sizeof(entry_t), sort_me); for (i=0; i<=current; ++i) { - int j, dump = 0, prec; - - int year = year_to_offset(sort_array[i].ptr); - - year_list |= 1LL << year; - if (sort_array[i].flag & 0x80 ) bizflag_list |= 1LL << year; - if (sort_array[i].flag & 0x40 ) revflag_list |= 1LL << year; - - /* The last entry always needs to be dumped, but check if its - precision is better than the old truth's - The second comparision checks for equality of entries (modulo - coordinate mismatch) - */ - if (i == current) { - compare_entries(sort_array+i, sort_array+i, &prec); - dump = 1; - } else if (compare_entries(sort_array+i, sort_array+i+1, &prec)) - dump = 1; - - /* If this entry's precision is higher than the one of possible - earlier matches, then the current entry becomes the truth */ - if (prec >= truth_prec) { - truth = i; - truth_prec = prec; - } + year_list |= 1LL << sort_array[i].year; + if (sort_array[i].flag & 0x80 ) bizflag_list |= 1LL << sort_array[i].year; + if (sort_array[i].flag & 0x40 ) revflag_list |= 1LL << sort_array[i].year; - if (dump) { + if ((i == current) || compare_entries(sort_array+i, sort_array+i+1)) { printf("%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t", year_list, bizflag_list, revflag_list); - for (int c=0; cptr, 10); - size_t len = s - out->ptr; - if (!len || out->ptr[0] == 9) + for (int c=2; cptr, len); + if (c != COLUMNS-1) + escape_string(out[c].ptr, out[c].size); else { char coords[64], *tab; -// memcpy(coords, "POINT(", 6); -// memcpy(coords + 6, out->ptr, len); -// tab = memchr(coords + 6, 9, len); -// if (tab) *tab = ' '; -// coords[6+len] = ')'; -// fwrite(coords, 7 + len, 1, stdout); - memcpy(coords, out->ptr, len); - tab = memchr(coords, 9, len); - if (tab) *tab = ' '; - fwrite(coords, len, 1, stdout); + memcpy(coords, out[c].ptr, out[c].size); + tab = memchr(coords, 9, out[c].size); + if (tab) *tab = ' '; + fwrite(coords, out[c].size, 1, stdout); } skipped = 0; } - out->ptr = s + 1; ++out; } if (started) putchar('}'); - if (c