#include "mystdlib.h" #include #include #include #include #include extern int halfsiphash(const uint8_t *in, const size_t inlen, const uint8_t *k, uint8_t *out, const size_t outlen); enum { COLUMNS = 15 }; typedef struct { long rows; long outoff; long flag; int year; } entry_t; typedef struct { char *ptr; size_t size; uint64_t data; /* might store precision or normalized verweise+zusaetze */ } outvec_t; static outvec_t * g_out_array; const char *g_year_map[] = { "1992_Q2", "1995_Q0", "1996_Q0", "1996_Q1", "1997_Q1", "1997_Q3", "1998_Q1", "1998_Q3", "1999_Q1", "1999_Q3", "2000_Q1", "2000_Q3", "2001_Q1", "2001_Q2", "2001_Q3", "2001_Q4", "2002_Q1", "2002_Q3", "2003_Q1", "2003_Q3", "2004_Q1", "2004_Q3", "2005_Q1", "2005_Q3", "2006_Q1", "2006_Q3", "2007_Q1", "2007_Q3", "2008_Q1", "2008_Q3", "2009_Q1", "2009_Q3", "2010_Q1", "2010_Q3", "2011_Q1", "2011_Q3", "2012_Q1", "2012_Q3", "2013_Q1", "2013_Q3", "2014_Q1", "2014_Q3", "2015_Q1", "2015_Q3", "2016_Q1", "2016_Q3", "2017_Q1", "2017_Q3", "2018_Q1", "2018_Q3", "2019_Q1", 0 }; static int year_to_offset(const char *year) { const char **y = g_year_map; int off = 0; while (*y) { if (!memcmp(year, *y, 7)) return off; ++off; ++y; } return -1; } static uint64_t string_to_hash(const char *start, const char *end) { const uint64_t k = 0xdc082e4772c897e7LL; uint64_t hash = 1LL, acc = 0LL; while (start < end) { const char *tokend = memchr(start, ' ', end - start); const char *tokend2 = memchr(start, '.', end - start); const char *compare_end = tokend; if (tokend2 && (!tokend || (tokend > tokend2))) { tokend = 0; compare_end = tokend2 + 1; } if (!tokend && !tokend2) compare_end = end; if (compare_end != start) { halfsiphash((const uint8_t*)start, (const size_t)(compare_end - start), (const uint8_t*)&k, (uint8_t*)&acc, sizeof(acc)); hash *= acc; //printf("HASH %" PRIX64 " %" PRIX64 ":%" PRIX64 " TOKEN(%zd): %.*s\n", hash, acc, k, tokend - start, (int)(tokend - start), start); } start = compare_end; if (tokend) start++; // for space, we only compared up to the char } return hash; } static int STRCMP_n (const char *p1, const char *p2) { const unsigned char *s1 = (const unsigned char *) p1; const unsigned char *s2 = (const unsigned char *) p2; unsigned char c1, c2; do { c1 = (unsigned char) *s1++; c2 = (unsigned char) *s2++; if (c1 == 10) return c1 - c2; } while (c1 == c2); return c1 - c2; } static int compare_entries(entry_t*a, entry_t*b) { int col, row, nprec_a, nprec_b, a_is_newer; outvec_t *oa = g_out_array + a->outoff; outvec_t *ob = g_out_array + b->outoff; /* Multi line entries never match single line entries */ if (a->rows != b->rows) return -1; /* Check all columns except year, flag and coords for equality */ for (row=0; row <= a->rows; ++row) for (col=2; col 0; nprec_a = memcmp(oa[0].ptr,"2006_Q3",7) ? 2 : 1; nprec_b = memcmp(ob[0].ptr,"2006_Q3",7) ? 2 : 1; for (row=0; row <= a->rows; ++row) { int present_a = oa[row * COLUMNS + 14].size != 1; int present_b = ob[row * COLUMNS + 14].size != 1; if (!present_a) continue; /* If the current entry's coords were copied, use the stored precision */ if (oa[row * COLUMNS + 14].data > 0) nprec_a = oa[row * COLUMNS + 14].data; if (!present_b || (nprec_a > nprec_b ) || ( a_is_newer && (nprec_a >= nprec_b))) { ob[row * COLUMNS + 14].ptr = oa[row * COLUMNS + 14].ptr; ob[row * COLUMNS + 14].size = oa[row * COLUMNS + 14].size; ob[row * COLUMNS + 14].data = nprec_a; } } return 0; } static int sort_me(const void *f_a, const void *f_b) { entry_t *e_a = (entry_t *)f_a; entry_t *e_b = (entry_t *)f_b; outvec_t *oa = g_out_array + e_a->outoff; outvec_t *ob = g_out_array + e_b->outoff; int res, row; if (e_a->rows != e_b->rows) return e_a->rows - e_b->rows; for (row = 0; row <= e_a->rows; ++row) { outvec_t *oa_row = oa + row * COLUMNS; outvec_t *ob_row = ob + row * COLUMNS; if ((res = STRCMP_n(oa_row[10].ptr, ob_row[10].ptr))) return res; /* Vorwahl */ if ((res = STRCMP_n(oa_row[11].ptr, ob_row[11].ptr))) return res; /* Rufnummer */ if ((res = STRCMP_n(oa_row[ 2].ptr, ob_row[ 2].ptr))) return res; /* PLZ */ if ((res = STRCMP_n(oa_row[ 6].ptr, ob_row[ 6].ptr))) return res; /* Strasse */ if ((res = STRCMP_n(oa_row[ 7].ptr, ob_row[ 7].ptr))) return res; /* Hausnummer */ if ((res = STRCMP_n(oa_row[ 3].ptr, ob_row[ 3].ptr))) return res; /* Nachname */ if ((res = STRCMP_n(oa_row[ 4].ptr, ob_row[ 4].ptr))) return res; /* Vorname */ if (oa_row[8].data != ob_row[8].data ) return oa_row[8].data - ob_row[8].data; } return STRCMP_n(oa[0].ptr, ob[0].ptr); } static void do_escape_string(char * s, size_t len) { size_t i; putchar('"'); for (i=0; isize; ++i) if (tbuch->addr[i] == 10) ++lines; g_out_array = (outvec_t*)malloc(lines * sizeof(outvec_t)); sort_array = (entry_t*)malloc(lines * sizeof(entry_t)); ptr = (char*)tbuch->addr; while (ptr < (char*)tbuch->addr + tbuch->size) { int c, year; /* Look for field terminator */ for (c=0; c0); sort_array[current-1].rows++; } else { sort_array[current].rows = 0; sort_array[current].outoff = outoff; sort_array[current].flag = flag; sort_array[current].year = year; current++; } outoff += COLUMNS; } /* Sort the whole thing */ qsort(sort_array, current, sizeof(entry_t), sort_me); for (i=0; i