diff options
Diffstat (limited to 'src/postprocess/merge_entries.c')
| -rw-r--r-- | src/postprocess/merge_entries.c | 313 |
1 files changed, 313 insertions, 0 deletions
diff --git a/src/postprocess/merge_entries.c b/src/postprocess/merge_entries.c new file mode 100644 index 0000000..3ebfa8c --- /dev/null +++ b/src/postprocess/merge_entries.c | |||
| @@ -0,0 +1,313 @@ | |||
| 1 | #include "mystdlib.h" | ||
| 2 | |||
| 3 | #include <stdlib.h> | ||
| 4 | #include <stdio.h> | ||
| 5 | #include <string.h> | ||
| 6 | #include <inttypes.h> | ||
| 7 | #include <assert.h> | ||
| 8 | |||
| 9 | enum { COLUMNS = 15 }; | ||
| 10 | typedef struct { | ||
| 11 | char *ptr; | ||
| 12 | long rows; | ||
| 13 | long outoff; | ||
| 14 | long flag; | ||
| 15 | } entry_t; | ||
| 16 | typedef struct { | ||
| 17 | char *ptr; | ||
| 18 | size_t size; | ||
| 19 | } outvec_t; | ||
| 20 | |||
| 21 | const char *g_year_map[] = { | ||
| 22 | "1992_Q2", "1995_Q0", "1996_Q0", "1996_Q1", "1997_Q1", "1997_Q3", "1998_Q1", "1998_Q3", "1999_Q1", "1999_Q3", "2000_Q1", "2000_Q3", "2001_Q1", "2001_Q2", "2001_Q3", "2001_Q4", "2002_Q1", | ||
| 23 | "2002_Q3", "2003_Q1", "2003_Q3", "2004_Q1", "2004_Q3", "2005_Q1", "2005_Q3", "2006_Q1", "2006_Q3", "2007_Q1", "2007_Q3", "2008_Q1", "2008_Q3", "2009_Q1", "2009_Q3", "2010_Q1", "2010_Q3", | ||
| 24 | "2011_Q1", "2011_Q3", "2012_Q1", "2012_Q3", "2013_Q1", "2013_Q3", "2014_Q1", "2014_Q3", "2015_Q1", "2015_Q3", "2016_Q1", "2016_Q3", "2017_Q1", "2017_Q3", "2018_Q1", "2018_Q3", "2019_Q1", | ||
| 25 | 0 | ||
| 26 | }; | ||
| 27 | |||
| 28 | void SKIP_1_COLUMN(char **ptr) { *ptr = strchr(*ptr, 10) + 1; } | ||
| 29 | void SKIP_2_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); } | ||
| 30 | void SKIP_3_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); } | ||
| 31 | |||
| 32 | int year_to_offset(const char *year) { | ||
| 33 | const char **y = g_year_map; | ||
| 34 | int off = 0; | ||
| 35 | while (*y) { | ||
| 36 | if (!memcmp(year, *y, 7)) return off; | ||
| 37 | ++off; ++y; | ||
| 38 | } | ||
| 39 | return -1; | ||
| 40 | } | ||
| 41 | |||
| 42 | |||
| 43 | int | ||
| 44 | STRCMP_n (const char *p1, const char *p2) | ||
| 45 | { | ||
| 46 | const unsigned char *s1 = (const unsigned char *) p1; | ||
| 47 | const unsigned char *s2 = (const unsigned char *) p2; | ||
| 48 | unsigned char c1, c2; | ||
| 49 | do | ||
| 50 | { | ||
| 51 | c1 = (unsigned char) *s1++; | ||
| 52 | c2 = (unsigned char) *s2++; | ||
| 53 | if (c1 == 10) | ||
| 54 | return c1 - c2; | ||
| 55 | } | ||
| 56 | while (c1 == c2); | ||
| 57 | return c1 - c2; | ||
| 58 | } | ||
| 59 | |||
| 60 | int compare_entries(entry_t*a, entry_t*b, int *prec) { | ||
| 61 | char *pa = a->ptr, *pb = b->ptr; | ||
| 62 | int col, row, res = 0, nprec = -1; | ||
| 63 | |||
| 64 | /* Multi line entries never match single line entries */ | ||
| 65 | if (a->rows != b->rows) | ||
| 66 | return -1; | ||
| 67 | |||
| 68 | /* Assume house number precision first .. unless */ | ||
| 69 | if (!memcmp(pa,"2006_Q3",7)) | ||
| 70 | *prec = 2; | ||
| 71 | else | ||
| 72 | *prec = 3; | ||
| 73 | |||
| 74 | if (!memcmp(pb,"2006_Q3",7)) | ||
| 75 | nprec = 2; | ||
| 76 | else | ||
| 77 | nprec = 3; | ||
| 78 | |||
| 79 | /* Skip year and flags */ | ||
| 80 | SKIP_2_COLUMNS(&pa); | ||
| 81 | SKIP_2_COLUMNS(&pb); | ||
| 82 | |||
| 83 | /* Test all columns for identity */ | ||
| 84 | for (col=2; col<COLUMNS-1; ++col) { | ||
| 85 | if (!res && STRCMP_n(pa, pb)) | ||
| 86 | res = -1; | ||
| 87 | SKIP_1_COLUMN(&pa); | ||
| 88 | SKIP_1_COLUMN(&pb); | ||
| 89 | } | ||
| 90 | |||
| 91 | /* If no coords, downgrade precision */ | ||
| 92 | if (*pa == 9) *prec = 1; | ||
| 93 | if (*pb == 9) nprec = 1; | ||
| 94 | |||
| 95 | /* If entries differ, return after precision has been found */ | ||
| 96 | if (res) return res; | ||
| 97 | |||
| 98 | /* Only if precision is the same, difference in coordinates | ||
| 99 | is significant. | ||
| 100 | if (*prec == nprec && STRCMP_n(pa, pb)) | ||
| 101 | return -1; */ | ||
| 102 | |||
| 103 | /* Row 1 has been compared, check the rest of lines */ | ||
| 104 | for (row=0; row<a->rows; ++row) { | ||
| 105 | |||
| 106 | /* Skip last row's coordinate columns, year and flags */ | ||
| 107 | SKIP_3_COLUMNS(&pa); | ||
| 108 | SKIP_3_COLUMNS(&pb); | ||
| 109 | |||
| 110 | for (col=2; col<COLUMNS-1; ++col) { | ||
| 111 | if (STRCMP_n(pa, pb)) | ||
| 112 | return -1; | ||
| 113 | SKIP_1_COLUMN(&pa); | ||
| 114 | SKIP_1_COLUMN(&pb); | ||
| 115 | } | ||
| 116 | |||
| 117 | /* Only if precision is the same, difference in coordinates | ||
| 118 | is significant. | ||
| 119 | if (*prec == nprec && STRCMP_n(pa, pb)) | ||
| 120 | return -1; */ | ||
| 121 | } | ||
| 122 | return 0; | ||
| 123 | } | ||
| 124 | |||
| 125 | int sort_me(const void *f_a, const void *f_b) { | ||
| 126 | entry_t *e_a = (entry_t *)f_a; | ||
| 127 | entry_t *e_b = (entry_t *)f_b; | ||
| 128 | |||
| 129 | char * pa = (char*)e_a->ptr; | ||
| 130 | char * pb = (char*)e_b->ptr; | ||
| 131 | |||
| 132 | int results[COLUMNS], c; | ||
| 133 | |||
| 134 | if (e_a->rows != e_b->rows) | ||
| 135 | return e_a->rows - e_b->rows; | ||
| 136 | |||
| 137 | for (c = 0; c<COLUMNS; ++c) { | ||
| 138 | results[c] = STRCMP_n(pa, pb); | ||
| 139 | SKIP_1_COLUMN(&pa); | ||
| 140 | SKIP_1_COLUMN(&pb); | ||
| 141 | } | ||
| 142 | |||
| 143 | if (results[10]) return results[10]; /* Vorwahl */ | ||
| 144 | if (results[11]) return results[11]; /* Rufnummer */ | ||
| 145 | if (results[2]) return results[2]; /* PLZ */ | ||
| 146 | if (results[3]) return results[3]; /* Nachname */ | ||
| 147 | if (results[4]) return results[4]; /* Vorname */ | ||
| 148 | if (results[6]) return results[6]; /* Strasse */ | ||
| 149 | if (results[7]) return results[7]; /* Hausnummer */ | ||
| 150 | if (results[8]) return results[8]; /* Verweise */ | ||
| 151 | if (results[0]) return results[0]; /* Year */ | ||
| 152 | return 0; | ||
| 153 | } | ||
| 154 | |||
| 155 | enum { OUTPUT_BUFFER_SIZE = 1024*1024*128 }; | ||
| 156 | |||
| 157 | static void do_escape_string(char * s, size_t len) { | ||
| 158 | size_t i; | ||
| 159 | |||
| 160 | putchar('"'); | ||
| 161 | for (i=0; i<len; ++i) { | ||
| 162 | switch(s[i]) { | ||
| 163 | case '\\': | ||
| 164 | putchar('\\'); | ||
| 165 | case '"': | ||
| 166 | putchar('\\'); | ||
| 167 | case ',': | ||
| 168 | putchar('\\'); | ||
| 169 | default: | ||
| 170 | break; | ||
| 171 | } | ||
| 172 | putchar(s[i]); | ||
| 173 | } | ||
| 174 | putchar('"'); | ||
| 175 | } | ||
| 176 | |||
| 177 | static void escape_string(char * s, size_t len) { | ||
| 178 | size_t i; | ||
| 179 | if (len == 4 && ( ( s[0] | 0x20 ) == 'n' ) && ( ( s[1] | 0x20 ) == 'u' ) && ( ( s[2] | 0x20 ) == 'l' ) && ( ( s[3] | 0x20 ) == 'l' ) ) | ||
| 180 | return do_escape_string(s, len); | ||
| 181 | for (i=0; i<len; ++i) | ||
| 182 | switch(s[i]) { | ||
| 183 | case '{': case '\\': case '}': case '"': case ',': return do_escape_string(s, len); | ||
| 184 | } | ||
| 185 | fwrite(s, len, 1, stdout); | ||
| 186 | } | ||
| 187 | |||
| 188 | int main(int argc, char **args) { | ||
| 189 | MAP tbuch = map_file(args[1], 1); | ||
| 190 | char *ptr, *start; | ||
| 191 | entry_t * sort_array; | ||
| 192 | outvec_t * out_array; | ||
| 193 | int current = -1, outoff = 0, lines = 1, i, truth = 0, truth_prec = -1; | ||
| 194 | uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0; | ||
| 195 | long flag = 0; | ||
| 196 | |||
| 197 | /* Estimate upper bound for amount of lines */ | ||
| 198 | for (i=0; i<tbuch->size; ++i) | ||
| 199 | if (tbuch->addr[i] == 10) | ||
| 200 | ++lines; | ||
| 201 | |||
| 202 | sort_array = (entry_t*)malloc((lines / COLUMNS) * sizeof(entry_t)); | ||
| 203 | out_array = (outvec_t*)malloc((lines / COLUMNS) * sizeof(outvec_t)); | ||
| 204 | |||
| 205 | ptr = (char*)tbuch->addr; | ||
| 206 | start = ptr; | ||
| 207 | |||
| 208 | while (ptr < (char*)tbuch->addr + tbuch->size) { | ||
| 209 | int c; | ||
| 210 | |||
| 211 | start = ptr; | ||
| 212 | |||
| 213 | /* Look for field terminator */ | ||
| 214 | for (c=0; c<COLUMNS; ++c) { | ||
| 215 | char * end = strchr(ptr, 10); | ||
| 216 | if (c==1) { | ||
| 217 | flag = strtoul(ptr, 0, 16); | ||
| 218 | out_array[outoff].ptr = end + 1; | ||
| 219 | } | ||
| 220 | ptr = end + 1; | ||
| 221 | } | ||
| 222 | |||
| 223 | if (flag&2) { | ||
| 224 | assert( current >= 0); | ||
| 225 | sort_array[current].rows++; | ||
| 226 | } else { | ||
| 227 | sort_array[++current].ptr = start; | ||
| 228 | sort_array[current].rows = 0; | ||
| 229 | sort_array[current].outoff = outoff; | ||
| 230 | sort_array[current].flag = flag; | ||
| 231 | } | ||
| 232 | out_array[outoff].size = ptr - out_array[outoff].ptr; | ||
| 233 | outoff++; | ||
| 234 | } | ||
| 235 | |||
| 236 | /* Sort the whole thing */ | ||
| 237 | qsort(sort_array, current, sizeof(entry_t), sort_me); | ||
| 238 | |||
| 239 | for (i=0; i<=current; ++i) { | ||
| 240 | int j, dump = 0, prec; | ||
| 241 | |||
| 242 | int year = year_to_offset(sort_array[i].ptr); | ||
| 243 | |||
| 244 | year_list |= 1LL << year; | ||
| 245 | if (sort_array[i].flag & 0x80 ) bizflag_list |= 1LL << year; | ||
| 246 | if (sort_array[i].flag & 0x40 ) revflag_list |= 1LL << year; | ||
| 247 | |||
| 248 | /* The last entry always needs to be dumped, but check if its | ||
| 249 | precision is better than the old truth's | ||
| 250 | The second comparision checks for equality of entries (modulo | ||
| 251 | coordinate mismatch) | ||
| 252 | */ | ||
| 253 | if (i == current) { | ||
| 254 | compare_entries(sort_array+i, sort_array+i, &prec); | ||
| 255 | dump = 1; | ||
| 256 | } else if (compare_entries(sort_array+i, sort_array+i+1, &prec)) | ||
| 257 | dump = 1; | ||
| 258 | |||
| 259 | /* If this entry's precision is higher than the one of possible | ||
| 260 | earlier matches, then the current entry becomes the truth */ | ||
| 261 | if (prec >= truth_prec) { | ||
| 262 | truth = i; | ||
| 263 | truth_prec = prec; | ||
| 264 | } | ||
| 265 | |||
| 266 | if (dump) { | ||
| 267 | printf("%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t", year_list, bizflag_list, revflag_list); | ||
| 268 | for (int c=0; c<COLUMNS-2; ++c) { | ||
| 269 | outvec_t * out = out_array + sort_array[truth].outoff; | ||
| 270 | int started = 0, skipped = 0; | ||
| 271 | for (j=0; j<=sort_array[truth].rows; ++j) { | ||
| 272 | char *s = strchr(out->ptr, 10); | ||
| 273 | size_t len = s - out->ptr; | ||
| 274 | if (!len || out->ptr[0] == 9) | ||
| 275 | skipped++; | ||
| 276 | else { | ||
| 277 | if (!started++) | ||
| 278 | putchar('{'); | ||
| 279 | else | ||
| 280 | putchar(','); | ||
| 281 | for (int x=0; x<skipped; ++x) fputs("null,", stdout); | ||
| 282 | if (c != COLUMNS-3) | ||
| 283 | escape_string(out->ptr, len); | ||
| 284 | else { | ||
| 285 | char coords[64], *tab; | ||
| 286 | // memcpy(coords, "POINT(", 6); | ||
| 287 | // memcpy(coords + 6, out->ptr, len); | ||
| 288 | // tab = memchr(coords + 6, 9, len); | ||
| 289 | // if (tab) *tab = ' '; | ||
| 290 | // coords[6+len] = ')'; | ||
| 291 | // fwrite(coords, 7 + len, 1, stdout); | ||
| 292 | memcpy(coords, out->ptr, len); | ||
| 293 | tab = memchr(coords, 9, len); | ||
| 294 | if (tab) *tab = ' '; | ||
| 295 | fwrite(coords, len, 1, stdout); | ||
| 296 | } | ||
| 297 | skipped = 0; | ||
| 298 | } | ||
| 299 | out->ptr = s + 1; | ||
| 300 | ++out; | ||
| 301 | } | ||
| 302 | if (started) putchar('}'); | ||
| 303 | if (c<COLUMNS-3) putchar(9); | ||
| 304 | } | ||
| 305 | putchar(10); | ||
| 306 | |||
| 307 | truth_prec = -1; | ||
| 308 | year_list = 0; | ||
| 309 | bizflag_list = 0; | ||
| 310 | revflag_list = 0; | ||
| 311 | } | ||
| 312 | } | ||
| 313 | } | ||
