summaryrefslogtreecommitdiff
path: root/src/postprocess
diff options
context:
space:
mode:
authorDirk Engling <erdgeist@erdgeist.org>2019-03-07 15:24:32 +0100
committerDirk Engling <erdgeist@erdgeist.org>2019-03-07 15:24:32 +0100
commit0e52aaa51092c5535cd4e2686bc5c15f7f7b9c5b (patch)
tree60e35c2ca6bc72144681820f72ee982e5558642a /src/postprocess
parent2eff180815619302f101a4ff32db6cf293b529c8 (diff)
Rework merge_entries to account for chunks moving between Zusaetze and Verweise column
Diffstat (limited to 'src/postprocess')
-rw-r--r--src/postprocess/merge_entries.c268
1 files changed, 123 insertions, 145 deletions
diff --git a/src/postprocess/merge_entries.c b/src/postprocess/merge_entries.c
index 3ebfa8c..f89113e 100644
--- a/src/postprocess/merge_entries.c
+++ b/src/postprocess/merge_entries.c
@@ -6,17 +6,21 @@
6#include <inttypes.h> 6#include <inttypes.h>
7#include <assert.h> 7#include <assert.h>
8 8
9extern int halfsiphash(const uint8_t *in, const size_t inlen, const uint8_t *k, uint8_t *out, const size_t outlen);
10
9enum { COLUMNS = 15 }; 11enum { COLUMNS = 15 };
10typedef struct { 12typedef struct {
11 char *ptr;
12 long rows; 13 long rows;
13 long outoff; 14 long outoff;
14 long flag; 15 long flag;
16 int year;
15} entry_t; 17} entry_t;
16typedef struct { 18typedef struct {
17 char *ptr; 19 char *ptr;
18 size_t size; 20 size_t size;
21 uint64_t data; /* might store precision or normalized verweise+zusaetze */
19} outvec_t; 22} outvec_t;
23static outvec_t * g_out_array;
20 24
21const char *g_year_map[] = { 25const char *g_year_map[] = {
22"1992_Q2", "1995_Q0", "1996_Q0", "1996_Q1", "1997_Q1", "1997_Q3", "1998_Q1", "1998_Q3", "1999_Q1", "1999_Q3", "2000_Q1", "2000_Q3", "2001_Q1", "2001_Q2", "2001_Q3", "2001_Q4", "2002_Q1", 26"1992_Q2", "1995_Q0", "1996_Q0", "1996_Q1", "1997_Q1", "1997_Q3", "1998_Q1", "1998_Q3", "1999_Q1", "1999_Q3", "2000_Q1", "2000_Q3", "2001_Q1", "2001_Q2", "2001_Q3", "2001_Q4", "2002_Q1",
@@ -25,11 +29,7 @@ const char *g_year_map[] = {
250 290
26}; 30};
27 31
28void SKIP_1_COLUMN(char **ptr) { *ptr = strchr(*ptr, 10) + 1; } 32static int year_to_offset(const char *year) {
29void SKIP_2_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); }
30void SKIP_3_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); }
31
32int year_to_offset(const char *year) {
33 const char **y = g_year_map; 33 const char **y = g_year_map;
34 int off = 0; 34 int off = 0;
35 while (*y) { 35 while (*y) {
@@ -39,8 +39,33 @@ int year_to_offset(const char *year) {
39 return -1; 39 return -1;
40} 40}
41 41
42static uint64_t string_to_hash(const char *start, const char *end) {
43 const uint64_t k = 0xdc082e4772c897e7LL;
44 uint64_t hash = 1LL, acc = 0LL;
45 while (start < end) {
46 const char *tokend = memchr(start, ' ', end - start);
47 const char *tokend2 = memchr(start, '.', end - start);
48 const char *compare_end = tokend;
49
50 if (tokend2 && (!tokend || (tokend > tokend2))) {
51 tokend = 0;
52 compare_end = tokend2 + 1;
53 }
54 if (!tokend && !tokend2)
55 compare_end = end;
42 56
43int 57 if (compare_end != start) {
58 halfsiphash((const uint8_t*)start, (const size_t)(compare_end - start), (const uint8_t*)&k, (uint8_t*)&acc, sizeof(acc));
59 hash *= acc;
60 //printf("HASH %" PRIX64 " %" PRIX64 ":%" PRIX64 " TOKEN(%zd): %.*s\n", hash, acc, k, tokend - start, (int)(tokend - start), start);
61 }
62 start = compare_end;
63 if (tokend) start++; // for space, we only compared up to the char
64 }
65 return hash;
66}
67
68static int
44STRCMP_n (const char *p1, const char *p2) 69STRCMP_n (const char *p1, const char *p2)
45{ 70{
46 const unsigned char *s1 = (const unsigned char *) p1; 71 const unsigned char *s1 = (const unsigned char *) p1;
@@ -57,103 +82,85 @@ STRCMP_n (const char *p1, const char *p2)
57 return c1 - c2; 82 return c1 - c2;
58} 83}
59 84
60int compare_entries(entry_t*a, entry_t*b, int *prec) { 85static int compare_entries(entry_t*a, entry_t*b) {
61 char *pa = a->ptr, *pb = b->ptr; 86 int col, row, nprec_a, nprec_b, a_is_newer;
62 int col, row, res = 0, nprec = -1; 87 outvec_t *oa = g_out_array + a->outoff;
88 outvec_t *ob = g_out_array + b->outoff;
63 89
64 /* Multi line entries never match single line entries */ 90 /* Multi line entries never match single line entries */
65 if (a->rows != b->rows) 91 if (a->rows != b->rows)
66 return -1; 92 return -1;
67 93
68 /* Assume house number precision first .. unless */ 94 /* Check all columns except year, flag and coords for equality */
69 if (!memcmp(pa,"2006_Q3",7)) 95 for (row=0; row <= a->rows; ++row)
70 *prec = 2; 96 for (col=2; col<COLUMNS-1; ++col) {
71 else 97 int off = COLUMNS*row+col;
72 *prec = 3; 98 if (col == 5) continue;
73 99 if (col == 8) {
74 if (!memcmp(pb,"2006_Q3",7)) 100 if (oa[off].data != ob[off].data)
75 nprec = 2; 101 return -1;
76 else 102 else
77 nprec = 3; 103 continue;
78 104 }
79 /* Skip year and flags */ 105 if (oa[off].size != ob[off].size || memcmp(oa[off].ptr, ob[off].ptr, oa[off].size))
80 SKIP_2_COLUMNS(&pa); 106 return -1;
81 SKIP_2_COLUMNS(&pb); 107 }
82
83 /* Test all columns for identity */
84 for (col=2; col<COLUMNS-1; ++col) {
85 if (!res && STRCMP_n(pa, pb))
86 res = -1;
87 SKIP_1_COLUMN(&pa);
88 SKIP_1_COLUMN(&pb);
89 }
90
91 /* If no coords, downgrade precision */
92 if (*pa == 9) *prec = 1;
93 if (*pb == 9) nprec = 1;
94 108
95 /* If entries differ, return after precision has been found */ 109 /* we haven't returned, so they're identical enough
96 if (res) return res; 110 we always want to keep the newest coordinates,
111 unless they are not present or from 2006_Q3 */
112 a_is_newer = memcmp(oa[0].ptr,ob[0].ptr,oa[0].size) > 0;
113 nprec_a = memcmp(oa[0].ptr,"2006_Q3",7) ? 2 : 1;
114 nprec_b = memcmp(ob[0].ptr,"2006_Q3",7) ? 2 : 1;
97 115
98 /* Only if precision is the same, difference in coordinates 116 for (row=0; row <= a->rows; ++row) {
99 is significant. 117 int present_a = oa[row * COLUMNS + 14].size != 1;
100 if (*prec == nprec && STRCMP_n(pa, pb)) 118 int present_b = ob[row * COLUMNS + 14].size != 1;
101 return -1; */
102 119
103 /* Row 1 has been compared, check the rest of lines */ 120 if (!present_a) continue;
104 for (row=0; row<a->rows; ++row) {
105 121
106 /* Skip last row's coordinate columns, year and flags */ 122 /* If the current entry's coords were copied, use the stored precision */
107 SKIP_3_COLUMNS(&pa); 123 if (oa[row * COLUMNS + 14].data > 0)
108 SKIP_3_COLUMNS(&pb); 124 nprec_a = oa[row * COLUMNS + 14].data;
109 125
110 for (col=2; col<COLUMNS-1; ++col) { 126 if (!present_b || (nprec_a > nprec_b ) || ( a_is_newer && (nprec_a >= nprec_b))) {
111 if (STRCMP_n(pa, pb)) 127 ob[row * COLUMNS + 14].ptr = oa[row * COLUMNS + 14].ptr;
112 return -1; 128 ob[row * COLUMNS + 14].size = oa[row * COLUMNS + 14].size;
113 SKIP_1_COLUMN(&pa); 129 ob[row * COLUMNS + 14].data = nprec_a;
114 SKIP_1_COLUMN(&pb);
115 } 130 }
116
117 /* Only if precision is the same, difference in coordinates
118 is significant.
119 if (*prec == nprec && STRCMP_n(pa, pb))
120 return -1; */
121 } 131 }
122 return 0; 132 return 0;
123} 133}
124 134
125int sort_me(const void *f_a, const void *f_b) { 135static int sort_me(const void *f_a, const void *f_b) {
126 entry_t *e_a = (entry_t *)f_a; 136 entry_t *e_a = (entry_t *)f_a;
127 entry_t *e_b = (entry_t *)f_b; 137 entry_t *e_b = (entry_t *)f_b;
128 138
129 char * pa = (char*)e_a->ptr; 139 outvec_t *oa = g_out_array + e_a->outoff;
130 char * pb = (char*)e_b->ptr; 140 outvec_t *ob = g_out_array + e_b->outoff;
131 141
132 int results[COLUMNS], c; 142 int res, row;
133 143
134 if (e_a->rows != e_b->rows) 144 if (e_a->rows != e_b->rows)
135 return e_a->rows - e_b->rows; 145 return e_a->rows - e_b->rows;
136 146
137 for (c = 0; c<COLUMNS; ++c) { 147 for (row = 0; row <= e_a->rows; ++row) {
138 results[c] = STRCMP_n(pa, pb); 148 outvec_t *oa_row = oa + row * COLUMNS;
139 SKIP_1_COLUMN(&pa); 149 outvec_t *ob_row = ob + row * COLUMNS;
140 SKIP_1_COLUMN(&pb); 150
151 if ((res = STRCMP_n(oa_row[10].ptr, ob_row[10].ptr))) return res; /* Vorwahl */
152 if ((res = STRCMP_n(oa_row[11].ptr, ob_row[11].ptr))) return res; /* Rufnummer */
153 if ((res = STRCMP_n(oa_row[ 2].ptr, ob_row[ 2].ptr))) return res; /* PLZ */
154 if ((res = STRCMP_n(oa_row[ 6].ptr, ob_row[ 6].ptr))) return res; /* Strasse */
155 if ((res = STRCMP_n(oa_row[ 7].ptr, ob_row[ 7].ptr))) return res; /* Hausnummer */
156 if ((res = STRCMP_n(oa_row[ 3].ptr, ob_row[ 3].ptr))) return res; /* Nachname */
157 if ((res = STRCMP_n(oa_row[ 4].ptr, ob_row[ 4].ptr))) return res; /* Vorname */
158 if (oa_row[8].data != ob_row[8].data ) return oa_row[8].data - ob_row[8].data;
141 } 159 }
142 160
143 if (results[10]) return results[10]; /* Vorwahl */ 161 return STRCMP_n(oa[0].ptr, ob[0].ptr);
144 if (results[11]) return results[11]; /* Rufnummer */
145 if (results[2]) return results[2]; /* PLZ */
146 if (results[3]) return results[3]; /* Nachname */
147 if (results[4]) return results[4]; /* Vorname */
148 if (results[6]) return results[6]; /* Strasse */
149 if (results[7]) return results[7]; /* Hausnummer */
150 if (results[8]) return results[8]; /* Verweise */
151 if (results[0]) return results[0]; /* Year */
152 return 0;
153} 162}
154 163
155enum { OUTPUT_BUFFER_SIZE = 1024*1024*128 };
156
157static void do_escape_string(char * s, size_t len) { 164static void do_escape_string(char * s, size_t len) {
158 size_t i; 165 size_t i;
159 166
@@ -187,10 +194,9 @@ static void escape_string(char * s, size_t len) {
187 194
188int main(int argc, char **args) { 195int main(int argc, char **args) {
189 MAP tbuch = map_file(args[1], 1); 196 MAP tbuch = map_file(args[1], 1);
190 char *ptr, *start; 197 char *ptr;
191 entry_t * sort_array; 198 entry_t * sort_array;
192 outvec_t * out_array; 199 int current = -1, outoff = 0, lines = COLUMNS, i;
193 int current = -1, outoff = 0, lines = 1, i, truth = 0, truth_prec = -1;
194 uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0; 200 uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0;
195 long flag = 0; 201 long flag = 0;
196 202
@@ -199,24 +205,29 @@ int main(int argc, char **args) {
199 if (tbuch->addr[i] == 10) 205 if (tbuch->addr[i] == 10)
200 ++lines; 206 ++lines;
201 207
202 sort_array = (entry_t*)malloc((lines / COLUMNS) * sizeof(entry_t)); 208 g_out_array = (outvec_t*)malloc(lines * sizeof(outvec_t));
203 out_array = (outvec_t*)malloc((lines / COLUMNS) * sizeof(outvec_t)); 209 sort_array = (entry_t*)malloc(lines * sizeof(entry_t));
204 210
205 ptr = (char*)tbuch->addr; 211 ptr = (char*)tbuch->addr;
206 start = ptr;
207 212
208 while (ptr < (char*)tbuch->addr + tbuch->size) { 213 while (ptr < (char*)tbuch->addr + tbuch->size) {
209 int c; 214 int c, year;
210
211 start = ptr;
212 215
213 /* Look for field terminator */ 216 /* Look for field terminator */
214 for (c=0; c<COLUMNS; ++c) { 217 for (c=0; c<COLUMNS; ++c) {
215 char * end = strchr(ptr, 10); 218 char * end = strchr(ptr, 10);
216 if (c==1) { 219 if (c==0)
220 year = year_to_offset(ptr);
221 if (c==1)
217 flag = strtoul(ptr, 0, 16); 222 flag = strtoul(ptr, 0, 16);
218 out_array[outoff].ptr = end + 1; 223 if (c==5)
219 } 224 g_out_array[outoff+8].data = string_to_hash(ptr, end);
225 g_out_array[outoff+c].ptr = ptr;
226 g_out_array[outoff+c].size = end - ptr;
227 if (c==8)
228 g_out_array[outoff+c].data *= string_to_hash(ptr, end);
229 else
230 g_out_array[outoff+c].data = 0;
220 ptr = end + 1; 231 ptr = end + 1;
221 } 232 }
222 233
@@ -224,54 +235,29 @@ int main(int argc, char **args) {
224 assert( current >= 0); 235 assert( current >= 0);
225 sort_array[current].rows++; 236 sort_array[current].rows++;
226 } else { 237 } else {
227 sort_array[++current].ptr = start; 238 sort_array[++current].rows = 0;
228 sort_array[current].rows = 0;
229 sort_array[current].outoff = outoff; 239 sort_array[current].outoff = outoff;
230 sort_array[current].flag = flag; 240 sort_array[current].flag = flag;
241 sort_array[current].year = year;
231 } 242 }
232 out_array[outoff].size = ptr - out_array[outoff].ptr; 243 outoff += COLUMNS;
233 outoff++;
234 } 244 }
235 245
236 /* Sort the whole thing */ 246 /* Sort the whole thing */
237 qsort(sort_array, current, sizeof(entry_t), sort_me); 247 qsort(sort_array, current, sizeof(entry_t), sort_me);
238 248
239 for (i=0; i<=current; ++i) { 249 for (i=0; i<=current; ++i) {
240 int j, dump = 0, prec; 250 year_list |= 1LL << sort_array[i].year;
241 251 if (sort_array[i].flag & 0x80 ) bizflag_list |= 1LL << sort_array[i].year;
242 int year = year_to_offset(sort_array[i].ptr); 252 if (sort_array[i].flag & 0x40 ) revflag_list |= 1LL << sort_array[i].year;
243
244 year_list |= 1LL << year;
245 if (sort_array[i].flag & 0x80 ) bizflag_list |= 1LL << year;
246 if (sort_array[i].flag & 0x40 ) revflag_list |= 1LL << year;
247
248 /* The last entry always needs to be dumped, but check if its
249 precision is better than the old truth's
250 The second comparision checks for equality of entries (modulo
251 coordinate mismatch)
252 */
253 if (i == current) {
254 compare_entries(sort_array+i, sort_array+i, &prec);
255 dump = 1;
256 } else if (compare_entries(sort_array+i, sort_array+i+1, &prec))
257 dump = 1;
258
259 /* If this entry's precision is higher than the one of possible
260 earlier matches, then the current entry becomes the truth */
261 if (prec >= truth_prec) {
262 truth = i;
263 truth_prec = prec;
264 }
265 253
266 if (dump) { 254 if ((i == current) || compare_entries(sort_array+i, sort_array+i+1)) {
267 printf("%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t", year_list, bizflag_list, revflag_list); 255 printf("%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t", year_list, bizflag_list, revflag_list);
268 for (int c=0; c<COLUMNS-2; ++c) { 256 for (int c=2; c<COLUMNS; ++c) {
269 outvec_t * out = out_array + sort_array[truth].outoff; 257 int j, started = 0, skipped = 0;
270 int started = 0, skipped = 0; 258 for (j=0; j<=sort_array[i].rows; ++j) {
271 for (j=0; j<=sort_array[truth].rows; ++j) { 259 outvec_t * out = g_out_array + sort_array[i].outoff + j * COLUMNS;
272 char *s = strchr(out->ptr, 10); 260 if (!out[c].size || *(out[c].ptr) == 9)
273 size_t len = s - out->ptr;
274 if (!len || out->ptr[0] == 9)
275 skipped++; 261 skipped++;
276 else { 262 else {
277 if (!started++) 263 if (!started++)
@@ -279,32 +265,24 @@ int main(int argc, char **args) {
279 else 265 else
280 putchar(','); 266 putchar(',');
281 for (int x=0; x<skipped; ++x) fputs("null,", stdout); 267 for (int x=0; x<skipped; ++x) fputs("null,", stdout);
282 if (c != COLUMNS-3) 268 if (c != COLUMNS-1)
283 escape_string(out->ptr, len); 269 escape_string(out[c].ptr, out[c].size);
284 else { 270 else {
285 char coords[64], *tab; 271 char coords[64], *tab;
286// memcpy(coords, "POINT(", 6); 272 memcpy(coords, out[c].ptr, out[c].size);
287// memcpy(coords + 6, out->ptr, len); 273 tab = memchr(coords, 9, out[c].size);
288// tab = memchr(coords + 6, 9, len); 274 if (tab) *tab = ' ';
289// if (tab) *tab = ' '; 275 fwrite(coords, out[c].size, 1, stdout);
290// coords[6+len] = ')';
291// fwrite(coords, 7 + len, 1, stdout);
292 memcpy(coords, out->ptr, len);
293 tab = memchr(coords, 9, len);
294 if (tab) *tab = ' ';
295 fwrite(coords, len, 1, stdout);
296 } 276 }
297 skipped = 0; 277 skipped = 0;
298 } 278 }
299 out->ptr = s + 1;
300 ++out; 279 ++out;
301 } 280 }
302 if (started) putchar('}'); 281 if (started) putchar('}');
303 if (c<COLUMNS-3) putchar(9); 282 if (c<COLUMNS-1) putchar(9);
304 } 283 }
305 putchar(10); 284 putchar(10);
306 285
307 truth_prec = -1;
308 year_list = 0; 286 year_list = 0;
309 bizflag_list = 0; 287 bizflag_list = 0;
310 revflag_list = 0; 288 revflag_list = 0;