summaryrefslogtreecommitdiff
path: root/src/postprocess
diff options
context:
space:
mode:
authorDirk Engling <erdgeist@erdgeist.org>2019-02-23 23:27:40 +0100
committerDirk Engling <erdgeist@erdgeist.org>2019-02-23 23:27:40 +0100
commit602c32207ac0085489dcdd436a6866d35455fc89 (patch)
treef8ebd07c3f0681194a48e4ef972bde326b7ecafe /src/postprocess
parent380b982004e35e14f6b2ad5c5d82f33c6a3c2b4e (diff)
Add code for database import
Diffstat (limited to 'src/postprocess')
-rw-r--r--src/postprocess/merge_entries.c313
-rw-r--r--src/postprocess/postgres.txt41
-rw-r--r--src/postprocess/postprocess.sh18
-rw-r--r--src/postprocess/sort_plz.c4
4 files changed, 374 insertions, 2 deletions
diff --git a/src/postprocess/merge_entries.c b/src/postprocess/merge_entries.c
new file mode 100644
index 0000000..3ebfa8c
--- /dev/null
+++ b/src/postprocess/merge_entries.c
@@ -0,0 +1,313 @@
1#include "mystdlib.h"
2
3#include <stdlib.h>
4#include <stdio.h>
5#include <string.h>
6#include <inttypes.h>
7#include <assert.h>
8
9enum { COLUMNS = 15 };
10typedef struct {
11 char *ptr;
12 long rows;
13 long outoff;
14 long flag;
15} entry_t;
16typedef struct {
17 char *ptr;
18 size_t size;
19} outvec_t;
20
21const char *g_year_map[] = {
22"1992_Q2", "1995_Q0", "1996_Q0", "1996_Q1", "1997_Q1", "1997_Q3", "1998_Q1", "1998_Q3", "1999_Q1", "1999_Q3", "2000_Q1", "2000_Q3", "2001_Q1", "2001_Q2", "2001_Q3", "2001_Q4", "2002_Q1",
23"2002_Q3", "2003_Q1", "2003_Q3", "2004_Q1", "2004_Q3", "2005_Q1", "2005_Q3", "2006_Q1", "2006_Q3", "2007_Q1", "2007_Q3", "2008_Q1", "2008_Q3", "2009_Q1", "2009_Q3", "2010_Q1", "2010_Q3",
24"2011_Q1", "2011_Q3", "2012_Q1", "2012_Q3", "2013_Q1", "2013_Q3", "2014_Q1", "2014_Q3", "2015_Q1", "2015_Q3", "2016_Q1", "2016_Q3", "2017_Q1", "2017_Q3", "2018_Q1", "2018_Q3", "2019_Q1",
250
26};
27
28void SKIP_1_COLUMN(char **ptr) { *ptr = strchr(*ptr, 10) + 1; }
29void SKIP_2_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); }
30void SKIP_3_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); }
31
32int year_to_offset(const char *year) {
33 const char **y = g_year_map;
34 int off = 0;
35 while (*y) {
36 if (!memcmp(year, *y, 7)) return off;
37 ++off; ++y;
38 }
39 return -1;
40}
41
42
43int
44STRCMP_n (const char *p1, const char *p2)
45{
46 const unsigned char *s1 = (const unsigned char *) p1;
47 const unsigned char *s2 = (const unsigned char *) p2;
48 unsigned char c1, c2;
49 do
50 {
51 c1 = (unsigned char) *s1++;
52 c2 = (unsigned char) *s2++;
53 if (c1 == 10)
54 return c1 - c2;
55 }
56 while (c1 == c2);
57 return c1 - c2;
58}
59
60int compare_entries(entry_t*a, entry_t*b, int *prec) {
61 char *pa = a->ptr, *pb = b->ptr;
62 int col, row, res = 0, nprec = -1;
63
64 /* Multi line entries never match single line entries */
65 if (a->rows != b->rows)
66 return -1;
67
68 /* Assume house number precision first .. unless */
69 if (!memcmp(pa,"2006_Q3",7))
70 *prec = 2;
71 else
72 *prec = 3;
73
74 if (!memcmp(pb,"2006_Q3",7))
75 nprec = 2;
76 else
77 nprec = 3;
78
79 /* Skip year and flags */
80 SKIP_2_COLUMNS(&pa);
81 SKIP_2_COLUMNS(&pb);
82
83 /* Test all columns for identity */
84 for (col=2; col<COLUMNS-1; ++col) {
85 if (!res && STRCMP_n(pa, pb))
86 res = -1;
87 SKIP_1_COLUMN(&pa);
88 SKIP_1_COLUMN(&pb);
89 }
90
91 /* If no coords, downgrade precision */
92 if (*pa == 9) *prec = 1;
93 if (*pb == 9) nprec = 1;
94
95 /* If entries differ, return after precision has been found */
96 if (res) return res;
97
98 /* Only if precision is the same, difference in coordinates
99 is significant.
100 if (*prec == nprec && STRCMP_n(pa, pb))
101 return -1; */
102
103 /* Row 1 has been compared, check the rest of lines */
104 for (row=0; row<a->rows; ++row) {
105
106 /* Skip last row's coordinate columns, year and flags */
107 SKIP_3_COLUMNS(&pa);
108 SKIP_3_COLUMNS(&pb);
109
110 for (col=2; col<COLUMNS-1; ++col) {
111 if (STRCMP_n(pa, pb))
112 return -1;
113 SKIP_1_COLUMN(&pa);
114 SKIP_1_COLUMN(&pb);
115 }
116
117 /* Only if precision is the same, difference in coordinates
118 is significant.
119 if (*prec == nprec && STRCMP_n(pa, pb))
120 return -1; */
121 }
122 return 0;
123}
124
125int sort_me(const void *f_a, const void *f_b) {
126 entry_t *e_a = (entry_t *)f_a;
127 entry_t *e_b = (entry_t *)f_b;
128
129 char * pa = (char*)e_a->ptr;
130 char * pb = (char*)e_b->ptr;
131
132 int results[COLUMNS], c;
133
134 if (e_a->rows != e_b->rows)
135 return e_a->rows - e_b->rows;
136
137 for (c = 0; c<COLUMNS; ++c) {
138 results[c] = STRCMP_n(pa, pb);
139 SKIP_1_COLUMN(&pa);
140 SKIP_1_COLUMN(&pb);
141 }
142
143 if (results[10]) return results[10]; /* Vorwahl */
144 if (results[11]) return results[11]; /* Rufnummer */
145 if (results[2]) return results[2]; /* PLZ */
146 if (results[3]) return results[3]; /* Nachname */
147 if (results[4]) return results[4]; /* Vorname */
148 if (results[6]) return results[6]; /* Strasse */
149 if (results[7]) return results[7]; /* Hausnummer */
150 if (results[8]) return results[8]; /* Verweise */
151 if (results[0]) return results[0]; /* Year */
152 return 0;
153}
154
155enum { OUTPUT_BUFFER_SIZE = 1024*1024*128 };
156
157static void do_escape_string(char * s, size_t len) {
158 size_t i;
159
160 putchar('"');
161 for (i=0; i<len; ++i) {
162 switch(s[i]) {
163 case '\\':
164 putchar('\\');
165 case '"':
166 putchar('\\');
167 case ',':
168 putchar('\\');
169 default:
170 break;
171 }
172 putchar(s[i]);
173 }
174 putchar('"');
175}
176
177static void escape_string(char * s, size_t len) {
178 size_t i;
179 if (len == 4 && ( ( s[0] | 0x20 ) == 'n' ) && ( ( s[1] | 0x20 ) == 'u' ) && ( ( s[2] | 0x20 ) == 'l' ) && ( ( s[3] | 0x20 ) == 'l' ) )
180 return do_escape_string(s, len);
181 for (i=0; i<len; ++i)
182 switch(s[i]) {
183 case '{': case '\\': case '}': case '"': case ',': return do_escape_string(s, len);
184 }
185 fwrite(s, len, 1, stdout);
186}
187
188int main(int argc, char **args) {
189 MAP tbuch = map_file(args[1], 1);
190 char *ptr, *start;
191 entry_t * sort_array;
192 outvec_t * out_array;
193 int current = -1, outoff = 0, lines = 1, i, truth = 0, truth_prec = -1;
194 uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0;
195 long flag = 0;
196
197 /* Estimate upper bound for amount of lines */
198 for (i=0; i<tbuch->size; ++i)
199 if (tbuch->addr[i] == 10)
200 ++lines;
201
202 sort_array = (entry_t*)malloc((lines / COLUMNS) * sizeof(entry_t));
203 out_array = (outvec_t*)malloc((lines / COLUMNS) * sizeof(outvec_t));
204
205 ptr = (char*)tbuch->addr;
206 start = ptr;
207
208 while (ptr < (char*)tbuch->addr + tbuch->size) {
209 int c;
210
211 start = ptr;
212
213 /* Look for field terminator */
214 for (c=0; c<COLUMNS; ++c) {
215 char * end = strchr(ptr, 10);
216 if (c==1) {
217 flag = strtoul(ptr, 0, 16);
218 out_array[outoff].ptr = end + 1;
219 }
220 ptr = end + 1;
221 }
222
223 if (flag&2) {
224 assert( current >= 0);
225 sort_array[current].rows++;
226 } else {
227 sort_array[++current].ptr = start;
228 sort_array[current].rows = 0;
229 sort_array[current].outoff = outoff;
230 sort_array[current].flag = flag;
231 }
232 out_array[outoff].size = ptr - out_array[outoff].ptr;
233 outoff++;
234 }
235
236 /* Sort the whole thing */
237 qsort(sort_array, current, sizeof(entry_t), sort_me);
238
239 for (i=0; i<=current; ++i) {
240 int j, dump = 0, prec;
241
242 int year = year_to_offset(sort_array[i].ptr);
243
244 year_list |= 1LL << year;
245 if (sort_array[i].flag & 0x80 ) bizflag_list |= 1LL << year;
246 if (sort_array[i].flag & 0x40 ) revflag_list |= 1LL << year;
247
248 /* The last entry always needs to be dumped, but check if its
249 precision is better than the old truth's
250 The second comparision checks for equality of entries (modulo
251 coordinate mismatch)
252 */
253 if (i == current) {
254 compare_entries(sort_array+i, sort_array+i, &prec);
255 dump = 1;
256 } else if (compare_entries(sort_array+i, sort_array+i+1, &prec))
257 dump = 1;
258
259 /* If this entry's precision is higher than the one of possible
260 earlier matches, then the current entry becomes the truth */
261 if (prec >= truth_prec) {
262 truth = i;
263 truth_prec = prec;
264 }
265
266 if (dump) {
267 printf("%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t", year_list, bizflag_list, revflag_list);
268 for (int c=0; c<COLUMNS-2; ++c) {
269 outvec_t * out = out_array + sort_array[truth].outoff;
270 int started = 0, skipped = 0;
271 for (j=0; j<=sort_array[truth].rows; ++j) {
272 char *s = strchr(out->ptr, 10);
273 size_t len = s - out->ptr;
274 if (!len || out->ptr[0] == 9)
275 skipped++;
276 else {
277 if (!started++)
278 putchar('{');
279 else
280 putchar(',');
281 for (int x=0; x<skipped; ++x) fputs("null,", stdout);
282 if (c != COLUMNS-3)
283 escape_string(out->ptr, len);
284 else {
285 char coords[64], *tab;
286// memcpy(coords, "POINT(", 6);
287// memcpy(coords + 6, out->ptr, len);
288// tab = memchr(coords + 6, 9, len);
289// if (tab) *tab = ' ';
290// coords[6+len] = ')';
291// fwrite(coords, 7 + len, 1, stdout);
292 memcpy(coords, out->ptr, len);
293 tab = memchr(coords, 9, len);
294 if (tab) *tab = ' ';
295 fwrite(coords, len, 1, stdout);
296 }
297 skipped = 0;
298 }
299 out->ptr = s + 1;
300 ++out;
301 }
302 if (started) putchar('}');
303 if (c<COLUMNS-3) putchar(9);
304 }
305 putchar(10);
306
307 truth_prec = -1;
308 year_list = 0;
309 bizflag_list = 0;
310 revflag_list = 0;
311 }
312 }
313}
diff --git a/src/postprocess/postgres.txt b/src/postprocess/postgres.txt
new file mode 100644
index 0000000..d1b4669
--- /dev/null
+++ b/src/postprocess/postgres.txt
@@ -0,0 +1,41 @@
1CREATE TABLE telefonbuch (
2 presence_flag bigint,
3 reverse_flag bigint,
4 biz_flag bigint,
5 zip character varying(64)[] DEFAULT '{}',
6 nachname character varying(256)[] DEFAULT '{}',
7 vorname character varying(192)[] DEFAULT '{}',
8 zusaetze character varying(512)[] DEFAULT '{}',
9 strasse character varying(64)[] DEFAULT '{}',
10 hausnummer character varying(32)[] DEFAULT '{}',
11 verweise text[] DEFAULT '{}',
12 ort character varying(64)[] DEFAULT '{}',
13 vorwahl character varying(10)[] DEFAULT '{}',
14 rufnummer character varying(32)[] DEFAULT '{}',
15 web character varying(256)[] DEFAULT '{}',
16 email character varying(256)[] DEFAULT '{}',
17 coords character varying(32)[] DEFAULT '{}'
18);
19
20COPY Telefonbuch ( presence_flag, reverse_flag, biz_flag, zip, nachname, vorname, zusaetze, strasse, hausnummer, verweise, ort, vorwahl, rufnummer, web, email, coords ) FROM 'telefonbuch.txt' WITH NULL AS 'NULL';
21
22CREATE MATERIALZED VIEW nachname_view AS SELECT id, nachname FROM telefonbuch CROSS JOIN UNNEST(nachname) WITH ORDINALITY u(name, name_id) WITH data;
23
24CREATE INDEX idx_hausnummer ON public.telefonbuch USING btree (hausnummer);
25CREATE INDEX idx_multilinegroup ON public.telefonbuch USING btree (multilinegroup);
26CREATE INDEX idx_nachname ON public.telefonbuch USING btree (nachname);
27CREATE INDEX idx_ort ON public.telefonbuch USING btree (ort);
28CREATE INDEX idx_rufnummer ON public.telefonbuch USING btree (rufnummer);
29CREATE INDEX idx_strasse ON public.telefonbuch USING btree (strasse);
30CREATE INDEX idx_vorname ON public.telefonbuch USING btree (vorname);
31CREATE INDEX idx_vorwahl ON public.telefonbuch USING btree (vorwahl);
32CREATE INDEX idx_zip ON public.telefonbuch USING btree (zip);
33CREATE INDEX idx_zusaetze ON public.telefonbuch USING btree (zusaetze);
34
35CREATE INDEX trgm_idx_nachname ON public.telefonbuch USING gin (nachname gin_trgm_ops);
36CREATE INDEX trgm_idx_rufnummer ON public.telefonbuch USING gin (rufnummer gin_trgm_ops);
37CREATE INDEX trgm_idx_verweise ON public.telefonbuch USING gin (verweise gin_trgm_ops);
38CREATE INDEX trgm_idx_vorname ON public.telefonbuch USING gin (vorname gin_trgm_ops);
39CREATE INDEX trgm_idx_zusaetze ON public.telefonbuch USING gin (zusaetze gin_trgm_ops);
40
41SELECT *, ST_AsText(coords) FROM Telefonbuch WHERE nachname = 'Engling';
diff --git a/src/postprocess/postprocess.sh b/src/postprocess/postprocess.sh
new file mode 100644
index 0000000..cc63a4e
--- /dev/null
+++ b/src/postprocess/postprocess.sh
@@ -0,0 +1,18 @@
1export LANG=C
2export LC_ALL=C
3export LC_CTYPE=C
4
5sudo sysctl kern.maxfilesperproc=65563
6sudo sysctl kern.maxfiles=65563
7ulimit -n 65563
8
9for a in 199[56789]_Q? 20*_Q?; do echo $a; ./sort_plz $a; done
10
11rm -rf sorted
12mkdir sorted
13cd output
14
15for a in *; do echo $a >&2; ../merge_entries $a | iconv -f iso8859-15 -t utf-8 > ../sorted/$a; done
16
17cd ../output
18
diff --git a/src/postprocess/sort_plz.c b/src/postprocess/sort_plz.c
index 26ac9d0..4c30ea3 100644
--- a/src/postprocess/sort_plz.c
+++ b/src/postprocess/sort_plz.c
@@ -17,10 +17,10 @@
17#include <unistd.h> 17#include <unistd.h>
18#include <err.h> 18#include <err.h>
19 19
20enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_14, F_16, F_COUNT }; 20enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_14, F_15, F_16, F_COUNT };
21 21
22static char *g_filenames[] = { 22static char *g_filenames[] = {
23 "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "14_15_Email_Webadresse", "16_Koordinaten" }; 23 "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "14_Webadresse", "15_Email", "16_Koordinaten" };
24 24
25typedef struct { 25typedef struct {
26 char plz[8]; 26 char plz[8];