summaryrefslogtreecommitdiff
path: root/src/postprocess
diff options
context:
space:
mode:
authorDirk Engling <erdgeist@erdgeist.org>2019-06-29 18:51:00 +0200
committerDirk Engling <erdgeist@erdgeist.org>2019-06-29 18:51:00 +0200
commitfb3616e06ca20ffe78dfb00b962a2599a46b2a5f (patch)
treefe7ad54e8cb17174b3296b63f41ad1f5b1038f8e /src/postprocess
parentb4bf8417af0d8ebff2c50570c70fdecaf6a53ed9 (diff)
Clean up 1992 post processing
Diffstat (limited to 'src/postprocess')
-rw-r--r--src/postprocess/map_plz.c4
-rw-r--r--src/postprocess/merge_entries.c5
-rw-r--r--src/postprocess/postprocess-1992.sh15
3 files changed, 11 insertions, 13 deletions
diff --git a/src/postprocess/map_plz.c b/src/postprocess/map_plz.c
index ab0db71..9dec6bb 100644
--- a/src/postprocess/map_plz.c
+++ b/src/postprocess/map_plz.c
@@ -18,7 +18,7 @@ int main(int argc, char **args) {
18 FILE *bfile, *streetfile_out; 18 FILE *bfile, *streetfile_out;
19 char *ptr, *input = malloc(65335); 19 char *ptr, *input = malloc(65335);
20 char *ort = malloc(65335), vorwahl_block[16]; 20 char *ort = malloc(65335), vorwahl_block[16];
21 int i, brutes_count = 0, report = 0; 21 unsigned int i, brutes_count = 0, report = 0;
22 brute_t *brutes = malloc(200000*sizeof(brute_t)); 22 brute_t *brutes = malloc(200000*sizeof(brute_t));
23 23
24 /* prepare io */ 24 /* prepare io */
@@ -63,7 +63,7 @@ int main(int argc, char **args) {
63 g_book_by_name = (entry_t*)malloc(g_book_size * sizeof(entry_t)); 63 g_book_by_name = (entry_t*)malloc(g_book_size * sizeof(entry_t));
64 64
65 /* Split pointers into input files into our arrays */ 65 /* Split pointers into input files into our arrays */
66 for (i = 0, ptr = (char*)tbuch->addr; i < g_book_size; ++i) { 66 for (i=0, ptr=(char*)tbuch->addr; i<g_book_size; ++i) {
67 g_book[i].vorwahl = ptr; ptr += strlen(ptr) + 1; 67 g_book[i].vorwahl = ptr; ptr += strlen(ptr) + 1;
68 g_book[i].ort = ptr; ptr += strlen(ptr) + 1; 68 g_book[i].ort = ptr; ptr += strlen(ptr) + 1;
69 g_book[i].strasse = ptr; ptr += strlen(ptr) + 1; 69 g_book[i].strasse = ptr; ptr += strlen(ptr) + 1;
diff --git a/src/postprocess/merge_entries.c b/src/postprocess/merge_entries.c
index f9ee67d..1dd7d50 100644
--- a/src/postprocess/merge_entries.c
+++ b/src/postprocess/merge_entries.c
@@ -148,9 +148,10 @@ static int sort_me(const void *f_a, const void *f_b) {
148 outvec_t *oa_row = oa + row * COLUMNS; 148 outvec_t *oa_row = oa + row * COLUMNS;
149 outvec_t *ob_row = ob + row * COLUMNS; 149 outvec_t *ob_row = ob + row * COLUMNS;
150 150
151 if ((res = STRCMP_n(oa_row[ 2].ptr, ob_row[ 2].ptr))) return res; /* PLZ */
152 if ((res = STRCMP_n(oa_row[ 9].ptr, ob_row[ 9].ptr))) return res; /* Ort */
151 if ((res = STRCMP_n(oa_row[10].ptr, ob_row[10].ptr))) return res; /* Vorwahl */ 153 if ((res = STRCMP_n(oa_row[10].ptr, ob_row[10].ptr))) return res; /* Vorwahl */
152 if ((res = STRCMP_n(oa_row[11].ptr, ob_row[11].ptr))) return res; /* Rufnummer */ 154 if ((res = STRCMP_n(oa_row[11].ptr, ob_row[11].ptr))) return res; /* Rufnummer */
153 if ((res = STRCMP_n(oa_row[ 2].ptr, ob_row[ 2].ptr))) return res; /* PLZ */
154 if ((res = STRCMP_n(oa_row[ 6].ptr, ob_row[ 6].ptr))) return res; /* Strasse */ 155 if ((res = STRCMP_n(oa_row[ 6].ptr, ob_row[ 6].ptr))) return res; /* Strasse */
155 if ((res = STRCMP_n(oa_row[ 7].ptr, ob_row[ 7].ptr))) return res; /* Hausnummer */ 156 if ((res = STRCMP_n(oa_row[ 7].ptr, ob_row[ 7].ptr))) return res; /* Hausnummer */
156 if ((res = STRCMP_n(oa_row[ 3].ptr, ob_row[ 3].ptr))) return res; /* Nachname */ 157 if ((res = STRCMP_n(oa_row[ 3].ptr, ob_row[ 3].ptr))) return res; /* Nachname */
@@ -200,7 +201,7 @@ int main(int argc, char **args) {
200 unsigned long current = 0, i, flag; 201 unsigned long current = 0, i, flag;
201 uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0; 202 uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0;
202 203
203 if (argc != 1) exit(1); 204 if (argc != 2) exit(1);
204 tbuch = map_file(args[1], 1); 205 tbuch = map_file(args[1], 1);
205 206
206 /* Estimate upper bound for amount of lines */ 207 /* Estimate upper bound for amount of lines */
diff --git a/src/postprocess/postprocess-1992.sh b/src/postprocess/postprocess-1992.sh
index 1e685d2..6720991 100644
--- a/src/postprocess/postprocess-1992.sh
+++ b/src/postprocess/postprocess-1992.sh
@@ -4,19 +4,16 @@ paste 1992_Q2/{01_Flags,12_Vorwahl,12_Vorwahl_block,11_Ort,07_Strasse,08_Hausnum
4# Generate lookup file from 1995 4# Generate lookup file from 1995
5paste 1995_Q0/{12_Vorwahl,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl} | tr '\n\t' '\0' > 1995-voshnvrp.bin 5paste 1995_Q0/{12_Vorwahl,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl} | tr '\n\t' '\0' > 1995-voshnvrp.bin
6 6
7# To debug in lldb
8process launch -i 1992_testfile.txt -- 1995-vorwahl-ort-strasse-hnr-name-vorname-rufnummer-plz.bin
9
10# Compile plz mapper 7# Compile plz mapper
11cc -O3 -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c 8cc -O3 -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c
12 9
10# To debug in lldb
11# cc -O0 -g -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c
12# process launch -i 1992_testfile.txt -- 1995-vorwahl-ort-strasse-hnr-name-vorname-rufnummer-plz.bin
13
13# outputs mapped plz, generates brutemap.txt 14# outputs mapped plz, generates brutemap.txt
14touch brutemap_input.bin zip_simple_map.bin 15touch brutemap_input.bin zip_simple_map.bin
15./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl 16./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 1992_Q2/10_Postleitzahl
16
17# generate street name translation table from brutemap,
18# only taking into account similar street names
19# cut -f 3,4 brutemap.txt | tr '[:upper:]' '[:lower:]' | paste brutemap.txt - | cut -f 1-4,6,7 | ./jaro | cut -f 1-5 > brutemap_filtered.txt
20 17
21# generate street name translation table from brutemap, 18# generate street name translation table from brutemap,
22# only taking into account similar street names, new style 19# only taking into account similar street names, new style
@@ -29,4 +26,4 @@ cut -f 1-5 brutemap_simifiltered.txt | sort | uniq -c | sed -E $'s:^ *([[:digit:
29sort -u zip_mapfile.txt | tr '\n' '\0' > zip_simple_map.bin 26sort -u zip_mapfile.txt | tr '\n' '\0' > zip_simple_map.bin
30 27
31# Redo the mapping with the data from brutemap and zipmap 28# Redo the mapping with the data from brutemap and zipmap
32./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl 29./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 1992_Q2/10_Postleitzahl