From b4bf8417af0d8ebff2c50570c70fdecaf6a53ed9 Mon Sep 17 00:00:00 2001 From: Dirk Engling Date: Wed, 20 Mar 2019 04:30:29 +0100 Subject: Add code to lookup new zip codes for 1995 entries and fix up streetnames --- src/postprocess/map_plz.c | 388 ++++++++++++++++++++++++++++++++++++ src/postprocess/map_plz.h | 39 ++++ src/postprocess/postprocess-1992.sh | 32 +++ src/postprocess/simi.py | 11 + 4 files changed, 470 insertions(+) create mode 100644 src/postprocess/map_plz.c create mode 100644 src/postprocess/map_plz.h create mode 100644 src/postprocess/postprocess-1992.sh create mode 100755 src/postprocess/simi.py diff --git a/src/postprocess/map_plz.c b/src/postprocess/map_plz.c new file mode 100644 index 0000000..ab0db71 --- /dev/null +++ b/src/postprocess/map_plz.c @@ -0,0 +1,388 @@ +#include "mystdlib.h" + +#include +#include +#include +#include +#include + +#include "map_plz.h" + +static entry_t *g_book, *g_book_by_name; +static size_t g_book_size, g_zipmap_size; +static const char * g_zipmap[32000]; +static FILE *g_mapfile_out; + +int main(int argc, char **args) { + MAP tbuch, zipmap, brutemap; + FILE *bfile, *streetfile_out; + char *ptr, *input = malloc(65335); + char *ort = malloc(65335), vorwahl_block[16]; + int i, brutes_count = 0, report = 0; + brute_t *brutes = malloc(200000*sizeof(brute_t)); + + /* prepare io */ + if (argc != 4) exit(1); + tbuch = map_file(args[1], 1); + zipmap = map_file(args[2], 1); + brutemap = map_file(args[3], 1); + + /* read all files */ + g_zipmap[0] = (char*)zipmap->addr; + for (i=0; isize; ++i) + if (!zipmap->addr[i]) + g_zipmap[++g_zipmap_size] = (char*)(zipmap->addr + i + 1); + qsort(g_zipmap, g_zipmap_size, sizeof(char*), cc); + + ptr = (char*)brutemap->addr; + + /* Split brute records */ + while (ptr < (char*)brutemap->addr + brutemap->size) { + brutes[brutes_count].count = atol(ptr); ptr += strlen(ptr) + 1; + brutes[brutes_count].similarity = atol(ptr); ptr += strlen(ptr) + 1; + + brutes[brutes_count].vorwahl = ptr; ptr += strlen(ptr) + 1; + brutes[brutes_count].ort = ptr; ptr += strlen(ptr) + 1; + brutes[brutes_count].strasse_1992 = ptr; ptr += strlen(ptr) + 1; + brutes[brutes_count].strasse_1995 = ptr; ptr += strlen(ptr) + 1; + + if (brutes[brutes_count].count >= 20 || brutes[brutes_count].similarity >= 70) + brutes_count++; + } + qsort(brutes, brutes_count, sizeof(brute_t), sort_brutes); + + /* count phonebook lines */ + for (i=0; isize; ++i) + if (!tbuch->addr[i]) + ++g_book_size; + + /* We expect 8 columns per line */ + g_book_size /= 8; + + g_book = (entry_t*)malloc(g_book_size * sizeof(entry_t)); + g_book_by_name = (entry_t*)malloc(g_book_size * sizeof(entry_t)); + + /* Split pointers into input files into our arrays */ + for (i = 0, ptr = (char*)tbuch->addr; i < g_book_size; ++i) { + g_book[i].vorwahl = ptr; ptr += strlen(ptr) + 1; + g_book[i].ort = ptr; ptr += strlen(ptr) + 1; + g_book[i].strasse = ptr; ptr += strlen(ptr) + 1; + g_book[i].hnr = ptr; ptr += strlen(ptr) + 1; + g_book[i].name = ptr; ptr += strlen(ptr) + 1; + g_book[i].vorname = ptr; ptr += strlen(ptr) + 1; + g_book[i].nummer = ptr; ptr += strlen(ptr) + 1; + g_book[i].plz = ptr; ptr += strlen(ptr) + 1; + } + /* Copy input array so that it can be sorted by different criteria */ + memcpy(g_book_by_name, g_book, g_book_size * sizeof(entry_t)); + + fprintf(stderr, "STEP 1: import done\n"); + + /* Sort the whole thing */ + qsort(g_book, g_book_size, sizeof(entry_t), sort_by_voshnvn); + qsort(g_book_by_name, g_book_size, sizeof(entry_t), sort_by_vonvh); + + fprintf(stderr, "STEP 2: sort done\n"); + + g_mapfile_out = fopen("zip_mapfile.txt", "w"); + streetfile_out = fopen("07_Strasse_fixed", "w"); + bfile = fopen("brutemap.txt", "w"); + + while (1) { + char flag, *t, *l = fgets(input, 65536, stdin); + const char *orig_strasse; + entry_t local; + brute_t *found_brute; + int once = 0, fixed = 0; + + if (!l) break; + if ((report++ % 300000) == 0) fprintf(stderr, "% 10d lines done\n", report); + + /* Scan and skip flags */ + flag = strtoul(l, &l, 16); ++l; + + /* Copy vorwahl, if in field */ + local.vorwahl = advance_and_replace(&l, 9, 0); + + /* Copy over vorwahl for whole exported block, if we're not in cont */ + t = advance_and_replace(&l, 9, 0); + if (flag < 2) strcpy(vorwahl_block, t); + + /* Only copy over ort from continuations, if present */ + if (flag < 2 || (*l != 9) ) local.ort = l; else local.ort = ort; + advance_and_replace(&l, 9, 0); + + /* Take copy of ort for continuations, if on start of multi line record */ + if (flag == 1) strcpy(ort, local.ort); + + /* Copy rest of the fields verbatim */ + local.strasse = advance_and_replace(&l, 9, 0); + local.hnr = advance_and_replace(&l, 9, 0); + local.name = advance_and_replace(&l, 9, 0); + local.vorname = advance_and_replace(&l, 9, 0); + local.nummer = advance_and_replace(&l, 9, 0); + local.plz = advance_and_replace(&l, 9, ':'); + advance_and_replace(&l, 10, 0); + + orig_strasse = local.strasse; +rescan: + + if (search_and_verify(&local, sort_by_voshnvn, 1) || + search_and_verify(&local, sort_by_voshnv, 1) || + search_and_verify(&local, sort_by_vosh, 0) || + search_and_verify(&local, sort_by_vos, 0)) { + fputs(fixed ? local.strasse : orig_strasse, streetfile_out); + fputc(10, streetfile_out); + continue; + } + + /* If we can't find the street, it might be due to an incorrect vorwahl, try to fix it up */ + if (strcmp(local.vorwahl, vorwahl_block)) { + local.vorwahl = vorwahl_block; + goto rescan; + } + + /* If we do have vorwahl + ort + strasse + hnr, but no match, street might have changed name */ + if (*local.vorwahl && *local.ort && *local.strasse && *local.hnr && *local.name) { + entry_t *found = bsearch_first(&local, g_book_by_name, g_book_size, sizeof(entry_t), sort_by_vonvh); + if (found) + fprintf(bfile, "%s\t%s\t%s\t%s\t%s\n", local.vorwahl, local.ort, local.strasse, found->strasse, local.hnr); + } + + /* See if we can find and correct the street name */ + if (!once++) { + found_brute = bsearch(&local, brutes, brutes_count, sizeof(brute_t), search_brute); + if (found_brute && ( (found_brute->similarity >= 70) || (found_brute->count >= 20)) ) { + fixed = found_brute->similarity >= 80; + local.strasse = found_brute->strasse_1995; + goto rescan; + } + } + + /* If nothing works, see if the whole village has only one zip, else just print an empty line */ + if (!search_and_verify(&local, sort_by_vo, 2)) + putchar(10); + fputs(orig_strasse, streetfile_out); + fputc(10, streetfile_out); + } +} + +static char * advance_and_replace(char **p, char find, char replace) { + char *copy = *p; + *p = strchr(*p, find); + **p = replace; + ++*p; + return copy; +} + +static void *bsearch_first( const void * const key, const void * base, const size_t nel, const size_t width, int (*compar) (const void *, const void *)) { + size_t interval = nel; + const void * first = base; + int cmp; + + while (interval) { + uint8_t *lookat = ((uint8_t*)base) + width * ( interval / 2 ); + cmp = compar(key, (void*)lookat); + if(cmp == 0 && ((base == first) || compar(key, (void*)(lookat-width)))) + return lookat; + if(cmp > 0) { + base = lookat + width; + interval--; + } + interval /= 2; + } + + return 0; +} + +/* For if we have vorwahl and ort and strasse */ +static int sort_by_voshnvn(const void *a, const void *b) { + int res; + entry_t *ea = (entry_t *)a; + entry_t *eb = (entry_t *)b; + + if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; + if ((res = strcmp(ea->ort, eb->ort ))) return res; + if ((res = strcmp(ea->strasse, eb->strasse))) return res; + if ((res = strcmp(ea->hnr, eb->hnr ))) return res; + if ((res = strcmp(ea->name, eb->name ))) return res; + if ((res = strcmp(ea->vorname, eb->vorname))) return res; + if ((res = strcmp(ea->nummer, eb->nummer ))) return res; + return 0; +} + +/* more relaxed, if rufnummer missmatches */ +static int sort_by_voshnv(const void *a, const void *b) { + int res; + entry_t *ea = (entry_t *)a; + entry_t *eb = (entry_t *)b; + + if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; + if ((res = strcmp(ea->ort, eb->ort ))) return res; + if ((res = strcmp(ea->strasse, eb->strasse))) return res; + if ((res = strcmp(ea->hnr, eb->hnr ))) return res; + if ((res = strcmp(ea->name, eb->name ))) return res; + if ((res = strcmp(ea->vorname, eb->vorname))) return res; + return 0; +} + +/* more relaxed, if rufnummer missmatches */ +static int sort_by_vosh(const void *a, const void *b) { + int res; + entry_t *ea = (entry_t *)a; + entry_t *eb = (entry_t *)b; + + if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; + if ((res = strcmp(ea->ort, eb->ort ))) return res; + if ((res = strcmp(ea->strasse, eb->strasse))) return res; + if ((res = strcmp(ea->hnr, eb->hnr ))) return res; + return 0; +} + +/* more relaxed, if rufnummer missmatches */ +static int sort_by_vos(const void *a, const void *b) { + int res; + entry_t *ea = (entry_t *)a; + entry_t *eb = (entry_t *)b; + + if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; + if ((res = strcmp(ea->ort, eb->ort ))) return res; + if ((res = strcmp(ea->strasse, eb->strasse))) return res; + return 0; +} + +/* last resort, check if the whole vorwahl+ort set matches a single zip */ +static int sort_by_vo(const void *a, const void *b) { + int res; + entry_t *ea = (entry_t *)a; + entry_t *eb = (entry_t *)b; + + if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; + if ((res = strcmp(ea->ort, eb->ort ))) return res; + return 0; +} + +/* For brute forcing name if we can't find strasse */ +static int sort_by_vonvh(const void *a, const void *b) { + int res; + entry_t *ea = (entry_t *)a; + entry_t *eb = (entry_t *)b; + + if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; + if ((res = strcmp(ea->ort, eb->ort ))) return res; + if ((res = strcmp(ea->name, eb->name ))) return res; + if ((res = strcmp(ea->vorname, eb->vorname))) return res; + if ((res = strcmp(ea->hnr, eb->hnr ))) return res; + return 0; +} + +static int sort_brutes(const void *a, const void *b) { + int res; + brute_t *ea = (brute_t *)a; + brute_t *eb = (brute_t *)b; + + if ((res = strcmp(ea->vorwahl, eb->vorwahl ))) return res; + if ((res = strcmp(ea->ort, eb->ort ))) return res; + if ((res = strcmp(ea->strasse_1992, eb->strasse_1992))) return res; + return 0; +} + +static int search_brute(const void *a, const void *b) { + int res; + entry_t *ea = (entry_t*)a; + brute_t *eb = (brute_t *)b; + + if ((res = strcmp(ea->vorwahl, eb->vorwahl ))) return res; + if ((res = strcmp(ea->ort, eb->ort ))) return res; + if ((res = strcmp(ea->strasse, eb->strasse_1992))) return res; + return 0; +} + +/* If zip codes end in dots, always chose the one with more info */ +static int test_dot(entry_t * iter, entry_t * found) { + char * dot_iter = strchr(iter->plz, '.'); + char * dot_found = strchr(found->plz, '.'); + int di = 64, df = 64; // magic value large enough so that the MIN() always choses the other one + + if (!dot_iter && !dot_found) return 0; + + if (dot_iter) di = dot_iter - iter->plz; + if (dot_found) df = dot_found - found->plz; + + if (memcmp(iter->plz, found->plz, ((di < df) ? di : df))) return 0; + + if (dot_iter && !dot_found) iter->plz = found->plz; + if (!dot_iter && dot_found) found->plz = iter->plz; + if (dot_iter && dot_found && di > df) found->plz = iter->plz; + if (dot_iter && dot_found && di < df) iter->plz = found->plz; + + return 1; +} + +static entry_t * verify_unique_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)) { + entry_t *test_iter = found + 1; + entry_t *end = g_book + g_book_size; + + while (test_iter < end && !compar((void*)candidate, (void*)test_iter)) { + if (strcmp(test_iter->plz, found->plz)) { + if (test_dot(test_iter, found)) + continue; + return 0; + } + ++test_iter; + } + + return found; +} + +static entry_t * verify_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)) { + entry_t *test_iter = found + 1; + entry_t *end = g_book + g_book_size; + char pair[32]; + + if (!g_zipmap_size) + return verify_unique_zip(found, candidate, compar); + + /* Do we know about the oldzip-newzip mapping? */ + strcpy(pair, candidate->plz); strcat(pair,"\t"); strcat(pair, found->plz); + if (bsearch(pair, g_zipmap, g_zipmap_size, sizeof(char*), cc2)) + return found; + + while ((test_iter < end) && !compar((void*)candidate, (void*)test_iter)) { + if (strcmp(test_iter->plz, found->plz)) { + if (test_dot(test_iter, found)) + continue; + strcpy(pair, candidate->plz); strcat(pair,"\t"); strcat(pair, test_iter->plz); + if (bsearch(pair, g_zipmap, g_zipmap_size, sizeof(char*), cc2)) { + // printf ("FIXED %s -> %s: ", found->plz, test_iter->plz); + return test_iter; + } + } + ++test_iter; + } + + return found; +} + +static int search_and_verify(entry_t *candidate, int (*compar) (const void *, const void *), int flag) { + entry_t *found = bsearch_first(candidate, g_book, g_book_size, sizeof(entry_t), compar); + + if (!found) return 0; + + if (flag == 2) + found = verify_unique_zip(found, candidate, compar); + else + found = verify_zip(found, candidate, compar); + + if (!found) + return 0; + + if (flag == 1) + fprintf(g_mapfile_out, "%s\t%s\n", candidate->plz, found->plz); + + /* Output plz*/ + puts(found->plz); + + return 1; +} + diff --git a/src/postprocess/map_plz.h b/src/postprocess/map_plz.h new file mode 100644 index 0000000..8839b5c --- /dev/null +++ b/src/postprocess/map_plz.h @@ -0,0 +1,39 @@ +#pragma once + +typedef struct { + const char * vorwahl; + const char * ort; + const char * strasse; + const char * hnr; + const char * name; + const char * vorname; + const char * nummer; + const char * plz; +} entry_t; + +typedef struct { + const char * vorwahl; + const char * ort; + const char * strasse_1992; + const char * strasse_1995; + int similarity; + int count; +} brute_t; + +static void *bsearch_first( const void * const key, const void * base, const size_t nel, const size_t width, int (*compar) (const void *, const void *)); +static int sort_by_voshnvn(const void *a, const void *b); +static int sort_by_voshnv(const void *a, const void *b); +static int sort_by_vosh(const void *a, const void *b); +static int sort_by_vos(const void *a, const void *b); +static int sort_by_vonvh(const void *a, const void *b); +static int sort_by_vo(const void *a, const void *b); +static int sort_brutes(const void *a, const void *b); +static int search_brute(const void *a, const void *b); +static int test_dot(entry_t * iter, entry_t * found); + +static int cc(const void *a, const void *b) { return strcmp(*(char**)a, *(char**)b); } +static int cc2(const void *a, const void *b) { return strcmp((char*)a, *(char**)b); } +static entry_t * verify_unique_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)); +static entry_t * verify_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)); +static int search_and_verify(entry_t *candidate, int (*compar) (const void *, const void *), int flag); +static char * advance_and_replace(char **p, char find, char replace); diff --git a/src/postprocess/postprocess-1992.sh b/src/postprocess/postprocess-1992.sh new file mode 100644 index 0000000..1e685d2 --- /dev/null +++ b/src/postprocess/postprocess-1992.sh @@ -0,0 +1,32 @@ +# Generate file with all relevant columns from 1992 +paste 1992_Q2/{01_Flags,12_Vorwahl,12_Vorwahl_block,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl_West,10_Zustellamt_PLZOst} > 1992-fvvoshnvrpp.txt + +# Generate lookup file from 1995 +paste 1995_Q0/{12_Vorwahl,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl} | tr '\n\t' '\0' > 1995-voshnvrp.bin + +# To debug in lldb +process launch -i 1992_testfile.txt -- 1995-vorwahl-ort-strasse-hnr-name-vorname-rufnummer-plz.bin + +# Compile plz mapper +cc -O3 -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c + +# outputs mapped plz, generates brutemap.txt +touch brutemap_input.bin zip_simple_map.bin +./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl + +# generate street name translation table from brutemap, +# only taking into account similar street names +# cut -f 3,4 brutemap.txt | tr '[:upper:]' '[:lower:]' | paste brutemap.txt - | cut -f 1-4,6,7 | ./jaro | cut -f 1-5 > brutemap_filtered.txt + +# generate street name translation table from brutemap, +# only taking into account similar street names, new style +cut -f 3,4 brutemap.txt | python simi.py | paste - brutemap.txt > brutemap_simifiltered.txt + +# Sort and prepare similarity filtered files for the merge +cut -f 1-5 brutemap_simifiltered.txt | sort | uniq -c | sed -E $'s:^ *([[:digit:]]+) :\\1\t:' | tr '\n\t' '\0' > brutemap_input.bin + +# compile zipmap into a binary format +sort -u zip_mapfile.txt | tr '\n' '\0' > zip_simple_map.bin + +# Redo the mapping with the data from brutemap and zipmap +./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl diff --git a/src/postprocess/simi.py b/src/postprocess/simi.py new file mode 100755 index 0000000..62ff1ff --- /dev/null +++ b/src/postprocess/simi.py @@ -0,0 +1,11 @@ +#!python + +import textdistance +from sys import stdin + +for line in stdin.readlines(): + x,y = line.split('\t') + x = x.casefold() + y = y.casefold() + v = textdistance.ratcliff_obershelp.normalized_similarity(x,y) + textdistance.jaro_winkler.normalized_similarity(x,y) + textdistance.cosine.normalized_similarity(x,y) + print (int(100*(v/3))) -- cgit v1.2.3