#include "mystdlib.h" #include #include #include #include #include #include "map_plz.h" static entry_t *g_book, *g_book_by_name; static size_t g_book_size, g_zipmap_size; static const char * g_zipmap[32000]; static FILE *g_mapfile_out; int main(int argc, char **args) { MAP tbuch, zipmap, brutemap; FILE *bfile, *streetfile_out; char *ptr, *input = malloc(65335); char *ort = malloc(65335), vorwahl_block[16]; int i, brutes_count = 0, report = 0; brute_t *brutes = malloc(200000*sizeof(brute_t)); /* prepare io */ if (argc != 4) exit(1); tbuch = map_file(args[1], 1); zipmap = map_file(args[2], 1); brutemap = map_file(args[3], 1); /* read all files */ g_zipmap[0] = (char*)zipmap->addr; for (i=0; isize; ++i) if (!zipmap->addr[i]) g_zipmap[++g_zipmap_size] = (char*)(zipmap->addr + i + 1); qsort(g_zipmap, g_zipmap_size, sizeof(char*), cc); ptr = (char*)brutemap->addr; /* Split brute records */ while (ptr < (char*)brutemap->addr + brutemap->size) { brutes[brutes_count].count = atol(ptr); ptr += strlen(ptr) + 1; brutes[brutes_count].similarity = atol(ptr); ptr += strlen(ptr) + 1; brutes[brutes_count].vorwahl = ptr; ptr += strlen(ptr) + 1; brutes[brutes_count].ort = ptr; ptr += strlen(ptr) + 1; brutes[brutes_count].strasse_1992 = ptr; ptr += strlen(ptr) + 1; brutes[brutes_count].strasse_1995 = ptr; ptr += strlen(ptr) + 1; if (brutes[brutes_count].count >= 20 || brutes[brutes_count].similarity >= 70) brutes_count++; } qsort(brutes, brutes_count, sizeof(brute_t), sort_brutes); /* count phonebook lines */ for (i=0; isize; ++i) if (!tbuch->addr[i]) ++g_book_size; /* We expect 8 columns per line */ g_book_size /= 8; g_book = (entry_t*)malloc(g_book_size * sizeof(entry_t)); g_book_by_name = (entry_t*)malloc(g_book_size * sizeof(entry_t)); /* Split pointers into input files into our arrays */ for (i = 0, ptr = (char*)tbuch->addr; i < g_book_size; ++i) { g_book[i].vorwahl = ptr; ptr += strlen(ptr) + 1; g_book[i].ort = ptr; ptr += strlen(ptr) + 1; g_book[i].strasse = ptr; ptr += strlen(ptr) + 1; g_book[i].hnr = ptr; ptr += strlen(ptr) + 1; g_book[i].name = ptr; ptr += strlen(ptr) + 1; g_book[i].vorname = ptr; ptr += strlen(ptr) + 1; g_book[i].nummer = ptr; ptr += strlen(ptr) + 1; g_book[i].plz = ptr; ptr += strlen(ptr) + 1; } /* Copy input array so that it can be sorted by different criteria */ memcpy(g_book_by_name, g_book, g_book_size * sizeof(entry_t)); fprintf(stderr, "STEP 1: import done\n"); /* Sort the whole thing */ qsort(g_book, g_book_size, sizeof(entry_t), sort_by_voshnvn); qsort(g_book_by_name, g_book_size, sizeof(entry_t), sort_by_vonvh); fprintf(stderr, "STEP 2: sort done\n"); g_mapfile_out = fopen("zip_mapfile.txt", "w"); streetfile_out = fopen("07_Strasse_fixed", "w"); bfile = fopen("brutemap.txt", "w"); while (1) { char flag, *t, *l = fgets(input, 65536, stdin); const char *orig_strasse; entry_t local; brute_t *found_brute; int once = 0, fixed = 0; if (!l) break; if ((report++ % 300000) == 0) fprintf(stderr, "% 10d lines done\n", report); /* Scan and skip flags */ flag = strtoul(l, &l, 16); ++l; /* Copy vorwahl, if in field */ local.vorwahl = advance_and_replace(&l, 9, 0); /* Copy over vorwahl for whole exported block, if we're not in cont */ t = advance_and_replace(&l, 9, 0); if (flag < 2) strcpy(vorwahl_block, t); /* Only copy over ort from continuations, if present */ if (flag < 2 || (*l != 9) ) local.ort = l; else local.ort = ort; advance_and_replace(&l, 9, 0); /* Take copy of ort for continuations, if on start of multi line record */ if (flag == 1) strcpy(ort, local.ort); /* Copy rest of the fields verbatim */ local.strasse = advance_and_replace(&l, 9, 0); local.hnr = advance_and_replace(&l, 9, 0); local.name = advance_and_replace(&l, 9, 0); local.vorname = advance_and_replace(&l, 9, 0); local.nummer = advance_and_replace(&l, 9, 0); local.plz = advance_and_replace(&l, 9, ':'); advance_and_replace(&l, 10, 0); orig_strasse = local.strasse; rescan: if (search_and_verify(&local, sort_by_voshnvn, 1) || search_and_verify(&local, sort_by_voshnv, 1) || search_and_verify(&local, sort_by_vosh, 0) || search_and_verify(&local, sort_by_vos, 0)) { fputs(fixed ? local.strasse : orig_strasse, streetfile_out); fputc(10, streetfile_out); continue; } /* If we can't find the street, it might be due to an incorrect vorwahl, try to fix it up */ if (strcmp(local.vorwahl, vorwahl_block)) { local.vorwahl = vorwahl_block; goto rescan; } /* If we do have vorwahl + ort + strasse + hnr, but no match, street might have changed name */ if (*local.vorwahl && *local.ort && *local.strasse && *local.hnr && *local.name) { entry_t *found = bsearch_first(&local, g_book_by_name, g_book_size, sizeof(entry_t), sort_by_vonvh); if (found) fprintf(bfile, "%s\t%s\t%s\t%s\t%s\n", local.vorwahl, local.ort, local.strasse, found->strasse, local.hnr); } /* See if we can find and correct the street name */ if (!once++) { found_brute = bsearch(&local, brutes, brutes_count, sizeof(brute_t), search_brute); if (found_brute && ( (found_brute->similarity >= 70) || (found_brute->count >= 20)) ) { fixed = found_brute->similarity >= 80; local.strasse = found_brute->strasse_1995; goto rescan; } } /* If nothing works, see if the whole village has only one zip, else just print an empty line */ if (!search_and_verify(&local, sort_by_vo, 2)) putchar(10); fputs(orig_strasse, streetfile_out); fputc(10, streetfile_out); } } static char * advance_and_replace(char **p, char find, char replace) { char *copy = *p; *p = strchr(*p, find); **p = replace; ++*p; return copy; } static void *bsearch_first( const void * const key, const void * base, const size_t nel, const size_t width, int (*compar) (const void *, const void *)) { size_t interval = nel; const void * first = base; int cmp; while (interval) { uint8_t *lookat = ((uint8_t*)base) + width * ( interval / 2 ); cmp = compar(key, (void*)lookat); if(cmp == 0 && ((base == first) || compar(key, (void*)(lookat-width)))) return lookat; if(cmp > 0) { base = lookat + width; interval--; } interval /= 2; } return 0; } /* For if we have vorwahl and ort and strasse */ static int sort_by_voshnvn(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse, eb->strasse))) return res; if ((res = strcmp(ea->hnr, eb->hnr ))) return res; if ((res = strcmp(ea->name, eb->name ))) return res; if ((res = strcmp(ea->vorname, eb->vorname))) return res; if ((res = strcmp(ea->nummer, eb->nummer ))) return res; return 0; } /* more relaxed, if rufnummer missmatches */ static int sort_by_voshnv(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse, eb->strasse))) return res; if ((res = strcmp(ea->hnr, eb->hnr ))) return res; if ((res = strcmp(ea->name, eb->name ))) return res; if ((res = strcmp(ea->vorname, eb->vorname))) return res; return 0; } /* more relaxed, if rufnummer missmatches */ static int sort_by_vosh(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse, eb->strasse))) return res; if ((res = strcmp(ea->hnr, eb->hnr ))) return res; return 0; } /* more relaxed, if rufnummer missmatches */ static int sort_by_vos(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse, eb->strasse))) return res; return 0; } /* last resort, check if the whole vorwahl+ort set matches a single zip */ static int sort_by_vo(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; return 0; } /* For brute forcing name if we can't find strasse */ static int sort_by_vonvh(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->name, eb->name ))) return res; if ((res = strcmp(ea->vorname, eb->vorname))) return res; if ((res = strcmp(ea->hnr, eb->hnr ))) return res; return 0; } static int sort_brutes(const void *a, const void *b) { int res; brute_t *ea = (brute_t *)a; brute_t *eb = (brute_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl ))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse_1992, eb->strasse_1992))) return res; return 0; } static int search_brute(const void *a, const void *b) { int res; entry_t *ea = (entry_t*)a; brute_t *eb = (brute_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl ))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse, eb->strasse_1992))) return res; return 0; } /* If zip codes end in dots, always chose the one with more info */ static int test_dot(entry_t * iter, entry_t * found) { char * dot_iter = strchr(iter->plz, '.'); char * dot_found = strchr(found->plz, '.'); int di = 64, df = 64; // magic value large enough so that the MIN() always choses the other one if (!dot_iter && !dot_found) return 0; if (dot_iter) di = dot_iter - iter->plz; if (dot_found) df = dot_found - found->plz; if (memcmp(iter->plz, found->plz, ((di < df) ? di : df))) return 0; if (dot_iter && !dot_found) iter->plz = found->plz; if (!dot_iter && dot_found) found->plz = iter->plz; if (dot_iter && dot_found && di > df) found->plz = iter->plz; if (dot_iter && dot_found && di < df) iter->plz = found->plz; return 1; } static entry_t * verify_unique_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)) { entry_t *test_iter = found + 1; entry_t *end = g_book + g_book_size; while (test_iter < end && !compar((void*)candidate, (void*)test_iter)) { if (strcmp(test_iter->plz, found->plz)) { if (test_dot(test_iter, found)) continue; return 0; } ++test_iter; } return found; } static entry_t * verify_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)) { entry_t *test_iter = found + 1; entry_t *end = g_book + g_book_size; char pair[32]; if (!g_zipmap_size) return verify_unique_zip(found, candidate, compar); /* Do we know about the oldzip-newzip mapping? */ strcpy(pair, candidate->plz); strcat(pair,"\t"); strcat(pair, found->plz); if (bsearch(pair, g_zipmap, g_zipmap_size, sizeof(char*), cc2)) return found; while ((test_iter < end) && !compar((void*)candidate, (void*)test_iter)) { if (strcmp(test_iter->plz, found->plz)) { if (test_dot(test_iter, found)) continue; strcpy(pair, candidate->plz); strcat(pair,"\t"); strcat(pair, test_iter->plz); if (bsearch(pair, g_zipmap, g_zipmap_size, sizeof(char*), cc2)) { // printf ("FIXED %s -> %s: ", found->plz, test_iter->plz); return test_iter; } } ++test_iter; } return found; } static int search_and_verify(entry_t *candidate, int (*compar) (const void *, const void *), int flag) { entry_t *found = bsearch_first(candidate, g_book, g_book_size, sizeof(entry_t), compar); if (!found) return 0; if (flag == 2) found = verify_unique_zip(found, candidate, compar); else found = verify_zip(found, candidate, compar); if (!found) return 0; if (flag == 1) fprintf(g_mapfile_out, "%s\t%s\n", candidate->plz, found->plz); /* Output plz*/ puts(found->plz); return 1; }