#include "mystdlib.h" #include #include #include #include #include #include "map_plz.h" static entry_t *g_book, *g_book_by_name; static size_t g_book_size, g_zipmap_size; static const char * g_zipmap[32000]; static FILE *g_mapfile_out; int main(int argc, char **args) { MAP tbuch, zipmap, brutemap; FILE *bfile, *streetfile_out; char *ptr, *input = malloc(65335); char *ort = malloc(65335), vorwahl_block[16]; unsigned int i, brutes_count = 0, report = 0; brute_t *brutes = malloc(200000*sizeof(brute_t)); /* prepare io */ if (argc != 4) exit(1); tbuch = map_file(args[1], 1); zipmap = map_file(args[2], 1); brutemap = map_file(args[3], 1); /* read all files */ g_zipmap[0] = (char*)zipmap->addr; for (i=0; isize; ++i) if (!zipmap->addr[i]) g_zipmap[++g_zipmap_size] = (char*)(zipmap->addr + i + 1); qsort(g_zipmap, g_zipmap_size, sizeof(char*), cc); ptr = (char*)brutemap->addr; /* Split brute records */ while (ptr < (char*)brutemap->addr + brutemap->size) { brutes[brutes_count].count = atol(ptr); ptr += strlen(ptr) + 1; brutes[brutes_count].similarity = atol(ptr); ptr += strlen(ptr) + 1; brutes[brutes_count].vorwahl = ptr; ptr += strlen(ptr) + 1; brutes[brutes_count].ort = ptr; ptr += strlen(ptr) + 1; brutes[brutes_count].strasse_1992 = ptr; ptr += strlen(ptr) + 1; brutes[brutes_count].strasse_1995 = ptr; ptr += strlen(ptr) + 1; if (brutes[brutes_count].count >= 20 || brutes[brutes_count].similarity >= 70) brutes_count++; } qsort(brutes, brutes_count, sizeof(brute_t), sort_brutes); /* count phonebook lines */ for (i=0; isize; ++i) if (!tbuch->addr[i]) ++g_book_size; /* We expect 8 columns per line */ g_book_size /= 8; g_book = (entry_t*)malloc(g_book_size * sizeof(entry_t)); g_book_by_name = (entry_t*)malloc(g_book_size * sizeof(entry_t)); /* Split pointers into input files into our arrays */ for (i=0, ptr=(char*)tbuch->addr; istrasse, local.hnr); } /* See if we can find and correct the street name */ if (!once++) { found_brute = bsearch(&local, brutes, brutes_count, sizeof(brute_t), search_brute); if (found_brute && ( (found_brute->similarity >= 70) || (found_brute->count >= 20)) ) { fixed = found_brute->similarity >= 80; local.strasse = found_brute->strasse_1995; goto rescan; } } /* If nothing works, see if the whole village has only one zip, else just print an empty line */ if (!search_and_verify(&local, sort_by_vo, 2)) putchar(10); fputs(orig_strasse, streetfile_out); fputc(10, streetfile_out); } } static char * advance_and_replace(char **p, char find, char replace) { char *copy = *p; *p = strchr(*p, find); **p = replace; ++*p; return copy; } static void *bsearch_first( const void * const key, const void * base, const size_t nel, const size_t width, int (*compar) (const void *, const void *)) { size_t interval = nel; const void * first = base; int cmp; while (interval) { uint8_t *lookat = ((uint8_t*)base) + width * ( interval / 2 ); cmp = compar(key, (void*)lookat); if(cmp == 0 && ((base == first) || compar(key, (void*)(lookat-width)))) return lookat; if(cmp > 0) { base = lookat + width; interval--; } interval /= 2; } return 0; } /* For if we have vorwahl and ort and strasse */ static int sort_by_voshnvn(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse, eb->strasse))) return res; if ((res = strcmp(ea->hnr, eb->hnr ))) return res; if ((res = strcmp(ea->name, eb->name ))) return res; if ((res = strcmp(ea->vorname, eb->vorname))) return res; if ((res = strcmp(ea->nummer, eb->nummer ))) return res; return 0; } /* more relaxed, if rufnummer missmatches */ static int sort_by_voshnv(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse, eb->strasse))) return res; if ((res = strcmp(ea->hnr, eb->hnr ))) return res; if ((res = strcmp(ea->name, eb->name ))) return res; if ((res = strcmp(ea->vorname, eb->vorname))) return res; return 0; } /* more relaxed, if rufnummer missmatches */ static int sort_by_vosh(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse, eb->strasse))) return res; if ((res = strcmp(ea->hnr, eb->hnr ))) return res; return 0; } /* more relaxed, if rufnummer missmatches */ static int sort_by_vos(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse, eb->strasse))) return res; return 0; } /* last resort, check if the whole vorwahl+ort set matches a single zip */ static int sort_by_vo(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; return 0; } /* For brute forcing name if we can't find strasse */ static int sort_by_vonvh(const void *a, const void *b) { int res; entry_t *ea = (entry_t *)a; entry_t *eb = (entry_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->name, eb->name ))) return res; if ((res = strcmp(ea->vorname, eb->vorname))) return res; if ((res = strcmp(ea->hnr, eb->hnr ))) return res; return 0; } static int sort_brutes(const void *a, const void *b) { int res; brute_t *ea = (brute_t *)a; brute_t *eb = (brute_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl ))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse_1992, eb->strasse_1992))) return res; return 0; } static int search_brute(const void *a, const void *b) { int res; entry_t *ea = (entry_t*)a; brute_t *eb = (brute_t *)b; if ((res = strcmp(ea->vorwahl, eb->vorwahl ))) return res; if ((res = strcmp(ea->ort, eb->ort ))) return res; if ((res = strcmp(ea->strasse, eb->strasse_1992))) return res; return 0; } /* If zip codes end in dots, always chose the one with more info */ static int test_dot(entry_t * iter, entry_t * found) { char * dot_iter = strchr(iter->plz, '.'); char * dot_found = strchr(found->plz, '.'); int di = 64, df = 64; // magic value large enough so that the MIN() always choses the other one if (!dot_iter && !dot_found) return 0; if (dot_iter) di = dot_iter - iter->plz; if (dot_found) df = dot_found - found->plz; if (memcmp(iter->plz, found->plz, ((di < df) ? di : df))) return 0; if (dot_iter && !dot_found) iter->plz = found->plz; if (!dot_iter && dot_found) found->plz = iter->plz; if (dot_iter && dot_found && di > df) found->plz = iter->plz; if (dot_iter && dot_found && di < df) iter->plz = found->plz; return 1; } static entry_t * verify_unique_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)) { entry_t *test_iter = found + 1; entry_t *end = g_book + g_book_size; while (test_iter < end && !compar((void*)candidate, (void*)test_iter)) { if (strcmp(test_iter->plz, found->plz)) { if (test_dot(test_iter, found)) continue; return 0; } ++test_iter; } return found; } static entry_t * verify_zip(entry_t *found, entry_t *candidate, int (*compar) (const void *, const void *)) { entry_t *test_iter = found + 1; entry_t *end = g_book + g_book_size; char pair[32]; if (!g_zipmap_size) return verify_unique_zip(found, candidate, compar); /* Do we know about the oldzip-newzip mapping? */ strcpy(pair, candidate->plz); strcat(pair,"\t"); strcat(pair, found->plz); if (bsearch(pair, g_zipmap, g_zipmap_size, sizeof(char*), cc2)) return found; while ((test_iter < end) && !compar((void*)candidate, (void*)test_iter)) { if (strcmp(test_iter->plz, found->plz)) { if (test_dot(test_iter, found)) continue; strcpy(pair, candidate->plz); strcat(pair,"\t"); strcat(pair, test_iter->plz); if (bsearch(pair, g_zipmap, g_zipmap_size, sizeof(char*), cc2)) { // printf ("FIXED %s -> %s: ", found->plz, test_iter->plz); return test_iter; } } ++test_iter; } return found; } static int search_and_verify(entry_t *candidate, int (*compar) (const void *, const void *), int flag) { entry_t *found = bsearch_first(candidate, g_book, g_book_size, sizeof(entry_t), compar); if (!found) return 0; if (flag == 2) found = verify_unique_zip(found, candidate, compar); else found = verify_zip(found, candidate, compar); if (!found) return 0; if (flag == 1) fprintf(g_mapfile_out, "%s\t%s\n", candidate->plz, found->plz); /* Output plz*/ puts(found->plz); return 1; }