# Generate file with all relevant columns from 1992 paste 1992_Q2/{01_Flags,12_Vorwahl,12_Vorwahl_block,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl_West,10_Zustellamt_PLZOst} > 1992-fvvoshnvrpp.txt # Generate lookup file from 1995 paste 1995_Q0/{12_Vorwahl,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl} | tr '\n\t' '\0' > 1995-voshnvrp.bin # Compile plz mapper cc -O3 -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c # To debug in lldb # cc -O0 -g -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c # process launch -i 1992_testfile.txt -- 1995-vorwahl-ort-strasse-hnr-name-vorname-rufnummer-plz.bin # outputs mapped plz, generates brutemap.txt touch brutemap_input.bin zip_simple_map.bin ./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 1992_Q2/10_Postleitzahl # generate street name translation table from brutemap, # only taking into account similar street names, new style cut -f 3,4 brutemap.txt | python simi.py | paste - brutemap.txt > brutemap_simifiltered.txt # Sort and prepare similarity filtered files for the merge cut -f 1-5 brutemap_simifiltered.txt | sort | uniq -c | sed -E $'s:^ *([[:digit:]]+) :\\1\t:' | tr '\n\t' '\0' > brutemap_input.bin # compile zipmap into a binary format sort -u zip_mapfile.txt | tr '\n' '\0' > zip_simple_map.bin # Redo the mapping with the data from brutemap and zipmap ./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 1992_Q2/10_Postleitzahl