From fb3616e06ca20ffe78dfb00b962a2599a46b2a5f Mon Sep 17 00:00:00 2001 From: Dirk Engling Date: Sat, 29 Jun 2019 18:51:00 +0200 Subject: Clean up 1992 post processing --- Makefile | 5 ++++- makecolumns.sh | 13 ++++++++++--- parasort.sh | 17 +++++++++++------ postprocess.sh | 2 +- src/export/extract_version_1.c | 16 +++++++++------- src/export/mystdlib.c | 4 +++- src/postprocess/map_plz.c | 4 ++-- src/postprocess/merge_entries.c | 5 +++-- src/postprocess/postprocess-1992.sh | 15 ++++++--------- 9 files changed, 49 insertions(+), 32 deletions(-) diff --git a/Makefile b/Makefile index 836af77..1bb45a0 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/map_branches_v3 bin/map_branches_v4 bin/convert_coords bin/merge_entries bin/sort_plz # bin/join +BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/map_branches_v3 bin/map_branches_v4 bin/convert_coords bin/merge_entries bin/sort_plz bin/map_plz # bin/join CFLAGS += -W -Wall -Wextra -O3 -I src/export # -Weverything -Wno-cast-align -Wno-padded all: $(BINARIES) @@ -42,6 +42,9 @@ bin/merge_entries: src/postprocess/merge_entries.c src/export/mystdlib.c src/pos bin/sort_plz: src/postprocess/sort_plz.c $(CC) $(CFLAGS) -o $@ src/postprocess/sort_plz.c +bin/map_plz: src/postprocess/map_plz.c src/export/mystdlib.c + $(CC) $(CFLAGS) -o $@ src/postprocess/map_plz.c src/export/mystdlib.c + .PHONY: clean clean: @rm -f $(BINARIES) diff --git a/makecolumns.sh b/makecolumns.sh index 3f05a61..a505d31 100755 --- a/makecolumns.sh +++ b/makecolumns.sh @@ -115,10 +115,17 @@ handle_format_version_1() { mv 11_unknown 12_Vorwahl mv 12_unknown 13_Rufnummer mv 13_unknown 11_Ort - mv 14_unknown 10_Postleitzahl + mv 14_unknown 10_Postleitzahl_West + mv 15_unknown 12_Vorwahl_block + + printf "Splitting appartement to zusaetze ... " + paste 07_Strasse 08_Hausnummer 09_unknown | sed -E $'s:^(.*)\;([0-9]+.*)\t(.*)\t.*$:\\1\t\\2\tWohnung \\3:;s:^(.*)tr(\t.*\t.*)$:\\1tr.\\2:' > tm_unknown + cut -f 1 tm_unknown > 07_Strasse + cut -f 2 tm_unknown > 08_Hausnummer + printf "done.\n" printf "Normalizing zusaetze ... " - sed -E -e 's:^, +:u. :' 09_unknown > 04_Namenszusatz + cut -f 3 tm_unknown | sed -E -e 's:^, +:u. :' > 04_Namenszusatz sed -E -e 's:^, +:u. :' 04_unknown > 05_Adresszusatz paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze printf "done.\n" @@ -501,7 +508,7 @@ tidy_columns () { printf "Tidying up streetnames ... " # Replace any dots at end of line by a single one # finish any str abbreviation without a period with a period - sed -E 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./;s/(.*)-(.*) -/\1-\2-Str./;s/ -$/ Str./;s/-$/str./' 07_Strasse | iconv -f iso-8859-15 -t utf-8 > 07_Strasse.new + sed -E 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./;s/(.*)-(.*) -/\1-\2-Str./;s/ -$/ Str./;s/-$/str./;s/^(.*-.*) Str\.?$/\1-Str./' 07_Strasse | iconv -f iso-8859-15 -t utf-8 > 07_Strasse.new mv 07_Strasse.new 07_Strasse printf "done.\n" diff --git a/parasort.sh b/parasort.sh index 6565d61..b593bb9 100644 --- a/parasort.sh +++ b/parasort.sh @@ -18,11 +18,16 @@ export PATH=${PATH}:`pwd -P`/bin/ mkdir -p work/sorted cd work/output || exit 1 -for a in *[05]; do echo "$a" >&2; merge_entries $a > ../sorted/$a; done & -for a in *[16]; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & -for a in *[27]; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & -for a in *[38]; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & -for a in *[49]; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & -for a in brken *_; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & +starttime=`date +%s` + +for a in ???[05]?; do echo "$a" >&2; merge_entries $a > ../sorted/$a; done & +for a in ???[16]?; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & +for a in ???[27]?; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & +for a in ???[38]?; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & +for a in ???[49]?; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & +for a in brken ???_?; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & wait + +elapsed=$(( `date +%s` - starttime )) +printf "Finished in %d seconds (%d minutes)\n" ${elapsed} $(( elapsed / 60 )) diff --git a/postprocess.sh b/postprocess.sh index 9316357..bc70492 100644 --- a/postprocess.sh +++ b/postprocess.sh @@ -26,4 +26,4 @@ fi cd work || exit 1 -for a in 199[56789]_Q? 20*_Q?; do ./sort_plz $a; done +for a in ????_Q?; do ./sort_plz $a; done diff --git a/src/export/extract_version_1.c b/src/export/extract_version_1.c index 8ec199e..6d67a5d 100644 --- a/src/export/extract_version_1.c +++ b/src/export/extract_version_1.c @@ -39,9 +39,9 @@ static uint8_t cp437_to_iso8859_1_table[] = { }; static struct { - int outfiles[15]; - uint8_t *outbuf[15]; - size_t outfill[15]; + int outfiles[16]; + uint8_t *outbuf[16]; + size_t outfill[16]; char * vorwahl; char ort[1024]; size_t ort_len; @@ -101,11 +101,13 @@ static void split_to_files( uint8_t *entries, int num_entries ) *( g_state.outbuf[0] + g_state.outfill[0]++ ) = num_entries > 1 ? '1' : '0'; memcpy( g_state.outbuf[12] + g_state.outfill[12], g_state.ort, g_state.ort_len ); g_state.outfill[12] += g_state.ort_len; memcpy( g_state.outbuf[13] + g_state.outfill[13], g_state.zip, g_state.zip_len ); g_state.outfill[13] += g_state.zip_len; + strcpy( g_state.outbuf[14] + g_state.outfill[14], g_state.vorwahl); g_state.outfill[14] += strlen(g_state.vorwahl); } *( g_state.outbuf[0 ] + g_state.outfill[0 ]++ ) = '\n'; *( g_state.outbuf[12] + g_state.outfill[12]++ ) = '\n'; *( g_state.outbuf[13] + g_state.outfill[13]++ ) = '\n'; + *( g_state.outbuf[14] + g_state.outfill[14]++ ) = '\n'; if( !end ) return; } @@ -178,7 +180,7 @@ static void act_on_file( uint8_t *file ) g_state.zip_len = snprintf( g_state.zip, sizeof(g_state.zip), "%s", zip ); g_state.vorwahl = vorwahl; - /* printf( "Working on a %04d page and %06d records file, city: %4s %-32s with prefix %s\n", num_pages, num_records, zip, ort, vorwahl ); */ + printf( "Working on a %04d page and %06d records file, city: %4s %-32s with prefix %s\n", num_pages, num_records, zip, ort, vorwahl ); (void)num_records; /* silence warning about unused variable */ for( page = 0; page < num_pages; ++page ) @@ -193,7 +195,7 @@ int main( ) ssize_t temp = 0; int i; - for( i=0; i<14; ++i ) + for( i=0; i<15; ++i ) { sprintf( filename, "%02d_unknown", i+1 ); g_state.outfiles[i] = open( filename, O_WRONLY | O_APPEND | O_CREAT, 0644 ); @@ -209,14 +211,14 @@ int main( ) unmap_file( &f ); /* Write out results */ - for( i=0; i<14; ++i ) { + for( i=0; i<15; ++i ) { /* if( g_state.outfill[i] > 1024*1024*6 ) printf( "Large: %s %zd\n", g_state.ort, g_state.outfill[i] ); */ temp += write( g_state.outfiles[i], g_state.outbuf[i], g_state.outfill[i] ); g_state.outfill[i] = 0; } } - for( i=0; i<14; ++i ) { + for( i=0; i<15; ++i ) { temp += write( g_state.outfiles[i], g_state.outbuf[i], g_state.outfill[i] ); close( g_state.outfiles[i] ); } diff --git a/src/export/mystdlib.c b/src/export/mystdlib.c index b65f63d..31c991e 100644 --- a/src/export/mystdlib.c +++ b/src/export/mystdlib.c @@ -21,7 +21,9 @@ MAP map_file( char *filename, int readonly ) if( ( map->fh = open( filename, readonly ? O_RDONLY : O_RDWR ) ) >= 0 ) { fstat( map->fh, &fstatus ); - if( ( map->addr = mmap( NULL, map->size = (size_t)fstatus.st_size, + map->size = (size_t)fstatus.st_size; + if (!map->size) return map; + if( ( map->addr = mmap( NULL, map->size, PROT_READ | ( readonly ? 0 : PROT_WRITE), (readonly ? MAP_PRIVATE : MAP_SHARED), map->fh, 0) ) == MAP_FAILED ) { fprintf( stderr, "Mapping file '%s' failed\n", filename ); diff --git a/src/postprocess/map_plz.c b/src/postprocess/map_plz.c index ab0db71..9dec6bb 100644 --- a/src/postprocess/map_plz.c +++ b/src/postprocess/map_plz.c @@ -18,7 +18,7 @@ int main(int argc, char **args) { FILE *bfile, *streetfile_out; char *ptr, *input = malloc(65335); char *ort = malloc(65335), vorwahl_block[16]; - int i, brutes_count = 0, report = 0; + unsigned int i, brutes_count = 0, report = 0; brute_t *brutes = malloc(200000*sizeof(brute_t)); /* prepare io */ @@ -63,7 +63,7 @@ int main(int argc, char **args) { g_book_by_name = (entry_t*)malloc(g_book_size * sizeof(entry_t)); /* Split pointers into input files into our arrays */ - for (i = 0, ptr = (char*)tbuch->addr; i < g_book_size; ++i) { + for (i=0, ptr=(char*)tbuch->addr; i 1995-voshnvrp.bin -# To debug in lldb -process launch -i 1992_testfile.txt -- 1995-vorwahl-ort-strasse-hnr-name-vorname-rufnummer-plz.bin - # Compile plz mapper cc -O3 -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c +# To debug in lldb +# cc -O0 -g -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c +# process launch -i 1992_testfile.txt -- 1995-vorwahl-ort-strasse-hnr-name-vorname-rufnummer-plz.bin + # outputs mapped plz, generates brutemap.txt touch brutemap_input.bin zip_simple_map.bin -./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl - -# generate street name translation table from brutemap, -# only taking into account similar street names -# cut -f 3,4 brutemap.txt | tr '[:upper:]' '[:lower:]' | paste brutemap.txt - | cut -f 1-4,6,7 | ./jaro | cut -f 1-5 > brutemap_filtered.txt +./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 1992_Q2/10_Postleitzahl # generate street name translation table from brutemap, # only taking into account similar street names, new style @@ -29,4 +26,4 @@ cut -f 1-5 brutemap_simifiltered.txt | sort | uniq -c | sed -E $'s:^ *([[:digit: sort -u zip_mapfile.txt | tr '\n' '\0' > zip_simple_map.bin # Redo the mapping with the data from brutemap and zipmap -./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl +./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 1992_Q2/10_Postleitzahl -- cgit v1.2.3