summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile5
-rwxr-xr-xmakecolumns.sh13
-rw-r--r--parasort.sh17
-rw-r--r--postprocess.sh2
-rw-r--r--src/export/extract_version_1.c16
-rw-r--r--src/export/mystdlib.c4
-rw-r--r--src/postprocess/map_plz.c4
-rw-r--r--src/postprocess/merge_entries.c5
-rw-r--r--src/postprocess/postprocess-1992.sh15
9 files changed, 49 insertions, 32 deletions
diff --git a/Makefile b/Makefile
index 836af77..1bb45a0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
1BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/map_branches_v3 bin/map_branches_v4 bin/convert_coords bin/merge_entries bin/sort_plz # bin/join 1BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/map_branches_v3 bin/map_branches_v4 bin/convert_coords bin/merge_entries bin/sort_plz bin/map_plz # bin/join
2CFLAGS += -W -Wall -Wextra -O3 -I src/export # -Weverything -Wno-cast-align -Wno-padded 2CFLAGS += -W -Wall -Wextra -O3 -I src/export # -Weverything -Wno-cast-align -Wno-padded
3 3
4all: $(BINARIES) 4all: $(BINARIES)
@@ -42,6 +42,9 @@ bin/merge_entries: src/postprocess/merge_entries.c src/export/mystdlib.c src/pos
42bin/sort_plz: src/postprocess/sort_plz.c 42bin/sort_plz: src/postprocess/sort_plz.c
43 $(CC) $(CFLAGS) -o $@ src/postprocess/sort_plz.c 43 $(CC) $(CFLAGS) -o $@ src/postprocess/sort_plz.c
44 44
45bin/map_plz: src/postprocess/map_plz.c src/export/mystdlib.c
46 $(CC) $(CFLAGS) -o $@ src/postprocess/map_plz.c src/export/mystdlib.c
47
45.PHONY: clean 48.PHONY: clean
46clean: 49clean:
47 @rm -f $(BINARIES) 50 @rm -f $(BINARIES)
diff --git a/makecolumns.sh b/makecolumns.sh
index 3f05a61..a505d31 100755
--- a/makecolumns.sh
+++ b/makecolumns.sh
@@ -115,10 +115,17 @@ handle_format_version_1() {
115 mv 11_unknown 12_Vorwahl 115 mv 11_unknown 12_Vorwahl
116 mv 12_unknown 13_Rufnummer 116 mv 12_unknown 13_Rufnummer
117 mv 13_unknown 11_Ort 117 mv 13_unknown 11_Ort
118 mv 14_unknown 10_Postleitzahl 118 mv 14_unknown 10_Postleitzahl_West
119 mv 15_unknown 12_Vorwahl_block
120
121 printf "Splitting appartement to zusaetze ... "
122 paste 07_Strasse 08_Hausnummer 09_unknown | sed -E $'s:^(.*)\;([0-9]+.*)\t(.*)\t.*$:\\1\t\\2\tWohnung \\3:;s:^(.*)tr(\t.*\t.*)$:\\1tr.\\2:' > tm_unknown
123 cut -f 1 tm_unknown > 07_Strasse
124 cut -f 2 tm_unknown > 08_Hausnummer
125 printf "done.\n"
119 126
120 printf "Normalizing zusaetze ... " 127 printf "Normalizing zusaetze ... "
121 sed -E -e 's:^, +:u. :' 09_unknown > 04_Namenszusatz 128 cut -f 3 tm_unknown | sed -E -e 's:^, +:u. :' > 04_Namenszusatz
122 sed -E -e 's:^, +:u. :' 04_unknown > 05_Adresszusatz 129 sed -E -e 's:^, +:u. :' 04_unknown > 05_Adresszusatz
123 paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze 130 paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze
124 printf "done.\n" 131 printf "done.\n"
@@ -501,7 +508,7 @@ tidy_columns () {
501 printf "Tidying up streetnames ... " 508 printf "Tidying up streetnames ... "
502 # Replace any dots at end of line by a single one 509 # Replace any dots at end of line by a single one
503 # finish any str abbreviation without a period with a period 510 # finish any str abbreviation without a period with a period
504 sed -E 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./;s/(.*)-(.*) -/\1-\2-Str./;s/ -$/ Str./;s/-$/str./' 07_Strasse | iconv -f iso-8859-15 -t utf-8 > 07_Strasse.new 511 sed -E 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./;s/(.*)-(.*) -/\1-\2-Str./;s/ -$/ Str./;s/-$/str./;s/^(.*-.*) Str\.?$/\1-Str./' 07_Strasse | iconv -f iso-8859-15 -t utf-8 > 07_Strasse.new
505 mv 07_Strasse.new 07_Strasse 512 mv 07_Strasse.new 07_Strasse
506 printf "done.\n" 513 printf "done.\n"
507 514
diff --git a/parasort.sh b/parasort.sh
index 6565d61..b593bb9 100644
--- a/parasort.sh
+++ b/parasort.sh
@@ -18,11 +18,16 @@ export PATH=${PATH}:`pwd -P`/bin/
18mkdir -p work/sorted 18mkdir -p work/sorted
19cd work/output || exit 1 19cd work/output || exit 1
20 20
21for a in *[05]; do echo "$a" >&2; merge_entries $a > ../sorted/$a; done & 21starttime=`date +%s`
22for a in *[16]; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & 22
23for a in *[27]; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & 23for a in ???[05]?; do echo "$a" >&2; merge_entries $a > ../sorted/$a; done &
24for a in *[38]; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & 24for a in ???[16]?; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done &
25for a in *[49]; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & 25for a in ???[27]?; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done &
26for a in brken *_; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done & 26for a in ???[38]?; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done &
27for a in ???[49]?; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done &
28for a in brken ???_?; do echo " $a" >&2; merge_entries $a > ../sorted/$a; done &
27 29
28wait 30wait
31
32elapsed=$(( `date +%s` - starttime ))
33printf "Finished in %d seconds (%d minutes)\n" ${elapsed} $(( elapsed / 60 ))
diff --git a/postprocess.sh b/postprocess.sh
index 9316357..bc70492 100644
--- a/postprocess.sh
+++ b/postprocess.sh
@@ -26,4 +26,4 @@ fi
26 26
27cd work || exit 1 27cd work || exit 1
28 28
29for a in 199[56789]_Q? 20*_Q?; do ./sort_plz $a; done 29for a in ????_Q?; do ./sort_plz $a; done
diff --git a/src/export/extract_version_1.c b/src/export/extract_version_1.c
index 8ec199e..6d67a5d 100644
--- a/src/export/extract_version_1.c
+++ b/src/export/extract_version_1.c
@@ -39,9 +39,9 @@ static uint8_t cp437_to_iso8859_1_table[] = {
39}; 39};
40 40
41static struct { 41static struct {
42 int outfiles[15]; 42 int outfiles[16];
43 uint8_t *outbuf[15]; 43 uint8_t *outbuf[16];
44 size_t outfill[15]; 44 size_t outfill[16];
45 char * vorwahl; 45 char * vorwahl;
46 char ort[1024]; 46 char ort[1024];
47 size_t ort_len; 47 size_t ort_len;
@@ -101,11 +101,13 @@ static void split_to_files( uint8_t *entries, int num_entries )
101 *( g_state.outbuf[0] + g_state.outfill[0]++ ) = num_entries > 1 ? '1' : '0'; 101 *( g_state.outbuf[0] + g_state.outfill[0]++ ) = num_entries > 1 ? '1' : '0';
102 memcpy( g_state.outbuf[12] + g_state.outfill[12], g_state.ort, g_state.ort_len ); g_state.outfill[12] += g_state.ort_len; 102 memcpy( g_state.outbuf[12] + g_state.outfill[12], g_state.ort, g_state.ort_len ); g_state.outfill[12] += g_state.ort_len;
103 memcpy( g_state.outbuf[13] + g_state.outfill[13], g_state.zip, g_state.zip_len ); g_state.outfill[13] += g_state.zip_len; 103 memcpy( g_state.outbuf[13] + g_state.outfill[13], g_state.zip, g_state.zip_len ); g_state.outfill[13] += g_state.zip_len;
104 strcpy( g_state.outbuf[14] + g_state.outfill[14], g_state.vorwahl); g_state.outfill[14] += strlen(g_state.vorwahl);
104 } 105 }
105 106
106 *( g_state.outbuf[0 ] + g_state.outfill[0 ]++ ) = '\n'; 107 *( g_state.outbuf[0 ] + g_state.outfill[0 ]++ ) = '\n';
107 *( g_state.outbuf[12] + g_state.outfill[12]++ ) = '\n'; 108 *( g_state.outbuf[12] + g_state.outfill[12]++ ) = '\n';
108 *( g_state.outbuf[13] + g_state.outfill[13]++ ) = '\n'; 109 *( g_state.outbuf[13] + g_state.outfill[13]++ ) = '\n';
110 *( g_state.outbuf[14] + g_state.outfill[14]++ ) = '\n';
109 if( !end ) 111 if( !end )
110 return; 112 return;
111 } 113 }
@@ -178,7 +180,7 @@ static void act_on_file( uint8_t *file )
178 g_state.zip_len = snprintf( g_state.zip, sizeof(g_state.zip), "%s", zip ); 180 g_state.zip_len = snprintf( g_state.zip, sizeof(g_state.zip), "%s", zip );
179 g_state.vorwahl = vorwahl; 181 g_state.vorwahl = vorwahl;
180 182
181 /* printf( "Working on a %04d page and %06d records file, city: %4s %-32s with prefix %s\n", num_pages, num_records, zip, ort, vorwahl ); */ 183 printf( "Working on a %04d page and %06d records file, city: %4s %-32s with prefix %s\n", num_pages, num_records, zip, ort, vorwahl );
182 (void)num_records; /* silence warning about unused variable */ 184 (void)num_records; /* silence warning about unused variable */
183 185
184 for( page = 0; page < num_pages; ++page ) 186 for( page = 0; page < num_pages; ++page )
@@ -193,7 +195,7 @@ int main( )
193 ssize_t temp = 0; 195 ssize_t temp = 0;
194 int i; 196 int i;
195 197
196 for( i=0; i<14; ++i ) 198 for( i=0; i<15; ++i )
197 { 199 {
198 sprintf( filename, "%02d_unknown", i+1 ); 200 sprintf( filename, "%02d_unknown", i+1 );
199 g_state.outfiles[i] = open( filename, O_WRONLY | O_APPEND | O_CREAT, 0644 ); 201 g_state.outfiles[i] = open( filename, O_WRONLY | O_APPEND | O_CREAT, 0644 );
@@ -209,14 +211,14 @@ int main( )
209 unmap_file( &f ); 211 unmap_file( &f );
210 212
211 /* Write out results */ 213 /* Write out results */
212 for( i=0; i<14; ++i ) { 214 for( i=0; i<15; ++i ) {
213 /* if( g_state.outfill[i] > 1024*1024*6 ) printf( "Large: %s %zd\n", g_state.ort, g_state.outfill[i] ); */ 215 /* if( g_state.outfill[i] > 1024*1024*6 ) printf( "Large: %s %zd\n", g_state.ort, g_state.outfill[i] ); */
214 temp += write( g_state.outfiles[i], g_state.outbuf[i], g_state.outfill[i] ); 216 temp += write( g_state.outfiles[i], g_state.outbuf[i], g_state.outfill[i] );
215 g_state.outfill[i] = 0; 217 g_state.outfill[i] = 0;
216 } 218 }
217 } 219 }
218 220
219 for( i=0; i<14; ++i ) { 221 for( i=0; i<15; ++i ) {
220 temp += write( g_state.outfiles[i], g_state.outbuf[i], g_state.outfill[i] ); 222 temp += write( g_state.outfiles[i], g_state.outbuf[i], g_state.outfill[i] );
221 close( g_state.outfiles[i] ); 223 close( g_state.outfiles[i] );
222 } 224 }
diff --git a/src/export/mystdlib.c b/src/export/mystdlib.c
index b65f63d..31c991e 100644
--- a/src/export/mystdlib.c
+++ b/src/export/mystdlib.c
@@ -21,7 +21,9 @@ MAP map_file( char *filename, int readonly )
21 if( ( map->fh = open( filename, readonly ? O_RDONLY : O_RDWR ) ) >= 0 ) 21 if( ( map->fh = open( filename, readonly ? O_RDONLY : O_RDWR ) ) >= 0 )
22 { 22 {
23 fstat( map->fh, &fstatus ); 23 fstat( map->fh, &fstatus );
24 if( ( map->addr = mmap( NULL, map->size = (size_t)fstatus.st_size, 24 map->size = (size_t)fstatus.st_size;
25 if (!map->size) return map;
26 if( ( map->addr = mmap( NULL, map->size,
25 PROT_READ | ( readonly ? 0 : PROT_WRITE), (readonly ? MAP_PRIVATE : MAP_SHARED), map->fh, 0) ) == MAP_FAILED ) 27 PROT_READ | ( readonly ? 0 : PROT_WRITE), (readonly ? MAP_PRIVATE : MAP_SHARED), map->fh, 0) ) == MAP_FAILED )
26 { 28 {
27 fprintf( stderr, "Mapping file '%s' failed\n", filename ); 29 fprintf( stderr, "Mapping file '%s' failed\n", filename );
diff --git a/src/postprocess/map_plz.c b/src/postprocess/map_plz.c
index ab0db71..9dec6bb 100644
--- a/src/postprocess/map_plz.c
+++ b/src/postprocess/map_plz.c
@@ -18,7 +18,7 @@ int main(int argc, char **args) {
18 FILE *bfile, *streetfile_out; 18 FILE *bfile, *streetfile_out;
19 char *ptr, *input = malloc(65335); 19 char *ptr, *input = malloc(65335);
20 char *ort = malloc(65335), vorwahl_block[16]; 20 char *ort = malloc(65335), vorwahl_block[16];
21 int i, brutes_count = 0, report = 0; 21 unsigned int i, brutes_count = 0, report = 0;
22 brute_t *brutes = malloc(200000*sizeof(brute_t)); 22 brute_t *brutes = malloc(200000*sizeof(brute_t));
23 23
24 /* prepare io */ 24 /* prepare io */
@@ -63,7 +63,7 @@ int main(int argc, char **args) {
63 g_book_by_name = (entry_t*)malloc(g_book_size * sizeof(entry_t)); 63 g_book_by_name = (entry_t*)malloc(g_book_size * sizeof(entry_t));
64 64
65 /* Split pointers into input files into our arrays */ 65 /* Split pointers into input files into our arrays */
66 for (i = 0, ptr = (char*)tbuch->addr; i < g_book_size; ++i) { 66 for (i=0, ptr=(char*)tbuch->addr; i<g_book_size; ++i) {
67 g_book[i].vorwahl = ptr; ptr += strlen(ptr) + 1; 67 g_book[i].vorwahl = ptr; ptr += strlen(ptr) + 1;
68 g_book[i].ort = ptr; ptr += strlen(ptr) + 1; 68 g_book[i].ort = ptr; ptr += strlen(ptr) + 1;
69 g_book[i].strasse = ptr; ptr += strlen(ptr) + 1; 69 g_book[i].strasse = ptr; ptr += strlen(ptr) + 1;
diff --git a/src/postprocess/merge_entries.c b/src/postprocess/merge_entries.c
index f9ee67d..1dd7d50 100644
--- a/src/postprocess/merge_entries.c
+++ b/src/postprocess/merge_entries.c
@@ -148,9 +148,10 @@ static int sort_me(const void *f_a, const void *f_b) {
148 outvec_t *oa_row = oa + row * COLUMNS; 148 outvec_t *oa_row = oa + row * COLUMNS;
149 outvec_t *ob_row = ob + row * COLUMNS; 149 outvec_t *ob_row = ob + row * COLUMNS;
150 150
151 if ((res = STRCMP_n(oa_row[ 2].ptr, ob_row[ 2].ptr))) return res; /* PLZ */
152 if ((res = STRCMP_n(oa_row[ 9].ptr, ob_row[ 9].ptr))) return res; /* Ort */
151 if ((res = STRCMP_n(oa_row[10].ptr, ob_row[10].ptr))) return res; /* Vorwahl */ 153 if ((res = STRCMP_n(oa_row[10].ptr, ob_row[10].ptr))) return res; /* Vorwahl */
152 if ((res = STRCMP_n(oa_row[11].ptr, ob_row[11].ptr))) return res; /* Rufnummer */ 154 if ((res = STRCMP_n(oa_row[11].ptr, ob_row[11].ptr))) return res; /* Rufnummer */
153 if ((res = STRCMP_n(oa_row[ 2].ptr, ob_row[ 2].ptr))) return res; /* PLZ */
154 if ((res = STRCMP_n(oa_row[ 6].ptr, ob_row[ 6].ptr))) return res; /* Strasse */ 155 if ((res = STRCMP_n(oa_row[ 6].ptr, ob_row[ 6].ptr))) return res; /* Strasse */
155 if ((res = STRCMP_n(oa_row[ 7].ptr, ob_row[ 7].ptr))) return res; /* Hausnummer */ 156 if ((res = STRCMP_n(oa_row[ 7].ptr, ob_row[ 7].ptr))) return res; /* Hausnummer */
156 if ((res = STRCMP_n(oa_row[ 3].ptr, ob_row[ 3].ptr))) return res; /* Nachname */ 157 if ((res = STRCMP_n(oa_row[ 3].ptr, ob_row[ 3].ptr))) return res; /* Nachname */
@@ -200,7 +201,7 @@ int main(int argc, char **args) {
200 unsigned long current = 0, i, flag; 201 unsigned long current = 0, i, flag;
201 uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0; 202 uint64_t year_list = 0, revflag_list = 0, bizflag_list = 0;
202 203
203 if (argc != 1) exit(1); 204 if (argc != 2) exit(1);
204 tbuch = map_file(args[1], 1); 205 tbuch = map_file(args[1], 1);
205 206
206 /* Estimate upper bound for amount of lines */ 207 /* Estimate upper bound for amount of lines */
diff --git a/src/postprocess/postprocess-1992.sh b/src/postprocess/postprocess-1992.sh
index 1e685d2..6720991 100644
--- a/src/postprocess/postprocess-1992.sh
+++ b/src/postprocess/postprocess-1992.sh
@@ -4,19 +4,16 @@ paste 1992_Q2/{01_Flags,12_Vorwahl,12_Vorwahl_block,11_Ort,07_Strasse,08_Hausnum
4# Generate lookup file from 1995 4# Generate lookup file from 1995
5paste 1995_Q0/{12_Vorwahl,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl} | tr '\n\t' '\0' > 1995-voshnvrp.bin 5paste 1995_Q0/{12_Vorwahl,11_Ort,07_Strasse,08_Hausnummer,02_Nachname,03_Vorname,13_Rufnummer,10_Postleitzahl} | tr '\n\t' '\0' > 1995-voshnvrp.bin
6 6
7# To debug in lldb
8process launch -i 1992_testfile.txt -- 1995-vorwahl-ort-strasse-hnr-name-vorname-rufnummer-plz.bin
9
10# Compile plz mapper 7# Compile plz mapper
11cc -O3 -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c 8cc -O3 -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c
12 9
10# To debug in lldb
11# cc -O0 -g -o map_plz map_plz.c -I ../src/export/ ../src/export/mystdlib.c
12# process launch -i 1992_testfile.txt -- 1995-vorwahl-ort-strasse-hnr-name-vorname-rufnummer-plz.bin
13
13# outputs mapped plz, generates brutemap.txt 14# outputs mapped plz, generates brutemap.txt
14touch brutemap_input.bin zip_simple_map.bin 15touch brutemap_input.bin zip_simple_map.bin
15./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl 16./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 1992_Q2/10_Postleitzahl
16
17# generate street name translation table from brutemap,
18# only taking into account similar street names
19# cut -f 3,4 brutemap.txt | tr '[:upper:]' '[:lower:]' | paste brutemap.txt - | cut -f 1-4,6,7 | ./jaro | cut -f 1-5 > brutemap_filtered.txt
20 17
21# generate street name translation table from brutemap, 18# generate street name translation table from brutemap,
22# only taking into account similar street names, new style 19# only taking into account similar street names, new style
@@ -29,4 +26,4 @@ cut -f 1-5 brutemap_simifiltered.txt | sort | uniq -c | sed -E $'s:^ *([[:digit:
29sort -u zip_mapfile.txt | tr '\n' '\0' > zip_simple_map.bin 26sort -u zip_mapfile.txt | tr '\n' '\0' > zip_simple_map.bin
30 27
31# Redo the mapping with the data from brutemap and zipmap 28# Redo the mapping with the data from brutemap and zipmap
32./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 10_Postleitzahl 29./map_plz 1995-voshnvrp.bin zip_simple_map.bin brutemap_input.bin < 1992-fvvoshnvrpp.txt > 1992_Q2/10_Postleitzahl