summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDirk Engling <erdgeist@erdgeist.org>2019-01-30 18:12:18 +0100
committerDirk Engling <erdgeist@erdgeist.org>2019-01-30 18:12:18 +0100
commita187241f4e4cf8a592e0a3cc0b61f949e6184a9e (patch)
treeee6adb8733dd81698f4a50bf75aeadbd30f68464
parent0150806fbf0cc64e60984f8a99aa45ca734e0735 (diff)
Add branch name mapper code for v3
-rw-r--r--Makefile9
-rwxr-xr-xmakecolumns.sh44
-rw-r--r--src/export/map_branches_v3.c79
-rw-r--r--src/export/map_branches_v4.c (renamed from src/export/map_branches.c)0
4 files changed, 115 insertions, 17 deletions
diff --git a/Makefile b/Makefile
index 089ae06..691675e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
1BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/map_branches bin/convert_coords bin/join 1BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/map_branches_v3 bin/map_branches_v4 bin/convert_coords bin/join
2CFLAGS += -W -Wall -Wextra -O3 # -Weverything -Wno-cast-align -Wno-padded 2CFLAGS += -W -Wall -Wextra -O3 # -Weverything -Wno-cast-align -Wno-padded
3 3
4all: $(BINARIES) 4all: $(BINARIES)
@@ -24,8 +24,11 @@ bin/split_version_2: src/export/split_version_2.c src/export/mystdlib.c
24bin/map_coords: src/export/map_coords.c src/export/mystdlib.c 24bin/map_coords: src/export/map_coords.c src/export/mystdlib.c
25 $(CC) $(CFLAGS) -o $@ src/export/map_coords.c src/export/mystdlib.c 25 $(CC) $(CFLAGS) -o $@ src/export/map_coords.c src/export/mystdlib.c
26 26
27bin/map_branches: src/export/map_branches.c 27bin/map_branches_v4: src/export/map_branches_v4.c
28 $(CC) $(CFLAGS) -o $@ src/export/map_branches.c 28 $(CC) $(CFLAGS) -o $@ src/export/map_branches_v4.c
29
30bin/map_branches_v3: src/export/map_branches_v3.c
31 $(CC) $(CFLAGS) -o $@ src/export/map_branches_v3.c
29 32
30bin/convert_coords: src/export/convert_coords.c 33bin/convert_coords: src/export/convert_coords.c
31 $(CC) $(CFLAGS) -o $@ src/export/convert_coords.c -lm 34 $(CC) $(CFLAGS) -o $@ src/export/convert_coords.c -lm
diff --git a/makecolumns.sh b/makecolumns.sh
index edd965c..4f4bebc 100755
--- a/makecolumns.sh
+++ b/makecolumns.sh
@@ -171,9 +171,21 @@ handle_format_version_2() {
171} 171}
172 172
173handle_format_version_3() { 173handle_format_version_3() {
174 echo "Working on $1. Detected pre-2004 Telefonbuch version." 174 # glob
175 teiln=`printf "%s" "$1"/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt]`
176 braid=`printf "%s" "$1"/[Dd][Aa][Tt]/[Bb][Rr][Aa][Ii][Dd].[Dd][Aa][Tt]`
177 streets=`printf "%s" "$1"/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt]`
178 karto=`printf "%s" "$1"/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt]`
179
180 if [ -f "${braid}" ]; then
181 echo "Working on $1. Detected pre-2004 Yellow Pages version."
182 is_yp=true
183 else
184 echo "Working on $1. Detected pre-2004 Telefonbuch version."
185 unset is_yp
186 fi
175 # Extract teiln.dat 187 # Extract teiln.dat
176 do_decompress_version_3 "$1"/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] "teiln.dat" 188 do_decompress_version_3 "${teiln}" "teiln.dat"
177 189
178 # See how long each filename is 190 # See how long each filename is
179 export filename_len=$(( `ls | head -n 1 | wc -c` - 1 )) 191 export filename_len=$(( `ls | head -n 1 | wc -c` - 1 ))
@@ -205,8 +217,14 @@ handle_format_version_3() {
205# set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}` 217# set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}`
206# tail -c +$(( $2 + 1 )) ${file} 218# tail -c +$(( $2 + 1 )) ${file}
207# done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname 219# done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname
208 cut -c 1 < 01_unknown > 01_Flags 220 if [ "${is_yp}" ]; then
209 cut -c 2- < 01_unknown > 02_Nachname 221 cut -c 1 < 01_unknown > 01_Flags
222 cut -c 2-7 < 01_unknown > 09_Branchenindex
223 cut -c 8- < 01_unknown > 02_Nachname
224 else
225 cut -c 1 < 01_unknown > 01_Flags
226 cut -c 2- < 01_unknown > 02_Nachname
227 fi
210 rm 01_unknown 228 rm 01_unknown
211 printf "done.\n" 229 printf "done.\n"
212 230
@@ -259,7 +277,6 @@ handle_format_version_3() {
259 277
260 # If street names come in an extra file, extract 278 # If street names come in an extra file, extract
261 # street names first 279 # street names first
262 streets="$1"/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt]
263 [ -f "${streets}" ] && do_processfile_version_3 "${streets}" "street name" 99_Strassenname convert_zeros 280 [ -f "${streets}" ] && do_processfile_version_3 "${streets}" "street name" 99_Strassenname convert_zeros
264 281
265 # extract street names if 07_unknown contains street indexes 282 # extract street names if 07_unknown contains street indexes
@@ -280,7 +297,6 @@ handle_format_version_3() {
280 tidy_streetnames 07_Strasse 297 tidy_streetnames 07_Strasse
281 fi 298 fi
282 299
283 karto="$1"/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt]
284 if [ -f "${karto}" ]; then 300 if [ -f "${karto}" ]; then
285 do_processfile_version_3 "${karto}" "geo coordinates" 90_Geokoordinaten_hnr_raw 301 do_processfile_version_3 "${karto}" "geo coordinates" 90_Geokoordinaten_hnr_raw
286 302
@@ -290,6 +306,14 @@ handle_format_version_3() {
290 paste 10_Postleitzahl 11_Ort 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten 306 paste 10_Postleitzahl 11_Ort 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten
291 printf "done.\n" 307 printf "done.\n"
292 fi 308 fi
309
310 if [ -f "${braid}" ]; then
311 do_processfile_version_3 "${braid}" "branchen name index" 97_Branchenname convert_zeros
312
313 printf "Looking up branch names from codes ... "
314 map_branches_v3 97_Branchenname < 09_Branchenindex > 09_Branchen
315 printf "done.\n"
316 fi
293} 317}
294 318
295handle_format_version_4() { 319handle_format_version_4() {
@@ -387,17 +411,9 @@ handle_format_version_4() {
387 rm file_* 411 rm file_*
388 printf "done.\n" 412 printf "done.\n"
389 413
390 printf "Generating branch name index ... "
391 mkdir branchcodes/
392 while read index name; do
393 printf $name > branchcodes/${index}
394 done < 97_Branchenname
395 printf "done.\n"
396
397 printf "Looking up branch names from codes ... " 414 printf "Looking up branch names from codes ... "
398 map_branches 97_Branchenname < 09_Verweise > 09_Branchen 415 map_branches 97_Branchenname < 09_Verweise > 09_Branchen
399 printf "done.\n" 416 printf "done.\n"
400 rm -r branchcodes
401 fi 417 fi
402} 418}
403 419
diff --git a/src/export/map_branches_v3.c b/src/export/map_branches_v3.c
new file mode 100644
index 0000000..22d0036
--- /dev/null
+++ b/src/export/map_branches_v3.c
@@ -0,0 +1,79 @@
1#define _WITH_GETLINE
2#define _GNU_SOURCE
3#include <stdlib.h>
4#include <stdint.h>
5#include <stdio.h>
6#include <string.h>
7#include <ctype.h>
8
9typedef struct {
10 long code;
11 char *name;
12} branchen_code;
13
14enum { MAX_CODES = 128 * 1024 };
15branchen_code g_codes[MAX_CODES];
16long g_code_count;
17
18static int find_code( const void *key, const void *bc)
19{
20 return (long)key - ((branchen_code*)bc)->code;
21}
22
23static int qsort_cmp( const void *a, const void *b )
24{
25 return ((branchen_code*)a)->code - ((branchen_code*)b)->code;
26}
27
28int main( int argc, char ** args )
29{
30 FILE * map_file;
31 char *end_p, *input = malloc(1024);
32 size_t input_length = 1024;
33 ssize_t ll;
34
35 if( argc != 2 ) { fprintf( stderr, "Syntax: %s <branchcodes> < <branches_files>\n", args[0] ); exit(111); }
36
37 map_file = fopen( args[1], "r" );
38 if (!map_file || !input) { fprintf( stderr, "Error allocating resources\n" ); exit( 111 ); }
39
40 /* Fill array with maps */
41 while ( (ll = getline( &input, &input_length, map_file ) ) >= 0 ) {
42 char * r = strchr(input, 10);
43 if (r) *r = 0;
44 g_codes[g_code_count].code = strtoul(input, &end_p, 10);
45
46 if (input == end_p) break;
47 if (*end_p != ';') { fprintf( stderr, "Input error, line: %s\n", input); exit(1); }
48
49 r = strchr(end_p + 1, ';');
50 if (!r) { fprintf( stderr, "Input error, line: %s\n", input); exit(1); }
51 *r = 0;
52
53 asprintf(&g_codes[g_code_count].name, "%s", end_p + 1) ;
54 // printf( "%ld: %s\n", g_codes[g_code_count].code, g_codes[g_code_count].name);
55 g_code_count++;
56 }
57
58 qsort(g_codes, g_code_count, sizeof(branchen_code), qsort_cmp );
59
60 /* Now scan lines from 09_Verweise for semicolon separated branchen codes */
61 while ( (ll = getline( &input, &input_length, stdin ) ) >= 0 ) {
62 char *codes = input;
63 branchen_code *bc;
64 int multiple;
65 for (multiple = 0;; ++multiple) {
66 long code = strtoul(codes, &end_p, 10);
67 if (codes == end_p) break;
68 bc = (branchen_code*)bsearch((void *)(uintptr_t)code, g_codes, g_code_count, sizeof(branchen_code), find_code);
69 if (bc) {
70 if (multiple) putchar(';');
71 printf("%s", bc->name);
72 }
73 if (*end_p != ';') break;
74 codes = end_p + 1;
75 }
76 putchar(10);
77 }
78 return 0;
79}
diff --git a/src/export/map_branches.c b/src/export/map_branches_v4.c
index 160945d..160945d 100644
--- a/src/export/map_branches.c
+++ b/src/export/map_branches_v4.c