From a187241f4e4cf8a592e0a3cc0b61f949e6184a9e Mon Sep 17 00:00:00 2001 From: Dirk Engling Date: Wed, 30 Jan 2019 18:12:18 +0100 Subject: Add branch name mapper code for v3 --- Makefile | 9 +++-- makecolumns.sh | 44 ++++++++++++++++-------- src/export/map_branches.c | 71 --------------------------------------- src/export/map_branches_v3.c | 79 ++++++++++++++++++++++++++++++++++++++++++++ src/export/map_branches_v4.c | 71 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 186 insertions(+), 88 deletions(-) delete mode 100644 src/export/map_branches.c create mode 100644 src/export/map_branches_v3.c create mode 100644 src/export/map_branches_v4.c diff --git a/Makefile b/Makefile index 089ae06..691675e 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/map_branches bin/convert_coords bin/join +BINARIES=bin/extract_version_1 bin/extract_version_2 bin/extract_version_3 bin/extract_version_4 bin/split_version_2 bin/split_version_3 bin/map_coords bin/map_branches_v3 bin/map_branches_v4 bin/convert_coords bin/join CFLAGS += -W -Wall -Wextra -O3 # -Weverything -Wno-cast-align -Wno-padded all: $(BINARIES) @@ -24,8 +24,11 @@ bin/split_version_2: src/export/split_version_2.c src/export/mystdlib.c bin/map_coords: src/export/map_coords.c src/export/mystdlib.c $(CC) $(CFLAGS) -o $@ src/export/map_coords.c src/export/mystdlib.c -bin/map_branches: src/export/map_branches.c - $(CC) $(CFLAGS) -o $@ src/export/map_branches.c +bin/map_branches_v4: src/export/map_branches_v4.c + $(CC) $(CFLAGS) -o $@ src/export/map_branches_v4.c + +bin/map_branches_v3: src/export/map_branches_v3.c + $(CC) $(CFLAGS) -o $@ src/export/map_branches_v3.c bin/convert_coords: src/export/convert_coords.c $(CC) $(CFLAGS) -o $@ src/export/convert_coords.c -lm diff --git a/makecolumns.sh b/makecolumns.sh index edd965c..4f4bebc 100755 --- a/makecolumns.sh +++ b/makecolumns.sh @@ -171,9 +171,21 @@ handle_format_version_2() { } handle_format_version_3() { - echo "Working on $1. Detected pre-2004 Telefonbuch version." + # glob + teiln=`printf "%s" "$1"/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt]` + braid=`printf "%s" "$1"/[Dd][Aa][Tt]/[Bb][Rr][Aa][Ii][Dd].[Dd][Aa][Tt]` + streets=`printf "%s" "$1"/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt]` + karto=`printf "%s" "$1"/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt]` + + if [ -f "${braid}" ]; then + echo "Working on $1. Detected pre-2004 Yellow Pages version." + is_yp=true + else + echo "Working on $1. Detected pre-2004 Telefonbuch version." + unset is_yp + fi # Extract teiln.dat - do_decompress_version_3 "$1"/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] "teiln.dat" + do_decompress_version_3 "${teiln}" "teiln.dat" # See how long each filename is export filename_len=$(( `ls | head -n 1 | wc -c` - 1 )) @@ -205,8 +217,14 @@ handle_format_version_3() { # set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}` # tail -c +$(( $2 + 1 )) ${file} # done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname - cut -c 1 < 01_unknown > 01_Flags - cut -c 2- < 01_unknown > 02_Nachname + if [ "${is_yp}" ]; then + cut -c 1 < 01_unknown > 01_Flags + cut -c 2-7 < 01_unknown > 09_Branchenindex + cut -c 8- < 01_unknown > 02_Nachname + else + cut -c 1 < 01_unknown > 01_Flags + cut -c 2- < 01_unknown > 02_Nachname + fi rm 01_unknown printf "done.\n" @@ -259,7 +277,6 @@ handle_format_version_3() { # If street names come in an extra file, extract # street names first - streets="$1"/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt] [ -f "${streets}" ] && do_processfile_version_3 "${streets}" "street name" 99_Strassenname convert_zeros # extract street names if 07_unknown contains street indexes @@ -280,7 +297,6 @@ handle_format_version_3() { tidy_streetnames 07_Strasse fi - karto="$1"/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] if [ -f "${karto}" ]; then do_processfile_version_3 "${karto}" "geo coordinates" 90_Geokoordinaten_hnr_raw @@ -290,6 +306,14 @@ handle_format_version_3() { paste 10_Postleitzahl 11_Ort 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten printf "done.\n" fi + + if [ -f "${braid}" ]; then + do_processfile_version_3 "${braid}" "branchen name index" 97_Branchenname convert_zeros + + printf "Looking up branch names from codes ... " + map_branches_v3 97_Branchenname < 09_Branchenindex > 09_Branchen + printf "done.\n" + fi } handle_format_version_4() { @@ -387,17 +411,9 @@ handle_format_version_4() { rm file_* printf "done.\n" - printf "Generating branch name index ... " - mkdir branchcodes/ - while read index name; do - printf $name > branchcodes/${index} - done < 97_Branchenname - printf "done.\n" - printf "Looking up branch names from codes ... " map_branches 97_Branchenname < 09_Verweise > 09_Branchen printf "done.\n" - rm -r branchcodes fi } diff --git a/src/export/map_branches.c b/src/export/map_branches.c deleted file mode 100644 index 160945d..0000000 --- a/src/export/map_branches.c +++ /dev/null @@ -1,71 +0,0 @@ -#define _WITH_GETLINE -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -typedef struct { - long code; - char *name; -} branchen_code; - -enum { MAX_CODES = 128 * 1024 }; -branchen_code g_codes[MAX_CODES]; -long g_code_count; - -static int find_code( const void *key, const void *bc) -{ - return (long)key - ((branchen_code*)bc)->code; -} - -static int qsort_cmp( const void *a, const void *b ) -{ - return ((branchen_code*)a)->code - ((branchen_code*)b)->code; -} - -int main( int argc, char ** args ) -{ - FILE * map_file; - char *end_p, *input = malloc(1024); - size_t input_length = 1024; - ssize_t ll; - - if( argc != 2 ) { fprintf( stderr, "Syntax: %s < \n", args[0] ); exit(111); } - - map_file = fopen( args[1], "r" ); - if (!map_file || !input) { fprintf( stderr, "Error allocating resources\n" ); exit( 111 ); } - - /* Fill array with maps */ - while ( (ll = getline( &input, &input_length, map_file ) ) >= 0 ) { - char * r = strchr(input, 10); - if (r) *r = 0; - g_codes[g_code_count].code = strtoul(input, &end_p, 10); - asprintf(&g_codes[g_code_count].name, "%s", end_p + 1) ; - // printf( "%ld: %s\n", g_codes[g_code_count].code, g_codes[g_code_count].name); - g_code_count++; - } - - qsort(g_codes, g_code_count, sizeof(branchen_code), qsort_cmp ); - - /* Now scan lines from 09_Verweise for semicolon separated branchen codes */ - while ( (ll = getline( &input, &input_length, stdin ) ) >= 0 ) { - char *codes = input; - branchen_code *bc; - int multiple; - for (multiple = 0;; ++multiple) { - long code = strtoul(codes, &end_p, 10); - if (codes == end_p) break; - bc = (branchen_code*)bsearch((void *)(uintptr_t)code, g_codes, g_code_count, sizeof(branchen_code), find_code); - if (bc) { - if (multiple) putchar(';'); - printf("%s", bc->name); - } - if (*end_p != ';') break; - codes = end_p + 1; - } - putchar(10); - } - return 0; -} diff --git a/src/export/map_branches_v3.c b/src/export/map_branches_v3.c new file mode 100644 index 0000000..22d0036 --- /dev/null +++ b/src/export/map_branches_v3.c @@ -0,0 +1,79 @@ +#define _WITH_GETLINE +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +typedef struct { + long code; + char *name; +} branchen_code; + +enum { MAX_CODES = 128 * 1024 }; +branchen_code g_codes[MAX_CODES]; +long g_code_count; + +static int find_code( const void *key, const void *bc) +{ + return (long)key - ((branchen_code*)bc)->code; +} + +static int qsort_cmp( const void *a, const void *b ) +{ + return ((branchen_code*)a)->code - ((branchen_code*)b)->code; +} + +int main( int argc, char ** args ) +{ + FILE * map_file; + char *end_p, *input = malloc(1024); + size_t input_length = 1024; + ssize_t ll; + + if( argc != 2 ) { fprintf( stderr, "Syntax: %s < \n", args[0] ); exit(111); } + + map_file = fopen( args[1], "r" ); + if (!map_file || !input) { fprintf( stderr, "Error allocating resources\n" ); exit( 111 ); } + + /* Fill array with maps */ + while ( (ll = getline( &input, &input_length, map_file ) ) >= 0 ) { + char * r = strchr(input, 10); + if (r) *r = 0; + g_codes[g_code_count].code = strtoul(input, &end_p, 10); + + if (input == end_p) break; + if (*end_p != ';') { fprintf( stderr, "Input error, line: %s\n", input); exit(1); } + + r = strchr(end_p + 1, ';'); + if (!r) { fprintf( stderr, "Input error, line: %s\n", input); exit(1); } + *r = 0; + + asprintf(&g_codes[g_code_count].name, "%s", end_p + 1) ; + // printf( "%ld: %s\n", g_codes[g_code_count].code, g_codes[g_code_count].name); + g_code_count++; + } + + qsort(g_codes, g_code_count, sizeof(branchen_code), qsort_cmp ); + + /* Now scan lines from 09_Verweise for semicolon separated branchen codes */ + while ( (ll = getline( &input, &input_length, stdin ) ) >= 0 ) { + char *codes = input; + branchen_code *bc; + int multiple; + for (multiple = 0;; ++multiple) { + long code = strtoul(codes, &end_p, 10); + if (codes == end_p) break; + bc = (branchen_code*)bsearch((void *)(uintptr_t)code, g_codes, g_code_count, sizeof(branchen_code), find_code); + if (bc) { + if (multiple) putchar(';'); + printf("%s", bc->name); + } + if (*end_p != ';') break; + codes = end_p + 1; + } + putchar(10); + } + return 0; +} diff --git a/src/export/map_branches_v4.c b/src/export/map_branches_v4.c new file mode 100644 index 0000000..160945d --- /dev/null +++ b/src/export/map_branches_v4.c @@ -0,0 +1,71 @@ +#define _WITH_GETLINE +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +typedef struct { + long code; + char *name; +} branchen_code; + +enum { MAX_CODES = 128 * 1024 }; +branchen_code g_codes[MAX_CODES]; +long g_code_count; + +static int find_code( const void *key, const void *bc) +{ + return (long)key - ((branchen_code*)bc)->code; +} + +static int qsort_cmp( const void *a, const void *b ) +{ + return ((branchen_code*)a)->code - ((branchen_code*)b)->code; +} + +int main( int argc, char ** args ) +{ + FILE * map_file; + char *end_p, *input = malloc(1024); + size_t input_length = 1024; + ssize_t ll; + + if( argc != 2 ) { fprintf( stderr, "Syntax: %s < \n", args[0] ); exit(111); } + + map_file = fopen( args[1], "r" ); + if (!map_file || !input) { fprintf( stderr, "Error allocating resources\n" ); exit( 111 ); } + + /* Fill array with maps */ + while ( (ll = getline( &input, &input_length, map_file ) ) >= 0 ) { + char * r = strchr(input, 10); + if (r) *r = 0; + g_codes[g_code_count].code = strtoul(input, &end_p, 10); + asprintf(&g_codes[g_code_count].name, "%s", end_p + 1) ; + // printf( "%ld: %s\n", g_codes[g_code_count].code, g_codes[g_code_count].name); + g_code_count++; + } + + qsort(g_codes, g_code_count, sizeof(branchen_code), qsort_cmp ); + + /* Now scan lines from 09_Verweise for semicolon separated branchen codes */ + while ( (ll = getline( &input, &input_length, stdin ) ) >= 0 ) { + char *codes = input; + branchen_code *bc; + int multiple; + for (multiple = 0;; ++multiple) { + long code = strtoul(codes, &end_p, 10); + if (codes == end_p) break; + bc = (branchen_code*)bsearch((void *)(uintptr_t)code, g_codes, g_code_count, sizeof(branchen_code), find_code); + if (bc) { + if (multiple) putchar(';'); + printf("%s", bc->name); + } + if (*end_p != ';') break; + codes = end_p + 1; + } + putchar(10); + } + return 0; +} -- cgit v1.2.3