From 4c3a31b1b03e72e65e080bfcb017ceb9619847a4 Mon Sep 17 00:00:00 2001 From: Dirk Engling Date: Tue, 2 Jun 2015 19:44:12 +0200 Subject: Cleanup known broken input data, build join.c --- makecolumns.sh | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'makecolumns.sh') diff --git a/makecolumns.sh b/makecolumns.sh index 8131379..0854b32 100755 --- a/makecolumns.sh +++ b/makecolumns.sh @@ -112,6 +112,7 @@ handle_format_version_1() { lam 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze printf "done.\n" + tidy_streetnames 07_Strasse } handle_format_version_2() { @@ -158,6 +159,7 @@ handle_format_version_2() { lam 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze printf "done.\n" + tidy_streetnames 07_Strasse } handle_format_version_3() { @@ -243,7 +245,7 @@ handle_format_version_3() { printf "done.\n" printf "Normalizing zusaetze ... " - lam 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze + lam 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E -e 's/ +/ /g' -e 's/^ +//g' -e 's/ +$//g' > 04_Zusaetze printf "done.\n" # If street names come in an extra file, extract @@ -260,10 +262,13 @@ handle_format_version_3() { # fix up known broken Strassennamen file [ `stat -f %z ${streets}` -eq 1642716 ] && printf '9. Str.\n91. Str.\n91er-Str.\n' >> 99_Strassenname + tidy_streetnames 99_Strassenname + cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse printf "done.\n" else mv 07_unknown 07_Strasse + tidy_streetnames 07_Strasse fi karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] @@ -306,7 +311,13 @@ handle_format_version_4() { find . -name file_\* -delete printf "done.\n" - mv column_0 01_Flags + # the 'did not object to inverse search' flag is insane and needs to be reversed + if grep -q ^40 column_0; then + awk '{ a=substr($0,1,1); printf "%x%x\n",index("5670123cdef89ab4",a)%16 ,substr($0,2,1) }' < column_0 > 01_Flags + rm column_0 + else + mv column_0 01_Flags + fi mv column_1 02_Nachname mv column_2 03_Vorname mv column_3 04_05_Namenszusatz_Addresszusatz @@ -318,6 +329,8 @@ handle_format_version_4() { mv column_9 13_Rufnummer mv column_10 14_15_Email_Webadresse + tidy_streetnames 99_Strassenname + printf "Looking up street names from indexes ... " cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse printf "done.\n" @@ -348,6 +361,15 @@ handle_format_version_4() { printf "done.\n" fi rm file_* + +} + +tidy_streetnames () { + streets=$1 + + # Replace any dots at end of line by a single one + # finish any str abbreviation without a period with a period + sed -Ei '' 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./' ${streets} } # After function definitions, main() can use them -- cgit v1.2.3