#!/bin/sh export LANG=C export LC_CTYPE=C export LC_ALL=C export PATH=${PATH}:`pwd`/bin/ unset FIND_E case `uname -s` in *BSD|Darwin) FIND_E="-E " esac main() { [ -f /usr/local/bin/el ] && EL=/usr/local/bin/el [ -f `dirname $0`/../bin/el ] && EL=`dirname $0`/../bin/el if [ -z "${EL}" ]; then echo "el not found. Get it at 'git clone git://erdgeist.org/el'" exit 1 fi if [ $# -ne 1 ]; then echo "Syntax: $0 [phonebookdirectory]" exit 1 fi # Compile all the binaries make all printf "Cleaning up old working directory ... " rm -rf work/`basename "${1#white_}"` printf "done.\n" mkdir -p work/`basename "${1#white_}"` cd work/`basename "${1#white_}"` || exit 1 if [ -f "$1/phonebook.db" ]; then handle_format_version_4 "${1}" elif [ -f "${1}"/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] ]; then handle_format_version_3 "${1}" elif [ -n "`find "${1}" -iname atb?dd00 -ls -quit`" ]; then handle_format_version_2 "${1}" elif [ -n "`find "${1}" -name dpr00000.005 -ls -quit`" ]; then handle_format_version_1 "${1}" else echo "Not a recognized Telefonbuch folder" fi cd ../.. } do_decompress_version_3() { printf "Extracting $2 chunks ... " mkdir LHA && cd LHA extract_version_3 "${1}" cd .. printf "done.\n" printf "Decompressing $2 chunks ... " numfiles=`find LHA | wc -l` reported=0; processed=0 for archive in LHA/*.lha; do lha x ${archive} > /dev/null [ 1 -eq $(( ( ( (processed+=1) * 20 ) / numfiles ) > reported )) ] && printf "%d%% " $(( (reported+=1) * 5 )) done rm -r LHA [ $reported -lt 10 ] && printf "100% " printf "done.\n" } do_processfile_version_3() { working_on=`basename ${1}` mkdir "${working_on}" && cd "${working_on}" do_decompress_version_3 "${1}" "${2}" cd .. printf "Combining $2 into single file ... " if [ "${4}" = "convert_zeros" ]; then cat "${working_on}"/* | tr '\n\0' '\t\n' > $3 else cat "${working_on}"/* > $3 fi printf "done.\n" rm -rf "${working_on}" } size() { case `uname -s` in *BSD|Darwin) stat -f %z `printf %0${filename_len}d "$1"`;; *) stat -c %s `printf %0${filename_len}d "$1"`;; esac } get_dword() { # $1 file hexdump -n 4 -v -e '" " 1/4 "%u"' `printf %0${filename_len}d "${1}"` } handle_format_version_1() { echo "Working on $1. Detected 1992 Telefonbuch version." # Extract all dpr database files printf "Extracting dpr databases ... " find "$1" -name dpr\*.001 | extract_version_1 printf "done.\n" # rename our extracted columns mv 01_unknown 01_Flags mv 02_unknown 02_Nachname mv 03_unknown 03_Vorname mv 05_unknown 06_Ortszusatz mv 06_unknown 10_Zustellamt_PLZOst mv 07_unknown 07_Strasse mv 08_unknown 08_Hausnummer mv 10_unknown 09_Verweise mv 11_unknown 12_Vorwahl mv 12_unknown 13_Rufnummer mv 13_unknown 11_Ort mv 14_unknown 10_Postleitzahl_West mv 15_unknown 12_Vorwahl_block printf "Splitting appartement to zusaetze ... " paste 07_Strasse 08_Hausnummer 09_unknown | sed -E $'s:^(.*)\;([0-9]+.*)\t(.*)\t.*$:\\1\t\\2\tWohnung \\3:;s:^(.*)tr(\t.*\t.*)$:\\1tr.\\2:' > tm_unknown cut -f 1 tm_unknown > 07_Strasse cut -f 2 tm_unknown > 08_Hausnummer printf "done.\n" printf "Normalizing zusaetze ... " cut -f 3 tm_unknown | sed -E -e 's:^, +:u. :' > 04_Namenszusatz sed -E -e 's:^, +:u. :' 04_unknown > 05_Adresszusatz paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze printf "done.\n" # For consistency, create files with empty lines tr -dC '\n' < 01_Flags > 14_Webadresse cp 14_Webadresse 15_Email sed $'s:.*:\t:' 01_Flags > 16_Koordinaten tidy_columns rm ??_unknown } handle_format_version_2() { echo "Working on $1. Detected pre 02/1996 Telefonbuch version." # Extract all CD's pkware compressed databases printf "Extracting 3 pkware databases ...\n" cd=1 for database in `find "$1" -iname atb?dd00`; do dir=`dirname ${database}` base=`basename ${database}` printf " %d/3 in %4s. Decompressing ..." ${cd} "$( basename ${dir} )" extract_version_2 "${database}" > ${base}.dump printf ", extracting ..." indexfile=$( find ${dir} -iname atb?di00 ) split_version_2 "${base}.dump" "${indexfile}" printf ", cleaning up ... " rm "${base}.dump" printf "done.\n" cd=$((cd+1)) done mv 01_unknown 01_Flags mv 16_unknown 02_Nachname mv 07_unknown 03_Vorname mv 12_unknown 06_Ortszusatz mv 08_unknown 07_Strasse mv 10_unknown 08_Hausnummer mv 13_unknown 09_Verweise mv 02_unknown 10_Postleitzahl mv 15_unknown 11_Ort mv 09_unknown 11_Ort_Gemeinde mv 05_unknown 12_Vorwahl mv 06_unknown 13_Rufnummer printf "Normalizing zusaetze ... " sed -E -e 's:^, +:u. :' 14_unknown > 04_Namenszusatz sed -E -e 's:^, +:u. :' 11_unknown > 05_Adresszusatz paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze printf "done.\n" # For consistency, create files with empty lines tr -dC '\n' < 01_Flags > 14_Webadresse cp 14_Webadresse 15_Email sed $'s:.*:\t:' 01_Flags > 16_Koordinaten tidy_columns rm ??_unknown } handle_format_version_3() { # glob teiln=`printf "%s" "$1"/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt]` braid=`printf "%s" "$1"/[Dd][Aa][Tt]/[Bb][Rr][Aa][Ii][Dd].[Dd][Aa][Tt]` streets=`printf "%s" "$1"/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt]` karto=`printf "%s" "$1"/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt]` if [ -f "${braid}" ]; then echo "Working on $1. Detected pre-2004 Yellow Pages version." is_yp=true else echo "Working on $1. Detected pre-2004 Telefonbuch version." unset is_yp fi # Extract teiln.dat do_decompress_version_3 "${teiln}" "teiln.dat" # See how long each filename is export filename_len=$(( `ls | head -n 1 | wc -c` - 1 )) # Get total amount of files, for reporting progress number_of_files=`find ${FIND_E} . -maxdepth 1 -regex '^\./[0123456789]+' | wc -l` # from 2000F on file 0+3*n is table, so make it default table_file=0; vname_file=2 # if supposed vname file is larger than table file, # we're having a pre-2000F layout, so switch accordingly if [ `size ${table_file}` -lt `size ${vname_file}` ]; then table_file=2; nname_file=0; vname_file=1 else nname_file=1 fi # Table file has a table header with identical count # to nname file's header. Verify this if [ `get_dword ${nname_file}` -ne `get_dword ${table_file}` ]; then echo "Unknown layout." exit fi # Now loop over all files and dump them printf "Splitting decompressed nname chunks into their columns ... " JOT "%0${filename_len}d" ${nname_file} $(( number_of_files - 1 )) 3 | split_version_3 1 1 # set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}` # tail -c +$(( $2 + 1 )) ${file} # done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname if [ "${is_yp}" ]; then cut -c 1 < 01_unknown > 01_Flags cut -c 2-7 < 01_unknown > 09_Branchenindex cut -c 8- < 01_unknown > 02_Nachname else cut -c 1 < 01_unknown > 01_Flags cut -c 2- < 01_unknown > 02_Nachname fi printf "done.\n" printf "Splitting decompress vname chunks into their columns ... " JOT "%0${filename_len}d" ${vname_file} $(( number_of_files - 1 )) 3 | xargs -n 128 cat | tr '\n\0' '\t\n' | tr -d '\377' | awk '{$1=$1};1' > 03_Vorname printf "done.\n" printf "Splitting decompress table file chunks into their columns ... " JOT "%0${filename_len}d" ${table_file} $(( number_of_files - 1 )) 3 | split_version_3 4 0 # for file in `jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3`; do # # Offset into first table entry tells us how many # # fields are in table file # set -- `hexdump -n 64 -v -e '" " 1/4 "%u"' ${file}` # count=$1; table_entries=$(( $2 / 4 - 1 )); shift # # # Now iterate over all entries in the table file # for idx in `jot ${table_entries}`; do # tail -c +$(( $1 + 1 )) ${file} | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( idx + 3 ))` # shift # done # done printf "done.\n" # wipe all temporary extracted files printf "Cleaning up decompressed chunks ... " find ${FIND_E} . -maxdepth 1 -regex '^\./[0123456789]+' -delete printf "done.\n" # rename our columns extracted from the table file mv 06_unknown 06_Ortszusatz mv 08_unknown 08_Hausnummer mv 10_unknown 10_Postleitzahl mv 11_unknown 11_Ort mv 12_unknown 12_Vorwahl mv 13_unknown 13_Rufnummer if [ -f 14_unknown ]; then tr '\\' '/' < 14_unknown | iconv -f iso-8859-15 -t utf-8 > 15_Email else tr -dC '\n' < 01_Flags > 15_Email fi if [ -f 15_unknown ]; then tr '\\' '/' < 15_unknown | iconv -f iso-8859-15 -t utf-8 > 14_Webadresse else tr -dC '\n' < 01_Flags > 14_Webadresse fi printf "Normalizing flags ... " sed -i.bak -e s:^1$:00:g -e s:^3$:01:g -e s:^2$:02:g 01_Flags rm 01_Flags.bak printf "done.\n" printf "Normalizing zusaetze ... " sed -E -e 's:^, +:u. :' 04_unknown > 04_Namenszusatz sed -E -e 's:^, +:u. :' 05_unknown > 05_Adresszusatz paste 04_Namenszusatz 05_Adresszusatz | awk '{$1=$1};1' > 04_Zusaetze printf "done.\n" printf "Normalizing verweise ... " sed -E -e 's:^\|::g;s:\|$::g;s:\|:, :g' 09_unknown | awk '{$1=$1};1' > 09_Verweise printf "done.\n" # At least 2002_Q3 and 2003_Q1 are known to sport | -separated vname and nname fields # those fields are redundant, as they are being made explicit in 09_-column for these entries if grep -q '|' 03_Vorname; then printf "Treating vname fields with pipe separator ... (adds 3 minutes) ... " # Identify entries with | in nname and move content of 09_Verweise to 04_Zusaetze paste 03_Vorname 04_Zusaetze 09_Verweise | sed -E $'s:^([^|]*)\|.*\t(.*)\t(.*):\\1\t\\2 \\3\t:;' > 03_04_09_Temp cut -f 1 03_04_09_Temp | awk '{$1=$1};1' > 03_Vorname cut -f 2 03_04_09_Temp | awk '{$1=$1};1' > 04_Zusaetze cut -f 3 03_04_09_Temp | awk '{$1=$1};1' > 09_Verweise rm 03_04_09_Temp # Delete redundant nachnamen values cut -d '|' -f 1 02_Nachname > 02_Nachname.new mv 02_Nachname.new 02_Nachname printf "done.\n" fi # If street names come in an extra file, extract # street names first if [ -f "${streets}" ]; then printf "Extracting street name indexes ... " do_processfile_version_3 "${streets}" "street name" 99_Strassenname convert_zeros printf "done.\n" fi # extract street names if 07_unknown contains street indexes # instead of street names if [ -f 99_Strassenname ]; then mv 07_unknown 07_Strassenindex # fix up known broken Strassennamen file [ `stat -f %z ${streets}` -eq 1642716 ] && printf '9. Str.\n91. Str.\n91er-Str.\n' >> 99_Strassenname printf "Looking up street names from indexes ... " cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse printf "done.\n" else mv 07_unknown 07_Strasse fi if [ -f "${karto}" ]; then do_processfile_version_3 "${karto}" "geo coordinates" 90_Geokoordinaten_hnr_raw printf "Looking up geo coordinates for each phonebook entry ... " tr '\0;' '\n\t' < 90_Geokoordinaten_hnr_raw | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr rm 90_Geokoordinaten_hnr_raw paste 10_Postleitzahl 11_Ort 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten printf "done.\n" else sed $'s:.*:\t:' 01_Flags > 16_Koordinaten fi if [ -f "${braid}" ]; then do_processfile_version_3 "${braid}" "branchen name index" 97_Branchenname convert_zeros printf "Looking up branch names from codes ... " map_branches_v3 97_Branchenname < 09_Branchenindex > 09_Branchen printf "done.\n" fi tidy_columns rm ??_unknown } handle_format_version_4() { if [ -f "$1/branchcodes.tl" ]; then is_yp=true echo "Working on $1. Detected post-2003 Yellow Pages version." else unset is_yp echo "Working on $1. Detected post-2003 Telefonbuch version." fi printf "Extracting street names ... " extract_version_4 "$1/streets.tl" cat file_* | tr '\n\0' '\t\n' > 99_Strassenname rm file_* printf "done.\n" printf "Extracting phonebook.db ... " extract_version_4 "$1/phonebook.db" rows=`find . -name file_\* | wc -l` printf "done.\n" printf "Splitting decompressed chunks into their columns (11 total) ... 1, " JOT "file_%05X" 0 $(( rows - 1 )) 11 | xargs -n 128 cat | xxd -ps -c1 > column_0 for col in 1 2 3 4 5 6 7 8 9 10; do printf "%d, " $(( col + 1 )) JOT "file_%05X" ${col} $(( rows - 1 )) 11 | xargs -n 128 cat | tr '\n\0' '\t\n' > column_${col} done printf "done.\n" printf "Cleaning up decompressed chunks ... " find . -name file_\* -delete printf "done.\n" # the 'did not object to inverse search' flag is insane and needs to be reversed if grep -q ^40 column_0; then printf "Cleanung up inverted reverse search flags ... " awk '{ a=substr($0,1,1); printf "%x%x\n",index("5670123cdef89ab4",a)%16 ,substr($0,2,1) }' < column_0 > 01_Flags printf "done\n" else mv column_0 01_Flags fi mv column_1 02_Nachname mv column_2 03_Vorname mv column_3 04_05_Namenszusatz_Addresszusatz mv column_5 07_08_Strassenindex_Hausnummer mv column_6 12_Vorwahl mv column_7 10_Postleitzahl mv column_8 11_Ort mv column_9 13_Rufnummer mv column_10 14_15_Email_Webadresse printf "Looking up street names from indexes ... " cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse printf "done.\n" printf "Splitting house numbers ... " sed -E $'s:$:\t:' < 07_08_Strassenindex_Hausnummer | cut -f 2 > 08_Hausnummer printf "done.\n" printf "Normalizing zusaetze ... " sed -E -e $'s:(^|\t),: u. :g' 04_05_Namenszusatz_Addresszusatz | awk '{$1=$1};1' > 04_Zusaetze printf "done.\n" printf "Normalizing verweise ... " sed -E -e $'s:^\|+::g;s:\|+$::g;s:\|:, :g' column_4 | awk '{$1=$1};1' > 09_Verweise printf "done.\n" printf "Splitting webaddress ... " cut -d $'\t' -f 1 14_15_Email_Webadresse | tr '\\' '/' | iconv -f iso-8859-15 -t utf-8 > 14_Webadresse printf "done.\n" printf "Splitting email ... " sed $'s:$:\t:' < 14_15_Email_Webadresse | cut -sd $'\t' -f 2 | tr '\\' '/' | iconv -f iso-8859-15 -t utf-8 > 15_Email printf "done.\n" if [ -f "$1/zip-streets-hn-geo.tl" ]; then printf "Extracting geo coordinates (precision: house number) ... " extract_version_4 "$1/zip-streets-hn-geo.tl" cat file_* > 90_Geokoordinaten_hnr printf "done.\n" printf "Looking up geo coordinates for each phonebook entry ... " paste 10_Postleitzahl 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten printf "done.\n" elif [ -f "$1/zip-streets-geo.tl" ]; then printf "Extracting geo coordinates (precision: street) ... " extract_version_4 "$1/zip-streets-geo.tl" cat file_* > 91_Geokoordinaten_str printf "done.\n" printf "Looking up geo coordinates for each phonebook entry ... " paste 10_Postleitzahl 07_Strasse | map_coords 91_Geokoordinaten_str | convert_coords > 16_Koordinaten printf "done.\n" else sed $'s:.*:\t:' 01_Flags > 16_Koordinaten fi rm file_* if [ "${is_yp}" ]; then printf "Extracting branch names ... " extract_version_4 "$1/branchcodes.tl" cat file_* | tr '\n\0' '\t\n' > 97_Branchenname rm file_* printf "done.\n" printf "Looking up branch names from codes ... " map_branches_v4 97_Branchenname < 09_Verweise > 09_Branchen printf "done.\n" fi tidy_columns rm column_* } tidy_columns () { printf "Removing backslashes from Nachnamen ... " sed -E -e 's:\\::g' 02_Nachname | awk '{$1=$1};1' | iconv -f iso-8859-15 -t utf-8 > 02_Nachname.new mv 02_Nachname.new 02_Nachname printf "done.\n" printf "Unicoding Vornamen ... " iconv -f iso-8859-15 -t utf-8 03_Vorname > 03_Vorname.new mv 03_Vorname.new 03_Vorname printf "done.\n" printf "Unicoding Zusaetze ... " iconv -f iso-8859-15 -t utf-8 04_Zusaetze > 04_Zusaetze.new mv 04_Zusaetze.new 04_Zusaetze printf "done.\n" printf "Tidying up streetnames ... " # Replace any dots at end of line by a single one # finish any str abbreviation without a period with a period sed -E 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./;s/(.*)-(.*) -/\1-\2-Str./;s/ -$/ Str./;s/-$/str./;s/^(.*-.*) Str\.?$/\1-Str./' 07_Strasse | iconv -f iso-8859-15 -t utf-8 > 07_Strasse.new mv 07_Strasse.new 07_Strasse printf "done.\n" printf "Normalizing house numbers ... " sed -E -e 's:^([[:digit:]]+) *([A-Za-z])$:\1 \2:' -e 's: a$: A:;s: b$: B:;s: c$: C:;s: d$: D:;s: e$: E:;s: f$: F:;s: g$: G:;s: h$: H:;s: i$: I:;s: j$: J:;s: k$: K:;s: l$: L:;s: m$: M:;s: n$: N:;s: o$: O:;' 08_Hausnummer | iconv -f iso-8859-15 -t utf-8 > 08_Hausnummer.new mv 08_Hausnummer.new 08_Hausnummer printf "done.\n" printf "Unicoding Verweise ... " iconv -f iso-8859-15 -t utf-8 09_Verweise > 09_Verweise.new mv 09_Verweise.new 09_Verweise printf "done.\n" printf "Unicoding Postleitzahl ... " iconv -f iso-8859-15 -t utf-8 10_Postleitzahl > 10_Postleitzahl.new mv 10_Postleitzahl.new 10_Postleitzahl printf "done.\n" printf "Removing trailing * from Ort ... " sed -E -e 's:\*$::' 11_Ort | iconv -f iso-8859-15 -t utf-8 > 11_Ort.new mv 11_Ort.new 11_Ort printf "done.\n" printf "Unicoding Vorwahl ... " iconv -f iso-8859-15 -t utf-8 12_Vorwahl > 12_Vorwahl.new mv 12_Vorwahl.new 12_Vorwahl printf "done.\n" printf "Unicoding Rufnummer ... " iconv -f iso-8859-15 -t utf-8 13_Rufnummer > 13_Rufnummer.new mv 13_Rufnummer.new 13_Rufnummer printf "done.\n" } # JOT JOT () { case `uname -s` in *BSD|Darwin) jot -w "$1" - "$2" "$3" "$4" ;; *) for x in `seq "$2" "$4" "$3"`; do printf "$1\n" "$x"; done ;; esac } # After function definitions, main() can use them main "$@"