#!/bin/sh export LANG=C export LC_CTYPE=C export LC_ALL=C export PATH=${PATH}:`pwd`/bin/ unset FIND_E case `uname -s` in *BSD|Darwin) FIND_E="-E " esac main() { [ -f /usr/local/bin/el ] && EL=/usr/local/bin/el [ -f `dirname $0`/../bin/el ] && EL=`dirname $0`/../bin/el if [ -z "${EL}" ]; then echo "el not found. Get it at 'cvs -d :pserver:anoncvs@cvs.erdgeist.org:/home/cvsroot co el'" exit 1 fi if [ $# -ne 1 ]; then echo "Syntax: $0 [phonebookdirectory]" exit 1 fi # Compile all the binaries make all printf "Cleaning up old working directory ... " rm -rf work/`basename "${1#white_}"` printf "done.\n" mkdir -p work/`basename "${1#white_}"` cd work/`basename "${1#white_}"` || exit 1 if [ -f "$1/phonebook.db" ]; then handle_format_version_4 "${1}" elif [ -f "${1}"/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] ]; then handle_format_version_3 "${1}" elif [ -n "`find "${1}" -iname atb?dd00 -ls -quit`" ]; then handle_format_version_2 "${1}" elif [ -n "`find "${1}" -name dpr00000.005 -ls -quit`" ]; then handle_format_version_1 "${1}" else echo "Not a recognized Telefonbuch folder" fi cd ../.. } do_decompress_version_3() { printf "Extracting $2 chunks ... " extract_version_3 "${1}" printf "done.\n" printf "Decompressing $2 chunks ... " numfiles=`find . -name \*.lha | wc -l` reported=0; processed=0 for archive in *.lha; do lha x ${archive} > /dev/null rm ${archive} [ 1 -eq $(( ( ( (processed+=1) * 20 ) / numfiles ) > reported )) ] && printf "%d%% " $(( (reported+=1) * 5 )) done [ $reported -lt 10 ] && printf "100% " printf "done.\n" } do_processfile_version_3() { working_on=`basename ${1}` mkdir $working_on && cd ${working_on} do_decompress_version_3 "${1}" "${2}" cd .. printf "Combining $2 into single file ... " if [ "${4}" = "convert_zeros" ]; then cat ${working_on}/* | tr '\n\0' '\t\n' > $3 else cat ${working_on}/* > $3 fi printf "done.\n" rm -rf ${working_on} } size() { case `uname -s` in *BSD|Darwin) stat -f %z `printf %0${filename_len}d "$1"`;; *) stat -c %s `printf %0${filename_len}d "$1"`;; esac } get_dword() { # $1 file hexdump -n 4 -v -e '" " 1/4 "%u"' `printf %0${filename_len}d "${1}"` } handle_format_version_1() { echo "Working on $1. Detected 1992 Telefonbuch version." # Extract all dpr database files printf "Extracting dpr databases ... " find "$1" -name dpr\*.001 | extract_version_1 printf "done.\n" # rename our extracted columns mv 01_unknown 01_Flags mv 02_unknown 02_Nachname mv 03_unknown 03_Vorname mv 04_unknown 05_Adresszusatz mv 05_unknown 06_Ortszusatz mv 06_unknown 10_Zustellamt_PLZOst mv 07_unknown 07_Strasse mv 08_unknown 08_Hausnummer mv 09_unknown 04_Namenszusatz mv 10_unknown 09_Fax_Verweise mv 11_unknown 12_Vorwahl mv 12_unknown 13_Rufnummer mv 13_unknown 11_Ort mv 14_unknown 10_Postleitzahl printf "Normalizing zusaetze ... " paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze printf "done.\n" tidy_streetnames 07_Strasse } handle_format_version_2() { echo "Working on $1. Detected pre 02/1996 Telefonbuch version." # Extract all CD's pkware compressed databases printf "Extracting 3 pkware databases ...\n" cd=1 for database in `find "$1" -iname atb?dd00`; do dir=`dirname ${database}` base=`basename ${database}` printf " %d/3 in %4s. Decompressing ... " ${cd} "$( basename ${dir} )" extract_version_2 "${database}" > ${base}.dump printf ", extracting ... " indexfile=$( find ${dir} -iname atb?di00 ) split_version_2 "${base}.dump" "${indexfile}" printf ", cleaning up ... " rm "${base}.dump" printf "done.\n" cd=$((cd+1)) done mv 01_unknown 01_Flags mv 16_unknown 02_Nachname mv 07_unknown 03_Vorname mv 14_unknown 04_Namenszusatz mv 11_unknown 05_Adresszusatz mv 12_unknown 06_Ortszusatz mv 08_unknown 07_Strasse mv 10_unknown 08_Hausnummer mv 13_unknown 09_Fax_Verweise mv 02_unknown 10_Postleitzahl mv 15_unknown 11_Ort mv 09_unknown 11_Ort_Gemeinde mv 05_unknown 12_Vorwahl mv 06_unknown 13_Rufnummer # remove entries that are for searching only rm 03_unknown 04_unknown printf "Normalizing zusaetze ... " paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E s/' +'/' '/g > 04_Zusaetze printf "done.\n" tidy_streetnames 07_Strasse } handle_format_version_3() { echo "Working on $1. Detected pre-2004 Telefonbuch version." # Extract teiln.dat do_decompress_version_3 "$1"/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] "teiln.dat" # See how long each filename is export filename_len=$(( `ls | head -n 1 | wc -c` - 1 )) # Get total amount of files, for reporting progress number_of_files=`find ${FIND_E} . -maxdepth 1 -regex '^\./[0123456789]+' | wc -l` # from 2000F on file 0+3*n is table, so make it default table_file=0; vname_file=2 # if supposed vname file is larger than table file, # we're having a pre-2000F layout, so switch accordingly if [ `size ${table_file}` -lt `size ${vname_file}` ]; then table_file=2; nname_file=0; vname_file=1 else nname_file=1 fi # Table file has a table header with identical count # to nname file's header. Verify this if [ `get_dword ${nname_file}` -ne `get_dword ${table_file}` ]; then echo "Unknown layout." exit fi # Now loop over all files and dump them printf "Splitting decompressed nname chunks into their columns ... " JOT "%0${filename_len}d" ${nname_file} $(( number_of_files - 1 )) 3 | split_version_3 1 1 # set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}` # tail -c +$(( $2 + 1 )) ${file} # done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname cut -c 1 < 01_unknown > 01_Flags cut -c 2- < 01_unknown > 02_Nachname rm 01_unknown printf "done.\n" printf "Splitting decompress vname chunks into their columns ... " JOT "%0${filename_len}d" ${vname_file} $(( number_of_files - 1 )) 3 | xargs -n 128 cat | tr '\n\0' '\t\n' | tr -d '\377' > 03_Vorname printf "done.\n" printf "Splitting decompress table file chunks into their columns ... " JOT "%0${filename_len}d" ${table_file} $(( number_of_files - 1 )) 3 | split_version_3 4 0 # for file in `jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3`; do # # Offset into first table entry tells us how many # # fields are in table file # set -- `hexdump -n 64 -v -e '" " 1/4 "%u"' ${file}` # count=$1; table_entries=$(( $2 / 4 - 1 )); shift # # # Now iterate over all entries in the table file # for idx in `jot ${table_entries}`; do # tail -c +$(( $1 + 1 )) ${file} | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( idx + 3 ))` # shift # done # done printf "done.\n" # wipe all temporary extracted files printf "Cleaning up decompressed chunks ... " find ${FIND_E} . -maxdepth 1 -regex '^\./[0123456789]+' -delete printf "done.\n" # rename our columns extracted from the table file mv 04_unknown 04_Namenszusatz mv 05_unknown 05_Adresszusatz mv 06_unknown 06_Ortszusatz mv 08_unknown 08_Hausnummer mv 09_unknown 09_Verweise mv 10_unknown 10_Postleitzahl mv 11_unknown 11_Ort mv 12_unknown 12_Vorwahl mv 13_unknown 13_Rufnummer [ -f 14_unknown ] && mv 14_unknown 14_Email [ -f 15_unknown ] && mv 15_unknown 15_Webadresse printf "Normalizing flags ... " sed -i.bak -e s:^1$:00:g -e s:^3$:01:g -e s:^2$:02:g 01_Flags rm 01_Flags.bak printf "done.\n" printf "Normalizing zusaetze ... " paste 04_Namenszusatz 05_Adresszusatz | tr '\t' ' ' | sed -E -e 's/ +/ /g' -e 's/^ +//g' -e 's/ +$//g' > 04_Zusaetze printf "done.\n" # If street names come in an extra file, extract # street names first streets="$1"/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt] [ -f "${streets}" ] && do_processfile_version_3 "${streets}" "street name" 99_Strassenname convert_zeros # extract street names if 07_unknown contains street indexes # instead of street names if [ -f 99_Strassenname ]; then mv 07_unknown 07_Strassenindex printf "Looking up street names from indexes ... " # fix up known broken Strassennamen file [ `stat -f %z ${streets}` -eq 1642716 ] && printf '9. Str.\n91. Str.\n91er-Str.\n' >> 99_Strassenname tidy_streetnames 99_Strassenname cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse printf "done.\n" else mv 07_unknown 07_Strasse tidy_streetnames 07_Strasse fi karto="$1"/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] if [ -f "${karto}" ]; then do_processfile_version_3 "${karto}" "geo coordinates" 90_Geokoordinaten_hnr_raw printf "Looking up geo coordinates for each phonebook entry ... " tr '\0' '\n' < 90_Geokoordinaten_hnr_raw | tr ';' '\t' | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr rm 90_Geokoordinaten_hnr_raw paste 10_Postleitzahl 11_Ort 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten printf "done.\n" fi } handle_format_version_4() { if [ -f "$1/branchcodes.tl" ]; then is_yp=true echo "Working on $1. Detected post-2003 Yellow Pages version." else unset is_yp echo "Working on $1. Detected post-2003 Telefonbuch version." fi printf "Extracting street names ... " extract_version_4 "$1/streets.tl" cat file_* | tr '\n\0' '\t\n' > 99_Strassenname rm file_* printf "done.\n" printf "Extracting phonebook.db ... " extract_version_4 "$1/phonebook.db" rows=`find . -name file_\* | wc -l` printf "done.\n" printf "Splitting decompressed chunks into their columns (11 total) ... 1, " JOT "file_%05X" 0 $(( rows - 1 )) 11 | xargs -n 128 cat | xxd -ps -c1 > column_0 for col in 1 2 3 4 5 6 7 8 9 10; do printf "%d, " $(( col + 1 )) JOT "file_%05X" ${col} $(( rows - 1 )) 11 | xargs -n 128 cat | tr '\n\0' '\t\n' > column_${col} done printf "done.\n" printf "Cleaning up decompressed chunks ... " find . -name file_\* -delete printf "done.\n" # the 'did not object to inverse search' flag is insane and needs to be reversed if grep -q ^40 column_0; then printf "Cleanung up inverted reverse search flags ... " awk '{ a=substr($0,1,1); printf "%x%x\n",index("5670123cdef89ab4",a)%16 ,substr($0,2,1) }' < column_0 > 01_Flags rm column_0 printf "done\n" else mv column_0 01_Flags fi mv column_1 02_Nachname mv column_2 03_Vorname mv column_3 04_05_Namenszusatz_Addresszusatz mv column_4 09_Verweise mv column_5 07_08_Strassenindex_Hausnummer mv column_6 12_Vorwahl mv column_7 10_Postleitzahl mv column_8 11_Ort mv column_9 13_Rufnummer mv column_10 14_15_Email_Webadresse tidy_streetnames 99_Strassenname printf "Looking up street names from indexes ... " cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse printf "done.\n" printf "Splitting house numbers ... " sed -E $'s:$:\t:' < 07_08_Strassenindex_Hausnummer | cut -f 2 > 08_Hausnummer printf "done.\n" printf "Normalizing zusaetze ... " tr '\t' ' ' < 04_05_Namenszusatz_Addresszusatz | sed -E s/' +'/' '/g > 04_Zusaetze printf "done.\n" if [ -f "$1/zip-streets-hn-geo.tl" ]; then printf "Extracting geo coordinates (precision: house number) ... " extract_version_4 "$1/zip-streets-hn-geo.tl" cat file_* > 90_Geokoordinaten_hnr printf "done.\n" printf "Looking up geo coordinates for each phonebook entry ... " paste 10_Postleitzahl 07_Strasse 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten printf "done.\n" elif [ -f "$1/zip-streets-geo.tl" ]; then printf "Extracting geo coordinates (precision: street) ... " extract_version_4 "$1/zip-streets-geo.tl" cat file_* > 91_Geokoordinaten_str printf "done.\n" printf "Looking up geo coordinates for each phonebook entry ... " paste 10_Postleitzahl 07_Strasse | map_coords 91_Geokoordinaten_str | convert_coords > 16_Koordinaten printf "done.\n" fi rm file_* if [ "${is_yp}" ]; then printf "Extracting branch names ... " extract_version_4 "$1/branchcodes.tl" cat file_* | tr '\n\0' '\t\n' > 97_Branchenname rm file_* printf "done.\n" printf "Generating branch name index ... " mkdir branchcodes/ while read index name; do printf $name > branchcodes/${index} done < 97_Branchenname printf "done.\n" printf "Looking up branch names from codes ... " map_branches 97_Branchenname < 09_Verweise > 09_Branchen printf "done.\n" rm -r branchcodes fi } tidy_streetnames () { streets="$1" # Replace any dots at end of line by a single one # finish any str abbreviation without a period with a period sed -E -i.bak 's/\.+/./g;s/(S|s)tr( |:)?$/\1tr./' ${streets} rm "${streets}".bak } # JOT JOT () { case `uname -s` in *BSD|Darwin) jot -w "$1" - "$2" "$3" "$4" ;; *) for x in `seq "$2" "$4" "$3"`; do printf "$1\n" "$x"; done ;; esac } # After function definitions, main() can use them main "$@"