From 2a185f889470f9bfa049b1610900536309aded5a Mon Sep 17 00:00:00 2001 From: Dirk Engling Date: Mon, 24 Feb 2014 03:14:53 +0100 Subject: Use make the way it was intended --- makecolumns.sh | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100755 makecolumns.sh (limited to 'makecolumns.sh') diff --git a/makecolumns.sh b/makecolumns.sh new file mode 100755 index 0000000..0f9c5ba --- /dev/null +++ b/makecolumns.sh @@ -0,0 +1,286 @@ +#!/bin/sh + +export LANG=C +export LC_CTYPE=C +export LC_ALL=C +export PATH=${PATH}:`pwd`/../bin/ + +main() { + [ -f /usr/local/bin/el ] && EL=/usr/local/bin/el + [ -f `dirname $0`/../bin/el ] && EL=`dirname $0`/../bin/el + + if [ -z "${EL}" ]; then + echo "el not found. Get it at 'cvs -d :pserver:anoncvs@cvs.erdgeist.org:/home/cvsroot co el'" + exit 1 + fi + + if [ $# -ne 1 ]; then + echo "Syntax: $0 [phonebookdirectory]" + exit 1 + fi + + # Compile all the binaries + make binaries + + printf "Cleaning up old working directory ... " + rm -rf ../work_`basename "${1#white_}"` + printf "done.\n" + mkdir -p ../work_`basename "${1#white_}"` + cd ../work_`basename "${1#white_}"` || exit 1 + + if [ -f "$1/phonebook.db" ]; then + handle_format_version_3 "${1}" + elif [ -f "${1}/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt]" ]; then + handle_format_version_2 "${1}" + elif [ -n "`find "${1}" -name dpr00000.005 -ls -quit`" ]; then + handle_format_version_1 "${1}" + else + echo "Not a recognized Telefonbuch folder" + fi + cd .. +} + +do_decompress_version_2() { + printf "Extracting $2 chunks ... " + extract_version_2 "${1}" + printf "done.\n" + + printf "Decompressing $2 chunks ... " + numfiles=`find . -name \*.lha | wc -l` + reported=0; processed=0 + for archive in *.lha; do + lha x ${archive} > /dev/null + rm ${archive} + [ 1 -eq $(( ( ( (processed+=1) * 20 ) / numfiles ) > reported )) ] && printf "%d%% " $(( (reported+=1) * 5 )) + done + [ $reported -lt 10 ] && printf "100% " + printf "done.\n" +} + +do_processfile_version_2() { + working_on=`basename ${1}` + mkdir $working_on && cd ${working_on} + do_decompress_version_2 "${1}" "${2}" + cd .. + + printf "Combining $2 into single file ... " + if [ "${4}" = "convert_zeros" ]; then + cat ${working_on}/* | tr '\n\0' '\t\n' > $3 + else + cat ${working_on}/* > $3 + fi + printf "done.\n" + + rm -rf ${working_on} +} + +size() { + stat -f %z `printf %0${filename_len}d $1` +} + +get_dword() { + # $1 file + hexdump -n 4 -v -e '" " 1/4 "%u"' `printf %0${filename_len}d ${1}` +} + +handle_format_version_1() { + echo "Working on $1. Detected pre-02/1996 Telefonbuch version." + # Extract all dpr database files + printf "Extracting dpr databases ... " + find "$1" -name dpr\*.001 | extract_version_1 + printf "done.\n" + + # rename our extracted columns + mv 01_unknown 01_Flags + mv 02_unknown 02_Nachname + mv 03_unknown 03_Vorname + mv 04_unknown 05_Adresszusatz + mv 05_unknown 06_Ortszusatz + mv 06_unknown 10_Zustellamt_PLZOst + mv 07_unknown 07_Strasse + mv 08_unknown 08_Hausnummer + mv 09_unknown 04_Namenszusatz + mv 10_unknown 09_Fax_Verweise + mv 11_unknown 12_Vorwahl + mv 12_unknown 13_Rufnummer + mv 13_unknown 11_Ort + mv 14_unknown 10_Postleitzahl +} + + +handle_format_version_2() { + echo "Working on $1. Detected pre-2004 Telefonbuch version." + # Extract teiln.dat + do_decompress_version_2 $1/[Dd][Aa][Tt]/[Tt][Ee][Ii][Ll][Nn].[Dd][Aa][Tt] "teiln.dat" + + # See how long each filename is + export filename_len=$(( `ls | head -n 1 | wc -c` - 1 )) + + # Get total amount of files, for reporting progress + number_of_files=`find -E . -depth 1 -regex '^\./[0123456789]+' | wc -l` + + # from 2000F on file 0+3*n is table, so make it default + table_file=0; vname_file=2 + + # if supposed vname file is larger than table file, + # we're having a pre-2000F layout, so switch accordingly + if [ `size ${table_file}` -lt `size ${vname_file}` ]; then + table_file=2; nname_file=0; vname_file=1 + else + nname_file=1 + fi + + # Table file has a table header with identical count + # to nname file's header. Verify this + if [ `get_dword ${nname_file}` -ne `get_dword ${table_file}` ]; then + echo "Unknown layout." + exit + fi + + # Now loop over all files and dump them + printf "Splitting decompressed nname chunks into their columns ... " + jot -w %0${filename_len}d - ${nname_file} $(( number_of_files - 1 )) 3 | split_version_2 1 1 +# set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}` +# tail -c +$(( $2 + 1 )) ${file} +# done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname + cut -c 1 < 01_unknown > 01_Flags + cut -c 2- < 01_unknown > 02_Nachname + rm 01_unknown + printf "done.\n" + + printf "Splitting decompress vname chunks into their columns ... " + jot -w "%0${filename_len}d" - ${vname_file} $(( number_of_files - 1 )) 3 | xargs cat | tr '\n\0' '\t\n' | tr -d '\377' > 03_Vorname + printf "done.\n" + + printf "Splitting decompress table file chunks into their columns ... " + jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3 | split_version_2 4 0 +# for file in `jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3`; do +# # Offset into first table entry tells us how many +# # fields are in table file +# set -- `hexdump -n 64 -v -e '" " 1/4 "%u"' ${file}` +# count=$1; table_entries=$(( $2 / 4 - 1 )); shift +# +# # Now iterate over all entries in the table file +# for idx in `jot ${table_entries}`; do +# tail -c +$(( $1 + 1 )) ${file} | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( idx + 3 ))` +# shift +# done +# done + printf "done.\n" + + # wipe all temporary extracted files + printf "Cleaning up decompressed chunks ... " + find -E . -depth 1 -regex '^\./[0123456789]+' -delete + printf "done.\n" + + # rename our columns extracted from the table file + mv 04_unknown 04_Namenszusatz + mv 05_unknown 05_Adresszusatz + mv 06_unknown 06_Ortszusatz + mv 08_unknown 08_Hausnummer + mv 09_unknown 09_Verweise + mv 10_unknown 10_Postleitzahl + mv 11_unknown 11_Ort + mv 12_unknown 12_Vorwahl + mv 13_unknown 13_Rufnummer + [ -f 14_unknown ] && mv 14_unknown 14_Email + [ -f 15_unknown ] && mv 15_unknown 15_Webadresse + + # If street names come in an extra file, extract + # street names first + streets=$1/[Dd][Aa][Tt]/[Ss][Tt][Rr][Aa][Ss][Ss][Ee][Nn].[Dd][Aa][Tt] + [ -f ${streets} ] && do_processfile_version_2 ${streets} "street name" 99_Strassenname convert_zeros + + # extract street names if 07_unknown contains street indexes + # instead of street names + if [ -f 99_Strassenname ]; then + mv 07_unknown 07_Strassenindex + printf "Looking up street names from indexes ... " + cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse + printf "done.\n" + else + mv 07_unknown 07_Strasse + fi + + karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] + if [ -f ${karto} ]; then + do_processfile_version_2 ${karto} "geo coordinates" 90_Geokoordinaten_hnr_raw + + printf "Looking up geo coordinates for each phonebook entry ... " + tr '\0' '\n' < 90_Geokoordinaten_hnr_raw | tr ';' '\t' | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr + rm 90_Geokoordinaten_hnr_raw + lam 10_Postleitzahl -s $'\t' 11_Ort -s $'\t' 07_Strasse -s $'\t' 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten + printf "done.\n" + fi +} + +handle_format_version_3() { + echo "Working on $1. Detected post-2003 Telefonbuch version." + printf "Extracting street names ... " + extract_version_3 $1/streets.tl + + cat file_* | tr '\n\0' '\t\n' > 99_Strassenname + rm file_* + printf "done.\n" + + printf "Extracting phonebook.db ... " + extract_version_3 $1/phonebook.db + + rows=`find . -name file_\* | wc -l` + printf "done.\n" + + printf "Splitting decompressed chunks into their columns (11 total) ... 1, " + jot -w "file_%05X" - 0 $(( rows - 1 )) 11 | xargs cat | xxd -ps -c1 > column_0 + + for col in 1 2 3 4 5 6 7 8 9 10; do + printf "%d, " $(( col + 1 )) + jot -w "file_%05X" - ${col} $(( rows - 1 )) 11 | xargs cat | tr '\n\0' '\t\n' > column_${col} + done + printf "done.\n" + + printf "Cleaning up decompressed chunks ... " + find . -name file_\* -delete + printf "done.\n" + + mv column_0 01_Flags + mv column_1 02_Nachname + mv column_2 03_Vorname + mv column_3 04_05_Namenszusatz_Addresszusatz + mv column_4 09_Verweise + mv column_5 07_08_Strassenindex_Hausnummer + mv column_6 12_Vorwahl + mv column_7 10_Postleitzahl + mv column_8 11_Ort + mv column_9 13_Rufnummer + mv column_10 14_15_Email_Webadresse + + printf "Looking up street names from indexes ... " + cut -f 1 07_08_Strassenindex_Hausnummer | ${EL} -0 99_Strassenname > 07_Strasse + printf "done.\n" + + printf "Splitting house numbers ... " + sed -E $'s:$:\t:' < 07_08_Strassenindex_Hausnummer | cut -f 2 > 08_Hausnummer + printf "done.\n" + + if [ -f $1/zip-streets-hn-geo.tl ]; then + printf "Extracting geo coordinates (precision: house number) ... " + extract_version_3 $1/zip-streets-hn-geo.tl + cat file_* > 90_Geokoordinaten_hnr + printf "done.\n" + printf "Looking up geo coordinates for each phonebook entry ... " + lam 10_Postleitzahl -s $'\t' 07_Strasse -s $'\t' 08_Hausnummer | map_coords 90_Geokoordinaten_hnr | convert_coords > 16_Koordinaten + printf "done.\n" + elif [ -f $1/zip-streets-geo.tl ]; then + printf "Extracting geo coordinates (precision: street) ... " + extract_version_3 $1/zip-streets-geo.tl + cat file_* > 91_Geokoordinaten_str + printf "done.\n" + printf "Looking up geo coordinates for each phonebook entry ... " + lam 10_Postleitzahl -s $'\t' 07_Strasse | map_coords 91_Geokoordinaten_str | convert_coords > 16_Koordinaten + printf "done.\n" + fi + rm file_* +} + +# After function definitions, main() can use them +main "$@" -- cgit v1.2.3