From 9c46deb628e21991606bbf2a23ecb678a40cd243 Mon Sep 17 00:00:00 2001 From: Dirk Engling Date: Tue, 11 Feb 2014 17:12:51 +0100 Subject: Reworked code to split old telefonbuch distributions, the old version was too slow --- src/makecolumns.sh | 104 +++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 55 deletions(-) (limited to 'src') diff --git a/src/makecolumns.sh b/src/makecolumns.sh index 2df65c9..5d2d90b 100755 --- a/src/makecolumns.sh +++ b/src/makecolumns.sh @@ -77,10 +77,8 @@ size() { } get_dword() { - # $1 file, $2 offset - file=`printf %0${filename_len}d ${1}` - set -- `od -tu4 -N4 -j$(( 4*${2:-0} )) ${file}` - printf "%d\n" $2 + # $1 file + hexdump -n 4 -v -e '" " 1/4 "%u"' `printf %0${filename_len}d ${1}` } handle_old_format() { @@ -113,38 +111,34 @@ handle_old_format() { fi # Now loop over all files and dump them - printf "Splitting decompressed chunks into their columns ... " - reported=0 - while [ -f `printf %0${filename_len}d ${nname_file}` ]; do - # Get number of entries in this round - count=`get_dword ${nname_file}` - - # Get offset into first nname - nname_off=$(( `get_dword ${nname_file} 1` + 1 )) - - # Now get the flags before the nnames - tail -c +${nname_off} `printf %0${filename_len}d ${nname_file}` | tr '\n\0' '\t\n' | head -n ${count} | cut -c -1 >> 01_Flags - tail -c +${nname_off} `printf %0${filename_len}d ${nname_file}` | tr '\n\0' '\t\n' | head -n ${count} | cut -c 2- >> 02_Nachname - - # Extract the vnames - tr '\n\0' '\t\n' < `printf %0${filename_len}d ${vname_file}` | head -n ${count} >> 03_Vorname - - # Offset into first table entry tells us how many - # fields are in table file - table_entries=$(( `get_dword ${table_file} 1` / 4 - 1 )) - - # Now iterate over all entries in the table file - for table_index in `jot ${table_entries}`; do - table_off=`get_dword ${table_file} ${table_index}` - tail -c +$(( table_off + 1 )) `printf %0${filename_len}d ${table_file}` | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( table_index + 3 ))` - done - - # Advance the filenames. - nname_file=$(( nname_file+3 )) - vname_file=$(( vname_file+3 )) - table_file=$(( table_file+3 )) - [ 1 -eq $(( ( ( table_file * 20 ) / number_of_files ) > reported )) ] && printf "%d%% " $(( (reported+=1) * 5 )) - done + printf "Splitting decompressed nname chunks into their columns ... " + for file in `jot -w %0${filename_len}d - ${nname_file} $(( number_of_files - 1 )) 3`; do + set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}` + tail -c +$(( $2 + 1 )) ${file} + done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname + cut -c 1 < 01_02_Flags_Nachname > 01_Flags + cut -c 2- < 01_02_Flags_Nachname > 02_Nachname + rm 01_02_Flags_Nachname + printf "done.\n" + + printf "Splitting decompress vname chunks into their columns ... " + jot -w "%0${filename_len}d" - ${vname_file} $(( number_of_files - 1 )) 3 | xargs cat | tr '\n\0' '\t\n' > 03_Vorname + printf "done.\n" + + printf "Splitting decompress table file chunks into their columns ... " + jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3 | splitold +# for file in `jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3`; do +# # Offset into first table entry tells us how many +# # fields are in table file +# set -- `hexdump -n 64 -v -e '" " 1/4 "%u"' ${file}` +# count=$1; table_entries=$(( $2 / 4 - 1 )); shift +# +# # Now iterate over all entries in the table file +# for idx in `jot ${table_entries}`; do +# tail -c +$(( $1 + 1 )) ${file} | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( idx + 3 ))` +# shift +# done +# done printf "done.\n" # wipe all temporary extracted files @@ -153,17 +147,19 @@ handle_old_format() { printf "done.\n" # rename our columns extracted from the table file - mv 04_unknown 04_Namenszusatz - mv 05_unknown 05_Adresszusatz - mv 06_unknown 06_Ortszusatz - mv 08_unknown 08_Hausnummer - mv 09_unknown 09_Verweise - mv 10_unknown 10_Postleitzahl - mv 11_unknown 11_Ort - mv 12_unknown 12_Vorwahl - mv 13_unknown 13_Rufnummer - [ -f 14_unknown ] && mv 14_unknown 14_Email - [ -f 15_unknown ] && mv 15_unknown 15_Webadresse + printf "Converting string terminators to line newlines ... " + tr '\0' '\n' < 04_unknown > 04_Namenszusatz + tr '\0' '\n' < 05_unknown > 05_Adresszusatz + tr '\0' '\n' < 06_unknown > 06_Ortszusatz + tr '\0' '\n' < 08_unknown > 08_Hausnummer + tr '\0' '\n' < 09_unknown > 09_Verweise + tr '\0' '\n' < 10_unknown > 10_Postleitzahl + tr '\0' '\n' < 11_unknown > 11_Ort + tr '\0' '\n' < 12_unknown > 12_Vorwahl + tr '\0' '\n' < 13_unknown > 13_Rufnummer + [ -f 14_unknown ] && tr '\0' '\n' < 14_unknown > 14_Email + [ -f 15_unknown ] && tr '\0' '\n' < 15_unknown > 15_Webadresse + printf "done.\n" # If street names come in an extra file, extract # street names first @@ -173,13 +169,14 @@ handle_old_format() { # extract street names if 07_unknown contains street indexes # instead of street names if [ -f 99_Strassenname ]; then - mv 07_unknown 07_Strassenindex + tr '\0' '\n' < 07_unknown > 07_Strassenindex printf "Looking up street names from indexes ... " cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse printf "done.\n" else - mv 07_unknown 07_Strasse + tr '\0' '\n' < 07_unknown > 07_Strasse fi + rm ??_unknown karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] [ -f ${karto} ] && do_processfile_old ${karto} "geo coordinates" 90_Geokoordinaten_hnr_raw @@ -187,7 +184,7 @@ handle_old_format() { printf "Looking up geo coordinates for each phonebook entry ... " tr '\0' '\n' < 90_Geokoordinaten_hnr_raw | tr ';' '\t' | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr rm 90_Geokoordinaten_hnr_raw - lam 10_Postleitzahl -s $'\t' 07_Strasse -s $'\t' 08_Hausnummer | mapcoords 90_Geokoordinaten_hnr | convertcoords > 16_Koordinaten + lam 10_Postleitzahl -s $'\t' 11_Ort -s $'\t' 07_Strasse -s $'\t' 08_Hausnummer | mapcoords 90_Geokoordinaten_hnr | convertcoords > 16_Koordinaten printf "done.\n" } @@ -204,17 +201,14 @@ handle_new_format() { decompress $1/phonebook.db rows=`find . -name file_\* | wc -l` - rows=$(( rows / 11 )) printf "done.\n" - # Do enumerations with builtin shell tools. Unfortunally neither - # jot nor seq are standards printf "Splitting decompressed chunks into their columns (11 total) ... 1, " - f=-1; while [ $f -lt $rows ]; do printf "file_%05X " $(( (f+=1) * 11)); done | xargs cat | xxd -ps -c1 > column_0 + jot -w "file_%05X" - 0 $rows 11 | xargs cat | xxd -ps -c1 > column_0 for col in 1 2 3 4 5 6 7 8 9 10; do printf "%d, " $(( col + 1 )) - f=-1; while [ $f -lt $rows ]; do printf "file_%05X " $(( col + (f+=1) * 11 )); done | xargs cat | tr '\n\0' '\t\n' > column_${col} + jot -w "file_%05X" - ${col} ${rows} 11 | xargs cat | tr '\n\0' '\t\n' > column_${col} done printf "done.\n" -- cgit v1.2.3