diff options
author | Dirk Engling <erdgeist@erdgeist.org> | 2014-02-11 17:12:51 +0100 |
---|---|---|
committer | Dirk Engling <erdgeist@erdgeist.org> | 2014-02-11 17:12:51 +0100 |
commit | 9c46deb628e21991606bbf2a23ecb678a40cd243 (patch) | |
tree | 5055efc5715f2f66e2e4c658cd8bb536c724e57c /src | |
parent | 31741d636811d5a0ac5a83f3ccce6875d2a92d78 (diff) |
Reworked code to split old telefonbuch distributions, the old version was too slow
Diffstat (limited to 'src')
-rwxr-xr-x | src/makecolumns.sh | 104 |
1 files changed, 49 insertions, 55 deletions
diff --git a/src/makecolumns.sh b/src/makecolumns.sh index 2df65c9..5d2d90b 100755 --- a/src/makecolumns.sh +++ b/src/makecolumns.sh | |||
@@ -77,10 +77,8 @@ size() { | |||
77 | } | 77 | } |
78 | 78 | ||
79 | get_dword() { | 79 | get_dword() { |
80 | # $1 file, $2 offset | 80 | # $1 file |
81 | file=`printf %0${filename_len}d ${1}` | 81 | hexdump -n 4 -v -e '" " 1/4 "%u"' `printf %0${filename_len}d ${1}` |
82 | set -- `od -tu4 -N4 -j$(( 4*${2:-0} )) ${file}` | ||
83 | printf "%d\n" $2 | ||
84 | } | 82 | } |
85 | 83 | ||
86 | handle_old_format() { | 84 | handle_old_format() { |
@@ -113,38 +111,34 @@ handle_old_format() { | |||
113 | fi | 111 | fi |
114 | 112 | ||
115 | # Now loop over all files and dump them | 113 | # Now loop over all files and dump them |
116 | printf "Splitting decompressed chunks into their columns ... " | 114 | printf "Splitting decompressed nname chunks into their columns ... " |
117 | reported=0 | 115 | for file in `jot -w %0${filename_len}d - ${nname_file} $(( number_of_files - 1 )) 3`; do |
118 | while [ -f `printf %0${filename_len}d ${nname_file}` ]; do | 116 | set -- `hexdump -n 8 -v -e '" " 1/4 "%u"' ${file}` |
119 | # Get number of entries in this round | 117 | tail -c +$(( $2 + 1 )) ${file} |
120 | count=`get_dword ${nname_file}` | 118 | done | tr '\n\0' '\t\n' > 01_02_Flags_Nachname |
121 | 119 | cut -c 1 < 01_02_Flags_Nachname > 01_Flags | |
122 | # Get offset into first nname | 120 | cut -c 2- < 01_02_Flags_Nachname > 02_Nachname |
123 | nname_off=$(( `get_dword ${nname_file} 1` + 1 )) | 121 | rm 01_02_Flags_Nachname |
124 | 122 | printf "done.\n" | |
125 | # Now get the flags before the nnames | 123 | |
126 | tail -c +${nname_off} `printf %0${filename_len}d ${nname_file}` | tr '\n\0' '\t\n' | head -n ${count} | cut -c -1 >> 01_Flags | 124 | printf "Splitting decompress vname chunks into their columns ... " |
127 | tail -c +${nname_off} `printf %0${filename_len}d ${nname_file}` | tr '\n\0' '\t\n' | head -n ${count} | cut -c 2- >> 02_Nachname | 125 | jot -w "%0${filename_len}d" - ${vname_file} $(( number_of_files - 1 )) 3 | xargs cat | tr '\n\0' '\t\n' > 03_Vorname |
128 | 126 | printf "done.\n" | |
129 | # Extract the vnames | 127 | |
130 | tr '\n\0' '\t\n' < `printf %0${filename_len}d ${vname_file}` | head -n ${count} >> 03_Vorname | 128 | printf "Splitting decompress table file chunks into their columns ... " |
131 | 129 | jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3 | splitold | |
132 | # Offset into first table entry tells us how many | 130 | # for file in `jot -w %0${filename_len}d - ${table_file} $(( number_of_files - 1 )) 3`; do |
133 | # fields are in table file | 131 | # # Offset into first table entry tells us how many |
134 | table_entries=$(( `get_dword ${table_file} 1` / 4 - 1 )) | 132 | # # fields are in table file |
135 | 133 | # set -- `hexdump -n 64 -v -e '" " 1/4 "%u"' ${file}` | |
136 | # Now iterate over all entries in the table file | 134 | # count=$1; table_entries=$(( $2 / 4 - 1 )); shift |
137 | for table_index in `jot ${table_entries}`; do | 135 | # |
138 | table_off=`get_dword ${table_file} ${table_index}` | 136 | # # Now iterate over all entries in the table file |
139 | tail -c +$(( table_off + 1 )) `printf %0${filename_len}d ${table_file}` | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( table_index + 3 ))` | 137 | # for idx in `jot ${table_entries}`; do |
140 | done | 138 | # tail -c +$(( $1 + 1 )) ${file} | tr '\n\0' '\t\n' | head -n ${count} >> `printf %02d_unknown $(( idx + 3 ))` |
141 | 139 | # shift | |
142 | # Advance the filenames. | 140 | # done |
143 | nname_file=$(( nname_file+3 )) | 141 | # done |
144 | vname_file=$(( vname_file+3 )) | ||
145 | table_file=$(( table_file+3 )) | ||
146 | [ 1 -eq $(( ( ( table_file * 20 ) / number_of_files ) > reported )) ] && printf "%d%% " $(( (reported+=1) * 5 )) | ||
147 | done | ||
148 | printf "done.\n" | 142 | printf "done.\n" |
149 | 143 | ||
150 | # wipe all temporary extracted files | 144 | # wipe all temporary extracted files |
@@ -153,17 +147,19 @@ handle_old_format() { | |||
153 | printf "done.\n" | 147 | printf "done.\n" |
154 | 148 | ||
155 | # rename our columns extracted from the table file | 149 | # rename our columns extracted from the table file |
156 | mv 04_unknown 04_Namenszusatz | 150 | printf "Converting string terminators to line newlines ... " |
157 | mv 05_unknown 05_Adresszusatz | 151 | tr '\0' '\n' < 04_unknown > 04_Namenszusatz |
158 | mv 06_unknown 06_Ortszusatz | 152 | tr '\0' '\n' < 05_unknown > 05_Adresszusatz |
159 | mv 08_unknown 08_Hausnummer | 153 | tr '\0' '\n' < 06_unknown > 06_Ortszusatz |
160 | mv 09_unknown 09_Verweise | 154 | tr '\0' '\n' < 08_unknown > 08_Hausnummer |
161 | mv 10_unknown 10_Postleitzahl | 155 | tr '\0' '\n' < 09_unknown > 09_Verweise |
162 | mv 11_unknown 11_Ort | 156 | tr '\0' '\n' < 10_unknown > 10_Postleitzahl |
163 | mv 12_unknown 12_Vorwahl | 157 | tr '\0' '\n' < 11_unknown > 11_Ort |
164 | mv 13_unknown 13_Rufnummer | 158 | tr '\0' '\n' < 12_unknown > 12_Vorwahl |
165 | [ -f 14_unknown ] && mv 14_unknown 14_Email | 159 | tr '\0' '\n' < 13_unknown > 13_Rufnummer |
166 | [ -f 15_unknown ] && mv 15_unknown 15_Webadresse | 160 | [ -f 14_unknown ] && tr '\0' '\n' < 14_unknown > 14_Email |
161 | [ -f 15_unknown ] && tr '\0' '\n' < 15_unknown > 15_Webadresse | ||
162 | printf "done.\n" | ||
167 | 163 | ||
168 | # If street names come in an extra file, extract | 164 | # If street names come in an extra file, extract |
169 | # street names first | 165 | # street names first |
@@ -173,13 +169,14 @@ handle_old_format() { | |||
173 | # extract street names if 07_unknown contains street indexes | 169 | # extract street names if 07_unknown contains street indexes |
174 | # instead of street names | 170 | # instead of street names |
175 | if [ -f 99_Strassenname ]; then | 171 | if [ -f 99_Strassenname ]; then |
176 | mv 07_unknown 07_Strassenindex | 172 | tr '\0' '\n' < 07_unknown > 07_Strassenindex |
177 | printf "Looking up street names from indexes ... " | 173 | printf "Looking up street names from indexes ... " |
178 | cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse | 174 | cut -d ';' -f 1 07_Strassenindex | ${EL} -0x 99_Strassenname > 07_Strasse |
179 | printf "done.\n" | 175 | printf "done.\n" |
180 | else | 176 | else |
181 | mv 07_unknown 07_Strasse | 177 | tr '\0' '\n' < 07_unknown > 07_Strasse |
182 | fi | 178 | fi |
179 | rm ??_unknown | ||
183 | 180 | ||
184 | karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] | 181 | karto=$1/[Dd][Aa][Tt]/[Kk][Aa][Rr][Tt][Oo].[Dd][Aa][Tt] |
185 | [ -f ${karto} ] && do_processfile_old ${karto} "geo coordinates" 90_Geokoordinaten_hnr_raw | 182 | [ -f ${karto} ] && do_processfile_old ${karto} "geo coordinates" 90_Geokoordinaten_hnr_raw |
@@ -187,7 +184,7 @@ handle_old_format() { | |||
187 | printf "Looking up geo coordinates for each phonebook entry ... " | 184 | printf "Looking up geo coordinates for each phonebook entry ... " |
188 | tr '\0' '\n' < 90_Geokoordinaten_hnr_raw | tr ';' '\t' | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr | 185 | tr '\0' '\n' < 90_Geokoordinaten_hnr_raw | tr ';' '\t' | cut -f "1,2,3,4,6,7" | tr '\n' '\0' > 90_Geokoordinaten_hnr |
189 | rm 90_Geokoordinaten_hnr_raw | 186 | rm 90_Geokoordinaten_hnr_raw |
190 | lam 10_Postleitzahl -s $'\t' 07_Strasse -s $'\t' 08_Hausnummer | mapcoords 90_Geokoordinaten_hnr | convertcoords > 16_Koordinaten | 187 | lam 10_Postleitzahl -s $'\t' 11_Ort -s $'\t' 07_Strasse -s $'\t' 08_Hausnummer | mapcoords 90_Geokoordinaten_hnr | convertcoords > 16_Koordinaten |
191 | printf "done.\n" | 188 | printf "done.\n" |
192 | } | 189 | } |
193 | 190 | ||
@@ -204,17 +201,14 @@ handle_new_format() { | |||
204 | decompress $1/phonebook.db | 201 | decompress $1/phonebook.db |
205 | 202 | ||
206 | rows=`find . -name file_\* | wc -l` | 203 | rows=`find . -name file_\* | wc -l` |
207 | rows=$(( rows / 11 )) | ||
208 | printf "done.\n" | 204 | printf "done.\n" |
209 | 205 | ||
210 | # Do enumerations with builtin shell tools. Unfortunally neither | ||
211 | # jot nor seq are standards | ||
212 | printf "Splitting decompressed chunks into their columns (11 total) ... 1, " | 206 | printf "Splitting decompressed chunks into their columns (11 total) ... 1, " |
213 | f=-1; while [ $f -lt $rows ]; do printf "file_%05X " $(( (f+=1) * 11)); done | xargs cat | xxd -ps -c1 > column_0 | 207 | jot -w "file_%05X" - 0 $rows 11 | xargs cat | xxd -ps -c1 > column_0 |
214 | 208 | ||
215 | for col in 1 2 3 4 5 6 7 8 9 10; do | 209 | for col in 1 2 3 4 5 6 7 8 9 10; do |
216 | printf "%d, " $(( col + 1 )) | 210 | printf "%d, " $(( col + 1 )) |
217 | f=-1; while [ $f -lt $rows ]; do printf "file_%05X " $(( col + (f+=1) * 11 )); done | xargs cat | tr '\n\0' '\t\n' > column_${col} | 211 | jot -w "file_%05X" - ${col} ${rows} 11 | xargs cat | tr '\n\0' '\t\n' > column_${col} |
218 | done | 212 | done |
219 | printf "done.\n" | 213 | printf "done.\n" |
220 | 214 | ||