parse (12060B)
1 #!/usr/bin/env zsh 2 # 3 # Jaro Mail, your humble and faithful electronic postman 4 # 5 # a tool to easily and privately handle your e-mail communication 6 # 7 # Copyleft (C) 2010-2015 Denis Roio <jaromil@dyne.org> 8 # 9 # This source code is free software; you can redistribute it and/or 10 # modify it under the terms of the GNU Public License as published by 11 # the Free Software Foundation; either version 3 of the License, or 12 # (at your option) any later version. 13 # 14 # This source code is distributed in the hope that it will be useful, 15 # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 17 # Please refer to the GNU Public License for more details. 18 # 19 # You should have received a copy of the GNU Public License along with 20 # this source code; if not, write to: 21 # Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 22 23 24 25 26 # extract all addresses found in a list of email files from stdin 27 extract_mails() { 28 fn extract_mails $* 29 30 mailpaths=( ${=stdin} ) 31 _tot=${#mailpaths} 32 33 act "$_tot emails to parse" 34 35 36 typeset -a _match 37 38 [[ ${_tot} -gt 100 ]] && { 39 act "operation will take a while, showing progress" 40 _prog=0 41 c=0 42 } 43 44 # learn from senders, recipients or all 45 _action=${1:-all} 46 # optional second argument limits parsing to header fields 47 [[ "$_action" = "all" ]] || _arg="-x $_action" 48 49 act "parsing $_action fields" 50 _match=() 51 52 for m in ${mailpaths}; do 53 [[ $global_quit = 1 ]] && break 54 # use RFC822 parser in fetchaddr 55 _parsed=`hdr $m | ${WORKDIR}/bin/fetchaddr ${=_arg} -a` 56 for _p in ${(f)_parsed}; do 57 58 _e="${(Q)_p[(ws:,:)1]:l}" 59 # check if an email address was found 60 isemail "$_e" || continue 61 # extract also the name using comma separator 62 _n="${(Q)_p[(ws:,:)2]}" 63 64 func "match: ${_n} <$_e>" 65 _match+=("${_n} <$_e>") 66 done 67 68 [[ $_tot -gt 100 ]] && { 69 c=$(( $c + 1 )) 70 [[ $c -gt 99 ]] && { 71 _prog=$(( $_prog + $c )) 72 act "$_prog / $_tot processed so far" 73 c=1 74 } 75 } 76 done 77 78 _found=0 79 for _l in ${_match}; do 80 print - "$_l" 81 _found=$(( $_found + 1 )) 82 done 83 84 notice "${#_match} addresses extracted (including duplicates)" 85 } 86 87 # extract all addresses found into a maildir 88 extract_maildir() { 89 fn extract_maildir $* 90 md="$1" 91 req=(md) 92 ckreq || return 1 93 94 ## first arg is a directory 95 md="$1" 96 func "extract maildir: $md" 97 ## extract from a maildir 98 maildircheck "$md" || return 1 99 100 _action="$2" 101 case $_action in 102 all) ;; 103 recipient) ;; 104 sender) ;; 105 *) _action="all" ;; 106 esac 107 108 # search files 109 _mails=`find $md -type f` 110 # search symlinks 111 _mails+=`find $md -type l` 112 113 stdin="$_mails"; extract_mails "$_action" 114 115 return 0 116 } 117 118 read_stdin() { # fills global stdin 119 fn read_stdin $* 120 121 stdin="" 122 stdin=`cat` 123 bytesread=${#stdin} 124 _res=$? 125 126 func "read ${bytesread} bytes from stdin" 127 128 129 # hard limit: read in max 10MB 130 # sysread -t 10 -c bytesread -s 10000000 stdin 131 # case $_res in 132 # 1) warning "read_stdin: there was an error in the parameters to the command." ;; 133 # 2) warning "read_stdin: there was an error on the read, or on polling the input file descriptor for a timeout." ;; 134 # 4) warning "read_stdin: the attempt to read timed out." ;; 135 # 5) warning "read_stdin: no system error occurred, but zero bytes were read." ;; 136 # 0) return 0 ;; 137 # esac 138 139 } 140 141 stdin_is_muttpipe() { 142 fn stdin_is_muttpipe $* 143 req=(stdin) 144 ckreq || return 1 145 146 if [[ "${stdin[(w)1]}" = "Date:" ]]; then 147 return 0 148 else 149 return 1 150 fi 151 } 152 153 stdin_is_pathlist() { 154 fn stdin_is_pathlist $* 155 req=(stdin) 156 ckreq || return 1 157 158 if [[ -r "${stdin[(w)1]}" ]]; then 159 return 0 160 else 161 return 1 162 fi 163 } 164 165 # extract all entries in addressbook or all addresses in a pgp keyring 166 # or all signatures on a pgp key (even without importing it) 167 extract_addresses() { 168 fn extract_addresses $* 169 170 # without arguments just list all entries in the active list 171 # default is whitelist 172 173 arg=${1} 174 175 # no arg means parse from stdin 176 stdin=0 177 [[ "$arg" = "" ]] && stdin=1 178 [[ "$arg" = "stdin" ]] && stdin=1 179 [[ "$arg" = "in" ]] && stdin=1 180 181 182 [[ $stdin = 1 ]] && { 183 read_stdin 184 185 # Extract all entries found in stdin. Supports two formats (autodetected) 186 # 1) list of complete paths to filenames as returned by search 187 # 2) mbox format big file with special jaromail separator as produced by mutt tagging 188 189 # take first word 190 if stdin_is_muttpipe; then 191 192 act "stdin seems an email or stream of emails" 193 194 _headers=`print - $stdin | awk ' 195 BEGIN { header=1 } 196 /JAROMAIL_PIPE_SEPARATOR/ { header=1; next } 197 /^$/ { header=0; print "\n" } 198 { if(header==1) { print $0 } } 199 '` 200 201 e_addr=() 202 _nextline=0 203 _gotit="" 204 205 for h in ${(f)_headers}; do 206 207 [[ "${h[(w)1]}" = "From:" ]] && _nextline=1 208 [[ "${h[(w)1]}" = "Subject:" ]] && { 209 _nextline=0 210 print - ${_gotit} | e_parse 211 212 _gotit="" 213 } 214 [[ $_nextline = 1 ]] && _gotit+="$h\n" 215 216 done 217 218 for i in ${(k)e_addr}; do 219 print - "${e_addr[$i]} <$i>" 220 done 221 222 223 elif stdin_is_pathlist; then 224 act "stdin seems a stream of full paths to single email files inside maildirs" 225 # is a list of files 226 extract_mails "$2" 227 _res=$? 228 else 229 error "Cannot process stream from stdin, unknown format" 230 return 1 231 fi 232 return $_res 233 } 234 235 [[ -r "$arg" ]] && { 236 # if first arg is a file, could be a maildir, a gpg keyring, 237 # a gpg pubkey or a vcard 238 239 # if first arg is a directory then extract from maildir 240 [[ -d "$arg" ]] && { 241 notice "Extracting $2 addresses from maildir $1" 242 extract_maildir "$1" "$2" 243 return $? 244 } 245 246 func "testing argument with file magic" 247 _magic=`file "$arg"` 248 _recognized=0 249 250 # a map to eliminate duplicates 251 typeset -A result 252 253 ######### GPG 254 # first arg is a GnuPG key ring 255 [[ "$_magic" =~ "GPG key public ring" ]] && { _recognized=1 256 notice "Extracting addresses found in GPG keyring: $arg" 257 _addrs=`gpg --list-keys --with-colons | awk -F: '{print $10}'` 258 for i in ${(f)_addrs}; do 259 [[ $global_quit = 1 ]] && break 260 _parsed=`print "From: $i" | ${WORKDIR}/bin/fetchaddr -a -x from` 261 _e="${_parsed[(ws:,:)1]:l}" 262 isemail "$_e" 263 [[ $? = 0 ]] || continue 264 # check if the email is not already parsed 265 [[ "${result[$_e]}" = "" ]] && { 266 _n="${_parsed[(ws:,:)2]}" 267 result+=("$_e" "$_n") 268 print - "$_n <$_e>" 269 } 270 done 271 } 272 273 # first arg is a GnuPG public key 274 [[ "$_magic" =~ "PGP public key" ]] && { _recognized=1 275 notice "Extracting addresses from sigs on GPG key $arg" 276 _gpg="gpg --no-default-keyring --keyring $MAILDIRS/cache/pubkey.gpg --batch --with-colons" 277 ${=rm} $MAILDIRS/cache/pubkey.gpg 278 ${=_gpg} --import "$arg" 279 # first make sure all unknown keys are imported 280 _addrs=`${=_gpg} --list-sigs | awk -F: '{print $5 " " $10}'` 281 for i in ${(f)_addrs}; do 282 [[ $global_quit = 1 ]] && break 283 284 [[ "$i" =~ "[User ID not found]" ]] && { 285 act "looking up: $i" 286 o ${=_gpg} --recv-key ${i[(w)1]} 287 } 288 done 289 290 _addrs=`${=_gpg} --list-sigs | awk -F: '{print $10}'` 291 for i in ${(f)_addrs}; do 292 [[ $global_quit = 1 ]] && break 293 294 _parsed=`print "From: $i" | ${WORKDIR}/bin/fetchaddr -a -x from` 295 _e="${_parsed[(ws:,:)1]:l}" 296 isemail "$_e" 297 [[ $? = 0 ]] || continue 298 # check if the email is not already parsed 299 [[ "${result[$_e]}" = "" ]] && { 300 _n="${_parsed[(ws:,:)2]}" 301 result+=("$_e" "$_n") 302 print - "$_n <$_e>" 303 } 304 done 305 } 306 307 [[ "$_magic" =~ "vCard" ]] && { _recognized=1 308 # parse the vcard and print a simple name and email list 309 # each value on a single line, entry tuples followed by a # 310 # we skip entries that don't have an email 311 addresses=`awk ' 312 BEGIN { newcard=0; c=0; name=""; email=""; } 313 /^BEGIN:VCARD/ { newcard=1 } 314 /^FN:/ { if(newcard = 1) name=$0 } 315 /^EMAIL/ { if(newcard = 1) email=$0 } 316 /^END:VCARD/ { 317 if(newcard = 1) { 318 newcard=0 319 if(email != "") { 320 c+=1 321 print name 322 print email 323 print "# " c 324 } 325 email="" 326 next 327 } 328 } 329 ' $arg | cut -d: -f2` 330 newa=1; _name=""; _email="" 331 for a in ${(f)addresses}; do 332 [[ $global_quit = 1 ]] && break 333 334 [[ "${a[1]}" = "#" ]] && { 335 newa=1; # its the end of the entry 336 337 # handle lines with multiple emails in vcard 338 for ee in ${=_email}; do 339 # check if we have this email already 340 _e=`print ${ee//\^M/} | extract_emails` 341 isemail "$_e" 342 [[ $? = 0 ]] || continue 343 # check if the email is not already parsed 344 [[ "${result[$_e]}" = "" ]] && { 345 _n="${_name//\^M/}" 346 result+=("$_e" "$_n") 347 print - "$_n <$_e>" 348 } 349 done 350 351 continue 352 } 353 if [[ $newa = 1 ]]; then 354 # (V) makes special chars visible, we need to remove them.. 355 _name="${(V)a[(ws:^:)1]}"; newa=0; continue 356 elif [[ $newa = 0 ]]; then 357 _email="${(V)a[(ws:^:)1]}" 358 fi 359 360 done 361 362 } 363 364 [[ $_recognized = 1 ]] && { 365 notice "Unique addresses found: ${#result}" 366 # act "calculating known and new addresses..." 367 # # counts which addresses are known to us 368 # _known=0 369 # for i in ${(k)result}; do 370 # [[ $global_quit = 1 ]] && break 371 372 # lookup_email ${i} 373 # [[ $? = 0 ]] || { 374 # _known=$(( $_known + 1 )) } 375 # done 376 # act "new addresses: $_known" 377 return 0 378 } 379 380 } # closes condition in which arg is a file 381 382 # final fallback 383 # if no file is recognized, use string as search query 384 error "cannot extract any address from $option_params" 385 [[ "$_magic" = "" ]] || { 386 error "file format not supported: ${_magic[(ws@:@)2]}" } 387 return 1 388 } 389 390 391 extract_headers() { 392 fn extract_headers $* 393 # use cat directly, faster than read_stdin 394 for i in `cat`; do 395 [[ -r "$i" ]] || { 396 warning "cannot extract headers, not a file: $i" } 397 _folder=${i[(ws:/:)-3]} 398 hdr $i | awk -v folder=$_folder ' 399 BEGIN { date=""; from=""; subj="" } 400 /^From:/ { from=$NF } 401 /^Date:/ { date=sprintf("%02d %s %s", $3, $4, $5)} 402 /^Subject:/ { subj=$0} 403 END { printf("%s :%s:\t%s\t%s\n", date, folder, from, subj) }' 404 done 405 }