jaromail

a commandline tool to easily and privately handle your e-mail
git clone git://parazyd.org/jaromail.git
Log | Files | Refs | Submodules | README

parse (12060B)


      1 #!/usr/bin/env zsh
      2 #
      3 # Jaro Mail, your humble and faithful electronic postman
      4 #
      5 # a tool to easily and privately handle your e-mail communication
      6 #
      7 # Copyleft (C) 2010-2015 Denis Roio <jaromil@dyne.org>
      8 #
      9 # This source  code is free  software; you can redistribute  it and/or
     10 # modify it under the terms of  the GNU Public License as published by
     11 # the Free  Software Foundation; either  version 3 of the  License, or
     12 # (at your option) any later version.
     13 #
     14 # This source code is distributed in  the hope that it will be useful,
     15 # but  WITHOUT ANY  WARRANTY;  without even  the  implied warranty  of
     16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     17 # Please refer to the GNU Public License for more details.
     18 #
     19 # You should have received a copy of the GNU Public License along with
     20 # this source code; if not, write to:
     21 # Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
     22 
     23 
     24 
     25 
     26 # extract all addresses found in a list of email files from stdin
     27 extract_mails() {
     28 	fn extract_mails $*
     29 
     30     mailpaths=( ${=stdin} )
     31     _tot=${#mailpaths}
     32 
     33     act "$_tot emails to parse"
     34 
     35 
     36     typeset -a _match
     37 
     38     [[ ${_tot} -gt 100 ]] && {
     39         act "operation will take a while, showing progress"
     40         _prog=0
     41         c=0
     42     }
     43 
     44     # learn from senders, recipients or all
     45     _action=${1:-all}
     46     # optional second argument limits parsing to header fields
     47     [[ "$_action" = "all" ]] || _arg="-x $_action"
     48 
     49     act "parsing $_action fields"
     50     _match=()
     51 
     52     for m in ${mailpaths}; do
     53         [[ $global_quit = 1 ]] && break
     54         # use RFC822 parser in fetchaddr
     55         _parsed=`hdr $m | ${WORKDIR}/bin/fetchaddr ${=_arg} -a`
     56         for _p in ${(f)_parsed}; do
     57 
     58             _e="${(Q)_p[(ws:,:)1]:l}"
     59             # check if an email address was found
     60             isemail "$_e" || continue
     61             # extract also the name using comma separator
     62             _n="${(Q)_p[(ws:,:)2]}"
     63             
     64             func "match: ${_n} <$_e>"
     65             _match+=("${_n} <$_e>")
     66         done
     67 
     68         [[ $_tot -gt 100 ]] && {
     69             c=$(( $c + 1 ))
     70             [[ $c -gt 99 ]] && {
     71                 _prog=$(( $_prog + $c ))
     72                 act "$_prog / $_tot processed so far"
     73                 c=1
     74             }
     75         }
     76     done
     77 
     78     _found=0
     79     for _l in ${_match}; do
     80         print - "$_l"
     81         _found=$(( $_found + 1 ))
     82     done
     83 
     84     notice "${#_match} addresses extracted (including duplicates)"
     85 }
     86 
     87 # extract all addresses found into a maildir
     88 extract_maildir() {
     89 	fn extract_maildir $*
     90     md="$1"
     91 	req=(md)
     92 	ckreq || return 1
     93 
     94     ## first arg is a directory
     95     md="$1"
     96     func "extract maildir: $md"
     97     ## extract from a maildir
     98     maildircheck "$md" || return 1
     99     
    100     _action="$2"
    101     case $_action in
    102         all) ;;
    103         recipient) ;;
    104         sender) ;;
    105         *) _action="all" ;;
    106     esac
    107     
    108     # search files
    109     _mails=`find $md -type f`
    110     # search symlinks
    111     _mails+=`find $md -type l`
    112 
    113     stdin="$_mails"; extract_mails "$_action"
    114 
    115     return 0
    116 }
    117 
    118 read_stdin() { # fills global stdin
    119     fn read_stdin $*
    120 
    121     stdin=""
    122     stdin=`cat`
    123     bytesread=${#stdin}
    124     _res=$?
    125 
    126     func "read ${bytesread} bytes from stdin"
    127 
    128 
    129     # hard limit: read in max 10MB
    130     #    sysread -t 10 -c bytesread -s 10000000 stdin
    131     # case $_res in
    132     #     1) warning "read_stdin: there was an error in the parameters to the command." ;;
    133     #     2) warning "read_stdin: there was an error on the read, or on polling the input file descriptor for a timeout." ;;
    134     #     4) warning "read_stdin: the attempt to read timed out." ;;
    135     #     5) warning "read_stdin: no system error occurred, but zero bytes were read." ;;
    136     #     0) return 0 ;;
    137     # esac
    138            
    139 }
    140 
    141 stdin_is_muttpipe() {
    142     fn stdin_is_muttpipe $*
    143     req=(stdin)
    144     ckreq || return 1
    145 
    146     if [[ "${stdin[(w)1]}" = "Date:" ]]; then
    147         return 0
    148     else
    149         return 1
    150     fi
    151 }
    152 
    153 stdin_is_pathlist() {
    154     fn stdin_is_pathlist $*
    155     req=(stdin)
    156     ckreq || return 1
    157 
    158     if [[ -r "${stdin[(w)1]}" ]]; then
    159         return 0
    160     else
    161         return 1
    162     fi
    163 }
    164 
    165 # extract all entries in addressbook or all addresses in a pgp keyring
    166 # or all signatures on a pgp key (even without importing it)
    167 extract_addresses() {
    168     fn extract_addresses $*
    169 
    170     # without arguments just list all entries in the active list
    171     # default is whitelist
    172 
    173     arg=${1}
    174 
    175     # no arg means parse from stdin
    176     stdin=0
    177     [[ "$arg" = "" ]]      && stdin=1
    178     [[ "$arg" = "stdin" ]] && stdin=1
    179     [[ "$arg" = "in" ]]    && stdin=1
    180     
    181     
    182     [[ $stdin = 1 ]] && { 
    183         read_stdin
    184 
    185         # Extract all entries found in stdin. Supports two formats (autodetected)
    186         # 1) list of complete paths to filenames as returned by search
    187         # 2) mbox format big file with special jaromail separator as produced by mutt tagging
    188 
    189         # take first word
    190         if stdin_is_muttpipe; then
    191 
    192             act "stdin seems an email or stream of emails"
    193 
    194             _headers=`print - $stdin | awk '
    195 BEGIN { header=1 }
    196 /JAROMAIL_PIPE_SEPARATOR/ { header=1; next }
    197 /^$/ { header=0; print "\n" }
    198 { if(header==1) { print $0 } }
    199 '`
    200 
    201             e_addr=()
    202             _nextline=0
    203             _gotit=""
    204 
    205             for h in ${(f)_headers}; do
    206 
    207                 [[ "${h[(w)1]}" = "From:" ]] && _nextline=1
    208                 [[ "${h[(w)1]}" = "Subject:" ]] && {
    209                     _nextline=0
    210                     print - ${_gotit} | e_parse
    211 
    212                     _gotit=""
    213                 }
    214                 [[ $_nextline = 1 ]] && _gotit+="$h\n"
    215 
    216             done
    217 
    218             for i in ${(k)e_addr}; do
    219                 print - "${e_addr[$i]} <$i>"
    220             done
    221 
    222 
    223         elif stdin_is_pathlist; then
    224             act "stdin seems a stream of full paths to single email files inside maildirs"
    225             # is a list of files
    226             extract_mails "$2"
    227             _res=$?
    228         else
    229             error "Cannot process stream from stdin, unknown format"
    230             return 1
    231         fi
    232         return $_res
    233     }
    234 
    235     [[ -r "$arg" ]] && {
    236         # if first arg is a file, could be a maildir, a gpg keyring,
    237         # a gpg pubkey or a vcard
    238         
    239         # if first arg is a directory then extract from maildir
    240         [[ -d "$arg" ]] && {
    241             notice "Extracting $2 addresses from maildir $1"
    242             extract_maildir "$1" "$2"
    243             return $?
    244         }
    245 
    246         func "testing argument with file magic"
    247         _magic=`file "$arg"`
    248         _recognized=0
    249 
    250         # a map to eliminate duplicates
    251         typeset -A result
    252 
    253         ######### GPG
    254         # first arg is a GnuPG key ring
    255         [[ "$_magic" =~ "GPG key public ring" ]] && { _recognized=1
    256             notice "Extracting addresses found in GPG keyring: $arg"
    257             _addrs=`gpg --list-keys --with-colons | awk -F: '{print $10}'`
    258             for i in ${(f)_addrs}; do
    259                 [[ $global_quit = 1 ]] && break
    260                 _parsed=`print "From: $i" | ${WORKDIR}/bin/fetchaddr -a -x from`
    261                 _e="${_parsed[(ws:,:)1]:l}"
    262                 isemail "$_e"
    263                 [[ $? = 0 ]] || continue
    264                 # check if the email is not already parsed
    265                 [[ "${result[$_e]}" = "" ]] && {
    266                     _n="${_parsed[(ws:,:)2]}"
    267                     result+=("$_e" "$_n")
    268                     print - "$_n <$_e>"
    269                 }
    270             done
    271         }
    272 
    273         # first arg is a GnuPG public key
    274         [[ "$_magic" =~ "PGP public key" ]] && { _recognized=1
    275             notice "Extracting addresses from sigs on GPG key $arg"
    276             _gpg="gpg --no-default-keyring --keyring $MAILDIRS/cache/pubkey.gpg --batch --with-colons"
    277             ${=rm} $MAILDIRS/cache/pubkey.gpg
    278             ${=_gpg} --import "$arg"
    279             # first make sure all unknown keys are imported
    280             _addrs=`${=_gpg} --list-sigs | awk -F: '{print $5 " " $10}'`
    281             for i in ${(f)_addrs}; do
    282                 [[ $global_quit = 1 ]] && break
    283 
    284                 [[ "$i" =~ "[User ID not found]" ]] && {
    285                     act "looking up: $i"
    286 o                    ${=_gpg} --recv-key ${i[(w)1]}
    287                 }
    288             done
    289             
    290             _addrs=`${=_gpg} --list-sigs | awk -F: '{print $10}'`
    291             for i in ${(f)_addrs}; do
    292                 [[ $global_quit = 1 ]] && break
    293 
    294                 _parsed=`print "From: $i" | ${WORKDIR}/bin/fetchaddr -a -x from`
    295                 _e="${_parsed[(ws:,:)1]:l}"
    296                 isemail "$_e"
    297                 [[ $? = 0 ]] || continue
    298                 # check if the email is not already parsed
    299                 [[ "${result[$_e]}" = "" ]] && {
    300                     _n="${_parsed[(ws:,:)2]}"
    301                     result+=("$_e" "$_n")
    302                     print - "$_n <$_e>"
    303                 }
    304             done
    305         }
    306 
    307         [[ "$_magic" =~ "vCard" ]] && { _recognized=1
    308             # parse the vcard and print a simple name and email list
    309             # each value on a single line, entry tuples followed by a #
    310             # we skip entries that don't have an email
    311             addresses=`awk '
    312 BEGIN { newcard=0; c=0; name=""; email=""; }
    313 /^BEGIN:VCARD/ { newcard=1 }
    314 /^FN:/ { if(newcard = 1) name=$0 }
    315 /^EMAIL/ { if(newcard = 1) email=$0 }
    316 /^END:VCARD/ {
    317   if(newcard = 1) {
    318     newcard=0
    319     if(email != "") {
    320       c+=1
    321       print name
    322       print email
    323       print "# " c
    324     }
    325     email=""
    326     next
    327   }
    328 }
    329 ' $arg | cut -d: -f2`
    330             newa=1; _name=""; _email=""
    331             for a in ${(f)addresses}; do
    332                 [[ $global_quit = 1 ]] && break
    333                 
    334                 [[ "${a[1]}" = "#" ]] && {
    335                     newa=1; # its the end of the entry
    336                     
    337                     # handle lines with multiple emails in vcard
    338                     for ee in ${=_email}; do
    339                         # check if we have this email already
    340                         _e=`print ${ee//\^M/} | extract_emails`
    341                         isemail "$_e"
    342                         [[ $? = 0 ]] || continue
    343                         # check if the email is not already parsed
    344                         [[ "${result[$_e]}" = "" ]] && {
    345                             _n="${_name//\^M/}"
    346                             result+=("$_e" "$_n")
    347                             print - "$_n <$_e>"
    348                         }
    349                     done
    350                     
    351                     continue
    352                 }
    353                 if [[ $newa = 1 ]]; then
    354                     # (V) makes special chars visible, we need to remove them..
    355                     _name="${(V)a[(ws:^:)1]}"; newa=0; continue
    356                 elif [[ $newa = 0 ]]; then
    357                     _email="${(V)a[(ws:^:)1]}"
    358                 fi
    359 
    360             done
    361             
    362          }
    363 
    364          [[ $_recognized = 1 ]] && {
    365              notice "Unique addresses found: ${#result}"
    366              # act "calculating known and new addresses..."
    367              # # counts which addresses are known to us
    368              # _known=0
    369              # for i in ${(k)result}; do
    370              #     [[ $global_quit = 1 ]] && break
    371 
    372              #     lookup_email ${i}
    373              #     [[ $? = 0 ]] || {
    374              #         _known=$(( $_known + 1 )) }
    375              # done
    376              # act "new addresses: $_known"
    377              return 0
    378          }
    379 
    380     } # closes condition in which arg is a file
    381 
    382     # final fallback
    383     # if no file is recognized, use string as search query
    384     error "cannot extract any address from $option_params"
    385     [[ "$_magic" = "" ]] || {
    386         error "file format not supported: ${_magic[(ws@:@)2]}" }
    387     return 1
    388 }
    389 
    390 
    391 extract_headers() {
    392     fn extract_headers $*
    393     # use cat directly, faster than read_stdin
    394     for i in `cat`; do
    395         [[ -r "$i" ]] || {
    396             warning "cannot extract headers, not a file: $i" }
    397         _folder=${i[(ws:/:)-3]}
    398         hdr $i | awk -v folder=$_folder '
    399 BEGIN { date=""; from=""; subj="" }
    400 /^From:/ { from=$NF }
    401 /^Date:/ { date=sprintf("%02d %s %s", $3, $4, $5)}
    402 /^Subject:/ { subj=$0}
    403 END { printf("%s :%s:\t%s\t%s\n", date, folder, from, subj) }'
    404     done
    405 }