Mercurial > hg > egg-tcls
changeset 656:7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Fri, 19 Feb 2021 19:30:30 +0200 |
parents | 4b985abf5aba |
children | f46c152183a2 |
files | fetch_feeds.tcl |
diffstat | 1 files changed, 61 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/fetch_feeds.tcl Fri Feb 19 19:29:51 2021 +0200 +++ b/fetch_feeds.tcl Fri Feb 19 19:30:30 2021 +0200 @@ -46,30 +46,79 @@ upvar $urlData udata upvar $urlMeta umeta - if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -binary 1 -headers {Accept-Encoding identity}]} uerrmsg]} { + set urlHeaders {} + lappend urlHeaders "Accept-Encoding" "identity" + #lappend urlHeaders "Connection" "keep-alive" + + ### Perform request + if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -binary 1 -headers $urlHeaders]} uerrmsg]} { puts "HTTP request failed: $uerrmsg" return 0 } + ### Check status set ustatus [::http::status $utoken] - if {$ustatus == "timeout"} { - puts "HTTP request timed out ($urlStr)" + set uscode [::http::code $utoken] + set ucode [::http::ncode $utoken] + + if {$ustatus != "ok"} { + puts "Error in HTTP request: $ustatus / $uscode ($urlStr)" return 0 } - if {$ustatus != "ok"} { - puts "Error in HTTP transaction: [::http::error $utoken] ($urlStr)" - return 0 - } - - set ustatus [::http::status $utoken] - set uscode [::http::code $utoken] - set ucode [::http::ncode $utoken] + ### Get data set udata [::http::data $utoken] array set umeta [::http::meta $utoken] ::http::cleanup $utoken - return 1 + ### Sanitize the metadata KEYS + foreach {ukey uvalue} [array get umeta] { + set ukey [string tolower $ukey] + set umeta($ukey) $uvalue + } + + ### Perform encoding conversion if necessary + if {$ucode >= 200 && $ucode <= 205} { + set uenc_doc "" + set uenc_http "" + set uencoding "" + + if {[info exists umeta(content-type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(content-type) -> uenc_http]} { + # Found character set encoding information in HTTP headers + } + + if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} { + # Found old style HTML meta tag with character set information + } elseif {[regexp -nocase -- "<meta.\*\?charset=\"(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} { + # Found HTML5 style meta tag with character set information + } + + # Make sanitized versions of the encoding strings + set uenc_http2 [fetch_sanitize_encoding $uenc_http] + set uenc_doc2 [fetch_sanitize_encoding $uenc_doc] + + # Check if the document has specified encoding + set uencoding $uenc_http2 + if {$uencoding == "" && $uenc_doc2 != ""} { + set uencoding $uenc_doc2 + } elseif {$uencoding == ""} { + # If _NO_ known encoding of any kind, assume the default of iso8859-1 + set uencoding "iso8859-1" + } + + #puts "Charsets: http='$uenc_http', doc='$uenc_doc' / sanitized http='$uenc_http2', doc='$uenc_doc2' -> '$uencoding'" + + # Get the document title, if any + if {$uencoding != ""} { + if {[catch {set udata [encoding convertfrom $uencoding $udata]} cerrmsg]} { + puts "Error in charset conversion: $urlStr: $cerrmsg" + return 0 + } + } + return 1 + } else { + return 0 + } }