// -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- // Author: Jose M. Vidal // $Id: iwebstream.C,v 1.17 2003/05/11 18:58:50 jmvidal Exp $ // This code is copyright of Jose M. Vidal and released under // the GNU General Public License // // A class thats is meant to look like an istream, regardless of // whether it opens a local file or an http object. // Sometime later I might also add ftp. // #include #include //#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iwebstream.H" //#define DEBUG //#define DEBUG_TIMEOUT // The constructors. iwebstream::iwebstream(string url, int timeout) : defaultTimeout(timeout), data(""), position(0), http_proxy(""), http_proxy_port(0), http_proxy_user(""), http_proxy_password("") { init(url); } iwebstream::iwebstream(const string& url, const string& http_proxy, int http_proxy_port, const string& http_proxy_user, const string& http_proxy_password, int timeout) : defaultTimeout(timeout), data(""), position(0), http_proxy(http_proxy), http_proxy_port(http_proxy_port), http_proxy_user(http_proxy_user), http_proxy_password(http_proxy_password) { init(url); } // A method doing initializations common for all constructors void iwebstream::init(const string& origurl) { string url = origurl; string method; string host; string filename; //directory string portString; int port; if (url.substr(0,5) == "http:"){ method = "http"; url = url.substr(5, url.size() - 5); } else if (url.substr(0,5) == "file:") { method = "file"; url = url.substr(5, url.size() - 5); } else if (url.substr(0,5) == "data:") { //a special url "data:stuff" means just use "stuff" as contents. data = url.substr(5,url.length() - 5); return; } else method = "file"; // cout << "url=" << url << endl; if (url.substr(0,2) == "//") { url = url.substr(2, url.size() - 2); string::size_type end = url.size(); if (url.find(":") != string::npos) end = url.find(":"); else if (url.find("/") != string::npos) end = url.find("/"); host = url.substr(0,end); if (end == url.size()){ url = ""; //substr with len 0 core dumps } else { url = url.substr(end, url.size() - end); } } else{ host = ""; if (method == "http") cerr << "Malformed URL = " << origurl << endl; } // cout << "url=" << url << endl; if ((url != "") && (url[0] == ':')) { string::size_type end = url.size(); if (url.find("/") != string::npos) end = url.find("/"); portString = url.substr(1,end -1); port = atoi(portString.c_str()); if (url.size() >= end) url = ""; else url = url.substr(end, url.size() - end); } else { port = 80; portString = "80"; } // cout << "url=" << url << endl; filename = url; if (filename == "") filename = "/"; if (method == "file") { readFile(filename); } else if (method == "http") { //an http request string theHost; int thePort; if (http_proxy == "") { theHost = host; thePort = port; } else { theHost = http_proxy; thePort = (http_proxy_port != 0) ? http_proxy_port : port; } if (connect(theHost, thePort, defaultTimeout) < 0) { cerr << "Could not connect to " << theHost << endl; return; } int count = 0; while ((get(host, portString, filename, data) == -1) && (count++ < 5)) { cerr << "Error downloading, trying again (" << count << "/5)..." << endl; data = ""; if (connect(theHost, thePort, defaultTimeout) < 0){ cerr << "Could not connect to " << theHost << endl; return; } } // cout << "Done with get" << endl; } else { cerr << "Unknown method " << method << " in " << origurl << endl; } } void iwebstream::readFile(string & filename) { ifstream ifs (filename.c_str()); if (ifs == 0) { cerr << "Could not open " << filename << endl; return; } char buffer[4097]; buffer[4096] = 0; // We need a null terminated buffer since the // operator "+" we use takes append the content of // a null terminated C string to a STL string. while ( ifs.read(buffer, sizeof(buffer) - 1) ) { data += buffer; } // Don't forget the last line read! if ( ifs.eof() ) { std::streamsize bytesRead = ifs.gcount(); buffer[bytesRead] = 0; data += buffer; } ifs.close(); } // The connect() method accepts a hostname and port of a WWW server to connect // to, as well as a timeout to be used during gets of pages. It returns 0 // if successful, -1 if not. int iwebstream::connect(string hostName, unsigned short int port, int timeOut) { struct sockaddr_in name; struct hostent *hp; // cout << "connect : " << hostName << ":" << port << " " << timeOut << endl; // Create TCP socket if ((socketDesc = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { perror("INET Domain Socket"); exit(1); } // Create structure that points to the WWW server name.sin_family = (short int)AF_INET; // A 1 here appears as port 256. name.sin_port = htons(port); // Resolve hostname to an IP address hp = gethostbyname(hostName.c_str()); if (hp == NULL) { socketDesc = -1; cerr << "Could not resolve host name= " << hostName << endl; return (-1); } memcpy(&name.sin_addr.s_addr, hp->h_addr, hp->h_length); // cerr << "h_addr= " << (int)(char)hp->h_addr[0] << "." << (char)hp->h_addr[1] << endl; // cerr << "h_length= " << hp->h_length << endl; // Connect to the host at the specified port if (::connect(socketDesc, (struct sockaddr *) &name, sizeof(name)) < 0) { perror("connect"); socketDesc = -1; cerr << "Could not connect to the host at the given port." << endl; return (-1); } // cerr << "Connected." << endl; // Initialize structures used during the select() system call in the // get() method FD_ZERO(&read_map); FD_SET(socketDesc, &read_map); tval.tv_sec = timeOut; tval.tv_usec = 0; return (0); } const string version = VERSION; // The get() method accepts the path to a WWW page and gets that page from // the specified server. The page is returned by reference in the pageContents // string. // Return -1 if there was an error int iwebstream::get(const string hostname, const string port, const string path, string& pageContents) { string request; char buf[10000]; int socketsReady; int numRead = 0; bool timedOut = false; // Build the request and send it to the WWW server request = "GET "; if ( http_proxy != "" ) { request += "http://" + hostname + ":" + port + path; } else { request += path; } request += " HTTP/1.0\r\nUser-Agent: bk2site/" + version + " (http://bk2site.sourceforge.net/)\r\n" + "Host: " + hostname + ":" + port +"\r\n" + "Accept: */*"; if ( (http_proxy_user != "") ) { string tmp = http_proxy_user + ":" + http_proxy_password; string password = string2base64(tmp); request += "Proxy-Authorization: Basic " + password; } request += "\r\n\r\n"; write(socketDesc, request.c_str(), request.length()); #ifdef DEBUG cout << "----------" << endl << request << "-------------" << endl; cout << "HTTP commands sent, waiting for reply..." << endl; cout << "Timeout = " << tval.tv_sec << endl; #endif // Keep reading data until error, timeout, or we read all the needed bytes. int totalRead = 0; int totalSize = 100000000; int maxReadZero = 100; //max number of times we read 0 bytes before closing connection int numReadZero = 0; //number of times we have have read 0 bytes string lengthKeyword = "Content-Length:"; bool foundHeader = false; string::size_type endHeader; do { do { socketsReady = select(socketDesc + 1, &read_map, NULL, NULL, &tval); } while(socketDesc == -1 && errno == EINTR); // cout << "socketsReady= " << socketsReady << endl; if (socketsReady > 0) { // Data is available on the socket do{ numRead = read(socketDesc, buf, sizeof(buf)); } while (numRead == -1 && errno == EINTR); //I got this from wget source code. if (numRead == 0) numReadZero++; // cout << numRead << "." << numReadZero << endl; if ((numRead != 0) && (numRead != EOF)) { totalRead += numRead; // Append data to the string holding the page contents string newstuff(buf,numRead); // cout << "---" << newstuff; pageContents += newstuff; if (((endHeader = pageContents.find("\r\n\r\n")) != string::npos) && !foundHeader){ foundHeader = true; totalRead -= endHeader; totalRead -= 4; string::size_type lstart = pageContents.find(" "); string resultCode = pageContents.substr(lstart+1,3); if (resultCode != "200"){ //we are only happy with 200, for now lstart = pageContents.find("\r"); cerr << "ERROR: " << pageContents.substr(0,lstart) << endl; timedOut = true; break; } lstart = pageContents.find(lengthKeyword); if (lstart != string::npos){ lstart += lengthKeyword.size(); string::size_type lend = pageContents.find("\r",lstart); string length = pageContents.substr(lstart, lend - lstart); totalSize = atoi(length.c_str()); } } } } else { cerr << "Connection timed out." << endl; // cout << pageContents << endl; timedOut = true; } // cout << "numRead=" << numRead << " totalRead=" << totalRead << " totalSize=" << totalSize << endl << // " numReadZero=" << numReadZero << " maxReadZero=" << maxReadZero << endl; } while ((numRead != -1) && (totalRead < totalSize) && (numReadZero < maxReadZero) && !timedOut); // cout << "Done reading timedOut=" << timedOut << endl; if (numRead == -1){ perror("ERROR"); pageContents = ""; return -1; } if (timedOut) //we return nothing instead of returning part of a page pageContents = ""; ::close(socketDesc); string::size_type pos = pageContents.find("\r\n\r\n"); //get rid of initial cruft if (pos != string::npos){ pos += 4; pageContents = pageContents.substr(pos, pageContents.size() - pos); } return 0; } iwebstream & iwebstream::operator>>(int & t){ string s; operator>>(s); //read it as string t = atoi(s.c_str()); return *this; } iwebstream & iwebstream::operator>>(long int & t){ string s; operator>>(s); //read it as string t = atoi(s.c_str()); return *this; } iwebstream & iwebstream::operator>>(string & s){ s = ""; if (position >= data.size()) //return 0? return *this; while (data[position] != ' ' && data[position] != '\n' && data[position] != '\t') { s += data[position++]; if (position >= data.size()) return *this; } position++; //move beyond whitespace return *this; } void iwebstream::getline(string & s){ s = ""; if (eof()) return; while ((data[position] != '\n') && (position < data.size())) s += data[position++]; position++; //move past /n return; } void iwebstream::putback(char & c){ if (position <= 0) return; data[--position] = c; } char iwebstream::get(){ if (position >= data.size()) return 0; return data[position++]; } bool iwebstream::eof(){ if (position >= data.size()) return true; return false; } //ignore x, assume its a comparison with 0. bool iwebstream::operator==(int x){ if (data == "") return true; else return false; } //from Stroustrup book // int iwebstream::cmpNC(const string& s, const string &s2) // { // string::const_iterator p = s.begin(); // string::const_iterator p2 = s2.begin(); // while (p != s.end() && p2 != s2.end()) { // if (toupper(*p) != toupper(*p2)) // return (toupper(*p) < toupper(*p2)) ? -1 : 1; // ++p; // ++p2; // } // return (s2.size() == s.size()) ? 0 : (s.size() < s2.size()) ? -1 : 1; // } //find s, starting at position p, ignoring case. // return data.size() if not found unsigned int iwebstream::findNC(const string & s, unsigned int p){ unsigned int matchpos = 0; for (; p < data.size() && matchpos < s.size() ; p++) { if (toupper(data[p]) == toupper(s[matchpos])) matchpos++; else matchpos = 0; } if (matchpos == 0) return p; else return p - s.size(); } bool iwebstream::find(const string & s){ position = findNC(s, position); if (position == data.size()) return false; return true; } bool iwebstream::findAndPass(const string & s){ position = findNC(s, position) + s.size(); if (position == data.size()) return false; return true; } //return whatever is between the next begin & end, which must be between the next // "after" and "before". // Case is ignored in comparisions. // If begin is not found, return "". Move position to point to char after end; // If end == "" then just find begin and place position just after it string iwebstream::findTag(const string & begin, const string & end, const string & after, const string & before) { // cout << "findTag begin=" << begin << "=" << endl // << "end=" << end << "=" << endl // << "after=" << after << "=" << endl // << "before=" << before << "=" << endl; unsigned int pos = position; unsigned int afterPos = position; if (after != "") pos = findNC(after, pos); if (pos == data.size()) return ""; afterPos = pos; pos = findNC(begin, pos); if (pos == data.size()) return ""; unsigned int temp = findNC(before, afterPos); if ((before != "") && (pos > findNC(before, afterPos))) //if the begin..end block is after "before". return ""; if (temp == data.size()) return ""; //afterPos not found pos += begin.size(); if (end == "") return ""; unsigned int posend = findNC(end, pos); if (posend == data.size()) return ""; //end not found string result = data.substr(pos, posend-pos); return result; } void iwebstream::string2base64Helper(unsigned char *what, unsigned char *where) { static char convTable[] = { 'A','B','C','D','E','F','G','H', 'I','J','K','L','M','N','O','P', 'Q','R','S','T','U','V','W','X', 'Y','Z','a','b','c','d','e','f', 'g','h','i','j','k','l','m','n', 'o','p','q','r','s','t','u','v', 'w','x','y','z','0','1','2','3', '4','5','6','7','8','9','+','/' }; *where = (*what >> 2) & 63; *(where+1) = ((*what << 4) | (*(what+1) >> 4)) & 63; *(where+2) = ((*(what+1) << 2) | (*(what+2) >> 6)) & 63; *(where+3) = *(what+2) & 63; for (int i = 0; i < 4; ++i) { where[i] = convTable[where[i]]; }; }; string iwebstream::string2base64(const string &in) { char *what = strdup(in.c_str()); int len = strlen(what); int len2 = 0; if ( len % 3 ) { len2 = (len/3 +1) * 4 + 1; } else { len2 = (len/3) * 4 + 1; } char *result = new char[len2]; unsigned char *tmp = (unsigned char *)result; unsigned char four[4]; while ( len >= 3 ) { string2base64Helper((unsigned char *)what, four); for ( int i = 0; i < 4; ++i) { *(tmp++) = four[i]; } len -= 3; what += 3; } if ( len ) { unsigned char three[3] = { 0, 0, 0 }; for (int i = 0; i < len; ++i) { three[i]=*((unsigned char*)what); what++; } string2base64Helper(three ,four); for (int i = 3 - (len==1); i < 4; ++i) { four[i] = '='; } for (int i = 0; i < 4; ++i) { *(tmp++) = four[i]; } } *tmp = 0; return(string(result)); }