#include "pdfparser.h" #include "util/stringutil.h" #include #define assert(expr) if(!bool(expr)) throw std::runtime_error("PDF reading error in " __FILE__ " on line " + tostr(__LINE__)); using namespace std; PDFParser::PDFParser(const std::string& filename) : source(filename.c_str()) { // Note; this might be bogus for multi-xref documents: it seems I should // only look for startxref at the end, the trailer should follow that xref // section. ParsePDF::findTrailer(source); std::string token; assert(source >> token >> std::ws && token == "trailer"); trailer = ParsePDF::readDictionary(source); assert(source >> token && token == "startxref"); int startxref; assert(source >> startxref); assert(source >> token && token == "%%EOF"); int numxref = int(trailer->get_entry("Size").dyn_cast() ->get_value()); source.seekg(startxref); xrefs = new int[numxref]; int idx, num; assert(source >> token && token == "xref" && source >> idx >> num); for(int i = 0; i < num; ++i, ++idx) { int offset, generation; char status; assert(source >> offset >> generation >> status); if(status == 'n') assert(generation == 0); xrefs[idx] = offset; } } PDFParser::~PDFParser() { delete[] xrefs; } PDF::Object::Ptr PDFParser::getObject(PDF::Object::Ptr refobj) { PDF::Ref::Ptr ref = refobj.dyn_cast(); if(!ref) return refobj; const int num = ref->get_num(); assert(xrefs[num] != 0); source.seekg(xrefs[num]); int i, j; std::string token; assert(source >> i >> j >> token >> std::ws && i == num && j == 0 && token == "obj"); PDF::Object::Ptr result = ParsePDF::readObject(source); assert(source >> token >> ws); if(token == "endobj") return result; if(token == "stream") { PDF::Dictionary::Ptr dict = result.dyn_cast(); assert(dict /* a stream must be a dictionary */); PDF::Stream::Ptr stream = PDF::Stream::create(); // Copy the dictionary for(PDF::Dictionary::const_iterator i = dict->begin(); i!=dict->end(); ++i) stream->set_entry(i->first, i->second); // Copy Length bytes from the input PDF::Object::Ptr len_obj = stream->get_entry("Length"); int len = 0; if(PDF::Ref::Ptr r = len_obj.dyn_cast()) { iostream::pos_type p = source.tellg(); len_obj = getObject(r); source.seekg(p); } if(PDF::Real::Ptr r = len_obj.dyn_cast()) len = int(r->get_value()); else throw std::runtime_error("Failed to get stream length"); // Todo: This should be a lot more efficient! for(int i = 0; i < len; ++i) { char ch; source.get(ch); stream->data() << ch; } return stream; } throw std::runtime_error("Unexpected token: " + token); } namespace { PDF::String::Ptr readString(istream& in) { assert(in.get() == '('); int parenlevel = 0; string result; while(1) switch(const char ch = in.get()) { case '(': ++parenlevel; result += ch; break; case ')': if(--parenlevel < 0) { return PDF::String::create(result); } result += ch; break; // Todo: other special characters! default: result += ch; } } PDF::String::Ptr readHexString(istream& in) { assert(in.get() == '<'); char ch; bool even = true; std::string result; while(in.get(ch)) { even = !even; if(ch == '>') return PDF::String::create(result); if(ch > 'a') ch -= 'a'; else if(ch > 'A') ch -= 'A'; else if(ch > '0') ch -= '0'; if(!even) result += char(ch * 16); else result[result.length()-1] += ch; } throw std::runtime_error("Read failed while reading hex string from PDF"); } istream& getstring(istream& in, string& token) { const string irregular = "\t\n\f\r ()[]{}<>/%#"; // Todo: null should also be irregular! token.clear(); char ch; while(in.get(ch)) { if(irregular.find(ch) == std::string::npos) token += ch; else { in.putback(ch); return in; } } return in; } PDF::Name::Ptr readName(istream& in) { assert(in.get() == '/'); const string irregular = "\t\n\f\r ()[]{}<>/%#"; // Todo: null should also be irregular! string name; char ch; while(in.get(ch)) { if(irregular.find(ch) == std::string::npos) name += ch; else if(ch == '#') { char h1 = in.get(), h2 = in.get(); if(h1 > 'a') h1 -= 'a'; else if(h1 > 'A') h1 -= 'A'; else if(h1 > '0') h1 -= '0'; if(h2 > 'a') h1 -= 'a'; else if(h2 > 'A') h1 -= 'A'; else if(h2 > '0') h1 -= '0'; name += char(h1*16 + h2); } else { in.putback(ch); return PDF::Name::create(name); } } throw std::runtime_error("Failed to read in pdf name"); } PDF::Array::Ptr readArray(istream& in) { assert(in.get() == '['); std::vector result; while(1) { in >> ws; switch(in.peek()) { case ']': in.get(); { PDF::Array::Ptr array = PDF::Array::create(); for(std::vector::const_iterator i = result.begin(); i != result.end(); ++i) array->push_back(*i); return array; } case 'R': assert(in.get() == 'R'); { int gen = int(result.back().dyn_cast()->get_value()); result.pop_back(); int num = int(result.back().dyn_cast()->get_value()); result.pop_back(); result.push_back(PDF::Ref::create(num, gen)); } break; default: result.push_back(ParsePDF::readObject(in)); } } } PDF::Real::Ptr readNumber(istream& in) { // Todo: create a PDF::Integer if the number is an integer float num; in >> num; return PDF::Real::create(num); } }; void ParsePDF::findTrailer(istream& in) { std::streamoff offset = -128; // Start searching last 128 bytes while(in.seekg(offset, ios_base::end)) { const string target = "trailer"; string::size_type pos = 0; char ch; while(in.get(ch)) { if(ch == target[pos]) { if(++pos >= target.size()) { // Negative seeks from cur stopped working (in 3.4?) //in.seekg(-target.length(), std::ios_base::cur); in.seekg(in.tellg() - std::streampos(target.length())); return; } } else pos = 0; } in.clear(); offset *= 2; // Nothing found, search wider } // seek failed, so we have probably searched the entire file throw std::runtime_error("Failed to find PDF trailer"); } PDF::Object::Ptr ParsePDF::readObject(istream& in) { switch(in.peek()) { case '%': in.ignore('\n'); return readObject(in); // the next object after the comment case '(': return readString(in); case '/': return readName(in); case '0' ... '9': case '+': case '-': // What about references? return readNumber(in); case '<': { in.get(); // peek one more char ahead char ch2 = in.peek(); in.putback('<'); if(ch2 == '<') return readDictionary(in); else return readHexString(in); } case '[': return readArray(in); case 'n': { string st; if(getstring(in, st) && st == "null") return PDF::Object::Ptr(); else throw std::runtime_error ("Illegal token \"" + st + "\" at pos " + tostr(in.tellg() - std::streamoff(st.length())) + " in pdf"); } case 't': case 'f': { string st; assert(getstring(in, st)); if(st == "true") return PDF::Boolean::create(true); if(st == "false") return PDF::Boolean::create(false); throw std::runtime_error ("Illegal token \"" + st + "\" at pos " + tostr(in.tellg() - std::streamoff(st.length())) + " in pdf"); } default: throw std::runtime_error (std::string("Illegal token \"") + std::string(in.get(), 1) + "\" at pos " + tostr(in.tellg() - std::streamoff(1)) + " in pdf"); } } PDF::Dictionary::Ptr ParsePDF::readDictionary(istream& in) { assert(in.get() == '<' && in.get() == '<'); typedef std::vector RVec; RVec result; while(1) { in >> ws; switch(in.peek()) { case '>': assert(in.get() == '>' && in.get() == '>'); { // End of the dictionary; return it PDF::Dictionary::Ptr dict = PDF::Dictionary::create(); for(RVec::iterator i = result.begin(); i!=result.end(); ++i) { if(PDF::Name::Ptr name = i->dyn_cast()) dict->set_entry(name->get_name(), *(++i)); else throw std::runtime_error ("Malformed dictionary: keys must be names"); } return dict; } case 'R': assert(in.get() == 'R'); { int gen = int(result.back().dyn_cast()->get_value()); result.pop_back(); int num = int(result.back().dyn_cast()->get_value()); result.pop_back(); result.push_back(PDF::Ref::create(num, gen)); } break; default: result.push_back(ParsePDF::readObject(in)); } } }