2 * Licensed to the University Corporation for Advanced Internet
3 * Development, Inc. (UCAID) under one or more contributor license
4 * agreements. See the NOTICE file distributed with this work for
5 * additional information regarding copyright ownership.
7 * UCAID licenses this file to you under the Apache License,
8 * Version 2.0 (the "License"); you may not use this file except
9 * in compliance with the License. You may obtain a copy of the
12 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
17 * either express or implied. See the License for the specific
18 * language governing permissions and limitations under the License.
24 * A thread-safe pool of parsers that share characteristics.
28 #include "exceptions.h"
30 #include "util/CurlURLInputStream.h"
32 #include "util/PathResolver.h"
33 #include "util/ParserPool.h"
34 #include "util/Threads.h"
35 #include "util/XMLHelper.h"
37 #include <sys/types.h>
41 #include <boost/algorithm/string.hpp>
42 #include <boost/bind.hpp>
43 #include <xercesc/util/PlatformUtils.hpp>
44 #include <xercesc/util/XMLUniDefs.hpp>
45 #include <xercesc/sax/SAXException.hpp>
46 #include <xercesc/framework/MemBufInputSource.hpp>
47 #include <xercesc/framework/LocalFileInputSource.hpp>
48 #include <xercesc/framework/Wrapper4InputSource.hpp>
50 using namespace xmltooling::logging;
51 using namespace xmltooling;
52 using namespace xercesc;
53 using namespace boost;
58 class MyErrorHandler : public DOMErrorHandler {
62 MyErrorHandler() : errors(0) {}
64 bool handleError(const DOMError& e)
67 xmltooling::NDC ndc("handleError");
69 Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
71 DOMLocator* locator=e.getLocation();
72 auto_ptr_char temp(e.getMessage());
74 switch (e.getSeverity()) {
75 case DOMError::DOM_SEVERITY_WARNING:
76 log.warnStream() << "warning on line " << locator->getLineNumber()
77 << ", column " << locator->getColumnNumber()
78 << ", message: " << temp.get() << logging::eol;
81 case DOMError::DOM_SEVERITY_ERROR:
83 log.errorStream() << "error on line " << locator->getLineNumber()
84 << ", column " << locator->getColumnNumber()
85 << ", message: " << temp.get() << logging::eol;
88 case DOMError::DOM_SEVERITY_FATAL_ERROR:
90 log.errorStream() << "fatal error on line " << locator->getLineNumber()
91 << ", column " << locator->getColumnNumber()
92 << ", message: " << temp.get() << logging::eol;
97 log.errorStream() << "undefined error type on line " << locator->getLineNumber()
98 << ", column " << locator->getColumnNumber()
99 << ", message: " << temp.get() << logging::eol;
106 ParserPool::ParserPool(bool namespaceAware, bool schemaAware)
107 : m_namespaceAware(namespaceAware), m_schemaAware(schemaAware), m_lock(Mutex::create()), m_security(new SecurityManager()) {
110 const char* env = getenv("XMLTOOLING_ENTITY_EXPANSION_LIMIT");
112 expLimit = atoi(env);
115 expLimit = XMLTOOLING_ENTITY_EXPANSION_LIMIT;
116 m_security->setEntityExpansionLimit(expLimit);
119 ParserPool::~ParserPool()
121 while(!m_pool.empty()) {
122 m_pool.top()->release();
127 DOMDocument* ParserPool::newDocument()
129 return DOMImplementationRegistry::getDOMImplementation(nullptr)->createDocument();
132 #ifdef XMLTOOLING_XERCESC_COMPLIANT_DOMLS
134 DOMDocument* ParserPool::parse(DOMLSInput& domsrc)
136 DOMLSParser* parser=checkoutBuilder();
137 XercesJanitor<DOMLSParser> janitor(parser);
140 parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, dynamic_cast<DOMErrorHandler*>(&deh));
141 DOMDocument* doc=parser->parse(&domsrc);
145 throw XMLParserException("XML error(s) during parsing, check log for specifics");
147 parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
148 parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
149 checkinBuilder(janitor.release());
152 catch (XMLException& ex) {
153 parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
154 parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
155 checkinBuilder(janitor.release());
156 auto_ptr_char temp(ex.getMessage());
157 throw XMLParserException(string("Xerces error during parsing: ") + (temp.get() ? temp.get() : "no message"));
159 catch (XMLToolingException&) {
160 parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
161 parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
162 checkinBuilder(janitor.release());
169 DOMDocument* ParserPool::parse(DOMInputSource& domsrc)
171 DOMBuilder* parser=checkoutBuilder();
172 XercesJanitor<DOMBuilder> janitor(parser);
175 parser->setErrorHandler(&deh);
176 DOMDocument* doc=parser->parse(domsrc);
180 throw XMLParserException("XML error(s) during parsing, check log for specifics");
182 parser->setErrorHandler(nullptr);
183 parser->setFeature(XMLUni::fgXercesUserAdoptsDOMDocument, true);
184 checkinBuilder(janitor.release());
187 catch (XMLException& ex) {
188 parser->setErrorHandler(nullptr);
189 parser->setFeature(XMLUni::fgXercesUserAdoptsDOMDocument, true);
190 checkinBuilder(janitor.release());
191 auto_ptr_char temp(ex.getMessage());
192 throw XMLParserException(string("Xerces error during parsing: ") + (temp.get() ? temp.get() : "no message"));
194 catch (XMLToolingException&) {
195 parser->setErrorHandler(nullptr);
196 parser->setFeature(XMLUni::fgXercesUserAdoptsDOMDocument, true);
197 checkinBuilder(janitor.release());
204 DOMDocument* ParserPool::parse(istream& is)
206 StreamInputSource src(is);
207 Wrapper4InputSource domsrc(&src,false);
208 return parse(domsrc);
211 // Functor to double its argument separated by a character and append to a buffer
212 template <class T> class doubleit {
214 doubleit(T& t, const typename T::value_type& s) : temp(t), sep(s) {}
215 void operator() (const pair<const T,T>& s) { temp += s.first + sep + s.first + sep; }
217 const typename T::value_type& sep;
220 bool ParserPool::loadSchema(const XMLCh* nsURI, const XMLCh* pathname)
222 // Just check the pathname and then directly register the pair into the map.
224 auto_ptr_char p(pathname);
226 struct _stat stat_buf;
227 if (_stat(p.get(), &stat_buf) != 0)
229 struct stat stat_buf;
230 if (stat(p.get(), &stat_buf) != 0)
234 xmltooling::NDC ndc("loadSchema");
236 Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
237 auto_ptr_char n(nsURI);
238 log.error("failed to load schema for (%s), file not found (%s)",n.get(),p.get());
242 // Roundtrip to local code page and back to translate path as needed.
243 string topath(p.get());
244 XMLToolingConfig::getConfig().getPathResolver()->resolve(topath, PathResolver::XMLTOOLING_XML_FILE);
245 auto_ptr_XMLCh temp(topath.c_str());
248 m_schemaLocMap[nsURI] = temp.get();
249 m_schemaLocations.erase();
250 for_each(m_schemaLocMap.begin(), m_schemaLocMap.end(), doubleit<xstring>(m_schemaLocations,chSpace));
255 bool ParserPool::loadCatalogs(const char* pathnames)
257 string temp(pathnames);
259 vector<string> catpaths;
260 split(catpaths, temp, is_any_of(PATH_SEPARATOR_STR), algorithm::token_compress_on);
261 static bool (ParserPool::* lc)(const char*) = &ParserPool::loadCatalog;
262 for_each(catpaths.begin(), catpaths.end(), boost::bind(lc, this, boost::bind(&string::c_str, _1)));
263 return !catpaths.empty();
266 bool ParserPool::loadCatalog(const char* pathname)
269 XMLToolingConfig::getConfig().getPathResolver()->resolve(p, PathResolver::XMLTOOLING_XML_FILE);
270 auto_ptr_XMLCh temp(p.c_str());
271 return loadCatalog(temp.get());
274 bool ParserPool::loadCatalog(const XMLCh* pathname)
277 xmltooling::NDC ndc("loadCatalog");
279 Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
282 static const XMLCh catalog[] = UNICODE_LITERAL_7(c,a,t,a,l,o,g);
283 static const XMLCh system[] = UNICODE_LITERAL_6(s,y,s,t,e,m);
284 static const XMLCh systemId[] = UNICODE_LITERAL_8(s,y,s,t,e,m,I,d);
285 static const XMLCh uri[] = UNICODE_LITERAL_3(u,r,i);
286 static const XMLCh CATALOG_NS[] = {
287 chLatin_u, chLatin_r, chLatin_n, chColon,
288 chLatin_o, chLatin_a, chLatin_s, chLatin_i, chLatin_s, chColon,
289 chLatin_n, chLatin_a, chLatin_m, chLatin_e, chLatin_s, chColon,
290 chLatin_t, chLatin_c, chColon,
291 chLatin_e, chLatin_n, chLatin_t, chLatin_i, chLatin_t, chLatin_y, chColon,
292 chLatin_x, chLatin_m, chLatin_l, chLatin_n, chLatin_s, chColon,
293 chLatin_x, chLatin_m, chLatin_l, chColon,
294 chLatin_c, chLatin_a, chLatin_t, chLatin_a, chLatin_l, chLatin_o, chLatin_g, chNull
297 // Parse the catalog with the internal parser pool.
299 if (log.isDebugEnabled()) {
300 auto_ptr_char temp(pathname);
301 log.debug("loading XML catalog from %s", temp.get());
304 LocalFileInputSource fsrc(nullptr,pathname);
305 Wrapper4InputSource domsrc(&fsrc,false);
307 DOMDocument* doc=XMLToolingConfig::getConfig().getParser().parse(domsrc);
308 XercesJanitor<DOMDocument> janitor(doc);
310 // Check root element.
311 const DOMElement* root=doc->getDocumentElement();
312 if (!XMLHelper::isNodeNamed(root,CATALOG_NS,catalog)) {
313 auto_ptr_char temp(pathname);
314 log.error("unknown root element, failed to load XML catalog from %s", temp.get());
318 // Fetch all the <system> elements.
319 DOMNodeList* mappings = root->getElementsByTagNameNS(CATALOG_NS,system);
321 for (XMLSize_t i = 0; i < mappings->getLength(); i++) {
322 root = static_cast<DOMElement*>(mappings->item(i));
323 const XMLCh* from = root->getAttributeNS(nullptr,systemId);
324 const XMLCh* to = root->getAttributeNS(nullptr,uri);
326 // Roundtrip to local code page and back to translate path as needed.
327 auto_ptr_char temp(to);
328 string topath(temp.get());
329 XMLToolingConfig::getConfig().getPathResolver()->resolve(topath, PathResolver::XMLTOOLING_XML_FILE);
330 auto_ptr_XMLCh temp2(topath.c_str());
332 m_schemaLocMap[from] = temp2.get();
334 m_schemaLocations.erase();
335 for_each(m_schemaLocMap.begin(), m_schemaLocMap.end(), doubleit<xstring>(m_schemaLocations,chSpace));
337 catch (std::exception& e) {
338 log.error("catalog loader caught exception: %s", e.what());
345 #ifdef XMLTOOLING_XERCESC_COMPLIANT_DOMLS
346 DOMLSInput* ParserPool::resolveResource(
347 const XMLCh *const resourceType,
348 const XMLCh *const namespaceUri,
349 const XMLCh *const publicId,
350 const XMLCh *const systemId,
351 const XMLCh *const baseURI
354 DOMInputSource* ParserPool::resolveEntity(
355 const XMLCh* const publicId, const XMLCh* const systemId, const XMLCh* const baseURI
360 xmltooling::NDC ndc("resolveEntity");
364 xstring sysId(systemId);
366 Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
367 if (log.isDebugEnabled()) {
368 auto_ptr_char sysId(systemId);
369 auto_ptr_char base(baseURI);
370 log.debug("asked to resolve %s with baseURI %s",sysId.get(),base.get() ? base.get() : "(null)");
373 // Find well-known schemas in the specified location.
374 map<xstring,xstring>::const_iterator i = m_schemaLocMap.find(sysId);
375 if (i != m_schemaLocMap.end())
376 return new Wrapper4InputSource(new LocalFileInputSource(baseURI, i->second.c_str()));
378 // Check for entity as a suffix of a value in the map.
379 bool (*p_ends_with)(const xstring&, const xstring&) = ends_with;
381 m_schemaLocMap.begin(), m_schemaLocMap.end(),
382 boost::bind(p_ends_with, boost::bind(&map<xstring,xstring>::value_type::second, _1), boost::ref(sysId))
384 if (i != m_schemaLocMap.end())
385 return new Wrapper4InputSource(new LocalFileInputSource(baseURI, i->second.c_str()));
387 // We'll allow anything without embedded slashes.
388 if (XMLString::indexOf(systemId, chForwardSlash) == -1 && XMLString::indexOf(systemId, chBackSlash) == -1)
389 return new Wrapper4InputSource(new LocalFileInputSource(baseURI, systemId));
391 // Shortcircuit the request.
392 auto_ptr_char temp(systemId);
393 log.debug("unauthorized entity request (%s), blocking it", temp.get());
394 static const XMLByte nullbuf[] = {0};
395 return new Wrapper4InputSource(new MemBufInputSource(nullbuf, 0, systemId));
398 #ifdef XMLTOOLING_XERCESC_COMPLIANT_DOMLS
400 DOMLSParser* ParserPool::createBuilder()
402 static const XMLCh impltype[] = { chLatin_L, chLatin_S, chNull };
403 DOMImplementation* impl=DOMImplementationRegistry::getDOMImplementation(impltype);
404 DOMLSParser* parser=static_cast<DOMImplementationLS*>(impl)->createLSParser(DOMImplementationLS::MODE_SYNCHRONOUS,nullptr);
405 parser->getDomConfig()->setParameter(XMLUni::fgDOMNamespaces, m_namespaceAware);
407 parser->getDomConfig()->setParameter(XMLUni::fgDOMNamespaces, true);
408 parser->getDomConfig()->setParameter(XMLUni::fgXercesSchema, true);
409 parser->getDomConfig()->setParameter(XMLUni::fgDOMValidate, true);
410 parser->getDomConfig()->setParameter(XMLUni::fgXercesCacheGrammarFromParse, true);
412 // We build a "fake" schema location hint that binds each namespace to itself.
413 // This ensures the entity resolver will be given the namespace as a systemId it can check.
414 parser->getDomConfig()->setParameter(XMLUni::fgXercesSchemaExternalSchemaLocation, const_cast<XMLCh*>(m_schemaLocations.c_str()));
416 parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
417 parser->getDomConfig()->setParameter(XMLUni::fgXercesDisableDefaultEntityResolution, true);
418 parser->getDomConfig()->setParameter(XMLUni::fgDOMResourceResolver, dynamic_cast<DOMLSResourceResolver*>(this));
419 parser->getDomConfig()->setParameter(XMLUni::fgXercesSecurityManager, m_security.get());
423 DOMLSParser* ParserPool::checkoutBuilder()
426 if (m_pool.empty()) {
427 DOMLSParser* builder=createBuilder();
430 DOMLSParser* p=m_pool.top();
433 p->getDomConfig()->setParameter(XMLUni::fgXercesSchemaExternalSchemaLocation, const_cast<XMLCh*>(m_schemaLocations.c_str()));
437 void ParserPool::checkinBuilder(DOMLSParser* builder)
441 m_pool.push(builder);
447 DOMBuilder* ParserPool::createBuilder()
449 static const XMLCh impltype[] = { chLatin_L, chLatin_S, chNull };
450 DOMImplementation* impl=DOMImplementationRegistry::getDOMImplementation(impltype);
451 DOMBuilder* parser=static_cast<DOMImplementationLS*>(impl)->createDOMBuilder(DOMImplementationLS::MODE_SYNCHRONOUS,0);
452 parser->setFeature(XMLUni::fgDOMNamespaces, m_namespaceAware);
454 parser->setFeature(XMLUni::fgDOMNamespaces, true);
455 parser->setFeature(XMLUni::fgXercesSchema, true);
456 parser->setFeature(XMLUni::fgDOMValidation, true);
457 parser->setFeature(XMLUni::fgXercesCacheGrammarFromParse, true);
459 // We build a "fake" schema location hint that binds each namespace to itself.
460 // This ensures the entity resolver will be given the namespace as a systemId it can check.
461 parser->setProperty(XMLUni::fgXercesSchemaExternalSchemaLocation,const_cast<XMLCh*>(m_schemaLocations.c_str()));
463 parser->setProperty(XMLUni::fgXercesSecurityManager, m_security.get());
464 parser->setFeature(XMLUni::fgXercesUserAdoptsDOMDocument, true);
465 parser->setFeature(XMLUni::fgXercesDisableDefaultEntityResolution, true);
466 parser->setEntityResolver(this);
470 DOMBuilder* ParserPool::checkoutBuilder()
473 if (m_pool.empty()) {
474 DOMBuilder* builder=createBuilder();
477 DOMBuilder* p=m_pool.top();
480 p->setProperty(XMLUni::fgXercesSchemaExternalSchemaLocation,const_cast<XMLCh*>(m_schemaLocations.c_str()));
484 void ParserPool::checkinBuilder(DOMBuilder* builder)
488 m_pool.push(builder);
494 StreamInputSource::StreamInputSource(istream& is, const char* systemId) : InputSource(systemId), m_is(is)
498 BinInputStream* StreamInputSource::makeStream() const
500 return new StreamBinInputStream(m_is);
503 StreamInputSource::StreamBinInputStream::StreamBinInputStream(istream& is) : m_is(is), m_pos(0)
507 #ifdef XMLTOOLING_XERCESC_64BITSAFE
512 StreamInputSource::StreamBinInputStream::curPos() const
517 #ifdef XMLTOOLING_XERCESC_64BITSAFE
518 const XMLCh* StreamInputSource::StreamBinInputStream::getContentType() const
524 xsecsize_t StreamInputSource::StreamBinInputStream::readBytes(XMLByte* const toFill, const xsecsize_t maxToRead)
526 XMLByte* target=toFill;
527 xsecsize_t bytes_read=0,request=maxToRead;
529 // Fulfill the rest by reading from the stream.
530 if (request && !m_is.eof() && !m_is.fail()) {
532 m_is.read(reinterpret_cast<char* const>(target),request);
533 m_pos+=m_is.gcount();
534 bytes_read+=m_is.gcount();
536 catch(ios_base::failure& e) {
537 Category::getInstance(XMLTOOLING_LOGCAT ".StreamInputSource").critStream()
538 << "XML::StreamInputSource::StreamBinInputStream::readBytes caught an exception: " << e.what()
547 #ifdef XMLTOOLING_LITE
549 URLInputSource::URLInputSource(const XMLCh* url, const char* systemId, string* cacheTag) : InputSource(systemId), m_url(url)
553 URLInputSource::URLInputSource(const DOMElement* e, const char* systemId, string* cacheTag) : InputSource(systemId)
555 static const XMLCh uri[] = UNICODE_LITERAL_3(u,r,i);
556 static const XMLCh url[] = UNICODE_LITERAL_3(u,r,l);
558 const XMLCh* attr = e->getAttributeNS(nullptr, url);
559 if (!attr || !*attr) {
560 attr = e->getAttributeNS(nullptr, uri);
562 throw IOException("No URL supplied via DOM to URLInputSource constructor.");
568 BinInputStream* URLInputSource::makeStream() const
570 // Ask the URL to create us an appropriate input stream
571 return m_url.makeNewStream();
576 URLInputSource::URLInputSource(const XMLCh* url, const char* systemId, string* cacheTag)
577 : InputSource(systemId), m_cacheTag(cacheTag), m_url(url), m_root(nullptr)
581 URLInputSource::URLInputSource(const DOMElement* e, const char* systemId, string* cacheTag)
582 : InputSource(systemId), m_cacheTag(cacheTag), m_root(e)
586 BinInputStream* URLInputSource::makeStream() const
588 return m_root ? new CurlURLInputStream(m_root, m_cacheTag) : new CurlURLInputStream(m_url.get(), m_cacheTag);
593 const char URLInputSource::asciiStatusCodeElementName[] = "URLInputSourceStatus";
595 const XMLCh URLInputSource::utf16StatusCodeElementName[] = UNICODE_LITERAL_20(U,R,L,I,n,p,u,t,S,o,u,r,c,e,S,t,a,t,u,s);