001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.workplace.tools.database;
029
030import org.opencms.file.CmsPropertyDefinition;
031import org.opencms.i18n.CmsEncoder;
032import org.opencms.main.CmsLog;
033import org.opencms.util.CmsStringUtil;
034
035import java.io.ByteArrayInputStream;
036import java.io.ByteArrayOutputStream;
037import java.io.IOException;
038import java.io.InputStream;
039import java.io.PrintWriter;
040import java.io.Reader;
041import java.io.StringReader;
042import java.io.StringWriter;
043import java.io.UnsupportedEncodingException;
044import java.io.Writer;
045import java.util.HashSet;
046import java.util.Hashtable;
047import java.util.StringTokenizer;
048import java.util.regex.Matcher;
049import java.util.regex.Pattern;
050
051import org.w3c.dom.Document;
052import org.w3c.dom.NamedNodeMap;
053import org.w3c.dom.Node;
054import org.w3c.dom.NodeList;
055import org.w3c.tidy.Tidy;
056
057/**
058 * This class implements Html-converting routines based on tidy to modify the
059 * Html code of the imported Html pages.<p>
060 *
061 * @since 6.0.0
062 */
063public class CmsHtmlImportConverter {
064
065    /** defintition of the alt attribute. */
066    private static final String ATTRIB_ALT = "alt";
067
068    /** defintition of the content attribute. */
069    private static final String ATTRIB_CONTENT = "content";
070
071    /** defintition of the href attribute.  */
072    private static final String ATTRIB_HREF = "href";
073
074    /** defintition of the name attribute. */
075    private static final String ATTRIB_NAME = "name";
076
077    /** defintition of the src attribute. */
078    private static final String ATTRIB_SRC = "src";
079
080    /** defintition of the &lt;BODY&gt;&lt;/BODY&gt; node. */
081    private static final String NODE_BODY = "body";
082
083    /** defintition of the &lt;HEAD&gt;&lt;/HEAD&gt; node. */
084    private static final String NODE_HEAD = "head";
085
086    /** defintition of the &lt;A&gt;&lt;/A&gt; node. */
087    private static final String NODE_HREF = "a";
088
089    /** defintition of the &lt;HTML&gt;&lt;/HTML&gt; node. */
090    private static final String NODE_HTML = "html";
091
092    /** defintition of the &lt;IMG&gt;&lt;/IMG&gt; node. */
093    private static final String NODE_IMG = "img";
094
095    /** defintition of the &lt;META&gt;&lt;/META&gt; node. */
096    private static final String NODE_META = "meta";
097
098    /** defintition of the &lt;TITLE&gt;&lt;/TITLE&gt;  node. */
099    private static final String NODE_TITLE = "title";
100
101    /**
102     * HashMap stores tag names, after the end-tag, a "\n" is added to the output.<p>
103     */
104    private HashSet m_enterTags = new HashSet();
105
106    /**
107     * the absolute path in the real filesystem of the file to convert.
108     */
109    private String m_filename;
110
111    /**
112     * reference to the HtmlImport object, required to access the link translation.
113     */
114    private CmsHtmlImport m_htmlImport;
115
116    /**
117     * temporary buffer used in transformation method.
118     */
119    private StringBuffer m_tempString;
120
121    /** instance of JTidy. */
122    private Tidy m_tidy = new Tidy();
123
124    /** flag to write the output. */
125    private boolean m_write;
126
127    /**
128     * Default constructor, creates a new HtmlConverter.<p>
129     *
130     * @param htmlImport reference to the htmlimport
131     * @param xmlMode switch for setting the import to HTML or XML mode
132     */
133    public CmsHtmlImportConverter(CmsHtmlImport htmlImport, boolean xmlMode) {
134
135        m_tidy.setTidyMark(false);
136        m_tidy.setShowWarnings(false);
137        m_tidy.setQuiet(true);
138        m_tidy.setForceOutput(true);
139
140        if (xmlMode) {
141            m_tidy.setXmlTags(xmlMode);
142            m_tidy.setXmlSpace(true);
143        }
144
145        initialiseTags();
146        m_htmlImport = htmlImport;
147    }
148
149    /**
150     * Extracts the content of a HTML page.<p>
151     *
152     * This method should be pretty robust and work even if the input HTML does not contains
153     * the specified matchers.<p>
154     *
155     * @param content the content to extract the body from
156     * @param startpoint the point where matching starts
157     * @param endpoint the point where matching ends
158     * @return the extracted body tag content
159     */
160    public static String extractHtml(String content, String startpoint, String endpoint) {
161
162        /** Regex that matches a start body tag. */
163        Pattern startPattern = Pattern.compile(startpoint, Pattern.CASE_INSENSITIVE);
164
165        /** Regex that matches an end body tag. */
166        Pattern endPattern = Pattern.compile(endpoint, Pattern.CASE_INSENSITIVE);
167
168        Matcher startMatcher = startPattern.matcher(content);
169        Matcher endMatcher = endPattern.matcher(content);
170
171        int start = 0;
172        int end = content.length();
173
174        if (startMatcher.find()) {
175            start = startMatcher.end();
176        }
177
178        if (endMatcher.find(start)) {
179            end = endMatcher.start();
180        }
181
182        return content.substring(start, end);
183    }
184
185    /**
186     * Transforms HTML code into user defined output.<p>
187     *
188     * @param input Reader with HTML code
189     * @param output Writer with transformed code
190     * @param startPattern the start pattern definition for content extracting
191     * @param endPattern the end pattern definition for content extracting
192     * @param properties the file properties
193     */
194    public void convertHTML(Reader input, Writer output, String startPattern, String endPattern, Hashtable properties) {
195
196        /* local variables */
197        StringBuffer htmlString = new StringBuffer();
198        Node node;
199        String outString = "";
200
201        try {
202            /* write InputStream input in StringBuffer htmlString */
203            int c;
204            while ((c = input.read()) != -1) {
205                htmlString.append((char)c);
206            }
207        } catch (IOException e) {
208            if (CmsLog.INIT.isWarnEnabled()) {
209                CmsLog.INIT.warn(
210                    Messages.get().getBundle().key(
211                        Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0,
212                        e.getLocalizedMessage()));
213            }
214            return;
215        }
216        outString = htmlString.toString();
217        // extract from html if even both patterns are defined
218        if (CmsStringUtil.isNotEmpty(startPattern) && CmsStringUtil.isNotEmpty(endPattern)) {
219            String extractMain = extractHtml(outString, startPattern, endPattern);
220            if (extractMain.length() != outString.length()) {
221                String extractHead = extractHtml(outString, "<html>", CmsStringUtil.BODY_START_REGEX);
222                //String extractHead = extractHtml(extractMain, "<html>", CmsStringUtil.C_BODY_START_REGEX);
223                StringBuffer buffer = new StringBuffer(extractHead.length() + extractMain.length() + 255);
224                buffer.append("<html>");
225                buffer.append(extractHead);
226                buffer.append("<body>");
227                buffer.append(extractMain);
228                buffer.append("</body></html>");
229                outString = buffer.toString();
230            }
231        }
232
233        /* convert htmlString in InputStream for parseDOM */
234        InputStream in;
235        try {
236            in = new ByteArrayInputStream(outString.getBytes(CmsEncoder.ENCODING_UTF_8));
237        } catch (UnsupportedEncodingException e) {
238            // this should never happen since UTF-8 is always supported
239            in = new ByteArrayInputStream(outString.getBytes());
240        }
241        m_tidy.setInputEncoding(CmsEncoder.ENCODING_UTF_8);
242        m_tidy.setOutputEncoding(CmsEncoder.ENCODING_UTF_8);
243
244        // hold tidy error information into a new PrintWriter Object
245        PrintWriter errorLog = new PrintWriter(new ByteArrayOutputStream(), true);
246        m_tidy.setErrout(errorLog);
247
248        node = m_tidy.parseDOM(in, null);
249        /* check if html code has errors */
250        if (m_tidy.getParseErrors() != 0) {
251            if (CmsLog.INIT.isWarnEnabled()) {
252                CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0));
253            }
254        }
255        /* second step: create transformed output with printDocument from DOM */
256        printDocument(node, properties);
257
258        try {
259            String content = m_tempString.toString();
260            content = CmsStringUtil.substitute(content, "<br></br>", "<br>");
261            content = CmsStringUtil.substitutePerl(content, "</a>(\\w+)", "</a> $1", "g");
262            output.write(content);
263            output.close();
264
265        } catch (IOException e) {
266            if (CmsLog.INIT.isWarnEnabled()) {
267                CmsLog.INIT.warn(
268                    Messages.get().getBundle().key(
269                        Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_1,
270                        e.getLocalizedMessage()));
271            }
272            return;
273        }
274    }
275
276    /**
277     * Transforms HTML code into user defined output.<p>
278     *
279     * @param filename the absolute path in the real filesystem of the file to convert
280     * @param inString String with HTML code
281     * @param startPattern the start pattern definition for content extracting
282     * @param endPattern the end pattern definition for content extracting
283     * @param properties the file properties
284     * @return String with transformed code
285     */
286    public String convertHTML(
287        String filename,
288        String inString,
289        String startPattern,
290        String endPattern,
291        Hashtable properties) {
292
293        m_tempString = new StringBuffer();
294        m_write = true;
295        m_filename = filename.replace('\\', '/');
296        Reader in = new StringReader(inString);
297        Writer out = new StringWriter();
298        convertHTML(in, out, startPattern, endPattern, properties);
299        return out.toString();
300    }
301
302    /**
303     * Initialises Vector m_enterTags with tag names.<p>
304     */
305    private void initialiseTags() {
306
307        StringTokenizer T = new StringTokenizer(
308            "p,table,tr,td,body,head,script,pre,title,style,h1,h2,h3,h4,h5,h6,ul,ol,li",
309            ",");
310        while (T.hasMoreTokens()) {
311            m_enterTags.add(T.nextToken());
312        }
313    }
314
315    /**
316     * Private method to parse DOM and create user defined output.<p>
317     *
318     * @param node Node of DOM from HTML code
319     * @param properties the file properties
320     */
321    private void printDocument(Node node, Hashtable properties) {
322
323        // if node is empty do nothing... (Recursion)
324        if (node == null) {
325            return;
326        }
327        // initialise local variables
328        int type = node.getNodeType();
329        String name = node.getNodeName();
330
331        // detect node type
332        switch (type) {
333            case Node.DOCUMENT_NODE:
334
335                printDocument(((Document)node).getDocumentElement(), properties);
336                break;
337            case Node.ELEMENT_NODE:
338
339                // check if its the <head> node. Nothing inside the <head> node
340                // must be
341                // part of the output, but we must scan the content of this
342                // node to get all
343                // <meta> tags
344                if (name.equals(NODE_HEAD)) {
345                    m_write = false;
346                }
347                // scan element node; if a block has to be removed or replaced,
348                // break and discard child nodes
349                transformStartElement(node, properties);
350
351                // test if node has children
352                NodeList children = node.getChildNodes();
353                if (children != null) {
354                    int len = children.getLength();
355                    for (int i = 0; i < len; i++) {
356                        // recursively call printDocument with all child nodes
357                        printDocument(children.item(i), properties);
358                    }
359                }
360                break;
361            case Node.TEXT_NODE:
362
363                // replace subStrings in text nodes
364                transformTextNode(node);
365                break;
366            default:
367
368                break;
369        }
370        // end of recursion, add eventual endtags and suffixes
371        switch (type) {
372            case Node.ELEMENT_NODE:
373                // analyse endtags and add them to output
374                transformEndElement(node);
375                if (node.getNodeName().equals(NODE_HEAD)) {
376                    m_write = true;
377                }
378                break;
379            case Node.DOCUMENT_NODE:
380                break;
381            default:
382                break;
383        }
384    }
385
386    /**
387     * Transform element nodes and create end tags in output.<p>
388     *
389     * @param node actual element node
390     */
391    private void transformEndElement(Node node) {
392
393        // check hat kind of node we have
394        String nodeName = node.getNodeName();
395
396        // the <HTML> and <BODY> node must be skipped
397        if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) {
398            // do nothing here
399        } else {
400            // only do some output if we are in writing mode
401            if (m_write) {
402                m_tempString.append("</");
403                m_tempString.append(nodeName);
404                m_tempString.append(">");
405
406                // append a "\n" to output String if possible
407                if (m_enterTags.contains(node.getNodeName())) {
408                    m_tempString.append("\n");
409                }
410            }
411        }
412    }
413
414    /**
415     * Transforms element nodes and create start tags in output. <p>
416     *
417     * @param node actual element node
418     * @param properties the file properties
419     */
420    private void transformStartElement(Node node, Hashtable properties) {
421
422        // check hat kind of node we have
423        String nodeName = node.getNodeName();
424
425        // the <HTML> and <BODY> node must be skipped
426        if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) {
427            // the <TITLE> node must be read and its value set as properties to
428            // the imported file
429
430        } else if (nodeName.equals(NODE_TITLE)) {
431
432            writeTitleProperty(node, properties);
433
434        } else if (nodeName.equals(NODE_META)) {
435
436            writeMetaTagProperty(node, properties);
437
438        } else if (nodeName.equals(NODE_HREF)) {
439
440            // only do some output if we are in writing mode
441            if (m_write) {
442                m_tempString.append("<");
443                m_tempString.append(nodeName);
444                NamedNodeMap attrs = node.getAttributes();
445                // look through all attribs to find the reference
446                for (int i = attrs.getLength() - 1; i >= 0; i--) {
447                    String name = attrs.item(i).getNodeName();
448                    String value = attrs.item(i).getNodeValue();
449
450                    if (name.equals(ATTRIB_HREF)) {
451
452                        // check if this is an external link
453                        if (value.indexOf("://") > 0) {
454                            // store it for later creation of an entry in the
455                            // link gallery
456                            String externalLinkFile = m_htmlImport.storeExternalLink(value);
457                            if (externalLinkFile != null) {
458                                value = m_htmlImport.getLinkGallery() + externalLinkFile;
459                            }
460                        } else if (!value.startsWith("mailto:") && !value.startsWith("javascript:")) {
461
462                            // save an existing anchor link for later use
463                            //                            if (value.indexOf("#") > 0) {
464                            //                                String anchor = value.substring(value.indexOf("#"), value.length());
465                            //                            }
466                            // get the new link into the VFS
467                            String internalUri = m_htmlImport.getAbsoluteUri(
468                                value,
469                                m_filename.substring(0, m_filename.lastIndexOf("/") + 1));
470
471                            value = m_htmlImport.translateLink(internalUri);
472                        }
473                    }
474
475                    m_tempString.append(" ");
476                    m_tempString.append(name);
477                    m_tempString.append("=\"");
478                    m_tempString.append(value);
479                    m_tempString.append("\"");
480                }
481                m_tempString.append(">");
482            }
483
484            // this is a imasge, its reference must be converted
485        } else if (nodeName.equals(NODE_IMG)) {
486
487            // only do some output if we are in writing mode
488            if (m_write) {
489                m_tempString.append("<");
490                m_tempString.append(nodeName);
491                NamedNodeMap attrs = node.getAttributes();
492                // look through all attribs to find the src and alt attributes
493                String imagename = "";
494                String altText = "";
495                for (int i = attrs.getLength() - 1; i >= 0; i--) {
496                    String name = attrs.item(i).getNodeName();
497                    String value = attrs.item(i).getNodeValue();
498                    if (name.equals(ATTRIB_SRC)) {
499                        // we found the src. now check if it refers to an
500                        // external image.
501                        // if not, we must get the correct location in the VFS
502                        if (value.indexOf("://") <= 0) {
503                            imagename = m_htmlImport.getAbsoluteUri(
504                                value,
505                                m_filename.substring(0, m_filename.lastIndexOf("/") + 1));
506                            value = m_htmlImport.translateLink(imagename);
507                        }
508                    } else if (name.equals(ATTRIB_ALT)) {
509                        altText = value;
510                    }
511
512                    m_tempString.append(" ");
513                    m_tempString.append(name);
514                    m_tempString.append("=\"");
515                    m_tempString.append(value);
516                    m_tempString.append("\"");
517                }
518
519                //store the alt tag of this image for later use
520                m_htmlImport.storeImageInfo(imagename, altText);
521
522                m_tempString.append(">");
523            }
524        } else {
525
526            // only do some output if we are in writing mode
527            if (m_write) {
528
529                m_tempString.append("<");
530                m_tempString.append(nodeName);
531                NamedNodeMap attrs = node.getAttributes();
532                for (int i = attrs.getLength() - 1; i >= 0; i--) {
533                    m_tempString.append(" " + attrs.item(i).getNodeName() + "=" + "\"");
534                    /* scan attribute values and replace subStrings */
535                    m_tempString.append(attrs.item(i).getNodeValue() + "\"");
536                }
537                m_tempString.append(">");
538            }
539        }
540    }
541
542    /**
543     * Private method to transform text nodes.<p>
544     *
545     * @param node actual text node
546     */
547    private void transformTextNode(Node node) {
548
549        // only do some output if we are in writing mode
550        if (m_write) {
551            String helpString = node.getNodeValue();
552            m_tempString.append(helpString);
553        }
554    }
555
556    /**
557     * Writes meta tags as cms properties by analyzing the meta tags nodes.<p>
558     *
559     * @param node the meta tag node in html document
560     * @param properties the properties hashtable
561     */
562    private void writeMetaTagProperty(Node node, Hashtable properties) {
563
564        NamedNodeMap attrs = node.getAttributes();
565        String metaName = "";
566        String metaContent = "";
567        // look through all attribs to find the name and content attributes
568        for (int i = attrs.getLength() - 1; i >= 0; i--) {
569            String name = attrs.item(i).getNodeName();
570            String value = attrs.item(i).getNodeValue();
571            if (name.equals(ATTRIB_NAME)) {
572                metaName = value;
573            } else if (name.equals(ATTRIB_CONTENT)) {
574                metaContent = value;
575            }
576        }
577        // check if we have valid entries for this <META> node, store them
578        // in the properties
579        if ((metaName.length() > 0) && (metaContent.length() > 0)) {
580            properties.put(metaName, CmsStringUtil.substitute(metaContent, "{subst}", "&#"));
581        }
582    }
583
584    /**
585     * Sets the Property title by analyzing the title node.<p>
586     *
587     * @param node the title node in html document
588     * @param properties the properties hashtable
589     */
590    private void writeTitleProperty(Node node, Hashtable properties) {
591
592        String title = "";
593        // the title string is stored in the first child node
594        NodeList children = node.getChildNodes();
595        if (children != null) {
596            Node titleNode = children.item(0);
597            if (titleNode != null) {
598                title = titleNode.getNodeValue();
599            }
600        }
601        // add the title property if we have one
602        if ((title != null) && (title.length() > 0)) {
603
604            properties.put(CmsPropertyDefinition.PROPERTY_TITLE, CmsStringUtil.substitute(title, "{subst}", "&#"));
605            // the title will be used as navtext if no other navtext is
606            // given
607            if (properties.get(CmsPropertyDefinition.PROPERTY_NAVTEXT) == null) {
608                properties.put(
609                    CmsPropertyDefinition.PROPERTY_NAVTEXT,
610                    CmsStringUtil.substitute(title, "{subst}", "&#"));
611            }
612        }
613
614    }
615
616}