001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.util;
029
030import org.opencms.main.CmsLog;
031
032import java.io.ByteArrayInputStream;
033import java.io.ByteArrayOutputStream;
034import java.io.UnsupportedEncodingException;
035import java.util.Arrays;
036import java.util.Collections;
037import java.util.List;
038import java.util.Properties;
039import java.util.regex.Pattern;
040
041import org.apache.commons.logging.Log;
042
043import org.w3c.tidy.Tidy;
044
045/**
046 * HTML cleaner and pretty printer using JTidy.<p>
047 *
048 * Used to clean up HTML code (e.g. remove word tags) and optionally create XHTML from HTML.<p>
049 *
050 * @since 6.0.0
051 */
052public class CmsHtmlConverterJTidy extends A_CmsHtmlConverter {
053
054    /** The log object for this class. */
055    private static final Log LOG = CmsLog.getLog(CmsHtmlConverterJTidy.class);
056
057    /** Regular expression for cleanup. */
058    String[] m_cleanupPatterns = {
059        "<o:p>.*(\\r\\n)*.*</o:p>",
060        "<o:p>.*(\\r\\n)*.*</O:p>",
061        "<\\?xml:.*(\\r\\n).*/>",
062        "<\\?xml:.*(\\r\\n).*(\\r\\n).*/\\?>",
063        "<\\?xml:.*(\\r\\n).*(\\r\\n).*/>",
064        "<\\?xml:(.*(\\r\\n)).*/\\?>",
065        "<o:SmartTagType.*(\\r\\n)*.*/>",
066        "<o:smarttagtype.*(\\r\\n)*.*/>"};
067
068    /** Patterns for cleanup. */
069    Pattern[] m_clearStyle;
070
071    /** Regular expressions for paragraph replacements -- additionally remove leading and trailing breaks. */
072    String[] m_replaceParagraphPatterns = {
073        "</ul>\n<br />",
074        "</ol>\n<br />",
075        "<p><br />",
076        "<p>",
077        "<br />(\\s)*&nbsp;(\\s)*</p>",
078        "<br /></p>",
079        "</p>",
080        "^<br />",
081        "<br />$"};
082
083    /** Values for paragraph replacements. */
084    String[] m_replaceParagraphValues = {"</ul>", "</ol>", "<br />", "<br />", "<br />", "<br />", "<br />", "", ""};
085
086    /** Regular expression for replace. */
087    String[] m_replacePatterns = {
088        "&#160;",
089        "(\\r\\n){2,}",
090        "\u2013",
091        "(\\n){2,}",
092        "\\(\\r\\n<",
093        "\\(\\n<",
094        "\\(\\r\\n(\\ ){1,}<",
095        "\\(\\n(\\ ){1,}<",
096        "\\r\\n<span",
097        "\\n<span"};
098
099    /** Patterns for replace. */
100    Pattern[] m_replaceStyle;
101
102    /** Values for replace. */
103    String[] m_replaceValues = {"&nbsp;", "", "&ndash;", "", "(<", "(<", "(<", "(<", "<span", "<span"};
104
105    /** The tidy to use. */
106    Tidy m_tidy;
107
108    /** The length of the line separator. */
109    private int m_lineSeparatorLength;
110
111    /** Indicates if this converter is enabled or not. */
112    private boolean m_modeEnabled;
113
114    /** Indicates if paragraph replacement mode is enabled or not. */
115    private boolean m_modeReplaceParagraphs;
116
117    /** Indicates if word cleanup mode is enabled or not. */
118    private boolean m_modeWord;
119
120    /** Indicates if XHTML conversion mode is enabled or not. */
121    private boolean m_modeXhtml;
122
123    /** List of default modes if none were specified explicitly. */
124    private static final List<String> MODES_DEFAULT = Collections.unmodifiableList(
125        Arrays.asList(new String[] {CmsHtmlConverter.PARAM_ENABLED}));
126
127    /**
128     * Constructor, creates a new CmsHtmlConverterJTidy.<p>
129     */
130    public CmsHtmlConverterJTidy() {
131
132        super(null, MODES_DEFAULT);
133    }
134
135    /**
136     * Constructor, creates a new CmsHtmlConverterJTidy.<p>
137     *
138     * Possible values for the conversion mode are:<ul>
139     * <li>{@link CmsHtmlConverter#PARAM_DISABLED}: The conversion is disabled.
140     * <li>{@link CmsHtmlConverter#PARAM_ENABLED}: Conversion is enabled without transformation, so HTML is pretty printed only.
141     * <li>{@link CmsHtmlConverter#PARAM_XHTML}: Conversion from HTML to XHTML is enabled.
142     * <li>{@link CmsHtmlConverter#PARAM_WORD}: Cleanup of word like HTML tags is enabled.
143     * <li>{@link CmsHtmlConverter#PARAM_REPLACE_PARAGRAPHS}: Cleanup of paragraphs and leading/trailing line breaks is enabled.
144     *
145     * </ul>
146     *
147     * @param encoding the encoding used for the HTML code conversion
148     * @param modes the conversion modes to use
149     */
150    public CmsHtmlConverterJTidy(String encoding, List<String> modes) {
151
152        super(encoding, modes);
153    }
154
155    /**
156     * Converts the given HTML code according to the settings of this converter.<p>
157     *
158     * @param htmlInput HTML input stored in a string
159     * @return string containing the converted HTML
160     *
161     * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
162     */
163    @Override
164    public String convertToString(String htmlInput) throws UnsupportedEncodingException {
165
166        // initialize the modes
167        initModes();
168        // only do parsing if the mode is not set to disabled
169        if (m_modeEnabled) {
170
171            // do a maximum of 10 loops
172            int max = m_modeWord ? 10 : 1;
173            int count = 0;
174
175            // we may have to do several parsing runs until all tags are removed
176            int oldSize = htmlInput.length();
177            String workHtml = regExp(htmlInput);
178            while (count < max) {
179                count++;
180
181                // first add the optional header if in word mode
182                if (m_modeWord) {
183                    workHtml = adjustHtml(workHtml);
184                }
185                // now use tidy to parse and format the HTML
186                workHtml = parse(workHtml);
187                if (m_modeWord) {
188                    // cut off the line separator, which is always appended
189                    workHtml = workHtml.substring(0, workHtml.length() - m_lineSeparatorLength);
190                }
191
192                if (workHtml.length() == oldSize) {
193                    // no change in HTML code after last processing loop
194                    workHtml = regExp(workHtml);
195                    break;
196                }
197                oldSize = workHtml.length();
198                workHtml = regExp(workHtml);
199            }
200            if (LOG.isDebugEnabled()) {
201                LOG.debug(
202                    Messages.get().getBundle().key(
203                        Messages.LOG_PARSING_RUNS_2,
204                        this.getClass().getName(),
205                        new Integer(count)));
206            }
207            htmlInput = workHtml;
208        }
209
210        return htmlInput;
211    }
212
213    /**
214     * Adjusts the HTML input code in WORD mode if necessary.<p>
215     *
216     * When in WORD mode, the HTML tag must contain the xmlns:o="urn:schemas-microsoft-com:office:office"
217     * attribute, otherwise tide will not remove the WORD tags from the document.
218     *
219     * @param htmlInput the HTML input
220     * @return adjusted HTML input
221     */
222    private String adjustHtml(String htmlInput) {
223
224        // check if we have some opening and closing HTML tags
225        if ((htmlInput.toLowerCase().indexOf("<html>") == -1) && (htmlInput.toLowerCase().indexOf("</html>") == -1)) {
226            // add a correct HTML tag for word generated HTML
227            StringBuffer tmp = new StringBuffer();
228            tmp.append("<html xmlns:o=\"\"><body>");
229            tmp.append(htmlInput);
230            tmp.append("</body></html>");
231            htmlInput = tmp.toString();
232        }
233        return htmlInput;
234    }
235
236    /**
237     * Initializes the JTidy modes.<p>
238     */
239    private void initModes() {
240
241        // set all internal modes to disabled
242        m_modeEnabled = false;
243        m_modeReplaceParagraphs = false;
244        m_modeWord = false;
245        m_modeXhtml = false;
246
247        // extract all operation modes
248        List<String> modes = getModes();
249
250        // configure the tidy depending on the operation mode
251        if (modes.contains(CmsHtmlConverter.PARAM_ENABLED)) {
252            m_modeEnabled = true;
253        }
254        if (modes.contains(CmsHtmlConverter.PARAM_XHTML)) {
255            m_modeEnabled = true;
256            m_modeXhtml = true;
257        }
258        if (modes.contains(CmsHtmlConverter.PARAM_WORD)) {
259            m_modeEnabled = true;
260            m_modeWord = true;
261        }
262        if (modes.contains(CmsHtmlConverter.PARAM_REPLACE_PARAGRAPHS)) {
263            m_modeEnabled = true;
264            m_modeReplaceParagraphs = true;
265        }
266
267        // get line separator length
268        m_lineSeparatorLength = System.getProperty("line.separator").length();
269
270        // we need this only if the conversion is enabled
271        if (m_modeEnabled) {
272
273            // create the main tidy object
274            m_tidy = new Tidy();
275
276            // set specified word, XHTML conversion settings
277            m_tidy.setXHTML(m_modeXhtml);
278            m_tidy.setWord2000(m_modeWord);
279
280            // add additional tags
281            // those are required to handle word 2002 (and newer) documents
282            Properties additionalTags = new Properties();
283            additionalTags.put("new-empty-tags", "o:smarttagtype");
284            additionalTags.put("new-inline-tags", "o:smarttagtype");
285            m_tidy.getConfiguration().addProps(additionalTags);
286
287            // set the default tidy configuration
288
289            // set the tidy encoding
290            m_tidy.setInputEncoding(getEncoding());
291            m_tidy.setOutputEncoding(getEncoding());
292
293            // disable the tidy meta element in output
294            m_tidy.setTidyMark(false);
295            // disable clean mode
296            m_tidy.setMakeClean(false);
297            // enable numeric entities
298            m_tidy.setNumEntities(true);
299            // create output of the body only
300            m_tidy.setPrintBodyOnly(true);
301            // force output creation even if there are tidy errors
302            m_tidy.setForceOutput(true);
303            // set tidy to quiet mode to prevent output
304            m_tidy.setQuiet(true);
305            // disable warning output
306            m_tidy.setShowWarnings(false);
307            // allow comments in the output
308            m_tidy.setHideComments(false);
309            // set no line break before a <br>
310            m_tidy.setBreakBeforeBR(false);
311            // don't wrap attribute values
312            m_tidy.setWrapAttVals(false);
313            // warp lines after 100 chars
314            m_tidy.setWraplen(100);
315            // no indentation
316            m_tidy.setSpaces(0);
317
318            if (m_modeWord) {
319                // create the regular expression for cleanup, only used in word clean mode
320                m_clearStyle = new Pattern[m_cleanupPatterns.length];
321                for (int i = 0; i < m_cleanupPatterns.length; i++) {
322                    m_clearStyle[i] = Pattern.compile(m_cleanupPatterns[i]);
323                }
324            }
325
326            // add paragraph replacement regular expression and values if needed
327            if (m_modeReplaceParagraphs) {
328                // add the regular expression and values for paragraph replacements
329                String[] newPatterns = new String[m_replacePatterns.length + m_replaceParagraphPatterns.length];
330                String[] newValues = new String[m_replacePatterns.length + m_replaceParagraphPatterns.length];
331                System.arraycopy(m_replacePatterns, 0, newPatterns, 0, m_replacePatterns.length);
332                System.arraycopy(
333                    m_replaceParagraphPatterns,
334                    0,
335                    newPatterns,
336                    m_replacePatterns.length,
337                    m_replaceParagraphPatterns.length);
338                System.arraycopy(m_replaceValues, 0, newValues, 0, m_replacePatterns.length);
339                System.arraycopy(
340                    m_replaceParagraphValues,
341                    0,
342                    newValues,
343                    m_replacePatterns.length,
344                    m_replaceParagraphPatterns.length);
345                m_replacePatterns = newPatterns;
346                m_replaceValues = newValues;
347            }
348
349            // create the regular expression for replace
350            m_replaceStyle = new Pattern[m_replacePatterns.length];
351            for (int i = 0; i < m_replacePatterns.length; i++) {
352                m_replaceStyle[i] = Pattern.compile(m_replacePatterns[i]);
353            }
354        }
355    }
356
357    /**
358     * Parses a byte array containing HTML code with different parsing modes.<p>
359     *
360     * @param htmlInput a byte array containing raw HTML code
361     *
362     * @return parsed and cleared HTML code
363     *
364     * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
365     */
366    private String parse(String htmlInput) throws UnsupportedEncodingException {
367
368        // prepare the streams
369        ByteArrayInputStream in = new ByteArrayInputStream(htmlInput.getBytes(getEncoding()));
370        ByteArrayOutputStream out = new ByteArrayOutputStream();
371        // do the parsing
372        m_tidy.parse(in, out);
373        // return the result
374        byte[] result = out.toByteArray();
375        return new String(result, getEncoding());
376    }
377
378    /**
379     * Parses the htmlInput with regular expressions for cleanup purposes.<p>
380     *
381     * @param htmlInput the HTML input
382     *
383     * @return the processed HTML
384     */
385    private String regExp(String htmlInput) {
386
387        String parsedHtml = htmlInput.trim();
388
389        if (m_modeWord) {
390            // process all cleanup regular expressions
391            for (int i = 0; i < m_cleanupPatterns.length; i++) {
392                parsedHtml = m_clearStyle[i].matcher(parsedHtml).replaceAll("");
393            }
394        }
395
396        // process all replace regular expressions
397        for (int i = 0; i < m_replacePatterns.length; i++) {
398            parsedHtml = m_replaceStyle[i].matcher(parsedHtml).replaceAll(m_replaceValues[i]);
399        }
400
401        return parsedHtml;
402    }
403}