001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.util; 029 030import org.opencms.main.CmsLog; 031 032import java.io.ByteArrayInputStream; 033import java.io.ByteArrayOutputStream; 034import java.io.UnsupportedEncodingException; 035import java.util.Arrays; 036import java.util.Collections; 037import java.util.List; 038import java.util.Properties; 039import java.util.regex.Pattern; 040 041import org.apache.commons.logging.Log; 042 043import org.w3c.tidy.Tidy; 044 045/** 046 * HTML cleaner and pretty printer using JTidy.<p> 047 * 048 * Used to clean up HTML code (e.g. remove word tags) and optionally create XHTML from HTML.<p> 049 * 050 * @since 6.0.0 051 */ 052public class CmsHtmlConverterJTidy extends A_CmsHtmlConverter { 053 054 /** The log object for this class. */ 055 private static final Log LOG = CmsLog.getLog(CmsHtmlConverterJTidy.class); 056 057 /** Regular expression for cleanup. */ 058 String[] m_cleanupPatterns = { 059 "<o:p>.*(\\r\\n)*.*</o:p>", 060 "<o:p>.*(\\r\\n)*.*</O:p>", 061 "<\\?xml:.*(\\r\\n).*/>", 062 "<\\?xml:.*(\\r\\n).*(\\r\\n).*/\\?>", 063 "<\\?xml:.*(\\r\\n).*(\\r\\n).*/>", 064 "<\\?xml:(.*(\\r\\n)).*/\\?>", 065 "<o:SmartTagType.*(\\r\\n)*.*/>", 066 "<o:smarttagtype.*(\\r\\n)*.*/>"}; 067 068 /** Patterns for cleanup. */ 069 Pattern[] m_clearStyle; 070 071 /** Regular expressions for paragraph replacements -- additionally remove leading and trailing breaks. */ 072 String[] m_replaceParagraphPatterns = { 073 "</ul>\n<br />", 074 "</ol>\n<br />", 075 "<p><br />", 076 "<p>", 077 "<br />(\\s)* (\\s)*</p>", 078 "<br /></p>", 079 "</p>", 080 "^<br />", 081 "<br />$"}; 082 083 /** Values for paragraph replacements. */ 084 String[] m_replaceParagraphValues = {"</ul>", "</ol>", "<br />", "<br />", "<br />", "<br />", "<br />", "", ""}; 085 086 /** Regular expression for replace. */ 087 String[] m_replacePatterns = { 088 " ", 089 "(\\r\\n){2,}", 090 "\u2013", 091 "(\\n){2,}", 092 "\\(\\r\\n<", 093 "\\(\\n<", 094 "\\(\\r\\n(\\ ){1,}<", 095 "\\(\\n(\\ ){1,}<", 096 "\\r\\n<span", 097 "\\n<span"}; 098 099 /** Patterns for replace. */ 100 Pattern[] m_replaceStyle; 101 102 /** Values for replace. */ 103 String[] m_replaceValues = {" ", "", "–", "", "(<", "(<", "(<", "(<", "<span", "<span"}; 104 105 /** The tidy to use. */ 106 Tidy m_tidy; 107 108 /** The length of the line separator. */ 109 private int m_lineSeparatorLength; 110 111 /** Indicates if this converter is enabled or not. */ 112 private boolean m_modeEnabled; 113 114 /** Indicates if paragraph replacement mode is enabled or not. */ 115 private boolean m_modeReplaceParagraphs; 116 117 /** Indicates if word cleanup mode is enabled or not. */ 118 private boolean m_modeWord; 119 120 /** Indicates if XHTML conversion mode is enabled or not. */ 121 private boolean m_modeXhtml; 122 123 /** List of default modes if none were specified explicitly. */ 124 private static final List<String> MODES_DEFAULT = Collections.unmodifiableList( 125 Arrays.asList(new String[] {CmsHtmlConverter.PARAM_ENABLED})); 126 127 /** 128 * Constructor, creates a new CmsHtmlConverterJTidy.<p> 129 */ 130 public CmsHtmlConverterJTidy() { 131 132 super(null, MODES_DEFAULT); 133 } 134 135 /** 136 * Constructor, creates a new CmsHtmlConverterJTidy.<p> 137 * 138 * Possible values for the conversion mode are:<ul> 139 * <li>{@link CmsHtmlConverter#PARAM_DISABLED}: The conversion is disabled. 140 * <li>{@link CmsHtmlConverter#PARAM_ENABLED}: Conversion is enabled without transformation, so HTML is pretty printed only. 141 * <li>{@link CmsHtmlConverter#PARAM_XHTML}: Conversion from HTML to XHTML is enabled. 142 * <li>{@link CmsHtmlConverter#PARAM_WORD}: Cleanup of word like HTML tags is enabled. 143 * <li>{@link CmsHtmlConverter#PARAM_REPLACE_PARAGRAPHS}: Cleanup of paragraphs and leading/trailing line breaks is enabled. 144 * 145 * </ul> 146 * 147 * @param encoding the encoding used for the HTML code conversion 148 * @param modes the conversion modes to use 149 */ 150 public CmsHtmlConverterJTidy(String encoding, List<String> modes) { 151 152 super(encoding, modes); 153 } 154 155 /** 156 * Converts the given HTML code according to the settings of this converter.<p> 157 * 158 * @param htmlInput HTML input stored in a string 159 * @return string containing the converted HTML 160 * 161 * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported 162 */ 163 @Override 164 public String convertToString(String htmlInput) throws UnsupportedEncodingException { 165 166 // initialize the modes 167 initModes(); 168 // only do parsing if the mode is not set to disabled 169 if (m_modeEnabled) { 170 171 // do a maximum of 10 loops 172 int max = m_modeWord ? 10 : 1; 173 int count = 0; 174 175 // we may have to do several parsing runs until all tags are removed 176 int oldSize = htmlInput.length(); 177 String workHtml = regExp(htmlInput); 178 while (count < max) { 179 count++; 180 181 // first add the optional header if in word mode 182 if (m_modeWord) { 183 workHtml = adjustHtml(workHtml); 184 } 185 // now use tidy to parse and format the HTML 186 workHtml = parse(workHtml); 187 if (m_modeWord) { 188 // cut off the line separator, which is always appended 189 workHtml = workHtml.substring(0, workHtml.length() - m_lineSeparatorLength); 190 } 191 192 if (workHtml.length() == oldSize) { 193 // no change in HTML code after last processing loop 194 workHtml = regExp(workHtml); 195 break; 196 } 197 oldSize = workHtml.length(); 198 workHtml = regExp(workHtml); 199 } 200 if (LOG.isDebugEnabled()) { 201 LOG.debug( 202 Messages.get().getBundle().key( 203 Messages.LOG_PARSING_RUNS_2, 204 this.getClass().getName(), 205 new Integer(count))); 206 } 207 htmlInput = workHtml; 208 } 209 210 return htmlInput; 211 } 212 213 /** 214 * Adjusts the HTML input code in WORD mode if necessary.<p> 215 * 216 * When in WORD mode, the HTML tag must contain the xmlns:o="urn:schemas-microsoft-com:office:office" 217 * attribute, otherwise tide will not remove the WORD tags from the document. 218 * 219 * @param htmlInput the HTML input 220 * @return adjusted HTML input 221 */ 222 private String adjustHtml(String htmlInput) { 223 224 // check if we have some opening and closing HTML tags 225 if ((htmlInput.toLowerCase().indexOf("<html>") == -1) && (htmlInput.toLowerCase().indexOf("</html>") == -1)) { 226 // add a correct HTML tag for word generated HTML 227 StringBuffer tmp = new StringBuffer(); 228 tmp.append("<html xmlns:o=\"\"><body>"); 229 tmp.append(htmlInput); 230 tmp.append("</body></html>"); 231 htmlInput = tmp.toString(); 232 } 233 return htmlInput; 234 } 235 236 /** 237 * Initializes the JTidy modes.<p> 238 */ 239 private void initModes() { 240 241 // set all internal modes to disabled 242 m_modeEnabled = false; 243 m_modeReplaceParagraphs = false; 244 m_modeWord = false; 245 m_modeXhtml = false; 246 247 // extract all operation modes 248 List<String> modes = getModes(); 249 250 // configure the tidy depending on the operation mode 251 if (modes.contains(CmsHtmlConverter.PARAM_ENABLED)) { 252 m_modeEnabled = true; 253 } 254 if (modes.contains(CmsHtmlConverter.PARAM_XHTML)) { 255 m_modeEnabled = true; 256 m_modeXhtml = true; 257 } 258 if (modes.contains(CmsHtmlConverter.PARAM_WORD)) { 259 m_modeEnabled = true; 260 m_modeWord = true; 261 } 262 if (modes.contains(CmsHtmlConverter.PARAM_REPLACE_PARAGRAPHS)) { 263 m_modeEnabled = true; 264 m_modeReplaceParagraphs = true; 265 } 266 267 // get line separator length 268 m_lineSeparatorLength = System.getProperty("line.separator").length(); 269 270 // we need this only if the conversion is enabled 271 if (m_modeEnabled) { 272 273 // create the main tidy object 274 m_tidy = new Tidy(); 275 276 // set specified word, XHTML conversion settings 277 m_tidy.setXHTML(m_modeXhtml); 278 m_tidy.setWord2000(m_modeWord); 279 280 // add additional tags 281 // those are required to handle word 2002 (and newer) documents 282 Properties additionalTags = new Properties(); 283 additionalTags.put("new-empty-tags", "o:smarttagtype"); 284 additionalTags.put("new-inline-tags", "o:smarttagtype"); 285 m_tidy.getConfiguration().addProps(additionalTags); 286 287 // set the default tidy configuration 288 289 // set the tidy encoding 290 m_tidy.setInputEncoding(getEncoding()); 291 m_tidy.setOutputEncoding(getEncoding()); 292 293 // disable the tidy meta element in output 294 m_tidy.setTidyMark(false); 295 // disable clean mode 296 m_tidy.setMakeClean(false); 297 // enable numeric entities 298 m_tidy.setNumEntities(true); 299 // create output of the body only 300 m_tidy.setPrintBodyOnly(true); 301 // force output creation even if there are tidy errors 302 m_tidy.setForceOutput(true); 303 // set tidy to quiet mode to prevent output 304 m_tidy.setQuiet(true); 305 // disable warning output 306 m_tidy.setShowWarnings(false); 307 // allow comments in the output 308 m_tidy.setHideComments(false); 309 // set no line break before a <br> 310 m_tidy.setBreakBeforeBR(false); 311 // don't wrap attribute values 312 m_tidy.setWrapAttVals(false); 313 // warp lines after 100 chars 314 m_tidy.setWraplen(100); 315 // no indentation 316 m_tidy.setSpaces(0); 317 318 if (m_modeWord) { 319 // create the regular expression for cleanup, only used in word clean mode 320 m_clearStyle = new Pattern[m_cleanupPatterns.length]; 321 for (int i = 0; i < m_cleanupPatterns.length; i++) { 322 m_clearStyle[i] = Pattern.compile(m_cleanupPatterns[i]); 323 } 324 } 325 326 // add paragraph replacement regular expression and values if needed 327 if (m_modeReplaceParagraphs) { 328 // add the regular expression and values for paragraph replacements 329 String[] newPatterns = new String[m_replacePatterns.length + m_replaceParagraphPatterns.length]; 330 String[] newValues = new String[m_replacePatterns.length + m_replaceParagraphPatterns.length]; 331 System.arraycopy(m_replacePatterns, 0, newPatterns, 0, m_replacePatterns.length); 332 System.arraycopy( 333 m_replaceParagraphPatterns, 334 0, 335 newPatterns, 336 m_replacePatterns.length, 337 m_replaceParagraphPatterns.length); 338 System.arraycopy(m_replaceValues, 0, newValues, 0, m_replacePatterns.length); 339 System.arraycopy( 340 m_replaceParagraphValues, 341 0, 342 newValues, 343 m_replacePatterns.length, 344 m_replaceParagraphPatterns.length); 345 m_replacePatterns = newPatterns; 346 m_replaceValues = newValues; 347 } 348 349 // create the regular expression for replace 350 m_replaceStyle = new Pattern[m_replacePatterns.length]; 351 for (int i = 0; i < m_replacePatterns.length; i++) { 352 m_replaceStyle[i] = Pattern.compile(m_replacePatterns[i]); 353 } 354 } 355 } 356 357 /** 358 * Parses a byte array containing HTML code with different parsing modes.<p> 359 * 360 * @param htmlInput a byte array containing raw HTML code 361 * 362 * @return parsed and cleared HTML code 363 * 364 * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported 365 */ 366 private String parse(String htmlInput) throws UnsupportedEncodingException { 367 368 // prepare the streams 369 ByteArrayInputStream in = new ByteArrayInputStream(htmlInput.getBytes(getEncoding())); 370 ByteArrayOutputStream out = new ByteArrayOutputStream(); 371 // do the parsing 372 m_tidy.parse(in, out); 373 // return the result 374 byte[] result = out.toByteArray(); 375 return new String(result, getEncoding()); 376 } 377 378 /** 379 * Parses the htmlInput with regular expressions for cleanup purposes.<p> 380 * 381 * @param htmlInput the HTML input 382 * 383 * @return the processed HTML 384 */ 385 private String regExp(String htmlInput) { 386 387 String parsedHtml = htmlInput.trim(); 388 389 if (m_modeWord) { 390 // process all cleanup regular expressions 391 for (int i = 0; i < m_cleanupPatterns.length; i++) { 392 parsedHtml = m_clearStyle[i].matcher(parsedHtml).replaceAll(""); 393 } 394 } 395 396 // process all replace regular expressions 397 for (int i = 0; i < m_replacePatterns.length; i++) { 398 parsedHtml = m_replaceStyle[i].matcher(parsedHtml).replaceAll(m_replaceValues[i]); 399 } 400 401 return parsedHtml; 402 } 403}