001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.i18n; 029 030import org.opencms.json.JSONArray; 031import org.opencms.json.JSONException; 032import org.opencms.main.CmsLog; 033import org.opencms.main.OpenCms; 034import org.opencms.util.CmsStringUtil; 035 036import java.io.UnsupportedEncodingException; 037import java.net.IDN; 038import java.net.URI; 039import java.net.URISyntaxException; 040import java.net.URLDecoder; 041import java.net.URLEncoder; 042import java.nio.CharBuffer; 043import java.nio.charset.Charset; 044import java.nio.charset.CharsetEncoder; 045import java.util.HashMap; 046import java.util.List; 047import java.util.Map; 048import java.util.Random; 049import java.util.regex.Matcher; 050import java.util.regex.Pattern; 051 052import org.apache.commons.codec.binary.Base64; 053import org.apache.commons.lang3.StringUtils; 054import org.apache.commons.logging.Log; 055 056import com.google.common.collect.Lists; 057 058/** 059 * The OpenCms CmsEncoder class provides static methods to decode and encode data.<p> 060 * 061 * The methods in this class are substitutes for <code>java.net.URLEncoder.encode()</code> and 062 * <code>java.net.URLDecoder.decode()</code>. Use the methods from this class in all OpenCms 063 * core classes to ensure the encoding is always handled the same way.<p> 064 * 065 * The de- and encoding uses the same coding mechanism as JavaScript, special characters are 066 * replaced with <code>%hex</code> where hex is a two digit hex number.<p> 067 * 068 * <b>Note:</b> On the client side (browser) instead of using the deprecated <code>escape</code> 069 * and <code>unescape</code> JavaScript functions, always the use <code>encodeURIComponent</code> and 070 * <code>decodeURIComponent</code> functions. Only these work properly with unicode characters.<p> 071 * 072 * @since 6.0.0 073 */ 074public final class CmsEncoder { 075 076 /** Non-alphanumeric characters used for Base64 encoding. */ 077 public static final String BASE64_EXTRA = "+/="; 078 079 /** Characters used as replacements for non-alphanumeric Base64 characters when using Base64 for request parameters. */ 080 public static final String BASE64_EXTRA_REPLACEMENTS = "-_."; 081 082 /** Constant for the standard <code>ISO-8859-1</code> encoding. */ 083 public static final String ENCODING_ISO_8859_1 = "ISO-8859-1"; 084 085 /** Constant for the standard <code>US-ASCII</code> encoding. */ 086 public static final String ENCODING_US_ASCII = "US-ASCII"; 087 088 /** 089 * Constant for the standard <code>UTF-8</code> encoding.<p> 090 * 091 * Default encoding for JavaScript decodeUriComponent methods is <code>UTF-8</code> by w3c standard. 092 */ 093 public static final String ENCODING_UTF_8 = "UTF-8"; 094 095 /** The regex pattern to match HTML entities. */ 096 private static final Pattern ENTITIY_PATTERN = Pattern.compile("\\&#\\d+;"); 097 098 /** The prefix for HTML entities. */ 099 private static final String ENTITY_PREFIX = "&#"; 100 101 /** The replacement for HTML entity prefix in parameters. */ 102 private static final String ENTITY_REPLACEMENT = "$$"; 103 104 /** The log object for this class. */ 105 private static final Log LOG = CmsLog.getLog(CmsEncoder.class); 106 107 /** A cache for encoding name lookup. */ 108 private static Map<String, String> m_encodingCache = new HashMap<String, String>(16); 109 110 private static Random m_random = new Random(); 111 112 /** The plus entity. */ 113 private static final String PLUS_ENTITY = ENTITY_PREFIX + "043;"; 114 115 /** 116 * Constructor.<p> 117 */ 118 private CmsEncoder() { 119 120 // empty 121 } 122 123 /** 124 * Adjusts the given String by making sure all characters that can be displayed 125 * in the given charset are contained as chars, whereas all other non-displayable 126 * characters are converted to HTML entities.<p> 127 * 128 * Just calls {@link #decodeHtmlEntities(String, String)} first and feeds the result 129 * to {@link #encodeHtmlEntities(String, String)}. <p> 130 * 131 * @param input the input to adjust the HTML encoding for 132 * @param encoding the charset to encode the result with\ 133 * 134 * @return the input with the decoded/encoded HTML entities 135 */ 136 public static String adjustHtmlEncoding(String input, String encoding) { 137 138 return encodeHtmlEntities(decodeHtmlEntities(input, encoding), encoding); 139 } 140 141 /** 142 * Changes the encoding of a byte array that represents a String.<p> 143 * 144 * @param input the byte array to convert 145 * @param oldEncoding the current encoding of the byte array 146 * @param newEncoding the new encoding of the byte array 147 * 148 * @return the byte array encoded in the new encoding 149 */ 150 public static byte[] changeEncoding(byte[] input, String oldEncoding, String newEncoding) { 151 152 if ((oldEncoding == null) || (newEncoding == null)) { 153 return input; 154 } 155 if (oldEncoding.trim().equalsIgnoreCase(newEncoding.trim())) { 156 return input; 157 } 158 byte[] result = input; 159 try { 160 result = (new String(input, oldEncoding)).getBytes(newEncoding); 161 } catch (UnsupportedEncodingException e) { 162 // return value will be input value 163 } 164 return result; 165 } 166 167 /** 168 * Converts the host of an URI to Punycode.<p> 169 * 170 * This is needed when we want to do redirects to hosts with host names containing international characters like umlauts.<p> 171 * 172 * @param uriString the URI 173 * @return the converted URI 174 */ 175 public static String convertHostToPunycode(String uriString) { 176 177 if (uriString.indexOf(":") >= 0) { 178 try { 179 URI uri = new URI(uriString); 180 String authority = uri.getAuthority(); // getHost won't work when we have special characters 181 int colonPos = authority.indexOf(':'); 182 if (colonPos >= 0) { 183 authority = IDN.toASCII(authority.substring(0, colonPos)) + authority.substring(colonPos); 184 } else { 185 authority = IDN.toASCII(authority); 186 } 187 URI uriWithCorrectedHost = new URI( 188 uri.getScheme(), 189 authority, 190 uri.getPath(), 191 uri.getQuery(), 192 uri.getFragment()); 193 uriString = uriWithCorrectedHost.toASCIIString(); 194 } catch (URISyntaxException e) { 195 LOG.error(e.getLocalizedMessage(), e); 196 } 197 } 198 return uriString; 199 } 200 201 /** 202 * Creates a String out of a byte array with the specified encoding, falling back 203 * to the system default in case the encoding name is not valid.<p> 204 * 205 * Use this method as a replacement for <code>new String(byte[], encoding)</code> 206 * to avoid possible encoding problems.<p> 207 * 208 * @param bytes the bytes to decode 209 * @param encoding the encoding scheme to use for decoding the bytes 210 * 211 * @return the bytes decoded to a String 212 */ 213 public static String createString(byte[] bytes, String encoding) { 214 215 String enc = encoding.intern(); 216 if (enc != OpenCms.getSystemInfo().getDefaultEncoding()) { 217 enc = lookupEncoding(enc, null); 218 } 219 if (enc != null) { 220 try { 221 return new String(bytes, enc); 222 } catch (UnsupportedEncodingException e) { 223 // this can _never_ happen since the charset was looked up first 224 } 225 } else { 226 if (LOG.isWarnEnabled()) { 227 LOG.warn(Messages.get().getBundle().key(Messages.ERR_UNSUPPORTED_VM_ENCODING_1, encoding)); 228 } 229 enc = OpenCms.getSystemInfo().getDefaultEncoding(); 230 try { 231 return new String(bytes, enc); 232 } catch (UnsupportedEncodingException e) { 233 // this can also _never_ happen since the default encoding is always valid 234 } 235 } 236 // this code is unreachable in practice 237 LOG.error(Messages.get().getBundle().key(Messages.ERR_ENCODING_ISSUES_1, encoding)); 238 return null; 239 } 240 241 /** 242 * Decodes a String using UTF-8 encoding, which is the standard for http data transmission 243 * with GET ant POST requests.<p> 244 * 245 * @param source the String to decode 246 * 247 * @return String the decoded source String 248 */ 249 public static String decode(String source) { 250 251 return decode(source, ENCODING_UTF_8); 252 } 253 254 /** 255 * This method is a substitute for <code>URLDecoder.decode()</code>. 256 * Use this in all OpenCms core classes to ensure the encoding is 257 * always handled the same way.<p> 258 * 259 * In case you don't know what encoding to use, set the value of 260 * the <code>encoding</code> parameter to <code>null</code>. 261 * This method will then default to UTF-8 encoding, which is probably the right one.<p> 262 * 263 * @param source The string to decode 264 * @param encoding The encoding to use (if null, the system default is used) 265 * 266 * @return The decoded source String 267 */ 268 public static String decode(String source, String encoding) { 269 270 if (source == null) { 271 return null; 272 } 273 if (encoding != null) { 274 try { 275 return URLDecoder.decode(source, encoding); 276 } catch (java.io.UnsupportedEncodingException e) { 277 // will fallback to default 278 } 279 } 280 // fallback to default decoding 281 try { 282 return URLDecoder.decode(source, ENCODING_UTF_8); 283 } catch (java.io.UnsupportedEncodingException e) { 284 // ignore 285 } 286 return source; 287 } 288 289 /** 290 * Decodes HTML entity references like <code>&#8364;</code> that are contained in the 291 * String to a regular character, but only if that character is contained in the given 292 * encodings charset.<p> 293 * 294 * @param input the input to decode the HTML entities in 295 * @param encoding the charset to decode the input for 296 * @return the input with the decoded HTML entities 297 * 298 * @see #encodeHtmlEntities(String, String) 299 */ 300 public static String decodeHtmlEntities(String input, String encoding) { 301 302 Matcher matcher = ENTITIY_PATTERN.matcher(input); 303 StringBuffer result = new StringBuffer(input.length()); 304 Charset charset = Charset.forName(encoding); 305 CharsetEncoder encoder = charset.newEncoder(); 306 307 while (matcher.find()) { 308 String entity = matcher.group(); 309 String value = entity.substring(2, entity.length() - 1); 310 int c = Integer.valueOf(value).intValue(); 311 if (c < 128) { 312 // first 128 chars are contained in almost every charset 313 entity = new String(new char[] {(char)c}); 314 // this is intended as performance improvement since 315 // the canEncode() operation appears quite CPU heavy 316 } else if (encoder.canEncode((char)c)) { 317 // encoder can encode this char 318 entity = new String(new char[] {(char)c}); 319 } 320 matcher.appendReplacement(result, entity); 321 } 322 matcher.appendTail(result); 323 return result.toString(); 324 } 325 326 /** 327 * Decodes a string used as parameter in an uri in a way independent of other encodings/decodings applied before.<p> 328 * 329 * @param input the encoded parameter string 330 * 331 * @return the decoded parameter string 332 * 333 * @see #encodeParameter(String) 334 */ 335 public static String decodeParameter(String input) { 336 337 String result = CmsStringUtil.substitute(input, ENTITY_REPLACEMENT, ENTITY_PREFIX); 338 return CmsEncoder.decodeHtmlEntities(result, OpenCms.getSystemInfo().getDefaultEncoding()); 339 } 340 341 /** 342 * Decodes a parameter which has been encoded from a string list using encodeStringsAsBase64Parameter.<p> 343 * 344 * @param data the data to decode 345 * @return the list of strings 346 */ 347 public static List<String> decodeStringsFromBase64Parameter(String data) { 348 349 data = StringUtils.replaceChars(data, BASE64_EXTRA_REPLACEMENTS, BASE64_EXTRA); 350 byte[] bytes = deobfuscateBytes(Base64.decodeBase64(data)); 351 try { 352 JSONArray json = new JSONArray(new String(bytes, "UTF-8")); 353 List<String> result = Lists.newArrayList(); 354 for (int i = 0; i < json.length(); i++) { 355 result.add(json.getString(i)); 356 } 357 return result; 358 } catch (UnsupportedEncodingException e) { 359 // TODO Auto-generated catch block 360 e.printStackTrace(); 361 } catch (JSONException e) { 362 throw new IllegalArgumentException("Decoding failed: " + data, e); 363 } 364 return null; 365 } 366 367 /** 368 * Encodes a String using UTF-8 encoding, which is the standard for http data transmission 369 * with GET ant POST requests.<p> 370 * 371 * @param source the String to encode 372 * 373 * @return String the encoded source String 374 */ 375 public static String encode(String source) { 376 377 return encode(source, ENCODING_UTF_8); 378 } 379 380 /** 381 * This method is a substitute for <code>URLEncoder.encode()</code>. 382 * Use this in all OpenCms core classes to ensure the encoding is 383 * always handled the same way.<p> 384 * 385 * In case you don't know what encoding to use, set the value of 386 * the <code>encoding</code> parameter to <code>null</code>. 387 * This method will then default to UTF-8 encoding, which is probably the right one.<p> 388 * 389 * @param source the String to encode 390 * @param encoding the encoding to use (if null, the system default is used) 391 * 392 * @return the encoded source String 393 */ 394 public static String encode(String source, String encoding) { 395 396 if (source == null) { 397 return null; 398 } 399 if (encoding != null) { 400 try { 401 return URLEncoder.encode(source, encoding); 402 } catch (java.io.UnsupportedEncodingException e) { 403 // will fallback to default 404 } 405 } 406 // fallback to default encoding 407 try { 408 return URLEncoder.encode(source, ENCODING_UTF_8); 409 } catch (java.io.UnsupportedEncodingException e) { 410 // ignore 411 } 412 return source; 413 } 414 415 /** 416 * Encodes all characters that are contained in the String which can not displayed 417 * in the given encodings charset with HTML entity references 418 * like <code>&#8364;</code>.<p> 419 * 420 * This is required since a Java String is 421 * internally always stored as Unicode, meaning it can contain almost every character, but 422 * the HTML charset used might not support all such characters.<p> 423 * 424 * @param input the input to encode for HTML 425 * @param encoding the charset to encode the result with 426 * 427 * @return the input with the encoded HTML entities 428 * 429 * @see #decodeHtmlEntities(String, String) 430 */ 431 public static String encodeHtmlEntities(String input, String encoding) { 432 433 StringBuffer result = new StringBuffer(input.length() * 2); 434 CharBuffer buffer = CharBuffer.wrap(input.toCharArray()); 435 Charset charset = Charset.forName(encoding); 436 CharsetEncoder encoder = charset.newEncoder(); 437 for (int i = 0; i < buffer.length(); i++) { 438 int c = buffer.get(i); 439 if (c < 128) { 440 // first 128 chars are contained in almost every charset 441 result.append((char)c); 442 // this is intended as performance improvement since 443 // the canEncode() operation appears quite CPU heavy 444 } else if (encoder.canEncode((char)c)) { 445 // encoder can encode this char 446 result.append((char)c); 447 } else { 448 // append HTML entity reference 449 result.append(ENTITY_PREFIX); 450 result.append(c); 451 result.append(";"); 452 } 453 } 454 return result.toString(); 455 } 456 457 /** 458 * Encodes all characters that are contained in the String which can not displayed 459 * in the given encodings charset with Java escaping like <code>\u20ac</code>.<p> 460 * 461 * This can be used to escape values used in Java property files.<p> 462 * 463 * @param input the input to encode for Java 464 * @param encoding the charset to encode the result with 465 * 466 * @return the input with the encoded Java entities 467 */ 468 public static String encodeJavaEntities(String input, String encoding) { 469 470 StringBuffer result = new StringBuffer(input.length() * 2); 471 CharBuffer buffer = CharBuffer.wrap(input.toCharArray()); 472 Charset charset = Charset.forName(encoding); 473 CharsetEncoder encoder = charset.newEncoder(); 474 for (int i = 0; i < buffer.length(); i++) { 475 int c = buffer.get(i); 476 if (c < 128) { 477 // first 128 chars are contained in almost every charset 478 result.append((char)c); 479 // this is intended as performance improvement since 480 // the canEncode() operation appears quite CPU heavy 481 } else if (encoder.canEncode((char)c)) { 482 // encoder can encode this char 483 result.append((char)c); 484 } else { 485 // append Java entity reference 486 result.append("\\u"); 487 String hex = Integer.toHexString(c); 488 int pad = 4 - hex.length(); 489 for (int p = 0; p < pad; p++) { 490 result.append('0'); 491 } 492 result.append(hex); 493 } 494 } 495 return result.toString(); 496 } 497 498 /** 499 * Encodes a string used as parameter in an uri in a way independent of other encodings/decodings applied later.<p> 500 * 501 * Used to ensure that GET parameters are not wrecked by wrong or incompatible configuration settings. 502 * In order to ensure this, the String is first encoded with html entities for any character that cannot encoded 503 * in US-ASCII; additionally, the plus sign is also encoded to avoid problems with the white-space replacer. 504 * Finally, the entity prefix is replaced with characters not used as delimiters in urls.<p> 505 * 506 * @param input the parameter string 507 * 508 * @return the encoded parameter string 509 */ 510 public static String encodeParameter(String input) { 511 512 String result = CmsEncoder.encodeHtmlEntities(input, CmsEncoder.ENCODING_US_ASCII); 513 result = CmsStringUtil.substitute(result, "+", PLUS_ENTITY); 514 return CmsStringUtil.substitute(result, ENTITY_PREFIX, ENTITY_REPLACEMENT); 515 } 516 517 /** 518 * Encode a list of strings as base64 data to be used in a request parameter.<p> 519 * 520 * @param strings the strings to encode 521 * @return the resulting base64 data 522 */ 523 public static String encodeStringsAsBase64Parameter(List<String> strings) { 524 525 JSONArray array = new JSONArray(); 526 for (String string : strings) { 527 array.put(string); 528 } 529 byte[] bytes; 530 try { 531 // use obfuscateBytes here to to make the output look more random 532 bytes = obfuscateBytes(array.toString().getBytes("UTF-8")); 533 } catch (UnsupportedEncodingException e) { 534 // should never happen 535 e.printStackTrace(); 536 throw new RuntimeException(e); 537 } 538 String result = Base64.encodeBase64String(bytes); 539 result = StringUtils.replaceChars(result, BASE64_EXTRA, BASE64_EXTRA_REPLACEMENTS); 540 return result; 541 } 542 543 /** 544 * Encodes a String in a way similar to the JavaScript "encodeURIcomponent" function, 545 * using "UTF-8" for character encoding encoding.<p> 546 * 547 * JavaScript "decodeURIcomponent" can decode Strings that have been encoded using this method.<p> 548 * 549 * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p> 550 * 551 * @param source The text to be encoded 552 * 553 * @return The encoded string 554 * 555 * @see #escape(String, String) 556 */ 557 public static String escape(String source) { 558 559 return escape(source, ENCODING_UTF_8); 560 } 561 562 /** 563 * Encodes a String in a way similar to the JavaScript "encodeURIcomponent" function.<p> 564 * 565 * JavaScript "decodeURIcomponent" can decode Strings that have been encoded using this method, 566 * provided "UTF-8" has been used as encoding.<p> 567 * 568 * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p> 569 * 570 * @param source The text to be encoded 571 * @param encoding the encoding type 572 * 573 * @return The encoded string 574 */ 575 public static String escape(String source, String encoding) { 576 577 // the blank is encoded into "+" not "%20" when using standard encode call 578 return CmsStringUtil.substitute(encode(source, encoding), "+", "%20"); 579 } 580 581 /** 582 * Escapes special characters in a HTML-String with their number-based 583 * entity representation, for example & becomes &#38;.<p> 584 * 585 * A character <code>num</code> is replaced if<br> 586 * <code>((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62)))</code><p> 587 * 588 * @param source the String to escape 589 * 590 * @return String the escaped String 591 * 592 * @see #escapeXml(String) 593 */ 594 public static String escapeHtml(String source) { 595 596 if (source == null) { 597 return null; 598 } 599 StringBuffer result = new StringBuffer(source.length() * 2); 600 for (int i = 0; i < source.length(); i++) { 601 int ch = source.charAt(i); 602 // avoid escaping already escaped characters 603 if (ch == 38) { 604 int terminatorIndex = source.indexOf(";", i); 605 if (terminatorIndex > 0) { 606 if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+|lt|gt|amp|quote")) { 607 result.append(source.substring(i, terminatorIndex + 1)); 608 // Skip remaining chars up to (and including) ";" 609 i = terminatorIndex; 610 continue; 611 } 612 } 613 } 614 if ((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62))) { 615 result.append(ENTITY_PREFIX); 616 result.append(ch); 617 result.append(";"); 618 } else { 619 result.append((char)ch); 620 } 621 } 622 return new String(result); 623 } 624 625 /** 626 * Escapes non ASCII characters in a HTML-String with their number-based 627 * entity representation, for example & becomes &#38;.<p> 628 * 629 * A character <code>num</code> is replaced if<br> 630 * <code>(ch > 255)</code><p> 631 * 632 * @param source the String to escape 633 * 634 * @return String the escaped String 635 * 636 * @see #escapeXml(String) 637 */ 638 public static String escapeNonAscii(String source) { 639 640 if (source == null) { 641 return null; 642 } 643 StringBuffer result = new StringBuffer(source.length() * 2); 644 for (int i = 0; i < source.length(); i++) { 645 int ch = source.charAt(i); 646 if (ch > 255) { 647 result.append(ENTITY_PREFIX); 648 result.append(ch); 649 result.append(";"); 650 } else { 651 result.append((char)ch); 652 } 653 } 654 return new String(result); 655 } 656 657 /** 658 * A simple method to avoid injection.<p> 659 * 660 * Replaces all single quotes to double single quotes in the value parameter of the SQL statement.<p> 661 * 662 * @param source the String to escape SQL from 663 * @return the escaped value of the parameter source 664 */ 665 public static String escapeSql(String source) { 666 667 return source.replaceAll("'", "''"); 668 } 669 670 /** 671 * Escapes the wildcard characters in a string which will be used as the pattern for a SQL LIKE clause.<p> 672 * 673 * @param pattern the pattern 674 * @param escapeChar the character which should be used as the escape character 675 * 676 * @return the escaped pattern 677 */ 678 public static String escapeSqlLikePattern(String pattern, char escapeChar) { 679 680 char[] special = new char[] {escapeChar, '%', '_'}; 681 String result = pattern; 682 for (char charToEscape : special) { 683 result = result.replaceAll("" + charToEscape, "" + escapeChar + charToEscape); 684 } 685 return result; 686 } 687 688 /** 689 * Encodes a String in a way similar JavaScript "encodeURIcomponent" function.<p> 690 * 691 * Multiple blanks are encoded _multiply_ with <code>%20</code>.<p> 692 * 693 * @param source The text to be encoded 694 * @param encoding the encoding type 695 * 696 * @return The encoded String 697 */ 698 public static String escapeWBlanks(String source, String encoding) { 699 700 if (CmsStringUtil.isEmpty(source)) { 701 return source; 702 } 703 StringBuffer ret = new StringBuffer(source.length() * 2); 704 705 // URLEncode the text string 706 // this produces a very similar encoding to JavaSscript encoding, 707 // except the blank which is not encoded into "%20" instead of "+" 708 709 String enc = encode(source, encoding); 710 for (int z = 0; z < enc.length(); z++) { 711 char c = enc.charAt(z); 712 if (c == '+') { 713 ret.append("%20"); 714 } else { 715 ret.append(c); 716 } 717 } 718 return ret.toString(); 719 } 720 721 /** 722 * Escapes a String so it may be printed as text content or attribute 723 * value in a HTML page or an XML file.<p> 724 * 725 * This method replaces the following characters in a String: 726 * <ul> 727 * <li><b><</b> with &lt; 728 * <li><b>></b> with &gt; 729 * <li><b>&</b> with &amp; 730 * <li><b>"</b> with &quot; 731 * </ul><p> 732 * 733 * @param source the string to escape 734 * 735 * @return the escaped string 736 * 737 * @see #escapeHtml(String) 738 */ 739 public static String escapeXml(String source) { 740 741 return escapeXml(source, false); 742 } 743 744 /** 745 * Escapes a String so it may be printed as text content or attribute 746 * value in a HTML page or an XML file.<p> 747 * 748 * This method replaces the following characters in a String: 749 * <ul> 750 * <li><b><</b> with &lt; 751 * <li><b>></b> with &gt; 752 * <li><b>&</b> with &amp; 753 * <li><b>"</b> with &quot; 754 * </ul><p> 755 * 756 * @param source the string to escape 757 * @param doubleEscape if <code>false</code>, all entities that already are escaped are left untouched 758 * 759 * @return the escaped string 760 * 761 * @see #escapeHtml(String) 762 */ 763 public static String escapeXml(String source, boolean doubleEscape) { 764 765 if (source == null) { 766 return null; 767 } 768 StringBuffer result = new StringBuffer(source.length() * 2); 769 770 for (int i = 0; i < source.length(); ++i) { 771 char ch = source.charAt(i); 772 switch (ch) { 773 case '<': 774 result.append("<"); 775 break; 776 case '>': 777 result.append(">"); 778 break; 779 case '&': 780 // don't escape already escaped international and special characters 781 if (!doubleEscape) { 782 int terminatorIndex = source.indexOf(";", i); 783 if (terminatorIndex > 0) { 784 if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+")) { 785 result.append(ch); 786 break; 787 } 788 } 789 } 790 // note that to other "break" in the above "if" block 791 result.append("&"); 792 break; 793 case '"': 794 result.append("""); 795 break; 796 default: 797 result.append(ch); 798 } 799 } 800 return new String(result); 801 } 802 803 /** 804 * Checks if a given encoding name is actually supported, and if so 805 * resolves it to it's canonical name, if not it returns the given fallback 806 * value.<p> 807 * 808 * Charsets have a set of aliases. For example, valid aliases for "UTF-8" 809 * are "UTF8", "utf-8" or "utf8". This method resolves any given valid charset name 810 * to it's "canonical" form, so that simple String comparison can be used 811 * when checking charset names internally later.<p> 812 * 813 * Please see <a href="http://www.iana.org/assignments/character-sets">http://www.iana.org/assignments/character-sets</a> 814 * for a list of valid charset alias names.<p> 815 * 816 * @param encoding the encoding to check and resolve 817 * @param fallback the fallback encoding scheme 818 * 819 * @return the resolved encoding name, or the fallback value 820 */ 821 public static String lookupEncoding(String encoding, String fallback) { 822 823 String result = m_encodingCache.get(encoding); 824 if (result != null) { 825 return result; 826 } 827 828 try { 829 result = Charset.forName(encoding).name(); 830 m_encodingCache.put(encoding, result); 831 return result; 832 } catch (Throwable t) { 833 // we will use the default value as fallback 834 } 835 836 return fallback; 837 } 838 839 /** 840 * Re-decodes a String that has not been correctly decoded and thus has scrambled 841 * character bytes.<p> 842 * 843 * This is an equivalent to the JavaScript "decodeURIComponent" function. 844 * It converts from the default "UTF-8" to the currently selected system encoding.<p> 845 * 846 * @param input the String to convert 847 * 848 * @return String the converted String 849 */ 850 public static String redecodeUriComponent(String input) { 851 852 if (input == null) { 853 return input; 854 } 855 return new String( 856 changeEncoding(input.getBytes(), ENCODING_UTF_8, OpenCms.getSystemInfo().getDefaultEncoding())); 857 } 858 859 /** 860 * Decodes a String in a way similar to the JavaScript "decodeURIcomponent" function, 861 * using "UTF-8" for character encoding.<p> 862 * 863 * This method can decode Strings that have been encoded in JavaScript with "encodeURIcomponent".<p> 864 * 865 * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p> 866 * 867 * @param source The String to be decoded 868 * 869 * @return The decoded String 870 */ 871 public static String unescape(String source) { 872 873 return unescape(source, ENCODING_UTF_8); 874 } 875 876 /** 877 * Decodes a String in a way similar to the JavaScript "decodeURIcomponent" function.<p> 878 * 879 * This method can decode Strings that have been encoded in JavaScript with "encodeURIcomponent", 880 * provided "UTF-8" is used as encoding.<p> 881 * 882 * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p> 883 * 884 * @param source The String to be decoded 885 * @param encoding the encoding type 886 * 887 * @return The decoded String 888 */ 889 public static String unescape(String source, String encoding) { 890 891 if (source == null) { 892 return null; 893 } 894 int len = source.length(); 895 // to use standard decoder we need to replace '+' with "%20" (space) 896 StringBuffer preparedSource = new StringBuffer(len); 897 for (int i = 0; i < len; i++) { 898 char c = source.charAt(i); 899 if (c == '+') { 900 preparedSource.append("%20"); 901 } else { 902 preparedSource.append(c); 903 } 904 } 905 return decode(preparedSource.toString(), encoding); 906 } 907 908 /** 909 * Decrypts a byte array obfuscated with 'obfuscateBytes'.<p> 910 * 911 * @param source the source 912 * @return the resuvlt 913 */ 914 private static byte[] deobfuscateBytes(byte[] source) { 915 916 byte[] result = new byte[source.length - 1]; 917 System.arraycopy(source, 1, result, 0, source.length - 1); 918 for (int i = 0; i < result.length; i++) { 919 result[i] = (byte)(0xFF & (result[i] ^ source[0])); 920 } 921 return result; 922 } 923 924 /** 925 * Simple "obfuscation" for byte arrays using random numbers.<p> 926 * 927 * @param source the source array 928 * @return the result 929 */ 930 private static byte[] obfuscateBytes(byte[] source) { 931 932 byte[] s = new byte[1]; 933 m_random.nextBytes(s); 934 byte[] result = new byte[source.length + 1]; 935 System.arraycopy(source, 0, result, 1, source.length); 936 result[0] = s[0]; 937 for (int i = 1; i < result.length; i++) { 938 result[i] = (byte)(0xFF & (result[i] ^ s[0])); 939 } 940 return result; 941 } 942 943}