001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.i18n;
029
030import org.opencms.json.JSONArray;
031import org.opencms.json.JSONException;
032import org.opencms.main.CmsLog;
033import org.opencms.main.OpenCms;
034import org.opencms.util.CmsStringUtil;
035
036import java.io.UnsupportedEncodingException;
037import java.net.IDN;
038import java.net.URI;
039import java.net.URISyntaxException;
040import java.net.URLDecoder;
041import java.net.URLEncoder;
042import java.nio.CharBuffer;
043import java.nio.charset.Charset;
044import java.nio.charset.CharsetEncoder;
045import java.util.HashMap;
046import java.util.List;
047import java.util.Map;
048import java.util.Random;
049import java.util.regex.Matcher;
050import java.util.regex.Pattern;
051
052import org.apache.commons.codec.binary.Base64;
053import org.apache.commons.lang3.StringUtils;
054import org.apache.commons.logging.Log;
055
056import com.google.common.collect.Lists;
057
058/**
059 * The OpenCms CmsEncoder class provides static methods to decode and encode data.<p>
060 *
061 * The methods in this class are substitutes for <code>java.net.URLEncoder.encode()</code> and
062 * <code>java.net.URLDecoder.decode()</code>. Use the methods from this class in all OpenCms
063 * core classes to ensure the encoding is always handled the same way.<p>
064 *
065 * The de- and encoding uses the same coding mechanism as JavaScript, special characters are
066 * replaced with <code>%hex</code> where hex is a two digit hex number.<p>
067 *
068 * <b>Note:</b> On the client side (browser) instead of using the deprecated <code>escape</code>
069 * and <code>unescape</code> JavaScript functions, always the use <code>encodeURIComponent</code> and
070 * <code>decodeURIComponent</code> functions. Only these work properly with unicode characters.<p>
071 *
072 * @since 6.0.0
073 */
074public final class CmsEncoder {
075
076    /** Non-alphanumeric characters used for Base64 encoding. */
077    public static final String BASE64_EXTRA = "+/=";
078
079    /** Characters used as replacements for non-alphanumeric Base64 characters when using Base64 for request parameters. */
080    public static final String BASE64_EXTRA_REPLACEMENTS = "-_.";
081
082    /** Constant for the standard <code>ISO-8859-1</code> encoding. */
083    public static final String ENCODING_ISO_8859_1 = "ISO-8859-1";
084
085    /** Constant for the standard <code>US-ASCII</code> encoding. */
086    public static final String ENCODING_US_ASCII = "US-ASCII";
087
088    /**
089     * Constant for the standard <code>UTF-8</code> encoding.<p>
090     *
091     * Default encoding for JavaScript decodeUriComponent methods is <code>UTF-8</code> by w3c standard.
092     */
093    public static final String ENCODING_UTF_8 = "UTF-8";
094
095    /** The regex pattern to match HTML entities. */
096    private static final Pattern ENTITIY_PATTERN = Pattern.compile("\\&#\\d+;");
097
098    /** The prefix for HTML entities. */
099    private static final String ENTITY_PREFIX = "&#";
100
101    /** The replacement for HTML entity prefix in parameters. */
102    private static final String ENTITY_REPLACEMENT = "$$";
103
104    /** The log object for this class. */
105    private static final Log LOG = CmsLog.getLog(CmsEncoder.class);
106
107    /** A cache for encoding name lookup. */
108    private static Map<String, String> m_encodingCache = new HashMap<String, String>(16);
109
110    private static Random m_random = new Random();
111
112    /** The plus entity. */
113    private static final String PLUS_ENTITY = ENTITY_PREFIX + "043;";
114
115    /**
116     * Constructor.<p>
117     */
118    private CmsEncoder() {
119
120        // empty
121    }
122
123    /**
124     * Adjusts the given String by making sure all characters that can be displayed
125     * in the given charset are contained as chars, whereas all other non-displayable
126     * characters are converted to HTML entities.<p>
127     *
128     * Just calls {@link #decodeHtmlEntities(String, String)} first and feeds the result
129     * to {@link #encodeHtmlEntities(String, String)}. <p>
130     *
131     * @param input the input to adjust the HTML encoding for
132     * @param encoding the charset to encode the result with\
133     *
134     * @return the input with the decoded/encoded HTML entities
135     */
136    public static String adjustHtmlEncoding(String input, String encoding) {
137
138        return encodeHtmlEntities(decodeHtmlEntities(input, encoding), encoding);
139    }
140
141    /**
142     * Changes the encoding of a byte array that represents a String.<p>
143     *
144     * @param input the byte array to convert
145     * @param oldEncoding the current encoding of the byte array
146     * @param newEncoding the new encoding of the byte array
147     *
148     * @return the byte array encoded in the new encoding
149     */
150    public static byte[] changeEncoding(byte[] input, String oldEncoding, String newEncoding) {
151
152        if ((oldEncoding == null) || (newEncoding == null)) {
153            return input;
154        }
155        if (oldEncoding.trim().equalsIgnoreCase(newEncoding.trim())) {
156            return input;
157        }
158        byte[] result = input;
159        try {
160            result = (new String(input, oldEncoding)).getBytes(newEncoding);
161        } catch (UnsupportedEncodingException e) {
162            // return value will be input value
163        }
164        return result;
165    }
166
167    /**
168     * Converts the host of an URI to Punycode.<p>
169     *
170     * This is needed when we want to do redirects to hosts with host names containing international characters like umlauts.<p>
171     *
172     * @param uriString the URI
173     * @return the converted URI
174     */
175    public static String convertHostToPunycode(String uriString) {
176
177        if (uriString.indexOf(":") >= 0) {
178            try {
179                URI uri = new URI(uriString);
180                String authority = uri.getAuthority(); // getHost won't work when we have special characters
181                int colonPos = authority.indexOf(':');
182                if (colonPos >= 0) {
183                    authority = IDN.toASCII(authority.substring(0, colonPos)) + authority.substring(colonPos);
184                } else {
185                    authority = IDN.toASCII(authority);
186                }
187                URI uriWithCorrectedHost = new URI(
188                    uri.getScheme(),
189                    authority,
190                    uri.getPath(),
191                    uri.getQuery(),
192                    uri.getFragment());
193                uriString = uriWithCorrectedHost.toASCIIString();
194            } catch (URISyntaxException e) {
195                LOG.error(e.getLocalizedMessage(), e);
196            }
197        }
198        return uriString;
199    }
200
201    /**
202     * Creates a String out of a byte array with the specified encoding, falling back
203     * to the system default in case the encoding name is not valid.<p>
204     *
205     * Use this method as a replacement for <code>new String(byte[], encoding)</code>
206     * to avoid possible encoding problems.<p>
207     *
208     * @param bytes the bytes to decode
209     * @param encoding the encoding scheme to use for decoding the bytes
210     *
211     * @return the bytes decoded to a String
212     */
213    public static String createString(byte[] bytes, String encoding) {
214
215        String enc = encoding.intern();
216        if (enc != OpenCms.getSystemInfo().getDefaultEncoding()) {
217            enc = lookupEncoding(enc, null);
218        }
219        if (enc != null) {
220            try {
221                return new String(bytes, enc);
222            } catch (UnsupportedEncodingException e) {
223                // this can _never_ happen since the charset was looked up first
224            }
225        } else {
226            if (LOG.isWarnEnabled()) {
227                LOG.warn(Messages.get().getBundle().key(Messages.ERR_UNSUPPORTED_VM_ENCODING_1, encoding));
228            }
229            enc = OpenCms.getSystemInfo().getDefaultEncoding();
230            try {
231                return new String(bytes, enc);
232            } catch (UnsupportedEncodingException e) {
233                // this can also _never_ happen since the default encoding is always valid
234            }
235        }
236        // this code is unreachable in practice
237        LOG.error(Messages.get().getBundle().key(Messages.ERR_ENCODING_ISSUES_1, encoding));
238        return null;
239    }
240
241    /**
242     * Decodes a String using UTF-8 encoding, which is the standard for http data transmission
243     * with GET ant POST requests.<p>
244     *
245     * @param source the String to decode
246     *
247     * @return String the decoded source String
248     */
249    public static String decode(String source) {
250
251        return decode(source, ENCODING_UTF_8);
252    }
253
254    /**
255     * This method is a substitute for <code>URLDecoder.decode()</code>.
256     * Use this in all OpenCms core classes to ensure the encoding is
257     * always handled the same way.<p>
258     *
259     * In case you don't know what encoding to use, set the value of
260     * the <code>encoding</code> parameter to <code>null</code>.
261     * This method will then default to UTF-8 encoding, which is probably the right one.<p>
262     *
263     * @param source The string to decode
264     * @param encoding The encoding to use (if null, the system default is used)
265     *
266     * @return The decoded source String
267     */
268    public static String decode(String source, String encoding) {
269
270        if (source == null) {
271            return null;
272        }
273        if (encoding != null) {
274            try {
275                return URLDecoder.decode(source, encoding);
276            } catch (java.io.UnsupportedEncodingException e) {
277                // will fallback to default
278            }
279        }
280        // fallback to default decoding
281        try {
282            return URLDecoder.decode(source, ENCODING_UTF_8);
283        } catch (java.io.UnsupportedEncodingException e) {
284            // ignore
285        }
286        return source;
287    }
288
289    /**
290     * Decodes HTML entity references like <code>&amp;#8364;</code> that are contained in the
291     * String to a regular character, but only if that character is contained in the given
292     * encodings charset.<p>
293     *
294     * @param input the input to decode the HTML entities in
295     * @param encoding the charset to decode the input for
296     * @return the input with the decoded HTML entities
297     *
298     * @see #encodeHtmlEntities(String, String)
299     */
300    public static String decodeHtmlEntities(String input, String encoding) {
301
302        Matcher matcher = ENTITIY_PATTERN.matcher(input);
303        StringBuffer result = new StringBuffer(input.length());
304        Charset charset = Charset.forName(encoding);
305        CharsetEncoder encoder = charset.newEncoder();
306
307        while (matcher.find()) {
308            String entity = matcher.group();
309            String value = entity.substring(2, entity.length() - 1);
310            int c = Integer.valueOf(value).intValue();
311            if (c < 128) {
312                // first 128 chars are contained in almost every charset
313                entity = new String(new char[] {(char)c});
314                // this is intended as performance improvement since
315                // the canEncode() operation appears quite CPU heavy
316            } else if (encoder.canEncode((char)c)) {
317                // encoder can encode this char
318                entity = new String(new char[] {(char)c});
319            }
320            matcher.appendReplacement(result, entity);
321        }
322        matcher.appendTail(result);
323        return result.toString();
324    }
325
326    /**
327     * Decodes a string used as parameter in an uri in a way independent of other encodings/decodings applied before.<p>
328     *
329     * @param input the encoded parameter string
330     *
331     * @return the decoded parameter string
332     *
333     * @see #encodeParameter(String)
334     */
335    public static String decodeParameter(String input) {
336
337        String result = CmsStringUtil.substitute(input, ENTITY_REPLACEMENT, ENTITY_PREFIX);
338        return CmsEncoder.decodeHtmlEntities(result, OpenCms.getSystemInfo().getDefaultEncoding());
339    }
340
341    /**
342     * Decodes a parameter which has been encoded from a string list using encodeStringsAsBase64Parameter.<p>
343     *
344     * @param data the data to decode
345     * @return the list of strings
346     */
347    public static List<String> decodeStringsFromBase64Parameter(String data) {
348
349        data = StringUtils.replaceChars(data, BASE64_EXTRA_REPLACEMENTS, BASE64_EXTRA);
350        byte[] bytes = deobfuscateBytes(Base64.decodeBase64(data));
351        try {
352            JSONArray json = new JSONArray(new String(bytes, "UTF-8"));
353            List<String> result = Lists.newArrayList();
354            for (int i = 0; i < json.length(); i++) {
355                result.add(json.getString(i));
356            }
357            return result;
358        } catch (UnsupportedEncodingException e) {
359            // TODO Auto-generated catch block
360            e.printStackTrace();
361        } catch (JSONException e) {
362            throw new IllegalArgumentException("Decoding failed: " + data, e);
363        }
364        return null;
365    }
366
367    /**
368     * Encodes a String using UTF-8 encoding, which is the standard for http data transmission
369     * with GET ant POST requests.<p>
370     *
371     * @param source the String to encode
372     *
373     * @return String the encoded source String
374     */
375    public static String encode(String source) {
376
377        return encode(source, ENCODING_UTF_8);
378    }
379
380    /**
381     * This method is a substitute for <code>URLEncoder.encode()</code>.
382     * Use this in all OpenCms core classes to ensure the encoding is
383     * always handled the same way.<p>
384     *
385     * In case you don't know what encoding to use, set the value of
386     * the <code>encoding</code> parameter to <code>null</code>.
387     * This method will then default to UTF-8 encoding, which is probably the right one.<p>
388     *
389     * @param source the String to encode
390     * @param encoding the encoding to use (if null, the system default is used)
391     *
392     * @return the encoded source String
393     */
394    public static String encode(String source, String encoding) {
395
396        if (source == null) {
397            return null;
398        }
399        if (encoding != null) {
400            try {
401                return URLEncoder.encode(source, encoding);
402            } catch (java.io.UnsupportedEncodingException e) {
403                // will fallback to default
404            }
405        }
406        // fallback to default encoding
407        try {
408            return URLEncoder.encode(source, ENCODING_UTF_8);
409        } catch (java.io.UnsupportedEncodingException e) {
410            // ignore
411        }
412        return source;
413    }
414
415    /**
416     * Encodes all characters that are contained in the String which can not displayed
417     * in the given encodings charset with HTML entity references
418     * like <code>&amp;#8364;</code>.<p>
419     *
420     * This is required since a Java String is
421     * internally always stored as Unicode, meaning it can contain almost every character, but
422     * the HTML charset used might not support all such characters.<p>
423     *
424     * @param input the input to encode for HTML
425     * @param encoding the charset to encode the result with
426     *
427     * @return the input with the encoded HTML entities
428     *
429     * @see #decodeHtmlEntities(String, String)
430     */
431    public static String encodeHtmlEntities(String input, String encoding) {
432
433        StringBuffer result = new StringBuffer(input.length() * 2);
434        CharBuffer buffer = CharBuffer.wrap(input.toCharArray());
435        Charset charset = Charset.forName(encoding);
436        CharsetEncoder encoder = charset.newEncoder();
437        for (int i = 0; i < buffer.length(); i++) {
438            int c = buffer.get(i);
439            if (c < 128) {
440                // first 128 chars are contained in almost every charset
441                result.append((char)c);
442                // this is intended as performance improvement since
443                // the canEncode() operation appears quite CPU heavy
444            } else if (encoder.canEncode((char)c)) {
445                // encoder can encode this char
446                result.append((char)c);
447            } else {
448                // append HTML entity reference
449                result.append(ENTITY_PREFIX);
450                result.append(c);
451                result.append(";");
452            }
453        }
454        return result.toString();
455    }
456
457    /**
458     * Encodes all characters that are contained in the String which can not displayed
459     * in the given encodings charset with Java escaping like <code>\u20ac</code>.<p>
460     *
461     * This can be used to escape values used in Java property files.<p>
462     *
463     * @param input the input to encode for Java
464     * @param encoding the charset to encode the result with
465     *
466     * @return the input with the encoded Java entities
467     */
468    public static String encodeJavaEntities(String input, String encoding) {
469
470        StringBuffer result = new StringBuffer(input.length() * 2);
471        CharBuffer buffer = CharBuffer.wrap(input.toCharArray());
472        Charset charset = Charset.forName(encoding);
473        CharsetEncoder encoder = charset.newEncoder();
474        for (int i = 0; i < buffer.length(); i++) {
475            int c = buffer.get(i);
476            if (c < 128) {
477                // first 128 chars are contained in almost every charset
478                result.append((char)c);
479                // this is intended as performance improvement since
480                // the canEncode() operation appears quite CPU heavy
481            } else if (encoder.canEncode((char)c)) {
482                // encoder can encode this char
483                result.append((char)c);
484            } else {
485                // append Java entity reference
486                result.append("\\u");
487                String hex = Integer.toHexString(c);
488                int pad = 4 - hex.length();
489                for (int p = 0; p < pad; p++) {
490                    result.append('0');
491                }
492                result.append(hex);
493            }
494        }
495        return result.toString();
496    }
497
498    /**
499     * Encodes a string used as parameter in an uri in a way independent of other encodings/decodings applied later.<p>
500     *
501     * Used to ensure that GET parameters are not wrecked by wrong or incompatible configuration settings.
502     * In order to ensure this, the String is first encoded with html entities for any character that cannot encoded
503     * in US-ASCII; additionally, the plus sign is also encoded to avoid problems with the white-space replacer.
504     * Finally, the entity prefix is replaced with characters not used as delimiters in urls.<p>
505     *
506     * @param input the parameter string
507     *
508     * @return the encoded parameter string
509     */
510    public static String encodeParameter(String input) {
511
512        String result = CmsEncoder.encodeHtmlEntities(input, CmsEncoder.ENCODING_US_ASCII);
513        result = CmsStringUtil.substitute(result, "+", PLUS_ENTITY);
514        return CmsStringUtil.substitute(result, ENTITY_PREFIX, ENTITY_REPLACEMENT);
515    }
516
517    /**
518     * Encode a list of strings as base64 data to be used in a request parameter.<p>
519     *
520     * @param strings the strings to encode
521     * @return the resulting base64 data
522     */
523    public static String encodeStringsAsBase64Parameter(List<String> strings) {
524
525        JSONArray array = new JSONArray();
526        for (String string : strings) {
527            array.put(string);
528        }
529        byte[] bytes;
530        try {
531            // use obfuscateBytes here to to make the output look more random
532            bytes = obfuscateBytes(array.toString().getBytes("UTF-8"));
533        } catch (UnsupportedEncodingException e) {
534            // should never happen
535            e.printStackTrace();
536            throw new RuntimeException(e);
537        }
538        String result = Base64.encodeBase64String(bytes);
539        result = StringUtils.replaceChars(result, BASE64_EXTRA, BASE64_EXTRA_REPLACEMENTS);
540        return result;
541    }
542
543    /**
544     * Encodes a String in a way similar to the JavaScript "encodeURIcomponent" function,
545     * using "UTF-8" for character encoding encoding.<p>
546     *
547     * JavaScript "decodeURIcomponent" can decode Strings that have been encoded using this method.<p>
548     *
549     * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p>
550     *
551     * @param source The text to be encoded
552     *
553     * @return The encoded string
554     *
555     * @see #escape(String, String)
556     */
557    public static String escape(String source) {
558
559        return escape(source, ENCODING_UTF_8);
560    }
561
562    /**
563     * Encodes a String in a way similar to the JavaScript "encodeURIcomponent" function.<p>
564     *
565     * JavaScript "decodeURIcomponent" can decode Strings that have been encoded using this method,
566     * provided "UTF-8" has been used as encoding.<p>
567     *
568     * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p>
569     *
570     * @param source The text to be encoded
571     * @param encoding the encoding type
572     *
573     * @return The encoded string
574     */
575    public static String escape(String source, String encoding) {
576
577        // the blank is encoded into "+" not "%20" when using standard encode call
578        return CmsStringUtil.substitute(encode(source, encoding), "+", "%20");
579    }
580
581    /**
582     * Escapes special characters in a HTML-String with their number-based
583     * entity representation, for example &amp; becomes &amp;#38;.<p>
584     *
585     * A character <code>num</code> is replaced if<br>
586     * <code>((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62)))</code><p>
587     *
588     * @param source the String to escape
589     *
590     * @return String the escaped String
591     *
592     * @see #escapeXml(String)
593     */
594    public static String escapeHtml(String source) {
595
596        if (source == null) {
597            return null;
598        }
599        StringBuffer result = new StringBuffer(source.length() * 2);
600        for (int i = 0; i < source.length(); i++) {
601            int ch = source.charAt(i);
602            // avoid escaping already escaped characters
603            if (ch == 38) {
604                int terminatorIndex = source.indexOf(";", i);
605                if (terminatorIndex > 0) {
606                    if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+|lt|gt|amp|quote")) {
607                        result.append(source.substring(i, terminatorIndex + 1));
608                        // Skip remaining chars up to (and including) ";"
609                        i = terminatorIndex;
610                        continue;
611                    }
612                }
613            }
614            if ((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62))) {
615                result.append(ENTITY_PREFIX);
616                result.append(ch);
617                result.append(";");
618            } else {
619                result.append((char)ch);
620            }
621        }
622        return new String(result);
623    }
624
625    /**
626     * Escapes non ASCII characters in a HTML-String with their number-based
627     * entity representation, for example &amp; becomes &amp;#38;.<p>
628     *
629     * A character <code>num</code> is replaced if<br>
630     * <code>(ch > 255)</code><p>
631     *
632     * @param source the String to escape
633     *
634     * @return String the escaped String
635     *
636     * @see #escapeXml(String)
637     */
638    public static String escapeNonAscii(String source) {
639
640        if (source == null) {
641            return null;
642        }
643        StringBuffer result = new StringBuffer(source.length() * 2);
644        for (int i = 0; i < source.length(); i++) {
645            int ch = source.charAt(i);
646            if (ch > 255) {
647                result.append(ENTITY_PREFIX);
648                result.append(ch);
649                result.append(";");
650            } else {
651                result.append((char)ch);
652            }
653        }
654        return new String(result);
655    }
656
657    /**
658     * A simple method to avoid injection.<p>
659     *
660     * Replaces all single quotes to double single quotes in the value parameter of the SQL statement.<p>
661     *
662     * @param source the String to escape SQL from
663     * @return the escaped value of the parameter source
664     */
665    public static String escapeSql(String source) {
666
667        return source.replaceAll("'", "''");
668    }
669
670    /**
671     * Escapes the wildcard characters in a string which will be used as the pattern for a SQL LIKE clause.<p>
672     *
673     * @param pattern the pattern
674     * @param escapeChar the character which should be used as the escape character
675     *
676     * @return the escaped pattern
677     */
678    public static String escapeSqlLikePattern(String pattern, char escapeChar) {
679
680        char[] special = new char[] {escapeChar, '%', '_'};
681        String result = pattern;
682        for (char charToEscape : special) {
683            result = result.replaceAll("" + charToEscape, "" + escapeChar + charToEscape);
684        }
685        return result;
686    }
687
688    /**
689     * Encodes a String in a way similar JavaScript "encodeURIcomponent" function.<p>
690     *
691     * Multiple blanks are encoded _multiply_ with <code>%20</code>.<p>
692     *
693     * @param source The text to be encoded
694     * @param encoding the encoding type
695     *
696     * @return The encoded String
697     */
698    public static String escapeWBlanks(String source, String encoding) {
699
700        if (CmsStringUtil.isEmpty(source)) {
701            return source;
702        }
703        StringBuffer ret = new StringBuffer(source.length() * 2);
704
705        // URLEncode the text string
706        // this produces a very similar encoding to JavaSscript encoding,
707        // except the blank which is not encoded into "%20" instead of "+"
708
709        String enc = encode(source, encoding);
710        for (int z = 0; z < enc.length(); z++) {
711            char c = enc.charAt(z);
712            if (c == '+') {
713                ret.append("%20");
714            } else {
715                ret.append(c);
716            }
717        }
718        return ret.toString();
719    }
720
721    /**
722     * Escapes a String so it may be printed as text content or attribute
723     * value in a HTML page or an XML file.<p>
724     *
725     * This method replaces the following characters in a String:
726     * <ul>
727     * <li><b>&lt;</b> with &amp;lt;
728     * <li><b>&gt;</b> with &amp;gt;
729     * <li><b>&amp;</b> with &amp;amp;
730     * <li><b>&quot;</b> with &amp;quot;
731     * </ul><p>
732     *
733     * @param source the string to escape
734     *
735     * @return the escaped string
736     *
737     * @see #escapeHtml(String)
738     */
739    public static String escapeXml(String source) {
740
741        return escapeXml(source, false);
742    }
743
744    /**
745     * Escapes a String so it may be printed as text content or attribute
746     * value in a HTML page or an XML file.<p>
747     *
748     * This method replaces the following characters in a String:
749     * <ul>
750     * <li><b>&lt;</b> with &amp;lt;
751     * <li><b>&gt;</b> with &amp;gt;
752     * <li><b>&amp;</b> with &amp;amp;
753     * <li><b>&quot;</b> with &amp;quot;
754     * </ul><p>
755     *
756     * @param source the string to escape
757     * @param doubleEscape if <code>false</code>, all entities that already are escaped are left untouched
758     *
759     * @return the escaped string
760     *
761     * @see #escapeHtml(String)
762     */
763    public static String escapeXml(String source, boolean doubleEscape) {
764
765        if (source == null) {
766            return null;
767        }
768        StringBuffer result = new StringBuffer(source.length() * 2);
769
770        for (int i = 0; i < source.length(); ++i) {
771            char ch = source.charAt(i);
772            switch (ch) {
773                case '<':
774                    result.append("&lt;");
775                    break;
776                case '>':
777                    result.append("&gt;");
778                    break;
779                case '&':
780                    // don't escape already escaped international and special characters
781                    if (!doubleEscape) {
782                        int terminatorIndex = source.indexOf(";", i);
783                        if (terminatorIndex > 0) {
784                            if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+")) {
785                                result.append(ch);
786                                break;
787                            }
788                        }
789                    }
790                    // note that to other "break" in the above "if" block
791                    result.append("&amp;");
792                    break;
793                case '"':
794                    result.append("&quot;");
795                    break;
796                default:
797                    result.append(ch);
798            }
799        }
800        return new String(result);
801    }
802
803    /**
804     * Checks if a given encoding name is actually supported, and if so
805     * resolves it to it's canonical name, if not it returns the given fallback
806     * value.<p>
807     *
808     * Charsets have a set of aliases. For example, valid aliases for "UTF-8"
809     * are "UTF8", "utf-8" or "utf8". This method resolves any given valid charset name
810     * to it's "canonical" form, so that simple String comparison can be used
811     * when checking charset names internally later.<p>
812     *
813     * Please see <a href="http://www.iana.org/assignments/character-sets">http://www.iana.org/assignments/character-sets</a>
814     * for a list of valid charset alias names.<p>
815     *
816     * @param encoding the encoding to check and resolve
817     * @param fallback the fallback encoding scheme
818     *
819     * @return the resolved encoding name, or the fallback value
820     */
821    public static String lookupEncoding(String encoding, String fallback) {
822
823        String result = m_encodingCache.get(encoding);
824        if (result != null) {
825            return result;
826        }
827
828        try {
829            result = Charset.forName(encoding).name();
830            m_encodingCache.put(encoding, result);
831            return result;
832        } catch (Throwable t) {
833            // we will use the default value as fallback
834        }
835
836        return fallback;
837    }
838
839    /**
840     * Re-decodes a String that has not been correctly decoded and thus has scrambled
841     * character bytes.<p>
842     *
843     * This is an equivalent to the JavaScript "decodeURIComponent" function.
844     * It converts from the default "UTF-8" to the currently selected system encoding.<p>
845     *
846     * @param input the String to convert
847     *
848     * @return String the converted String
849     */
850    public static String redecodeUriComponent(String input) {
851
852        if (input == null) {
853            return input;
854        }
855        return new String(
856            changeEncoding(input.getBytes(), ENCODING_UTF_8, OpenCms.getSystemInfo().getDefaultEncoding()));
857    }
858
859    /**
860     * Decodes a String in a way similar to the JavaScript "decodeURIcomponent" function,
861     * using "UTF-8" for character encoding.<p>
862     *
863     * This method can decode Strings that have been encoded in JavaScript with "encodeURIcomponent".<p>
864     *
865     * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p>
866     *
867     * @param source The String to be decoded
868     *
869     * @return The decoded String
870     */
871    public static String unescape(String source) {
872
873        return unescape(source, ENCODING_UTF_8);
874    }
875
876    /**
877     * Decodes a String in a way similar to the JavaScript "decodeURIcomponent" function.<p>
878     *
879     * This method can decode Strings that have been encoded in JavaScript with "encodeURIcomponent",
880     * provided "UTF-8" is used as encoding.<p>
881     *
882     * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p>
883     *
884     * @param source The String to be decoded
885     * @param encoding the encoding type
886     *
887     * @return The decoded String
888     */
889    public static String unescape(String source, String encoding) {
890
891        if (source == null) {
892            return null;
893        }
894        int len = source.length();
895        // to use standard decoder we need to replace '+' with "%20" (space)
896        StringBuffer preparedSource = new StringBuffer(len);
897        for (int i = 0; i < len; i++) {
898            char c = source.charAt(i);
899            if (c == '+') {
900                preparedSource.append("%20");
901            } else {
902                preparedSource.append(c);
903            }
904        }
905        return decode(preparedSource.toString(), encoding);
906    }
907
908    /**
909     * Decrypts a byte array obfuscated with 'obfuscateBytes'.<p>
910     *
911     * @param source the source
912     * @return the resuvlt
913     */
914    private static byte[] deobfuscateBytes(byte[] source) {
915
916        byte[] result = new byte[source.length - 1];
917        System.arraycopy(source, 1, result, 0, source.length - 1);
918        for (int i = 0; i < result.length; i++) {
919            result[i] = (byte)(0xFF & (result[i] ^ source[0]));
920        }
921        return result;
922    }
923
924    /**
925     * Simple "obfuscation" for byte arrays using random numbers.<p>
926     *
927     * @param source the source array
928     * @return the result
929     */
930    private static byte[] obfuscateBytes(byte[] source) {
931
932        byte[] s = new byte[1];
933        m_random.nextBytes(s);
934        byte[] result = new byte[source.length + 1];
935        System.arraycopy(source, 0, result, 1, source.length);
936        result[0] = s[0];
937        for (int i = 1; i < result.length; i++) {
938            result[i] = (byte)(0xFF & (result[i] ^ s[0]));
939        }
940        return result;
941    }
942
943}