Source code

001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.i18n;
029
030import org.opencms.json.JSONArray;
031import org.opencms.json.JSONException;
032import org.opencms.main.CmsLog;
033import org.opencms.main.OpenCms;
034import org.opencms.util.CmsStringUtil;
035
036import java.io.UnsupportedEncodingException;
037import java.net.IDN;
038import java.net.URI;
039import java.net.URISyntaxException;
040import java.net.URLDecoder;
041import java.net.URLEncoder;
042import java.nio.CharBuffer;
043import java.nio.charset.Charset;
044import java.nio.charset.CharsetEncoder;
045import java.util.HashMap;
046import java.util.List;
047import java.util.Map;
048import java.util.Random;
049import java.util.regex.Matcher;
050import java.util.regex.Pattern;
051
052import org.apache.commons.codec.binary.Base64;
053import org.apache.commons.lang3.StringUtils;
054import org.apache.commons.logging.Log;
055import org.apache.http.client.utils.URIBuilder;
056
057import com.google.common.collect.Lists;
058
059/**
060 * The OpenCms CmsEncoder class provides static methods to decode and encode data.<p>
061 *
062 * The methods in this class are substitutes for <code>java.net.URLEncoder.encode()</code> and
063 * <code>java.net.URLDecoder.decode()</code>. Use the methods from this class in all OpenCms
064 * core classes to ensure the encoding is always handled the same way.<p>
065 *
066 * The de- and encoding uses the same coding mechanism as JavaScript, special characters are
067 * replaced with <code>%hex</code> where hex is a two digit hex number.<p>
068 *
069 * <b>Note:</b> On the client side (browser) instead of using the deprecated <code>escape</code>
070 * and <code>unescape</code> JavaScript functions, always the use <code>encodeURIComponent</code> and
071 * <code>decodeURIComponent</code> functions. Only these work properly with unicode characters.<p>
072 *
073 * @since 6.0.0
074 */
075public final class CmsEncoder {
076
077    /** Non-alphanumeric characters used for Base64 encoding. */
078    public static final String BASE64_EXTRA = "+/=";
079
080    /** Characters used as replacements for non-alphanumeric Base64 characters when using Base64 for request parameters. */
081    public static final String BASE64_EXTRA_REPLACEMENTS = "-_.";
082
083    /** Constant for the standard <code>ISO-8859-1</code> encoding. */
084    public static final String ENCODING_ISO_8859_1 = "ISO-8859-1";
085
086    /** Constant for the standard <code>US-ASCII</code> encoding. */
087    public static final String ENCODING_US_ASCII = "US-ASCII";
088
089    /**
090     * Constant for the standard <code>UTF-8</code> encoding.<p>
091     *
092     * Default encoding for JavaScript decodeUriComponent methods is <code>UTF-8</code> by w3c standard.
093     */
094    public static final String ENCODING_UTF_8 = "UTF-8";
095
096    /** The regex pattern to match HTML entities. */
097    private static final Pattern ENTITIY_PATTERN = Pattern.compile("\\&#(\\d+);");
098
099    /** The prefix for HTML entities. */
100    private static final String ENTITY_PREFIX = "&#";
101
102    /** The replacement for HTML entity prefix in parameters. */
103    private static final String ENTITY_REPLACEMENT = "$$";
104
105    /** The log object for this class. */
106    private static final Log LOG = CmsLog.getLog(CmsEncoder.class);
107
108    /** A cache for encoding name lookup. */
109    private static Map<String, String> m_encodingCache = new HashMap<String, String>(16);
110
111    private static Random m_random = new Random();
112
113    /** The plus entity. */
114    private static final String PLUS_ENTITY = ENTITY_PREFIX + "043;";
115
116    /**
117     * Constructor.<p>
118     */
119    private CmsEncoder() {
120
121        // empty
122    }
123
124    /**
125     * Adjusts the given String by making sure all characters that can be displayed
126     * in the given charset are contained as chars, whereas all other non-displayable
127     * characters are converted to HTML entities.<p>
128     *
129     * Just calls {@link #decodeHtmlEntities(String)} first and feeds the result
130     * to {@link #encodeHtmlEntities(String, String)}. <p>
131     *
132     * @param input the input to adjust the HTML encoding for
133     * @param encoding the charset to encode the result with\
134     *
135     * @return the input with the decoded/encoded HTML entities
136     */
137    public static String adjustHtmlEncoding(String input, String encoding) {
138
139        return encodeHtmlEntities(decodeHtmlEntities(input), encoding);
140    }
141
142    /**
143     * Changes the encoding of a byte array that represents a String.<p>
144     *
145     * @param input the byte array to convert
146     * @param oldEncoding the current encoding of the byte array
147     * @param newEncoding the new encoding of the byte array
148     *
149     * @return the byte array encoded in the new encoding
150     */
151    public static byte[] changeEncoding(byte[] input, String oldEncoding, String newEncoding) {
152
153        if ((oldEncoding == null) || (newEncoding == null)) {
154            return input;
155        }
156        if (oldEncoding.trim().equalsIgnoreCase(newEncoding.trim())) {
157            return input;
158        }
159        byte[] result = input;
160        try {
161            result = (new String(input, oldEncoding)).getBytes(newEncoding);
162        } catch (UnsupportedEncodingException e) {
163            // return value will be input value
164        }
165        return result;
166    }
167
168    /**
169     * Converts the host of an URI to Punycode.<p>
170     *
171     * This is needed when we want to do redirects to hosts with host names containing international characters like umlauts.<p>
172     *
173     * @param uriString the URI
174     * @return the converted URI
175     */
176    public static String convertHostToPunycode(String uriString) {
177
178        if (uriString.indexOf(":") >= 0) {
179            try {
180                URI uri = new URI(uriString);
181                String authority = uri.getAuthority(); // getHost won't work when we have special characters
182                int colonPos = authority.indexOf(':');
183                if (colonPos >= 0) {
184                    authority = IDN.toASCII(authority.substring(0, colonPos)) + authority.substring(colonPos);
185                } else {
186                    authority = IDN.toASCII(authority);
187                }
188                URI uriWithCorrectedHost = new URI(uri.getScheme(), authority, null, null, null);
189                URIBuilder builder = new URIBuilder(uri);
190                builder.setHost(uriWithCorrectedHost.getHost());
191                builder.setPort(uriWithCorrectedHost.getPort());
192                builder.setUserInfo(uriWithCorrectedHost.getUserInfo());
193                uriString = builder.build().toASCIIString();
194            } catch (URISyntaxException e) {
195                LOG.error(e.getLocalizedMessage(), e);
196            }
197        }
198        return uriString;
199    }
200
201    /**
202     * Creates a String out of a byte array with the specified encoding, falling back
203     * to the system default in case the encoding name is not valid.<p>
204     *
205     * Use this method as a replacement for <code>new String(byte[], encoding)</code>
206     * to avoid possible encoding problems.<p>
207     *
208     * @param bytes the bytes to decode
209     * @param encoding the encoding scheme to use for decoding the bytes
210     *
211     * @return the bytes decoded to a String
212     */
213    public static String createString(byte[] bytes, String encoding) {
214
215        String enc = encoding.intern();
216        if (enc != OpenCms.getSystemInfo().getDefaultEncoding()) {
217            enc = lookupEncoding(enc, null);
218        }
219        if (enc != null) {
220            try {
221                return new String(bytes, enc);
222            } catch (UnsupportedEncodingException e) {
223                // this can _never_ happen since the charset was looked up first
224            }
225        } else {
226            if (LOG.isWarnEnabled()) {
227                LOG.warn(Messages.get().getBundle().key(Messages.ERR_UNSUPPORTED_VM_ENCODING_1, encoding));
228            }
229            enc = OpenCms.getSystemInfo().getDefaultEncoding();
230            try {
231                return new String(bytes, enc);
232            } catch (UnsupportedEncodingException e) {
233                // this can also _never_ happen since the default encoding is always valid
234            }
235        }
236        // this code is unreachable in practice
237        LOG.error(Messages.get().getBundle().key(Messages.ERR_ENCODING_ISSUES_1, encoding));
238        return null;
239    }
240
241    /**
242     * Decodes a String using UTF-8 encoding, which is the standard for http data transmission
243     * with GET ant POST requests.<p>
244     *
245     * @param source the String to decode
246     *
247     * @return String the decoded source String
248     */
249    public static String decode(String source) {
250
251        return decode(source, ENCODING_UTF_8);
252    }
253
254    /**
255     * This method is a substitute for <code>URLDecoder.decode()</code>.
256     * Use this in all OpenCms core classes to ensure the encoding is
257     * always handled the same way.<p>
258     *
259     * In case you don't know what encoding to use, set the value of
260     * the <code>encoding</code> parameter to <code>null</code>.
261     * This method will then default to UTF-8 encoding, which is probably the right one.<p>
262     *
263     * @param source The string to decode
264     * @param encoding The encoding to use (if null, the system default is used)
265     *
266     * @return The decoded source String
267     */
268    public static String decode(String source, String encoding) {
269
270        if (source == null) {
271            return null;
272        }
273        if (encoding != null) {
274            try {
275                return URLDecoder.decode(source, encoding);
276            } catch (java.io.UnsupportedEncodingException e) {
277                // will fallback to default
278            }
279        }
280        // fallback to default decoding
281        try {
282            return URLDecoder.decode(source, ENCODING_UTF_8);
283        } catch (java.io.UnsupportedEncodingException e) {
284            // ignore
285        }
286        return source;
287    }
288
289    /**
290     * Decodes HTML entity references like <code>&amp;#8364;</code>.
291     *
292     * @param input the input to decode the HTML entities in
293     * @return the input with the decoded HTML entities
294     *
295     * @see #encodeHtmlEntities(String, String)
296     */
297    public static String decodeHtmlEntities(String input) {
298
299        Matcher matcher = ENTITIY_PATTERN.matcher(input);
300        StringBuffer result = new StringBuffer(input.length());
301        while (matcher.find()) {
302            String value = matcher.group(1);
303            int c = Integer.valueOf(value).intValue();
304            String replacement = new String(Character.toChars(c));
305            matcher.appendReplacement(result, replacement);
306        }
307        matcher.appendTail(result);
308        return result.toString();
309    }
310
311    /**
312     * Decodes HTML entity references like <code>&amp;#8364;</code> that are contained in the
313     * String to a regular character, but only if that character is contained in the given
314     * encodings charset.<p>
315     *
316     * @param input the input to decode the HTML entities in
317     * @param encoding the charset to decode the input for
318     * @return the input with the decoded HTML entities
319     *
320     * @see #encodeHtmlEntities(String, String)
321     */
322    @Deprecated
323    public static String decodeHtmlEntities(String input, String encoding) {
324
325        Matcher matcher = ENTITIY_PATTERN.matcher(input);
326        StringBuffer result = new StringBuffer(input.length());
327        Charset charset = Charset.forName(encoding);
328        CharsetEncoder encoder = charset.newEncoder();
329
330        while (matcher.find()) {
331            String entity = matcher.group();
332            String value = entity.substring(2, entity.length() - 1);
333            int c = Integer.valueOf(value).intValue();
334
335            if (c < 128) {
336                // first 128 chars are contained in almost every charset
337                entity = new String(new char[] {(char)c});
338                // this is intended as performance improvement since
339                // the canEncode() operation appears quite CPU heavy
340            } else if (encoder.canEncode((char)c)) {
341                // encoder can encode this char
342                entity = new String(new char[] {(char)c});
343            }
344            matcher.appendReplacement(result, entity);
345        }
346        matcher.appendTail(result);
347        return result.toString();
348    }
349
350    /**
351     * Decodes a string used as parameter in an uri in a way independent of other encodings/decodings applied before.<p>
352     *
353     * @param input the encoded parameter string
354     *
355     * @return the decoded parameter string
356     *
357     * @see #encodeParameter(String)
358     */
359    public static String decodeParameter(String input) {
360
361        String result = CmsStringUtil.substitute(input, ENTITY_REPLACEMENT, ENTITY_PREFIX);
362        return CmsEncoder.decodeHtmlEntities(result, OpenCms.getSystemInfo().getDefaultEncoding());
363    }
364
365    /**
366     * Decodes a parameter which has been encoded from a string list using encodeStringsAsBase64Parameter.<p>
367     *
368     * @param data the data to decode
369     * @return the list of strings
370     */
371    public static List<String> decodeStringsFromBase64Parameter(String data) {
372
373        data = StringUtils.replaceChars(data, BASE64_EXTRA_REPLACEMENTS, BASE64_EXTRA);
374        byte[] bytes = deobfuscateBytes(Base64.decodeBase64(data));
375        try {
376            JSONArray json = new JSONArray(new String(bytes, "UTF-8"));
377            List<String> result = Lists.newArrayList();
378            for (int i = 0; i < json.length(); i++) {
379                result.add(json.getString(i));
380            }
381            return result;
382        } catch (UnsupportedEncodingException e) {
383            // TODO Auto-generated catch block
384            e.printStackTrace();
385        } catch (JSONException e) {
386            throw new IllegalArgumentException("Decoding failed: " + data, e);
387        }
388        return null;
389    }
390
391    /**
392     * Encodes a String using UTF-8 encoding, which is the standard for http data transmission
393     * with GET ant POST requests.<p>
394     *
395     * @param source the String to encode
396     *
397     * @return String the encoded source String
398     */
399    public static String encode(String source) {
400
401        return encode(source, ENCODING_UTF_8);
402    }
403
404    /**
405     * This method is a substitute for <code>URLEncoder.encode()</code>.
406     * Use this in all OpenCms core classes to ensure the encoding is
407     * always handled the same way.<p>
408     *
409     * In case you don't know what encoding to use, set the value of
410     * the <code>encoding</code> parameter to <code>null</code>.
411     * This method will then default to UTF-8 encoding, which is probably the right one.<p>
412     *
413     * @param source the String to encode
414     * @param encoding the encoding to use (if null, the system default is used)
415     *
416     * @return the encoded source String
417     */
418    public static String encode(String source, String encoding) {
419
420        if (source == null) {
421            return null;
422        }
423        if (encoding != null) {
424            try {
425                return URLEncoder.encode(source, encoding);
426            } catch (java.io.UnsupportedEncodingException e) {
427                // will fallback to default
428            }
429        }
430        // fallback to default encoding
431        try {
432            return URLEncoder.encode(source, ENCODING_UTF_8);
433        } catch (java.io.UnsupportedEncodingException e) {
434            // ignore
435        }
436        return source;
437    }
438
439    /**
440     * Encodes all characters that are contained in the String which can not displayed
441     * in the given encodings charset with HTML entity references
442     * like <code>&amp;#8364;</code>.<p>
443     *
444     * This is required since a Java String is
445     * internally always stored as Unicode, meaning it can contain almost every character, but
446     * the HTML charset used might not support all such characters.<p>
447     *
448     * @param input the input to encode for HTML
449     * @param encoding the charset to encode the result with
450     *
451     * @return the input with the encoded HTML entities
452     *
453     * @see #decodeHtmlEntities(String, String)
454     */
455    public static String encodeHtmlEntities(String input, String encoding) {
456
457        StringBuffer result = new StringBuffer(input.length() * 2);
458        Charset charset = Charset.forName(encoding);
459        CharsetEncoder encoder = charset.newEncoder();
460        input.codePoints().forEach(codepoint -> {
461            char[] charsForCodepoint = Character.toChars(codepoint);
462            boolean isSimple = (charsForCodepoint.length == 1) && (charsForCodepoint[0] < 128);
463            if (isSimple || encoder.canEncode(new String(charsForCodepoint))) {
464                result.append(charsForCodepoint);
465            } else {
466                result.append(ENTITY_PREFIX);
467                result.append(codepoint);
468                result.append(";");
469            }
470        });
471        return result.toString();
472    }
473
474    /**
475     * Encodes all characters that are contained in the String which can not displayed
476     * in the given encodings charset with Java escaping like <code>\u20ac</code>.<p>
477     *
478     * This can be used to escape values used in Java property files.<p>
479     *
480     * @param input the input to encode for Java
481     * @param encoding the charset to encode the result with
482     *
483     * @return the input with the encoded Java entities
484     */
485    public static String encodeJavaEntities(String input, String encoding) {
486
487        StringBuffer result = new StringBuffer(input.length() * 2);
488        CharBuffer buffer = CharBuffer.wrap(input.toCharArray());
489        Charset charset = Charset.forName(encoding);
490        CharsetEncoder encoder = charset.newEncoder();
491        for (int i = 0; i < buffer.length(); i++) {
492            int c = buffer.get(i);
493            if (c < 128) {
494                // first 128 chars are contained in almost every charset
495                result.append((char)c);
496                // this is intended as performance improvement since
497                // the canEncode() operation appears quite CPU heavy
498            } else if (encoder.canEncode((char)c)) {
499                // encoder can encode this char
500                result.append((char)c);
501            } else {
502                // append Java entity reference
503                result.append("\\u");
504                String hex = Integer.toHexString(c);
505                int pad = 4 - hex.length();
506                for (int p = 0; p < pad; p++) {
507                    result.append('0');
508                }
509                result.append(hex);
510            }
511        }
512        return result.toString();
513    }
514
515    /**
516     * Encodes a string used as parameter in an uri in a way independent of other encodings/decodings applied later.<p>
517     *
518     * Used to ensure that GET parameters are not wrecked by wrong or incompatible configuration settings.
519     * In order to ensure this, the String is first encoded with html entities for any character that cannot encoded
520     * in US-ASCII; additionally, the plus sign is also encoded to avoid problems with the white-space replacer.
521     * Finally, the entity prefix is replaced with characters not used as delimiters in urls.<p>
522     *
523     * @param input the parameter string
524     *
525     * @return the encoded parameter string
526     */
527    public static String encodeParameter(String input) {
528
529        String result = CmsEncoder.encodeHtmlEntities(input, CmsEncoder.ENCODING_US_ASCII);
530        result = CmsStringUtil.substitute(result, "+", PLUS_ENTITY);
531        return CmsStringUtil.substitute(result, ENTITY_PREFIX, ENTITY_REPLACEMENT);
532    }
533
534    /**
535     * Encode a list of strings as base64 data to be used in a request parameter.<p>
536     *
537     * @param strings the strings to encode
538     * @return the resulting base64 data
539     */
540    public static String encodeStringsAsBase64Parameter(List<String> strings) {
541
542        JSONArray array = new JSONArray();
543        for (String string : strings) {
544            array.put(string);
545        }
546        byte[] bytes;
547        try {
548            // use obfuscateBytes here to to make the output look more random
549            bytes = obfuscateBytes(array.toString().getBytes("UTF-8"));
550        } catch (UnsupportedEncodingException e) {
551            // should never happen
552            e.printStackTrace();
553            throw new RuntimeException(e);
554        }
555        String result = Base64.encodeBase64String(bytes);
556        result = StringUtils.replaceChars(result, BASE64_EXTRA, BASE64_EXTRA_REPLACEMENTS);
557        return result;
558    }
559
560    /**
561     * Encodes a String in a way similar to the JavaScript "encodeURIcomponent" function,
562     * using "UTF-8" for character encoding encoding.<p>
563     *
564     * JavaScript "decodeURIcomponent" can decode Strings that have been encoded using this method.<p>
565     *
566     * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p>
567     *
568     * @param source The text to be encoded
569     *
570     * @return The encoded string
571     *
572     * @see #escape(String, String)
573     */
574    public static String escape(String source) {
575
576        return escape(source, ENCODING_UTF_8);
577    }
578
579    /**
580     * Encodes a String in a way similar to the JavaScript "encodeURIcomponent" function.<p>
581     *
582     * JavaScript "decodeURIcomponent" can decode Strings that have been encoded using this method,
583     * provided "UTF-8" has been used as encoding.<p>
584     *
585     * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p>
586     *
587     * @param source The text to be encoded
588     * @param encoding the encoding type
589     *
590     * @return The encoded string
591     */
592    public static String escape(String source, String encoding) {
593
594        // the blank is encoded into "+" not "%20" when using standard encode call
595        return CmsStringUtil.substitute(encode(source, encoding), "+", "%20");
596    }
597
598    /**
599     * Escapes special characters in a HTML-String with their number-based
600     * entity representation, for example &amp; becomes &amp;#38;.<p>
601     *
602     * A character <code>num</code> is replaced if<br>
603     * <code>((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62)))</code><p>
604     *
605     * @param source the String to escape
606     *
607     * @return String the escaped String
608     *
609     * @see #escapeXml(String)
610     */
611    public static String escapeHtml(String source) {
612
613        if (source == null) {
614            return null;
615        }
616        StringBuffer result = new StringBuffer(source.length() * 2);
617        for (int i = 0; i < source.length(); i++) {
618            int ch = source.charAt(i);
619            // avoid escaping already escaped characters
620            if (ch == 38) {
621                int terminatorIndex = source.indexOf(";", i);
622                if (terminatorIndex > 0) {
623                    if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+|lt|gt|amp|quote")) {
624                        result.append(source.substring(i, terminatorIndex + 1));
625                        // Skip remaining chars up to (and including) ";"
626                        i = terminatorIndex;
627                        continue;
628                    }
629                }
630            }
631            if ((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62))) {
632                result.append(ENTITY_PREFIX);
633                result.append(ch);
634                result.append(";");
635            } else {
636                result.append((char)ch);
637            }
638        }
639        return new String(result);
640    }
641
642    /**
643     * Escapes non ASCII characters in a HTML-String with their number-based
644     * entity representation, for example &amp; becomes &amp;#38;.<p>
645     *
646     * A character <code>num</code> is replaced if<br>
647     * <code>(ch > 255)</code><p>
648     *
649     * @param source the String to escape
650     *
651     * @return String the escaped String
652     *
653     * @see #escapeXml(String)
654     */
655    public static String escapeNonAscii(String source) {
656
657        if (source == null) {
658            return null;
659        }
660        StringBuffer result = new StringBuffer(source.length() * 2);
661        for (int i = 0; i < source.length(); i++) {
662            int ch = source.charAt(i);
663            if (ch > 255) {
664                result.append(ENTITY_PREFIX);
665                result.append(ch);
666                result.append(";");
667            } else {
668                result.append((char)ch);
669            }
670        }
671        return new String(result);
672    }
673
674    /**
675     * A simple method to avoid injection.<p>
676     *
677     * Replaces all single quotes to double single quotes in the value parameter of the SQL statement.<p>
678     *
679     * @param source the String to escape SQL from
680     * @return the escaped value of the parameter source
681     */
682    public static String escapeSql(String source) {
683
684        return source.replaceAll("'", "''");
685    }
686
687    /**
688     * Escapes the wildcard characters in a string which will be used as the pattern for a SQL LIKE clause.<p>
689     *
690     * @param pattern the pattern
691     * @param escapeChar the character which should be used as the escape character
692     *
693     * @return the escaped pattern
694     */
695    public static String escapeSqlLikePattern(String pattern, char escapeChar) {
696
697        char[] special = new char[] {escapeChar, '%', '_'};
698        String result = pattern;
699        for (char charToEscape : special) {
700            result = result.replaceAll("" + charToEscape, "" + escapeChar + charToEscape);
701        }
702        return result;
703    }
704
705    /**
706     * Encodes a String in a way similar JavaScript "encodeURIcomponent" function.<p>
707     *
708     * Multiple blanks are encoded _multiply_ with <code>%20</code>.<p>
709     *
710     * @param source The text to be encoded
711     * @param encoding the encoding type
712     *
713     * @return The encoded String
714     */
715    public static String escapeWBlanks(String source, String encoding) {
716
717        if (CmsStringUtil.isEmpty(source)) {
718            return source;
719        }
720        StringBuffer ret = new StringBuffer(source.length() * 2);
721
722        // URLEncode the text string
723        // this produces a very similar encoding to JavaSscript encoding,
724        // except the blank which is not encoded into "%20" instead of "+"
725
726        String enc = encode(source, encoding);
727        for (int z = 0; z < enc.length(); z++) {
728            char c = enc.charAt(z);
729            if (c == '+') {
730                ret.append("%20");
731            } else {
732                ret.append(c);
733            }
734        }
735        return ret.toString();
736    }
737
738    /**
739     * Escapes a String so it may be printed as text content or attribute
740     * value in a HTML page or an XML file.<p>
741     *
742     * This method replaces the following characters in a String:
743     * <ul>
744     * <li><b>&lt;</b> with &amp;lt;
745     * <li><b>&gt;</b> with &amp;gt;
746     * <li><b>&amp;</b> with &amp;amp;
747     * <li><b>&quot;</b> with &amp;quot;
748     * </ul><p>
749     *
750     * @param source the string to escape
751     *
752     * @return the escaped string
753     *
754     * @see #escapeHtml(String)
755     */
756    public static String escapeXml(String source) {
757
758        return escapeXml(source, false);
759    }
760
761    /**
762     * Escapes a String so it may be printed as text content or attribute
763     * value in a HTML page or an XML file.<p>
764     *
765     * This method replaces the following characters in a String:
766     * <ul>
767     * <li><b>&lt;</b> with &amp;lt;
768     * <li><b>&gt;</b> with &amp;gt;
769     * <li><b>&amp;</b> with &amp;amp;
770     * <li><b>&quot;</b> with &amp;quot;
771     * </ul><p>
772     *
773     * @param source the string to escape
774     * @param doubleEscape if <code>false</code>, all entities that already are escaped are left untouched
775     *
776     * @return the escaped string
777     *
778     * @see #escapeHtml(String)
779     */
780    public static String escapeXml(String source, boolean doubleEscape) {
781
782        if (source == null) {
783            return null;
784        }
785        StringBuffer result = new StringBuffer(source.length() * 2);
786
787        for (int i = 0; i < source.length(); ++i) {
788            char ch = source.charAt(i);
789            switch (ch) {
790                case '<':
791                    result.append("&lt;");
792                    break;
793                case '>':
794                    result.append("&gt;");
795                    break;
796                case '&':
797                    // don't escape already escaped international and special characters
798                    if (!doubleEscape) {
799                        int terminatorIndex = source.indexOf(";", i);
800                        if (terminatorIndex > 0) {
801                            if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+")) {
802                                result.append(ch);
803                                break;
804                            }
805                        }
806                    }
807                    // note that to other "break" in the above "if" block
808                    result.append("&amp;");
809                    break;
810                case '"':
811                    result.append("&quot;");
812                    break;
813                case '\'':
814                    result.append("&apos;");
815                    break;
816                default:
817                    result.append(ch);
818            }
819        }
820        return new String(result);
821    }
822
823    /**
824     * Checks if a given encoding name is actually supported, and if so
825     * resolves it to it's canonical name, if not it returns the given fallback
826     * value.<p>
827     *
828     * Charsets have a set of aliases. For example, valid aliases for "UTF-8"
829     * are "UTF8", "utf-8" or "utf8". This method resolves any given valid charset name
830     * to it's "canonical" form, so that simple String comparison can be used
831     * when checking charset names internally later.<p>
832     *
833     * Please see <a href="http://www.iana.org/assignments/character-sets">http://www.iana.org/assignments/character-sets</a>
834     * for a list of valid charset alias names.<p>
835     *
836     * @param encoding the encoding to check and resolve
837     * @param fallback the fallback encoding scheme
838     *
839     * @return the resolved encoding name, or the fallback value
840     */
841    public static String lookupEncoding(String encoding, String fallback) {
842
843        String result = m_encodingCache.get(encoding);
844        if (result != null) {
845            return result;
846        }
847
848        try {
849            result = Charset.forName(encoding).name();
850            m_encodingCache.put(encoding, result);
851            return result;
852        } catch (Throwable t) {
853            // we will use the default value as fallback
854        }
855
856        return fallback;
857    }
858
859    /**
860     * Re-decodes a String that has not been correctly decoded and thus has scrambled
861     * character bytes.<p>
862     *
863     * This is an equivalent to the JavaScript "decodeURIComponent" function.
864     * It converts from the default "UTF-8" to the currently selected system encoding.<p>
865     *
866     * @param input the String to convert
867     *
868     * @return String the converted String
869     */
870    public static String redecodeUriComponent(String input) {
871
872        if (input == null) {
873            return input;
874        }
875        return new String(
876            changeEncoding(input.getBytes(), ENCODING_UTF_8, OpenCms.getSystemInfo().getDefaultEncoding()));
877    }
878
879    /**
880     * Decodes a String in a way similar to the JavaScript "decodeURIcomponent" function,
881     * using "UTF-8" for character encoding.<p>
882     *
883     * This method can decode Strings that have been encoded in JavaScript with "encodeURIcomponent".<p>
884     *
885     * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p>
886     *
887     * @param source The String to be decoded
888     *
889     * @return The decoded String
890     */
891    public static String unescape(String source) {
892
893        return unescape(source, ENCODING_UTF_8);
894    }
895
896    /**
897     * Decodes a String in a way similar to the JavaScript "decodeURIcomponent" function.<p>
898     *
899     * This method can decode Strings that have been encoded in JavaScript with "encodeURIcomponent",
900     * provided "UTF-8" is used as encoding.<p>
901     *
902     * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p>
903     *
904     * @param source The String to be decoded
905     * @param encoding the encoding type
906     *
907     * @return The decoded String
908     */
909    public static String unescape(String source, String encoding) {
910
911        if (source == null) {
912            return null;
913        }
914        int len = source.length();
915        // to use standard decoder we need to replace '+' with "%20" (space)
916        StringBuffer preparedSource = new StringBuffer(len);
917        for (int i = 0; i < len; i++) {
918            char c = source.charAt(i);
919            if (c == '+') {
920                preparedSource.append("%20");
921            } else {
922                preparedSource.append(c);
923            }
924        }
925        return decode(preparedSource.toString(), encoding);
926    }
927
928    /**
929     * Decrypts a byte array obfuscated with 'obfuscateBytes'.<p>
930     *
931     * @param source the source
932     * @return the resuvlt
933     */
934    private static byte[] deobfuscateBytes(byte[] source) {
935
936        byte[] result = new byte[source.length - 1];
937        System.arraycopy(source, 1, result, 0, source.length - 1);
938        for (int i = 0; i < result.length; i++) {
939            result[i] = (byte)(0xFF & (result[i] ^ source[0]));
940        }
941        return result;
942    }
943
944    /**
945     * Simple "obfuscation" for byte arrays using random numbers.<p>
946     *
947     * @param source the source array
948     * @return the result
949     */
950    private static byte[] obfuscateBytes(byte[] source) {
951
952        byte[] s = new byte[1];
953        m_random.nextBytes(s);
954        byte[] result = new byte[source.length + 1];
955        System.arraycopy(source, 0, result, 1, source.length);
956        result[0] = s[0];
957        for (int i = 1; i < result.length; i++) {
958            result[i] = (byte)(0xFF & (result[i] ^ s[0]));
959        }
960        return result;
961    }
962
963}