001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (https://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: https://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: https://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.i18n; 029 030import org.opencms.json.JSONArray; 031import org.opencms.json.JSONException; 032import org.opencms.main.CmsLog; 033import org.opencms.main.OpenCms; 034import org.opencms.util.CmsStringUtil; 035 036import java.io.UnsupportedEncodingException; 037import java.net.IDN; 038import java.net.URI; 039import java.net.URISyntaxException; 040import java.net.URLDecoder; 041import java.net.URLEncoder; 042import java.nio.CharBuffer; 043import java.nio.charset.Charset; 044import java.nio.charset.CharsetEncoder; 045import java.util.HashMap; 046import java.util.List; 047import java.util.Map; 048import java.util.Random; 049import java.util.regex.Matcher; 050import java.util.regex.Pattern; 051 052import org.apache.commons.codec.binary.Base64; 053import org.apache.commons.lang3.StringUtils; 054import org.apache.commons.logging.Log; 055import org.apache.http.client.utils.URIBuilder; 056 057import com.google.common.base.Strings; 058import com.google.common.collect.Lists; 059 060/** 061 * The OpenCms CmsEncoder class provides static methods to decode and encode data.<p> 062 * 063 * The methods in this class are substitutes for <code>java.net.URLEncoder.encode()</code> and 064 * <code>java.net.URLDecoder.decode()</code>. Use the methods from this class in all OpenCms 065 * core classes to ensure the encoding is always handled the same way.<p> 066 * 067 * The de- and encoding uses the same coding mechanism as JavaScript, special characters are 068 * replaced with <code>%hex</code> where hex is a two digit hex number.<p> 069 * 070 * <b>Note:</b> On the client side (browser) instead of using the deprecated <code>escape</code> 071 * and <code>unescape</code> JavaScript functions, always the use <code>encodeURIComponent</code> and 072 * <code>decodeURIComponent</code> functions. Only these work properly with unicode characters.<p> 073 * 074 * @since 6.0.0 075 */ 076public final class CmsEncoder { 077 078 /** Non-alphanumeric characters used for Base64 encoding. */ 079 public static final String BASE64_EXTRA = "+/="; 080 081 /** Characters used as replacements for non-alphanumeric Base64 characters when using Base64 for request parameters. */ 082 public static final String BASE64_EXTRA_REPLACEMENTS = "-_."; 083 084 /** Constant for the standard <code>ISO-8859-1</code> encoding. */ 085 public static final String ENCODING_ISO_8859_1 = "ISO-8859-1"; 086 087 /** Constant for the standard <code>US-ASCII</code> encoding. */ 088 public static final String ENCODING_US_ASCII = "US-ASCII"; 089 090 /** 091 * Constant for the standard <code>UTF-8</code> encoding.<p> 092 * 093 * Default encoding for JavaScript decodeUriComponent methods is <code>UTF-8</code> by w3c standard. 094 */ 095 public static final String ENCODING_UTF_8 = "UTF-8"; 096 097 /** The regex pattern to match HTML entities. */ 098 private static final Pattern ENTITIY_PATTERN = Pattern.compile("\\&#(\\d+);"); 099 100 /** The prefix for HTML entities. */ 101 private static final String ENTITY_PREFIX = "&#"; 102 103 /** The replacement for HTML entity prefix in parameters. */ 104 private static final String ENTITY_REPLACEMENT = "$$"; 105 106 /** The log object for this class. */ 107 private static final Log LOG = CmsLog.getLog(CmsEncoder.class); 108 109 /** A cache for encoding name lookup. */ 110 private static Map<String, String> m_encodingCache = new HashMap<String, String>(16); 111 112 private static Random m_random = new Random(); 113 114 /** The plus entity. */ 115 private static final String PLUS_ENTITY = ENTITY_PREFIX + "043;"; 116 117 /** Pattern for decomposing the authority section of an URI. */ 118 public static final Pattern AUTHORITY_PATTERN = Pattern.compile("^(.*?@)?(.*)(:[0-9]+)?$"); 119 120 /** 121 * Constructor.<p> 122 */ 123 private CmsEncoder() { 124 125 // empty 126 } 127 128 /** 129 * Adjusts the given String by making sure all characters that can be displayed 130 * in the given charset are contained as chars, whereas all other non-displayable 131 * characters are converted to HTML entities.<p> 132 * 133 * Just calls {@link #decodeHtmlEntities(String)} first and feeds the result 134 * to {@link #encodeHtmlEntities(String, String)}. <p> 135 * 136 * @param input the input to adjust the HTML encoding for 137 * @param encoding the charset to encode the result with\ 138 * 139 * @return the input with the decoded/encoded HTML entities 140 */ 141 public static String adjustHtmlEncoding(String input, String encoding) { 142 143 return encodeHtmlEntities(decodeHtmlEntities(input), encoding); 144 } 145 146 /** 147 * Changes the encoding of a byte array that represents a String.<p> 148 * 149 * @param input the byte array to convert 150 * @param oldEncoding the current encoding of the byte array 151 * @param newEncoding the new encoding of the byte array 152 * 153 * @return the byte array encoded in the new encoding 154 */ 155 public static byte[] changeEncoding(byte[] input, String oldEncoding, String newEncoding) { 156 157 if ((oldEncoding == null) || (newEncoding == null)) { 158 return input; 159 } 160 if (oldEncoding.trim().equalsIgnoreCase(newEncoding.trim())) { 161 return input; 162 } 163 byte[] result = input; 164 try { 165 result = (new String(input, oldEncoding)).getBytes(newEncoding); 166 } catch (UnsupportedEncodingException e) { 167 // return value will be input value 168 } 169 return result; 170 } 171 172 /** 173 * Converts the host of an URI to Punycode.<p> 174 * 175 * This is needed when we want to do redirects to hosts with host names containing international characters like umlauts.<p> 176 * 177 * @param uriString the URI 178 * @return the converted URI 179 */ 180 public static String convertHostToPunycode(String uriString) { 181 182 if (uriString.indexOf(":") >= 0) { 183 try { 184 URI uri = new URI(uriString); 185 String authority = uri.getAuthority(); // getHost won't work when we have non-ASCII domain characters 186 Matcher matcher = AUTHORITY_PATTERN.matcher(authority); 187 if (matcher.matches()) { 188 authority = Strings.nullToEmpty(matcher.group(1)) 189 + IDN.toASCII(matcher.group(2)) 190 + Strings.nullToEmpty(matcher.group(3)); 191 } 192 URI uriWithCorrectedHost = new URI(uri.getScheme(), authority, null, null, null); 193 URIBuilder builder = new URIBuilder(uri); 194 builder.setHost(uriWithCorrectedHost.getHost()); 195 builder.setPort(uriWithCorrectedHost.getPort()); 196 builder.setUserInfo(uriWithCorrectedHost.getUserInfo()); 197 uriString = builder.build().toASCIIString(); 198 } catch (URISyntaxException e) { 199 LOG.error(e.getLocalizedMessage(), e); 200 } 201 } 202 return uriString; 203 } 204 205 /** 206 * Creates a String out of a byte array with the specified encoding, falling back 207 * to the system default in case the encoding name is not valid.<p> 208 * 209 * Use this method as a replacement for <code>new String(byte[], encoding)</code> 210 * to avoid possible encoding problems.<p> 211 * 212 * @param bytes the bytes to decode 213 * @param encoding the encoding scheme to use for decoding the bytes 214 * 215 * @return the bytes decoded to a String 216 */ 217 public static String createString(byte[] bytes, String encoding) { 218 219 String enc = encoding.intern(); 220 if (enc != OpenCms.getSystemInfo().getDefaultEncoding()) { 221 enc = lookupEncoding(enc, null); 222 } 223 if (enc != null) { 224 try { 225 return new String(bytes, enc); 226 } catch (UnsupportedEncodingException e) { 227 // this can _never_ happen since the charset was looked up first 228 } 229 } else { 230 if (LOG.isWarnEnabled()) { 231 LOG.warn(Messages.get().getBundle().key(Messages.ERR_UNSUPPORTED_VM_ENCODING_1, encoding)); 232 } 233 enc = OpenCms.getSystemInfo().getDefaultEncoding(); 234 try { 235 return new String(bytes, enc); 236 } catch (UnsupportedEncodingException e) { 237 // this can also _never_ happen since the default encoding is always valid 238 } 239 } 240 // this code is unreachable in practice 241 LOG.error(Messages.get().getBundle().key(Messages.ERR_ENCODING_ISSUES_1, encoding)); 242 return null; 243 } 244 245 /** 246 * Decodes a String using UTF-8 encoding, which is the standard for http data transmission 247 * with GET ant POST requests.<p> 248 * 249 * @param source the String to decode 250 * 251 * @return String the decoded source String 252 */ 253 public static String decode(String source) { 254 255 return decode(source, ENCODING_UTF_8); 256 } 257 258 /** 259 * This method is a substitute for <code>URLDecoder.decode()</code>. 260 * Use this in all OpenCms core classes to ensure the encoding is 261 * always handled the same way.<p> 262 * 263 * In case you don't know what encoding to use, set the value of 264 * the <code>encoding</code> parameter to <code>null</code>. 265 * This method will then default to UTF-8 encoding, which is probably the right one.<p> 266 * 267 * @param source The string to decode 268 * @param encoding The encoding to use (if null, the system default is used) 269 * 270 * @return The decoded source String 271 */ 272 public static String decode(String source, String encoding) { 273 274 if (source == null) { 275 return null; 276 } 277 if (encoding != null) { 278 try { 279 return URLDecoder.decode(source, encoding); 280 } catch (java.io.UnsupportedEncodingException e) { 281 // will fallback to default 282 } 283 } 284 // fallback to default decoding 285 try { 286 return URLDecoder.decode(source, ENCODING_UTF_8); 287 } catch (java.io.UnsupportedEncodingException e) { 288 // ignore 289 } 290 return source; 291 } 292 293 /** 294 * Decodes HTML entity references like <code>&#8364;</code>. 295 * 296 * @param input the input to decode the HTML entities in 297 * @return the input with the decoded HTML entities 298 * 299 * @see #encodeHtmlEntities(String, String) 300 */ 301 public static String decodeHtmlEntities(String input) { 302 303 Matcher matcher = ENTITIY_PATTERN.matcher(input); 304 StringBuffer result = new StringBuffer(input.length()); 305 while (matcher.find()) { 306 String value = matcher.group(1); 307 int c = Integer.valueOf(value).intValue(); 308 String replacement = new String(Character.toChars(c)); 309 matcher.appendReplacement(result, replacement); 310 } 311 matcher.appendTail(result); 312 return result.toString(); 313 } 314 315 /** 316 * Decodes HTML entity references like <code>&#8364;</code> that are contained in the 317 * String to a regular character, but only if that character is contained in the given 318 * encodings charset.<p> 319 * 320 * @param input the input to decode the HTML entities in 321 * @param encoding the charset to decode the input for 322 * @return the input with the decoded HTML entities 323 * 324 * @see #encodeHtmlEntities(String, String) 325 */ 326 @Deprecated 327 public static String decodeHtmlEntities(String input, String encoding) { 328 329 Matcher matcher = ENTITIY_PATTERN.matcher(input); 330 StringBuffer result = new StringBuffer(input.length()); 331 Charset charset = Charset.forName(encoding); 332 CharsetEncoder encoder = charset.newEncoder(); 333 334 while (matcher.find()) { 335 String entity = matcher.group(); 336 String value = entity.substring(2, entity.length() - 1); 337 int c = Integer.valueOf(value).intValue(); 338 339 if (c < 128) { 340 // first 128 chars are contained in almost every charset 341 entity = new String(new char[] {(char)c}); 342 // this is intended as performance improvement since 343 // the canEncode() operation appears quite CPU heavy 344 } else if (encoder.canEncode((char)c)) { 345 // encoder can encode this char 346 entity = new String(new char[] {(char)c}); 347 } 348 matcher.appendReplacement(result, entity); 349 } 350 matcher.appendTail(result); 351 return result.toString(); 352 } 353 354 /** 355 * Decodes a string used as parameter in an uri in a way independent of other encodings/decodings applied before.<p> 356 * 357 * @param input the encoded parameter string 358 * 359 * @return the decoded parameter string 360 * 361 * @see #encodeParameter(String) 362 */ 363 public static String decodeParameter(String input) { 364 365 String result = CmsStringUtil.substitute(input, ENTITY_REPLACEMENT, ENTITY_PREFIX); 366 return CmsEncoder.decodeHtmlEntities(result, OpenCms.getSystemInfo().getDefaultEncoding()); 367 } 368 369 /** 370 * Decodes a parameter which has been encoded from a string list using encodeStringsAsBase64Parameter.<p> 371 * 372 * @param data the data to decode 373 * @return the list of strings 374 */ 375 public static List<String> decodeStringsFromBase64Parameter(String data) { 376 377 data = StringUtils.replaceChars(data, BASE64_EXTRA_REPLACEMENTS, BASE64_EXTRA); 378 byte[] bytes = deobfuscateBytes(Base64.decodeBase64(data)); 379 try { 380 JSONArray json = new JSONArray(new String(bytes, "UTF-8")); 381 List<String> result = Lists.newArrayList(); 382 for (int i = 0; i < json.length(); i++) { 383 result.add(json.getString(i)); 384 } 385 return result; 386 } catch (UnsupportedEncodingException e) { 387 // TODO Auto-generated catch block 388 e.printStackTrace(); 389 } catch (JSONException e) { 390 throw new IllegalArgumentException("Decoding failed: " + data, e); 391 } 392 return null; 393 } 394 395 /** 396 * Encodes a String using UTF-8 encoding, which is the standard for http data transmission 397 * with GET ant POST requests.<p> 398 * 399 * @param source the String to encode 400 * 401 * @return String the encoded source String 402 */ 403 public static String encode(String source) { 404 405 return encode(source, ENCODING_UTF_8); 406 } 407 408 /** 409 * This method is a substitute for <code>URLEncoder.encode()</code>. 410 * Use this in all OpenCms core classes to ensure the encoding is 411 * always handled the same way.<p> 412 * 413 * In case you don't know what encoding to use, set the value of 414 * the <code>encoding</code> parameter to <code>null</code>. 415 * This method will then default to UTF-8 encoding, which is probably the right one.<p> 416 * 417 * @param source the String to encode 418 * @param encoding the encoding to use (if null, the system default is used) 419 * 420 * @return the encoded source String 421 */ 422 public static String encode(String source, String encoding) { 423 424 if (source == null) { 425 return null; 426 } 427 if (encoding != null) { 428 try { 429 return URLEncoder.encode(source, encoding); 430 } catch (java.io.UnsupportedEncodingException e) { 431 // will fallback to default 432 } 433 } 434 // fallback to default encoding 435 try { 436 return URLEncoder.encode(source, ENCODING_UTF_8); 437 } catch (java.io.UnsupportedEncodingException e) { 438 // ignore 439 } 440 return source; 441 } 442 443 /** 444 * Encodes all characters that are contained in the String which can not displayed 445 * in the given encodings charset with HTML entity references 446 * like <code>&#8364;</code>.<p> 447 * 448 * This is required since a Java String is 449 * internally always stored as Unicode, meaning it can contain almost every character, but 450 * the HTML charset used might not support all such characters.<p> 451 * 452 * @param input the input to encode for HTML 453 * @param encoding the charset to encode the result with 454 * 455 * @return the input with the encoded HTML entities 456 * 457 * @see #decodeHtmlEntities(String, String) 458 */ 459 public static String encodeHtmlEntities(String input, String encoding) { 460 461 StringBuffer result = new StringBuffer(input.length() * 2); 462 Charset charset = Charset.forName(encoding); 463 CharsetEncoder encoder = charset.newEncoder(); 464 input.codePoints().forEach(codepoint -> { 465 char[] charsForCodepoint = Character.toChars(codepoint); 466 boolean isSimple = (charsForCodepoint.length == 1) && (charsForCodepoint[0] < 128); 467 if (isSimple || encoder.canEncode(new String(charsForCodepoint))) { 468 result.append(charsForCodepoint); 469 } else { 470 result.append(ENTITY_PREFIX); 471 result.append(codepoint); 472 result.append(";"); 473 } 474 }); 475 return result.toString(); 476 } 477 478 /** 479 * Encodes all characters that are contained in the String which can not displayed 480 * in the given encodings charset with Java escaping like <code>\u20ac</code>.<p> 481 * 482 * This can be used to escape values used in Java property files.<p> 483 * 484 * @param input the input to encode for Java 485 * @param encoding the charset to encode the result with 486 * 487 * @return the input with the encoded Java entities 488 */ 489 public static String encodeJavaEntities(String input, String encoding) { 490 491 StringBuffer result = new StringBuffer(input.length() * 2); 492 CharBuffer buffer = CharBuffer.wrap(input.toCharArray()); 493 Charset charset = Charset.forName(encoding); 494 CharsetEncoder encoder = charset.newEncoder(); 495 for (int i = 0; i < buffer.length(); i++) { 496 int c = buffer.get(i); 497 if (c < 128) { 498 // first 128 chars are contained in almost every charset 499 result.append((char)c); 500 // this is intended as performance improvement since 501 // the canEncode() operation appears quite CPU heavy 502 } else if (encoder.canEncode((char)c)) { 503 // encoder can encode this char 504 result.append((char)c); 505 } else { 506 // append Java entity reference 507 result.append("\\u"); 508 String hex = Integer.toHexString(c); 509 int pad = 4 - hex.length(); 510 for (int p = 0; p < pad; p++) { 511 result.append('0'); 512 } 513 result.append(hex); 514 } 515 } 516 return result.toString(); 517 } 518 519 /** 520 * Encodes a string used as parameter in an uri in a way independent of other encodings/decodings applied later.<p> 521 * 522 * Used to ensure that GET parameters are not wrecked by wrong or incompatible configuration settings. 523 * In order to ensure this, the String is first encoded with html entities for any character that cannot encoded 524 * in US-ASCII; additionally, the plus sign is also encoded to avoid problems with the white-space replacer. 525 * Finally, the entity prefix is replaced with characters not used as delimiters in urls.<p> 526 * 527 * @param input the parameter string 528 * 529 * @return the encoded parameter string 530 */ 531 public static String encodeParameter(String input) { 532 533 String result = CmsEncoder.encodeHtmlEntities(input, CmsEncoder.ENCODING_US_ASCII); 534 result = CmsStringUtil.substitute(result, "+", PLUS_ENTITY); 535 return CmsStringUtil.substitute(result, ENTITY_PREFIX, ENTITY_REPLACEMENT); 536 } 537 538 /** 539 * Encode a list of strings as base64 data to be used in a request parameter.<p> 540 * 541 * @param strings the strings to encode 542 * @return the resulting base64 data 543 */ 544 public static String encodeStringsAsBase64Parameter(List<String> strings) { 545 546 JSONArray array = new JSONArray(); 547 for (String string : strings) { 548 array.put(string); 549 } 550 byte[] bytes; 551 try { 552 // use obfuscateBytes here to to make the output look more random 553 bytes = obfuscateBytes(array.toString().getBytes("UTF-8")); 554 } catch (UnsupportedEncodingException e) { 555 // should never happen 556 e.printStackTrace(); 557 throw new RuntimeException(e); 558 } 559 String result = Base64.encodeBase64String(bytes); 560 result = StringUtils.replaceChars(result, BASE64_EXTRA, BASE64_EXTRA_REPLACEMENTS); 561 return result; 562 } 563 564 /** 565 * Encodes a String in a way similar to the JavaScript "encodeURIcomponent" function, 566 * using "UTF-8" for character encoding encoding.<p> 567 * 568 * JavaScript "decodeURIcomponent" can decode Strings that have been encoded using this method.<p> 569 * 570 * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p> 571 * 572 * @param source The text to be encoded 573 * 574 * @return The encoded string 575 * 576 * @see #escape(String, String) 577 */ 578 public static String escape(String source) { 579 580 return escape(source, ENCODING_UTF_8); 581 } 582 583 /** 584 * Encodes a String in a way similar to the JavaScript "encodeURIcomponent" function.<p> 585 * 586 * JavaScript "decodeURIcomponent" can decode Strings that have been encoded using this method, 587 * provided "UTF-8" has been used as encoding.<p> 588 * 589 * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p> 590 * 591 * @param source The text to be encoded 592 * @param encoding the encoding type 593 * 594 * @return The encoded string 595 */ 596 public static String escape(String source, String encoding) { 597 598 // the blank is encoded into "+" not "%20" when using standard encode call 599 return CmsStringUtil.substitute(encode(source, encoding), "+", "%20"); 600 } 601 602 /** 603 * Escapes special characters in a HTML-String with their number-based 604 * entity representation, for example & becomes &#38;.<p> 605 * 606 * A character <code>num</code> is replaced if<br> 607 * <code>((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62)))</code><p> 608 * 609 * @param source the String to escape 610 * 611 * @return String the escaped String 612 * 613 * @see #escapeXml(String) 614 */ 615 public static String escapeHtml(String source) { 616 617 if (source == null) { 618 return null; 619 } 620 StringBuffer result = new StringBuffer(source.length() * 2); 621 for (int i = 0; i < source.length(); i++) { 622 int ch = source.charAt(i); 623 // avoid escaping already escaped characters 624 if (ch == 38) { 625 int terminatorIndex = source.indexOf(";", i); 626 if (terminatorIndex > 0) { 627 if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+|lt|gt|amp|quote")) { 628 result.append(source.substring(i, terminatorIndex + 1)); 629 // Skip remaining chars up to (and including) ";" 630 i = terminatorIndex; 631 continue; 632 } 633 } 634 } 635 if ((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62))) { 636 result.append(ENTITY_PREFIX); 637 result.append(ch); 638 result.append(";"); 639 } else { 640 result.append((char)ch); 641 } 642 } 643 return new String(result); 644 } 645 646 /** 647 * Escapes non ASCII characters in a HTML-String with their number-based 648 * entity representation, for example & becomes &#38;.<p> 649 * 650 * A character <code>num</code> is replaced if<br> 651 * <code>(ch > 255)</code><p> 652 * 653 * @param source the String to escape 654 * 655 * @return String the escaped String 656 * 657 * @see #escapeXml(String) 658 */ 659 public static String escapeNonAscii(String source) { 660 661 if (source == null) { 662 return null; 663 } 664 StringBuffer result = new StringBuffer(source.length() * 2); 665 for (int i = 0; i < source.length(); i++) { 666 int ch = source.charAt(i); 667 if (ch > 255) { 668 result.append(ENTITY_PREFIX); 669 result.append(ch); 670 result.append(";"); 671 } else { 672 result.append((char)ch); 673 } 674 } 675 return new String(result); 676 } 677 678 /** 679 * A simple method to avoid injection.<p> 680 * 681 * Replaces all single quotes to double single quotes in the value parameter of the SQL statement.<p> 682 * 683 * @param source the String to escape SQL from 684 * @return the escaped value of the parameter source 685 */ 686 public static String escapeSql(String source) { 687 688 return source.replaceAll("'", "''"); 689 } 690 691 /** 692 * Escapes the wildcard characters in a string which will be used as the pattern for a SQL LIKE clause.<p> 693 * 694 * @param pattern the pattern 695 * @param escapeChar the character which should be used as the escape character 696 * 697 * @return the escaped pattern 698 */ 699 public static String escapeSqlLikePattern(String pattern, char escapeChar) { 700 701 char[] special = new char[] {escapeChar, '%', '_'}; 702 String result = pattern; 703 for (char charToEscape : special) { 704 result = result.replaceAll("" + charToEscape, "" + escapeChar + charToEscape); 705 } 706 return result; 707 } 708 709 /** 710 * Encodes a String in a way similar JavaScript "encodeURIcomponent" function.<p> 711 * 712 * Multiple blanks are encoded _multiply_ with <code>%20</code>.<p> 713 * 714 * @param source The text to be encoded 715 * @param encoding the encoding type 716 * 717 * @return The encoded String 718 */ 719 public static String escapeWBlanks(String source, String encoding) { 720 721 if (CmsStringUtil.isEmpty(source)) { 722 return source; 723 } 724 StringBuffer ret = new StringBuffer(source.length() * 2); 725 726 // URLEncode the text string 727 // this produces a very similar encoding to JavaSscript encoding, 728 // except the blank which is not encoded into "%20" instead of "+" 729 730 String enc = encode(source, encoding); 731 for (int z = 0; z < enc.length(); z++) { 732 char c = enc.charAt(z); 733 if (c == '+') { 734 ret.append("%20"); 735 } else { 736 ret.append(c); 737 } 738 } 739 return ret.toString(); 740 } 741 742 /** 743 * Escapes a String so it may be printed as text content or attribute 744 * value in a HTML page or an XML file.<p> 745 * 746 * This method replaces the following characters in a String: 747 * <ul> 748 * <li><b><</b> with &lt; 749 * <li><b>></b> with &gt; 750 * <li><b>&</b> with &amp; 751 * <li><b>"</b> with &quot; 752 * </ul><p> 753 * 754 * @param source the string to escape 755 * 756 * @return the escaped string 757 * 758 * @see #escapeHtml(String) 759 */ 760 public static String escapeXml(String source) { 761 762 return escapeXml(source, false); 763 } 764 765 /** 766 * Escapes a String so it may be printed as text content or attribute 767 * value in a HTML page or an XML file.<p> 768 * 769 * This method replaces the following characters in a String: 770 * <ul> 771 * <li><b><</b> with &lt; 772 * <li><b>></b> with &gt; 773 * <li><b>&</b> with &amp; 774 * <li><b>"</b> with &quot; 775 * </ul><p> 776 * 777 * @param source the string to escape 778 * @param doubleEscape if <code>false</code>, all entities that already are escaped are left untouched 779 * 780 * @return the escaped string 781 * 782 * @see #escapeHtml(String) 783 */ 784 public static String escapeXml(String source, boolean doubleEscape) { 785 786 if (source == null) { 787 return null; 788 } 789 StringBuffer result = new StringBuffer(source.length() * 2); 790 791 for (int i = 0; i < source.length(); ++i) { 792 char ch = source.charAt(i); 793 switch (ch) { 794 case '<': 795 result.append("<"); 796 break; 797 case '>': 798 result.append(">"); 799 break; 800 case '&': 801 // don't escape already escaped international and special characters 802 if (!doubleEscape) { 803 int terminatorIndex = source.indexOf(";", i); 804 if (terminatorIndex > 0) { 805 if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+")) { 806 result.append(ch); 807 break; 808 } 809 } 810 } 811 // note that to other "break" in the above "if" block 812 result.append("&"); 813 break; 814 case '"': 815 result.append("""); 816 break; 817 case '\'': 818 result.append("'"); 819 break; 820 default: 821 result.append(ch); 822 } 823 } 824 return new String(result); 825 } 826 827 /** 828 * Checks if a given encoding name is actually supported, and if so 829 * resolves it to it's canonical name, if not it returns the given fallback 830 * value.<p> 831 * 832 * Charsets have a set of aliases. For example, valid aliases for "UTF-8" 833 * are "UTF8", "utf-8" or "utf8". This method resolves any given valid charset name 834 * to it's "canonical" form, so that simple String comparison can be used 835 * when checking charset names internally later.<p> 836 * 837 * Please see <a href="http://www.iana.org/assignments/character-sets">http://www.iana.org/assignments/character-sets</a> 838 * for a list of valid charset alias names.<p> 839 * 840 * @param encoding the encoding to check and resolve 841 * @param fallback the fallback encoding scheme 842 * 843 * @return the resolved encoding name, or the fallback value 844 */ 845 public static String lookupEncoding(String encoding, String fallback) { 846 847 String result = m_encodingCache.get(encoding); 848 if (result != null) { 849 return result; 850 } 851 852 try { 853 result = Charset.forName(encoding).name(); 854 m_encodingCache.put(encoding, result); 855 return result; 856 } catch (Throwable t) { 857 // we will use the default value as fallback 858 } 859 860 return fallback; 861 } 862 863 /** 864 * Re-decodes a String that has not been correctly decoded and thus has scrambled 865 * character bytes.<p> 866 * 867 * This is an equivalent to the JavaScript "decodeURIComponent" function. 868 * It converts from the default "UTF-8" to the currently selected system encoding.<p> 869 * 870 * @param input the String to convert 871 * 872 * @return String the converted String 873 */ 874 public static String redecodeUriComponent(String input) { 875 876 if (input == null) { 877 return input; 878 } 879 return new String( 880 changeEncoding(input.getBytes(), ENCODING_UTF_8, OpenCms.getSystemInfo().getDefaultEncoding())); 881 } 882 883 /** 884 * Decodes a String in a way similar to the JavaScript "decodeURIcomponent" function, 885 * using "UTF-8" for character encoding.<p> 886 * 887 * This method can decode Strings that have been encoded in JavaScript with "encodeURIcomponent".<p> 888 * 889 * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p> 890 * 891 * @param source The String to be decoded 892 * 893 * @return The decoded String 894 */ 895 public static String unescape(String source) { 896 897 return unescape(source, ENCODING_UTF_8); 898 } 899 900 /** 901 * Decodes a String in a way similar to the JavaScript "decodeURIcomponent" function.<p> 902 * 903 * This method can decode Strings that have been encoded in JavaScript with "encodeURIcomponent", 904 * provided "UTF-8" is used as encoding.<p> 905 * 906 * <b>Directly exposed for JSP EL<b>, not through {@link org.opencms.jsp.util.CmsJspElFunctions}.<p> 907 * 908 * @param source The String to be decoded 909 * @param encoding the encoding type 910 * 911 * @return The decoded String 912 */ 913 public static String unescape(String source, String encoding) { 914 915 if (source == null) { 916 return null; 917 } 918 int len = source.length(); 919 // to use standard decoder we need to replace '+' with "%20" (space) 920 StringBuffer preparedSource = new StringBuffer(len); 921 for (int i = 0; i < len; i++) { 922 char c = source.charAt(i); 923 if (c == '+') { 924 preparedSource.append("%20"); 925 } else { 926 preparedSource.append(c); 927 } 928 } 929 return decode(preparedSource.toString(), encoding); 930 } 931 932 /** 933 * Decrypts a byte array obfuscated with 'obfuscateBytes'.<p> 934 * 935 * @param source the source 936 * @return the resuvlt 937 */ 938 private static byte[] deobfuscateBytes(byte[] source) { 939 940 byte[] result = new byte[source.length - 1]; 941 System.arraycopy(source, 1, result, 0, source.length - 1); 942 for (int i = 0; i < result.length; i++) { 943 result[i] = (byte)(0xFF & (result[i] ^ source[0])); 944 } 945 return result; 946 } 947 948 /** 949 * Simple "obfuscation" for byte arrays using random numbers.<p> 950 * 951 * @param source the source array 952 * @return the result 953 */ 954 private static byte[] obfuscateBytes(byte[] source) { 955 956 byte[] s = new byte[1]; 957 m_random.nextBytes(s); 958 byte[] result = new byte[source.length + 1]; 959 System.arraycopy(source, 0, result, 1, source.length); 960 result[0] = s[0]; 961 for (int i = 1; i < result.length; i++) { 962 result[i] = (byte)(0xFF & (result[i] ^ s[0])); 963 } 964 return result; 965 } 966 967}