001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search; 029 030import org.opencms.ade.galleries.shared.CmsGallerySearchScope; 031import org.opencms.file.CmsObject; 032import org.opencms.file.CmsPropertyDefinition; 033import org.opencms.i18n.CmsEncoder; 034import org.opencms.main.CmsLog; 035import org.opencms.main.OpenCms; 036import org.opencms.search.galleries.CmsGallerySearchParameters; 037import org.opencms.site.CmsSiteManagerImpl; 038import org.opencms.util.CmsHtmlExtractor; 039import org.opencms.util.CmsStringUtil; 040 041import java.io.UnsupportedEncodingException; 042import java.text.DateFormat; 043import java.text.ParseException; 044import java.text.SimpleDateFormat; 045import java.util.ArrayList; 046import java.util.Arrays; 047import java.util.Calendar; 048import java.util.Collection; 049import java.util.Date; 050import java.util.Iterator; 051import java.util.List; 052import java.util.Locale; 053import java.util.TimeZone; 054 055import org.apache.commons.logging.Log; 056import org.apache.solr.common.SolrDocument; 057import org.apache.solr.common.SolrInputDocument; 058import org.apache.solr.common.SolrInputField; 059import org.apache.solr.common.util.ContentStream; 060import org.apache.solr.common.util.ContentStreamBase; 061 062import org.htmlparser.util.ParserException; 063 064/** 065 * Provides common functions regarding searching.<p> 066 * 067 * @since 9.0.0 068 */ 069public final class CmsSearchUtil { 070 071 /** Date format object that obeys ISO 8601 which is used by Solr. */ 072 private static final DateFormat DATEFORMAT_ISO_8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); 073 074 /** The log object for this class. */ 075 private static final Log LOG = CmsLog.getLog(CmsSearchUtil.class); 076 077 /** Variable to hold an UTC timezone object. */ 078 public static final TimeZone TIMEZONE_UTC = TimeZone.getTimeZone("UTC"); 079 080 /** Variable to hold an GMT timezone object. */ 081 public static final TimeZone TIMEZONE_GMT = TimeZone.getTimeZone("GMT"); 082 083 //start HttpClient 084 /** 085 * Date format pattern used to parse HTTP date headers in RFC 1123 format. 086 */ 087 public static final String PATTERN_RFC1123 = "EEE, dd MMM yyyy HH:mm:ss zzz"; 088 089 /** 090 * Date format pattern used to parse HTTP date headers in RFC 1036 format. 091 */ 092 public static final String PATTERN_RFC1036 = "EEEE, dd-MMM-yy HH:mm:ss zzz"; 093 094 /** 095 * Date format pattern used to parse HTTP date headers in ANSI C 096 * <code>asctime()</code> format. 097 */ 098 public static final String PATTERN_ASCTIME = "EEE MMM d HH:mm:ss yyyy"; 099 100 //These are included for back compat 101 private static final Collection<String> DEFAULT_HTTP_CLIENT_PATTERNS = Arrays.asList( 102 PATTERN_ASCTIME, 103 PATTERN_RFC1036, 104 PATTERN_RFC1123); 105 //--------------------------------------------------------------------------------------- 106 107 /** 108 * A suite of default date formats that can be parsed, and thus transformed to the Solr specific format 109 */ 110 public static final Collection<String> DEFAULT_DATE_FORMATS = new ArrayList<>(); 111 112 private static final Date DEFAULT_TWO_DIGIT_YEAR_START; 113 114 static { 115 DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss'Z'"); 116 DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss"); 117 DEFAULT_DATE_FORMATS.add("yyyy-MM-dd"); 118 DEFAULT_DATE_FORMATS.add("yyyy-MM-dd hh:mm:ss"); 119 DEFAULT_DATE_FORMATS.add("yyyy-MM-dd HH:mm:ss"); 120 DEFAULT_DATE_FORMATS.add("EEE MMM d hh:mm:ss z yyyy"); 121 DEFAULT_DATE_FORMATS.addAll(DEFAULT_HTTP_CLIENT_PATTERNS); 122 123 Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT); 124 calendar.set(2000, Calendar.JANUARY, 1, 0, 0); 125 DEFAULT_TWO_DIGIT_YEAR_START = calendar.getTime(); 126 } 127 128 /** 129 * Private constructor of utitlity class.<p> 130 */ 131 private CmsSearchUtil() { 132 133 // noop 134 } 135 136 /** 137 * Computes the search root folders for the given search parameters based on the search scope.<p> 138 * 139 * @param cms the current CMS context 140 * @param params the current search parameters 141 * 142 * @return the search root folders based on the search scope 143 */ 144 public static List<String> computeScopeFolders(CmsObject cms, CmsGallerySearchParameters params) { 145 146 String subsite = null; 147 if (params.getReferencePath() != null) { 148 subsite = OpenCms.getADEManager().getSubSiteRoot( 149 cms, 150 cms.getRequestContext().addSiteRoot(params.getReferencePath())); 151 if (subsite != null) { 152 subsite = cms.getRequestContext().removeSiteRoot(subsite); 153 } else if (LOG.isWarnEnabled()) { 154 LOG.warn( 155 Messages.get().getBundle().key( 156 Messages.LOG_GALLERIES_COULD_NOT_EVALUATE_SUBSITE_1, 157 params.getReferencePath())); 158 } 159 } else if (LOG.isWarnEnabled()) { 160 LOG.warn(Messages.get().getBundle().key(Messages.LOG_GALLERIES_NO_REFERENCE_PATH_PROVIDED_0)); 161 } 162 163 List<String> scopeFolders = getSearchRootsForScope( 164 params.getScope(), 165 cms.getRequestContext().getSiteRoot(), 166 subsite); 167 return scopeFolders; 168 } 169 170 /** 171 * Returns a given date object in the ISO 8601 format. 172 * 173 * @param date that should be converted. 174 * @return string that represents the given date in the ISO 8601 format. 175 */ 176 public static String getDateAsIso8601(Date date) { 177 178 synchronized (CmsSearchUtil.class) { 179 if (DATEFORMAT_ISO_8601.getTimeZone() != TIMEZONE_UTC) { 180 DATEFORMAT_ISO_8601.setTimeZone(TIMEZONE_UTC); 181 } 182 return DATEFORMAT_ISO_8601.format(date); 183 } 184 } 185 186 /** 187 * Returns a given date object in the ISO 8601 format. 188 * 189 * @param date that should be converted. 190 * @return string that represents the given date in the ISO 8601 format. 191 */ 192 public static String getDateAsIso8601(long date) { 193 194 // Check if date is set 195 if ((date > Long.MIN_VALUE) && (date < Long.MAX_VALUE)) { 196 final Date d_date = new Date(date); 197 return getDateAsIso8601(d_date); 198 } 199 return null; 200 } 201 202 /** 203 * Returns a time interval as Solr compatible query string. 204 * @param searchField the field to search for. 205 * @param startTime the lower limit of the interval. 206 * @param endTime the upper limit of the interval. 207 * @return Solr compatible query string. 208 */ 209 public static String getDateCreatedTimeRangeFilterQuery(String searchField, long startTime, long endTime) { 210 211 String sStartTime = null; 212 String sEndTime = null; 213 214 // Convert startTime to ISO 8601 format 215 if ((startTime > Long.MIN_VALUE) && (startTime < Long.MAX_VALUE)) { 216 sStartTime = CmsSearchUtil.getDateAsIso8601(new Date(startTime)); 217 } 218 219 // Convert endTime to ISO 8601 format 220 if ((endTime > Long.MIN_VALUE) && (endTime < Long.MAX_VALUE)) { 221 sEndTime = CmsSearchUtil.getDateAsIso8601(new Date(endTime)); 222 } 223 224 // Build Solr range string 225 final String rangeString = CmsSearchUtil.getSolrRangeString(sStartTime, sEndTime); 226 227 // Build Solr filter string 228 return String.format("%s:%s", searchField, rangeString); 229 } 230 231 /** 232 * Gets the search roots to use for the given site/subsite parameters.<p> 233 * 234 * @param scope the search scope 235 * @param siteParam the current site 236 * @param subSiteParam the current subsite 237 * 238 * @return the list of search roots for that option 239 */ 240 public static List<String> getSearchRootsForScope( 241 CmsGallerySearchScope scope, 242 String siteParam, 243 String subSiteParam) { 244 245 List<String> result = new ArrayList<String>(); 246 if (scope == CmsGallerySearchScope.everything) { 247 result.add("/"); 248 return result; 249 } 250 if (scope.isIncludeSite()) { 251 result.add(siteParam); 252 } 253 if (scope.isIncludeSubSite()) { 254 if (subSiteParam == null) { 255 result.add(siteParam); 256 } else { 257 result.add(CmsStringUtil.joinPaths(siteParam, subSiteParam)); 258 } 259 } 260 if (scope.isIncludeShared()) { 261 String sharedFolder = OpenCms.getSiteManager().getSharedFolder(); 262 if (sharedFolder != null) { 263 result.add(sharedFolder); 264 } 265 result.add(CmsSiteManagerImpl.PATH_SYSTEM_SHARED_FOLDER); 266 } 267 return result; 268 } 269 270 /** 271 * Returns a string that represents a valid Solr query range. 272 * 273 * @param from Lower bound of the query range. 274 * @param to Upper bound of the query range. 275 * @return String that represents a Solr query range. 276 */ 277 public static String getSolrRangeString(String from, String to) { 278 279 // If a parameter is not initialized, use the asterisk '*' operator 280 if (CmsStringUtil.isEmptyOrWhitespaceOnly(from)) { 281 from = "*"; 282 } 283 284 if (CmsStringUtil.isEmptyOrWhitespaceOnly(to)) { 285 to = "*"; 286 } 287 288 return String.format("[%s TO %s]", from, to); 289 } 290 291 /** 292 * Returns a formatter that can be use by the current thread if needed to 293 * convert Date objects to the Internal representation. 294 * 295 * @param d The input date to parse 296 * @return The parsed {@link java.util.Date} 297 * @throws java.text.ParseException If the input can't be parsed 298 */ 299 public static Date parseDate(String d) throws ParseException { 300 301 return parseDate(d, DEFAULT_DATE_FORMATS); 302 } 303 304 public static Date parseDate(String d, Collection<String> fmts) throws ParseException { 305 306 // 2007-04-26T08:05:04Z 307 if (d.endsWith("Z") && (d.length() > 20)) { 308 return DATEFORMAT_ISO_8601.parse(d); 309 } 310 return parseDate(d, fmts, null); 311 } 312 313 /** 314 * Slightly modified from org.apache.commons.httpclient.util.DateUtil.parseDate 315 * <p> 316 * Parses the date value using the given date formats. 317 * 318 * @param dateValue the date value to parse 319 * @param dateFormats the date formats to use 320 * @param startDate During parsing, two digit years will be placed in the range 321 * <code>startDate</code> to <code>startDate + 100 years</code>. This value may 322 * be <code>null</code>. When <code>null</code> is given as a parameter, year 323 * <code>2000</code> will be used. 324 * @return the parsed date 325 * @throws ParseException if none of the dataFormats could parse the dateValue 326 */ 327 public static Date parseDate(String dateValue, Collection<String> dateFormats, Date startDate) 328 throws ParseException { 329 330 if (dateValue == null) { 331 throw new IllegalArgumentException("dateValue is null"); 332 } 333 if (dateFormats == null) { 334 dateFormats = DEFAULT_HTTP_CLIENT_PATTERNS; 335 } 336 if (startDate == null) { 337 startDate = DEFAULT_TWO_DIGIT_YEAR_START; 338 } 339 // trim single quotes around date if present 340 // see issue #5279 341 if ((dateValue.length() > 1) && dateValue.startsWith("'") && dateValue.endsWith("'")) { 342 dateValue = dateValue.substring(1, dateValue.length() - 1); 343 } 344 345 SimpleDateFormat dateParser = null; 346 Iterator formatIter = dateFormats.iterator(); 347 348 while (formatIter.hasNext()) { 349 String format = (String)formatIter.next(); 350 if (dateParser == null) { 351 dateParser = new SimpleDateFormat(format, Locale.ENGLISH); 352 dateParser.setTimeZone(TIMEZONE_GMT); 353 dateParser.set2DigitYearStart(startDate); 354 } else { 355 dateParser.applyPattern(format); 356 } 357 try { 358 return dateParser.parse(dateValue); 359 } catch (ParseException pe) { 360 // ignore this exception, we will try the next format 361 } 362 } 363 364 // we were unable to parse the date 365 throw new ParseException("Unable to parse the date " + dateValue, 0); 366 } 367 368 /** 369 * Strips of HTML of the value to map, if necessary (depending on the property name). 370 * @param propertyName name of the property. 371 * @param value the properties value (possibly with HTML) 372 * @return the value with HTML stripped of, or the original value, if stripping of the HTML fails. 373 */ 374 public static String stripHtmlFromPropertyIfNecessary(String propertyName, String value) { 375 376 if (propertyName.equals(CmsPropertyDefinition.PROPERTY_DESCRIPTION_HTML)) { 377 try { 378 return CmsHtmlExtractor.extractText(value, CmsEncoder.ENCODING_UTF_8); 379 } catch (ParserException | UnsupportedEncodingException e) { 380 LOG.warn("Could not strip HTML from property value. Returning the original value.", e); 381 } 382 } 383 return value; 384 385 } 386 387 /** 388 * Take a string and make it an iterable ContentStream 389 */ 390 public static Collection<ContentStream> toContentStreams(final String str, final String contentType) { 391 392 if (str == null) { 393 return null; 394 } 395 396 ArrayList<ContentStream> streams = new ArrayList<>(1); 397 ContentStreamBase ccc = new ContentStreamBase.StringStream(str); 398 ccc.setContentType(contentType); 399 streams.add(ccc); 400 return streams; 401 } 402 403 /** 404 * @param d SolrInputDocument to convert 405 * @return a SolrDocument with the same fields and values as the SolrInputDocument 406 * @deprecated This method will be removed in Solr 6.0 407 */ 408 @Deprecated 409 public static SolrDocument toSolrDocument(SolrInputDocument d) { 410 411 SolrDocument doc = new SolrDocument(); 412 for (SolrInputField field : d) { 413 doc.setField(field.getName(), field.getValue()); 414 } 415 if (d.getChildDocuments() != null) { 416 for (SolrInputDocument in : d.getChildDocuments()) { 417 doc.addChildDocument(toSolrDocument(in)); 418 } 419 420 } 421 return doc; 422 } 423 424 /** 425 * @param d SolrDocument to convert 426 * @return a SolrInputDocument with the same fields and values as the 427 * SolrDocument. 428 * @deprecated This method will be removed in Solr 6.0 429 */ 430 @Deprecated 431 public static SolrInputDocument toSolrInputDocument(SolrDocument d) { 432 433 SolrInputDocument doc = new SolrInputDocument(); 434 d.getFieldNames().forEach(name -> doc.addField(name, d.getFieldValue(name))); 435 return doc; 436 } 437 438}