001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search;
029
030import org.opencms.ade.galleries.shared.CmsGallerySearchScope;
031import org.opencms.file.CmsObject;
032import org.opencms.file.CmsPropertyDefinition;
033import org.opencms.i18n.CmsEncoder;
034import org.opencms.main.CmsLog;
035import org.opencms.main.OpenCms;
036import org.opencms.search.galleries.CmsGallerySearchParameters;
037import org.opencms.site.CmsSiteManagerImpl;
038import org.opencms.util.CmsHtmlExtractor;
039import org.opencms.util.CmsStringUtil;
040
041import java.io.UnsupportedEncodingException;
042import java.text.DateFormat;
043import java.text.ParseException;
044import java.text.SimpleDateFormat;
045import java.util.ArrayList;
046import java.util.Arrays;
047import java.util.Calendar;
048import java.util.Collection;
049import java.util.Date;
050import java.util.Iterator;
051import java.util.List;
052import java.util.Locale;
053import java.util.TimeZone;
054
055import org.apache.commons.logging.Log;
056import org.apache.solr.common.SolrDocument;
057import org.apache.solr.common.SolrInputDocument;
058import org.apache.solr.common.SolrInputField;
059import org.apache.solr.common.util.ContentStream;
060import org.apache.solr.common.util.ContentStreamBase;
061
062import org.htmlparser.util.ParserException;
063
064/**
065 * Provides common functions regarding searching.<p>
066 *
067 * @since 9.0.0
068 */
069public final class CmsSearchUtil {
070
071    /** Date format object that obeys ISO 8601 which is used by Solr. */
072    private static final DateFormat DATEFORMAT_ISO_8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
073
074    /** The log object for this class. */
075    private static final Log LOG = CmsLog.getLog(CmsSearchUtil.class);
076
077    /** Variable to hold an UTC timezone object. */
078    public static final TimeZone TIMEZONE_UTC = TimeZone.getTimeZone("UTC");
079
080    /** Variable to hold an GMT timezone object. */
081    public static final TimeZone TIMEZONE_GMT = TimeZone.getTimeZone("GMT");
082
083    //start HttpClient
084    /**
085     * Date format pattern used to parse HTTP date headers in RFC 1123 format.
086     */
087    public static final String PATTERN_RFC1123 = "EEE, dd MMM yyyy HH:mm:ss zzz";
088
089    /**
090     * Date format pattern used to parse HTTP date headers in RFC 1036 format.
091     */
092    public static final String PATTERN_RFC1036 = "EEEE, dd-MMM-yy HH:mm:ss zzz";
093
094    /**
095     * Date format pattern used to parse HTTP date headers in ANSI C
096     * <code>asctime()</code> format.
097     */
098    public static final String PATTERN_ASCTIME = "EEE MMM d HH:mm:ss yyyy";
099
100    //These are included for back compat
101    private static final Collection<String> DEFAULT_HTTP_CLIENT_PATTERNS = Arrays.asList(
102        PATTERN_ASCTIME,
103        PATTERN_RFC1036,
104        PATTERN_RFC1123);
105    //---------------------------------------------------------------------------------------
106
107    /**
108     * A suite of default date formats that can be parsed, and thus transformed to the Solr specific format
109     */
110    public static final Collection<String> DEFAULT_DATE_FORMATS = new ArrayList<>();
111
112    private static final Date DEFAULT_TWO_DIGIT_YEAR_START;
113
114    static {
115        DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss'Z'");
116        DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss");
117        DEFAULT_DATE_FORMATS.add("yyyy-MM-dd");
118        DEFAULT_DATE_FORMATS.add("yyyy-MM-dd hh:mm:ss");
119        DEFAULT_DATE_FORMATS.add("yyyy-MM-dd HH:mm:ss");
120        DEFAULT_DATE_FORMATS.add("EEE MMM d hh:mm:ss z yyyy");
121        DEFAULT_DATE_FORMATS.addAll(DEFAULT_HTTP_CLIENT_PATTERNS);
122
123        Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
124        calendar.set(2000, Calendar.JANUARY, 1, 0, 0);
125        DEFAULT_TWO_DIGIT_YEAR_START = calendar.getTime();
126    }
127
128    /**
129     * Private constructor of utitlity class.<p>
130     */
131    private CmsSearchUtil() {
132
133        // noop
134    }
135
136    /**
137     * Computes the search root folders for the given search parameters based on the search scope.<p>
138     *
139     * @param cms the current CMS context
140     * @param params the current search parameters
141     *
142     * @return the search root folders based on the search scope
143     */
144    public static List<String> computeScopeFolders(CmsObject cms, CmsGallerySearchParameters params) {
145
146        String subsite = null;
147        if (params.getReferencePath() != null) {
148            subsite = OpenCms.getADEManager().getSubSiteRoot(
149                cms,
150                cms.getRequestContext().addSiteRoot(params.getReferencePath()));
151            if (subsite != null) {
152                subsite = cms.getRequestContext().removeSiteRoot(subsite);
153            } else if (LOG.isWarnEnabled()) {
154                LOG.warn(
155                    Messages.get().getBundle().key(
156                        Messages.LOG_GALLERIES_COULD_NOT_EVALUATE_SUBSITE_1,
157                        params.getReferencePath()));
158            }
159        } else if (LOG.isWarnEnabled()) {
160            LOG.warn(Messages.get().getBundle().key(Messages.LOG_GALLERIES_NO_REFERENCE_PATH_PROVIDED_0));
161        }
162
163        List<String> scopeFolders = getSearchRootsForScope(
164            params.getScope(),
165            cms.getRequestContext().getSiteRoot(),
166            subsite);
167        return scopeFolders;
168    }
169
170    /**
171     * Returns a given date object in the ISO 8601 format.
172     *
173     * @param date that should be converted.
174     * @return string that represents the given date in the ISO 8601 format.
175     */
176    public static String getDateAsIso8601(Date date) {
177
178        synchronized (CmsSearchUtil.class) {
179            if (DATEFORMAT_ISO_8601.getTimeZone() != TIMEZONE_UTC) {
180                DATEFORMAT_ISO_8601.setTimeZone(TIMEZONE_UTC);
181            }
182            return DATEFORMAT_ISO_8601.format(date);
183        }
184    }
185
186    /**
187     * Returns a given date object in the ISO 8601 format.
188     *
189     * @param date that should be converted.
190     * @return string that represents the given date in the ISO 8601 format.
191     */
192    public static String getDateAsIso8601(long date) {
193
194        // Check if date is set
195        if ((date > Long.MIN_VALUE) && (date < Long.MAX_VALUE)) {
196            final Date d_date = new Date(date);
197            return getDateAsIso8601(d_date);
198        }
199        return null;
200    }
201
202    /**
203     * Returns a time interval as Solr compatible query string.
204     * @param searchField the field to search for.
205     * @param startTime the lower limit of the interval.
206     * @param endTime the upper limit of the interval.
207     * @return Solr compatible query string.
208     */
209    public static String getDateCreatedTimeRangeFilterQuery(String searchField, long startTime, long endTime) {
210
211        String sStartTime = null;
212        String sEndTime = null;
213
214        // Convert startTime to ISO 8601 format
215        if ((startTime > Long.MIN_VALUE) && (startTime < Long.MAX_VALUE)) {
216            sStartTime = CmsSearchUtil.getDateAsIso8601(new Date(startTime));
217        }
218
219        // Convert endTime to ISO 8601 format
220        if ((endTime > Long.MIN_VALUE) && (endTime < Long.MAX_VALUE)) {
221            sEndTime = CmsSearchUtil.getDateAsIso8601(new Date(endTime));
222        }
223
224        // Build Solr range string
225        final String rangeString = CmsSearchUtil.getSolrRangeString(sStartTime, sEndTime);
226
227        // Build Solr filter string
228        return String.format("%s:%s", searchField, rangeString);
229    }
230
231    /**
232     * Gets the search roots to use for the given site/subsite parameters.<p>
233     *
234     * @param scope the search scope
235     * @param siteParam the current site
236     * @param subSiteParam the current subsite
237     *
238     * @return the list of search roots for that option
239     */
240    public static List<String> getSearchRootsForScope(
241        CmsGallerySearchScope scope,
242        String siteParam,
243        String subSiteParam) {
244
245        List<String> result = new ArrayList<String>();
246        if (scope == CmsGallerySearchScope.everything) {
247            result.add("/");
248            return result;
249        }
250        if (scope.isIncludeSite()) {
251            result.add(siteParam);
252        }
253        if (scope.isIncludeSubSite()) {
254            if (subSiteParam == null) {
255                result.add(siteParam);
256            } else {
257                result.add(CmsStringUtil.joinPaths(siteParam, subSiteParam));
258            }
259        }
260        if (scope.isIncludeShared()) {
261            String sharedFolder = OpenCms.getSiteManager().getSharedFolder();
262            if (sharedFolder != null) {
263                result.add(sharedFolder);
264            }
265            result.add(CmsSiteManagerImpl.PATH_SYSTEM_SHARED_FOLDER);
266        }
267        return result;
268    }
269
270    /**
271     * Returns a string that represents a valid Solr query range.
272     *
273     * @param from Lower bound of the query range.
274     * @param to Upper bound of the query range.
275     * @return String that represents a Solr query range.
276     */
277    public static String getSolrRangeString(String from, String to) {
278
279        // If a parameter is not initialized, use the asterisk '*' operator
280        if (CmsStringUtil.isEmptyOrWhitespaceOnly(from)) {
281            from = "*";
282        }
283
284        if (CmsStringUtil.isEmptyOrWhitespaceOnly(to)) {
285            to = "*";
286        }
287
288        return String.format("[%s TO %s]", from, to);
289    }
290
291    /**
292     * Returns a formatter that can be use by the current thread if needed to
293     * convert Date objects to the Internal representation.
294     *
295     * @param d The input date to parse
296     * @return The parsed {@link java.util.Date}
297     * @throws java.text.ParseException If the input can't be parsed
298     */
299    public static Date parseDate(String d) throws ParseException {
300
301        return parseDate(d, DEFAULT_DATE_FORMATS);
302    }
303
304    public static Date parseDate(String d, Collection<String> fmts) throws ParseException {
305
306        // 2007-04-26T08:05:04Z
307        if (d.endsWith("Z") && (d.length() > 20)) {
308            return DATEFORMAT_ISO_8601.parse(d);
309        }
310        return parseDate(d, fmts, null);
311    }
312
313    /**
314     * Slightly modified from org.apache.commons.httpclient.util.DateUtil.parseDate
315     * <p>
316     * Parses the date value using the given date formats.
317     *
318     * @param dateValue   the date value to parse
319     * @param dateFormats the date formats to use
320     * @param startDate   During parsing, two digit years will be placed in the range
321     *                    <code>startDate</code> to <code>startDate + 100 years</code>. This value may
322     *                    be <code>null</code>. When <code>null</code> is given as a parameter, year
323     *                    <code>2000</code> will be used.
324     * @return the parsed date
325     * @throws ParseException if none of the dataFormats could parse the dateValue
326     */
327    public static Date parseDate(String dateValue, Collection<String> dateFormats, Date startDate)
328    throws ParseException {
329
330        if (dateValue == null) {
331            throw new IllegalArgumentException("dateValue is null");
332        }
333        if (dateFormats == null) {
334            dateFormats = DEFAULT_HTTP_CLIENT_PATTERNS;
335        }
336        if (startDate == null) {
337            startDate = DEFAULT_TWO_DIGIT_YEAR_START;
338        }
339        // trim single quotes around date if present
340        // see issue #5279
341        if ((dateValue.length() > 1) && dateValue.startsWith("'") && dateValue.endsWith("'")) {
342            dateValue = dateValue.substring(1, dateValue.length() - 1);
343        }
344
345        SimpleDateFormat dateParser = null;
346        Iterator formatIter = dateFormats.iterator();
347
348        while (formatIter.hasNext()) {
349            String format = (String)formatIter.next();
350            if (dateParser == null) {
351                dateParser = new SimpleDateFormat(format, Locale.ENGLISH);
352                dateParser.setTimeZone(TIMEZONE_GMT);
353                dateParser.set2DigitYearStart(startDate);
354            } else {
355                dateParser.applyPattern(format);
356            }
357            try {
358                return dateParser.parse(dateValue);
359            } catch (ParseException pe) {
360                // ignore this exception, we will try the next format
361            }
362        }
363
364        // we were unable to parse the date
365        throw new ParseException("Unable to parse the date " + dateValue, 0);
366    }
367
368    /**
369     * Strips of HTML of the value to map, if necessary (depending on the property name).
370     * @param propertyName name of the property.
371     * @param value the properties value (possibly with HTML)
372     * @return the value with HTML stripped of, or the original value, if stripping of the HTML fails.
373     */
374    public static String stripHtmlFromPropertyIfNecessary(String propertyName, String value) {
375
376        if (propertyName.equals(CmsPropertyDefinition.PROPERTY_DESCRIPTION_HTML)) {
377            try {
378                return CmsHtmlExtractor.extractText(value, CmsEncoder.ENCODING_UTF_8);
379            } catch (ParserException | UnsupportedEncodingException e) {
380                LOG.warn("Could not strip HTML from property value. Returning the original value.", e);
381            }
382        }
383        return value;
384
385    }
386
387    /**
388     * Take a string and make it an iterable ContentStream
389     */
390    public static Collection<ContentStream> toContentStreams(final String str, final String contentType) {
391
392        if (str == null) {
393            return null;
394        }
395
396        ArrayList<ContentStream> streams = new ArrayList<>(1);
397        ContentStreamBase ccc = new ContentStreamBase.StringStream(str);
398        ccc.setContentType(contentType);
399        streams.add(ccc);
400        return streams;
401    }
402
403    /**
404     * @param d SolrInputDocument to convert
405     * @return a SolrDocument with the same fields and values as the SolrInputDocument
406     * @deprecated This method will be removed in Solr 6.0
407     */
408    @Deprecated
409    public static SolrDocument toSolrDocument(SolrInputDocument d) {
410
411        SolrDocument doc = new SolrDocument();
412        for (SolrInputField field : d) {
413            doc.setField(field.getName(), field.getValue());
414        }
415        if (d.getChildDocuments() != null) {
416            for (SolrInputDocument in : d.getChildDocuments()) {
417                doc.addChildDocument(toSolrDocument(in));
418            }
419
420        }
421        return doc;
422    }
423
424    /**
425     * @param d SolrDocument to convert
426     * @return a SolrInputDocument with the same fields and values as the
427     *   SolrDocument.
428     * @deprecated This method will be removed in Solr 6.0
429     */
430    @Deprecated
431    public static SolrInputDocument toSolrInputDocument(SolrDocument d) {
432
433        SolrInputDocument doc = new SolrInputDocument();
434        d.getFieldNames().forEach(name -> doc.addField(name, d.getFieldValue(name)));
435        return doc;
436    }
437
438}