Source code

001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH & Co. KG, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.search.extractors;
029
030import org.opencms.util.CmsFileUtil;
031import org.opencms.util.CmsStringUtil;
032
033import java.io.ByteArrayInputStream;
034import java.io.InputStream;
035import java.io.StringWriter;
036import java.util.LinkedHashMap;
037import java.util.Map;
038
039import org.apache.commons.lang3.StringUtils;
040import org.apache.tika.metadata.DublinCore;
041import org.apache.tika.metadata.MSOffice;
042import org.apache.tika.metadata.Metadata;
043import org.apache.tika.metadata.OfficeOpenXMLExtended;
044import org.apache.tika.parser.ParseContext;
045import org.apache.tika.parser.Parser;
046import org.apache.tika.sax.BodyContentHandler;
047
048/**
049 * Base utility class that allows extraction of the indexable "plain" text from a given document format.<p>
050 *
051 * @since 6.0.0
052 */
053public abstract class A_CmsTextExtractor implements I_CmsTextExtractor {
054
055    /**
056     * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(byte[])
057     */
058    public I_CmsExtractionResult extractText(byte[] content) throws Exception {
059
060        // call stream based method of extraction without encoding
061        return extractText(new ByteArrayInputStream(content));
062    }
063
064    /**
065     * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(byte[], java.lang.String)
066     */
067    public I_CmsExtractionResult extractText(byte[] content, String encoding) throws Exception {
068
069        // call stream based method of extraction with encoding
070        return extractText(new ByteArrayInputStream(content), encoding);
071    }
072
073    /**
074     * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream)
075     */
076    public I_CmsExtractionResult extractText(InputStream in) throws Exception {
077
078        // encoding is null
079        // (using cast to disambiguate method)
080        return extractText(in, (String)null);
081    }
082
083    /**
084     * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String)
085     */
086    public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception {
087
088        // read the byte content
089        byte[] text = CmsFileUtil.readFully(in);
090        // call byte array based method of extraction
091        return extractText(text, encoding);
092    }
093
094    /**
095     * Combines a meta information item extracted from the document with the main content buffer and
096     * also stores the individual information as item in the Map of content items.<p>
097     *
098     * @param itemValue the value of the item to store
099     * @param itemKey the key in the Map of content items
100     * @param content a buffer where to append the content item
101     * @param contentItems the Map of individual content items
102     */
103    protected void combineContentItem(
104        String itemValue,
105        String itemKey,
106        StringBuffer content,
107        Map<String, String> contentItems) {
108
109        if (CmsStringUtil.isNotEmpty(itemValue)) {
110            contentItems.put(itemKey, itemValue);
111            content.append('\n');
112            content.append(itemValue);
113        }
114    }
115
116    /**
117     * Parses the given input stream with the provided parser and returns the result as a map of content items.<p>
118     *
119     * @param in the input stream for the content to parse
120     * @param parser the parser to use
121     *
122     * @return the result of the parsing as a map of content items
123     *
124     * @throws Exception in case something goes wrong
125     */
126    @SuppressWarnings("deprecation")
127    protected CmsExtractionResult extractText(InputStream in, Parser parser) throws Exception {
128
129        LinkedHashMap<String, String> contentItems = new LinkedHashMap<String, String>();
130
131        StringWriter writer = new StringWriter();
132        BodyContentHandler handler = new BodyContentHandler(writer);
133        Metadata meta = new Metadata();
134        ParseContext context = new ParseContext();
135
136        parser.parse(in, handler, meta, context);
137        in.close();
138
139        String result = writer.toString();
140
141        // add the main document text
142        StringBuffer content = new StringBuffer(result);
143        if (CmsStringUtil.isNotEmpty(result)) {
144            contentItems.put(I_CmsExtractionResult.ITEM_RAW, result);
145        }
146
147        // appends all known document meta data as content items
148        combineContentItem(meta.get(DublinCore.TITLE), I_CmsExtractionResult.ITEM_TITLE, content, contentItems);
149        combineContentItem(meta.get(MSOffice.KEYWORDS), I_CmsExtractionResult.ITEM_KEYWORDS, content, contentItems);
150        String subject = meta.get(I_CmsExtractionResult.ITEM_SUBJECT);
151        if (StringUtils.isBlank(subject)) {
152            subject = meta.get(DublinCore.SUBJECT);
153        }
154        combineContentItem(subject, I_CmsExtractionResult.ITEM_SUBJECT, content, contentItems);
155        combineContentItem(meta.get(MSOffice.AUTHOR), I_CmsExtractionResult.ITEM_AUTHOR, content, contentItems);
156        String creator = meta.get("xmp:CreatorTool");
157        if (StringUtils.isBlank(creator)) {
158            creator = meta.get(DublinCore.CREATOR);
159        }
160        if (StringUtils.isBlank(creator)) {
161            creator = meta.get(I_CmsExtractionResult.ITEM_CREATOR);
162        }
163        combineContentItem(creator, I_CmsExtractionResult.ITEM_CREATOR, content, contentItems);
164        //
165        combineContentItem(meta.get(MSOffice.CATEGORY), I_CmsExtractionResult.ITEM_CATEGORY, content, contentItems);
166        //
167        combineContentItem(meta.get(MSOffice.COMMENTS), I_CmsExtractionResult.ITEM_COMMENTS, content, contentItems);
168        String company = meta.get(OfficeOpenXMLExtended.COMPANY);
169        if (StringUtils.isBlank(company)) {
170            company = meta.get(MSOffice.COMPANY);
171        }
172        combineContentItem(company, I_CmsExtractionResult.ITEM_COMPANY, content, contentItems);
173        //
174        combineContentItem(meta.get(MSOffice.MANAGER), I_CmsExtractionResult.ITEM_MANAGER, content, contentItems);
175        // this constant seems to be missing from TIKA
176        combineContentItem(
177            meta.get(I_CmsExtractionResult.ITEM_PRODUCER),
178            I_CmsExtractionResult.ITEM_PRODUCER,
179            content,
180            contentItems);
181
182        // return the final result
183        return new CmsExtractionResult(content.toString(), contentItems);
184    }
185
186    /**
187     * Removes "unwanted" control chars from the given content.<p>
188     *
189     * @param content the content to remove the unwanted control chars from
190     *
191     * @return the content with the unwanted control chars removed
192     */
193    protected String removeControlChars(String content) {
194
195        if (CmsStringUtil.isEmptyOrWhitespaceOnly(content)) {
196            // to avoid later null pointer exceptions an empty String is returned
197            return "";
198        }
199
200        char[] chars = content.toCharArray();
201        StringBuffer result = new StringBuffer(chars.length);
202        boolean wasUnwanted = false;
203        for (int i = 0; i < chars.length; i++) {
204            char ch = chars[i];
205
206            int type = Character.getType(ch);
207            switch (type) {
208
209                // punctuation
210                case Character.CURRENCY_SYMBOL:
211                case Character.CONNECTOR_PUNCTUATION:
212                case Character.FINAL_QUOTE_PUNCTUATION:
213                case Character.INITIAL_QUOTE_PUNCTUATION:
214                case Character.DASH_PUNCTUATION:
215                case Character.START_PUNCTUATION:
216                case Character.END_PUNCTUATION:
217                case Character.OTHER_PUNCTUATION:
218                    // letters
219                case Character.OTHER_LETTER:
220                case Character.MODIFIER_LETTER:
221                case Character.UPPERCASE_LETTER:
222                case Character.TITLECASE_LETTER:
223                case Character.LOWERCASE_LETTER:
224                    // digits
225                case Character.DECIMAL_DIGIT_NUMBER:
226                    // spaces
227                case Character.SPACE_SEPARATOR:
228                    result.append(ch);
229                    wasUnwanted = false;
230                    break;
231
232                // line separators
233                case Character.LINE_SEPARATOR:
234                    result.append('\n');
235                    wasUnwanted = true;
236                    break;
237
238                // symbols
239                case Character.MATH_SYMBOL:
240                case Character.OTHER_SYMBOL:
241                    // other stuff:
242                case Character.CONTROL:
243                case Character.COMBINING_SPACING_MARK:
244                case Character.ENCLOSING_MARK:
245                case Character.FORMAT:
246                case Character.LETTER_NUMBER:
247                case Character.MODIFIER_SYMBOL:
248                case Character.NON_SPACING_MARK:
249                case Character.PARAGRAPH_SEPARATOR:
250                case Character.PRIVATE_USE:
251                case Character.SURROGATE:
252                case Character.UNASSIGNED:
253                case Character.OTHER_NUMBER:
254                default:
255                    if (!wasUnwanted) {
256                        result.append('\n');
257                        wasUnwanted = true;
258                    }
259            }
260        }
261
262        return result.toString();
263    }
264}