001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH & Co. KG, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.search.extractors; 029 030import org.opencms.util.CmsFileUtil; 031import org.opencms.util.CmsStringUtil; 032 033import java.io.ByteArrayInputStream; 034import java.io.InputStream; 035import java.io.StringWriter; 036import java.util.LinkedHashMap; 037import java.util.Map; 038 039import org.apache.commons.lang3.StringUtils; 040import org.apache.tika.metadata.DublinCore; 041import org.apache.tika.metadata.MSOffice; 042import org.apache.tika.metadata.Metadata; 043import org.apache.tika.metadata.OfficeOpenXMLExtended; 044import org.apache.tika.parser.ParseContext; 045import org.apache.tika.parser.Parser; 046import org.apache.tika.sax.BodyContentHandler; 047 048/** 049 * Base utility class that allows extraction of the indexable "plain" text from a given document format.<p> 050 * 051 * @since 6.0.0 052 */ 053public abstract class A_CmsTextExtractor implements I_CmsTextExtractor { 054 055 /** 056 * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(byte[]) 057 */ 058 public I_CmsExtractionResult extractText(byte[] content) throws Exception { 059 060 // call stream based method of extraction without encoding 061 return extractText(new ByteArrayInputStream(content)); 062 } 063 064 /** 065 * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(byte[], java.lang.String) 066 */ 067 public I_CmsExtractionResult extractText(byte[] content, String encoding) throws Exception { 068 069 // call stream based method of extraction with encoding 070 return extractText(new ByteArrayInputStream(content), encoding); 071 } 072 073 /** 074 * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream) 075 */ 076 public I_CmsExtractionResult extractText(InputStream in) throws Exception { 077 078 // encoding is null 079 // (using cast to disambiguate method) 080 return extractText(in, (String)null); 081 } 082 083 /** 084 * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) 085 */ 086 public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { 087 088 // read the byte content 089 byte[] text = CmsFileUtil.readFully(in); 090 // call byte array based method of extraction 091 return extractText(text, encoding); 092 } 093 094 /** 095 * Combines a meta information item extracted from the document with the main content buffer and 096 * also stores the individual information as item in the Map of content items.<p> 097 * 098 * @param itemValue the value of the item to store 099 * @param itemKey the key in the Map of content items 100 * @param content a buffer where to append the content item 101 * @param contentItems the Map of individual content items 102 */ 103 protected void combineContentItem( 104 String itemValue, 105 String itemKey, 106 StringBuffer content, 107 Map<String, String> contentItems) { 108 109 if (CmsStringUtil.isNotEmpty(itemValue)) { 110 contentItems.put(itemKey, itemValue); 111 content.append('\n'); 112 content.append(itemValue); 113 } 114 } 115 116 /** 117 * Parses the given input stream with the provided parser and returns the result as a map of content items.<p> 118 * 119 * @param in the input stream for the content to parse 120 * @param parser the parser to use 121 * 122 * @return the result of the parsing as a map of content items 123 * 124 * @throws Exception in case something goes wrong 125 */ 126 @SuppressWarnings("deprecation") 127 protected CmsExtractionResult extractText(InputStream in, Parser parser) throws Exception { 128 129 LinkedHashMap<String, String> contentItems = new LinkedHashMap<String, String>(); 130 131 StringWriter writer = new StringWriter(); 132 BodyContentHandler handler = new BodyContentHandler(writer); 133 Metadata meta = new Metadata(); 134 ParseContext context = new ParseContext(); 135 136 parser.parse(in, handler, meta, context); 137 in.close(); 138 139 String result = writer.toString(); 140 141 // add the main document text 142 StringBuffer content = new StringBuffer(result); 143 if (CmsStringUtil.isNotEmpty(result)) { 144 contentItems.put(I_CmsExtractionResult.ITEM_RAW, result); 145 } 146 147 // appends all known document meta data as content items 148 combineContentItem(meta.get(DublinCore.TITLE), I_CmsExtractionResult.ITEM_TITLE, content, contentItems); 149 combineContentItem(meta.get(MSOffice.KEYWORDS), I_CmsExtractionResult.ITEM_KEYWORDS, content, contentItems); 150 String subject = meta.get(I_CmsExtractionResult.ITEM_SUBJECT); 151 if (StringUtils.isBlank(subject)) { 152 subject = meta.get(DublinCore.SUBJECT); 153 } 154 combineContentItem(subject, I_CmsExtractionResult.ITEM_SUBJECT, content, contentItems); 155 combineContentItem(meta.get(MSOffice.AUTHOR), I_CmsExtractionResult.ITEM_AUTHOR, content, contentItems); 156 String creator = meta.get("xmp:CreatorTool"); 157 if (StringUtils.isBlank(creator)) { 158 creator = meta.get(DublinCore.CREATOR); 159 } 160 if (StringUtils.isBlank(creator)) { 161 creator = meta.get(I_CmsExtractionResult.ITEM_CREATOR); 162 } 163 combineContentItem(creator, I_CmsExtractionResult.ITEM_CREATOR, content, contentItems); 164 // 165 combineContentItem(meta.get(MSOffice.CATEGORY), I_CmsExtractionResult.ITEM_CATEGORY, content, contentItems); 166 // 167 combineContentItem(meta.get(MSOffice.COMMENTS), I_CmsExtractionResult.ITEM_COMMENTS, content, contentItems); 168 String company = meta.get(OfficeOpenXMLExtended.COMPANY); 169 if (StringUtils.isBlank(company)) { 170 company = meta.get(MSOffice.COMPANY); 171 } 172 combineContentItem(company, I_CmsExtractionResult.ITEM_COMPANY, content, contentItems); 173 // 174 combineContentItem(meta.get(MSOffice.MANAGER), I_CmsExtractionResult.ITEM_MANAGER, content, contentItems); 175 // this constant seems to be missing from TIKA 176 combineContentItem( 177 meta.get(I_CmsExtractionResult.ITEM_PRODUCER), 178 I_CmsExtractionResult.ITEM_PRODUCER, 179 content, 180 contentItems); 181 182 // return the final result 183 return new CmsExtractionResult(content.toString(), contentItems); 184 } 185 186 /** 187 * Removes "unwanted" control chars from the given content.<p> 188 * 189 * @param content the content to remove the unwanted control chars from 190 * 191 * @return the content with the unwanted control chars removed 192 */ 193 protected String removeControlChars(String content) { 194 195 if (CmsStringUtil.isEmptyOrWhitespaceOnly(content)) { 196 // to avoid later null pointer exceptions an empty String is returned 197 return ""; 198 } 199 200 char[] chars = content.toCharArray(); 201 StringBuffer result = new StringBuffer(chars.length); 202 boolean wasUnwanted = false; 203 for (int i = 0; i < chars.length; i++) { 204 char ch = chars[i]; 205 206 int type = Character.getType(ch); 207 switch (type) { 208 209 // punctuation 210 case Character.CURRENCY_SYMBOL: 211 case Character.CONNECTOR_PUNCTUATION: 212 case Character.FINAL_QUOTE_PUNCTUATION: 213 case Character.INITIAL_QUOTE_PUNCTUATION: 214 case Character.DASH_PUNCTUATION: 215 case Character.START_PUNCTUATION: 216 case Character.END_PUNCTUATION: 217 case Character.OTHER_PUNCTUATION: 218 // letters 219 case Character.OTHER_LETTER: 220 case Character.MODIFIER_LETTER: 221 case Character.UPPERCASE_LETTER: 222 case Character.TITLECASE_LETTER: 223 case Character.LOWERCASE_LETTER: 224 // digits 225 case Character.DECIMAL_DIGIT_NUMBER: 226 // spaces 227 case Character.SPACE_SEPARATOR: 228 result.append(ch); 229 wasUnwanted = false; 230 break; 231 232 // line separators 233 case Character.LINE_SEPARATOR: 234 result.append('\n'); 235 wasUnwanted = true; 236 break; 237 238 // symbols 239 case Character.MATH_SYMBOL: 240 case Character.OTHER_SYMBOL: 241 // other stuff: 242 case Character.CONTROL: 243 case Character.COMBINING_SPACING_MARK: 244 case Character.ENCLOSING_MARK: 245 case Character.FORMAT: 246 case Character.LETTER_NUMBER: 247 case Character.MODIFIER_SYMBOL: 248 case Character.NON_SPACING_MARK: 249 case Character.PARAGRAPH_SEPARATOR: 250 case Character.PRIVATE_USE: 251 case Character.SURROGATE: 252 case Character.UNASSIGNED: 253 case Character.OTHER_NUMBER: 254 default: 255 if (!wasUnwanted) { 256 result.append('\n'); 257 wasUnwanted = true; 258 } 259 } 260 } 261 262 return result.toString(); 263 } 264}