001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH & Co. KG (https://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software, please see the 018 * company website: https://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: https://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.ai; 029 030import org.opencms.ai.CmsTranslationUtil.FoundOrCreatedValue; 031import org.opencms.file.CmsObject; 032import org.opencms.json.JSONArray; 033import org.opencms.json.JSONException; 034import org.opencms.json.JSONObject; 035import org.opencms.main.CmsLog; 036import org.opencms.util.CmsStringUtil; 037import org.opencms.xml.CmsXmlException; 038import org.opencms.xml.content.CmsXmlContent; 039import org.opencms.xml.types.I_CmsXmlContentValue; 040 041import java.util.ArrayDeque; 042import java.util.ArrayList; 043import java.util.Arrays; 044import java.util.Deque; 045import java.util.HashMap; 046import java.util.List; 047import java.util.Locale; 048import java.util.Map; 049import java.util.concurrent.atomic.AtomicBoolean; 050import java.util.concurrent.atomic.AtomicReference; 051 052import org.apache.commons.logging.Log; 053 054import org.jsoup.Jsoup; 055import org.jsoup.nodes.Document; 056import org.jsoup.nodes.Node; 057import org.jsoup.nodes.TextNode; 058 059import dev.langchain4j.data.message.ChatMessage; 060import dev.langchain4j.data.message.SystemMessage; 061import dev.langchain4j.data.message.UserMessage; 062import dev.langchain4j.model.chat.ChatModel; 063import dev.langchain4j.model.chat.StreamingChatModel; 064import dev.langchain4j.model.chat.request.ChatRequest; 065import dev.langchain4j.model.chat.response.ChatResponse; 066import dev.langchain4j.model.chat.response.PartialResponse; 067import dev.langchain4j.model.chat.response.PartialResponseContext; 068import dev.langchain4j.model.chat.response.StreamingChatResponseHandler; 069 070/** 071 * Translates OpenCms XML content using an external AI provider.<p> 072 * 073 * @since 21.0.0 074 */ 075public class CmsAiTranslator { 076 077 /** 078 * Holder for parsed HTML text nodes and helper operations.<p> 079 */ 080 public static class HtmlParseResult { 081 082 private final Document m_doc; 083 private final boolean m_isFullDocument; 084 private final List<TextNode> m_textNodes; 085 private String m_translateString; 086 087 /** 088 * Creates a new parse result.<p> 089 * 090 * @param doc the parsed document 091 * @param textNodes the collected text nodes 092 * @param isFullDocument true if input is a full HTML document 093 */ 094 public HtmlParseResult(Document doc, List<TextNode> textNodes, boolean isFullDocument) { 095 096 m_doc = doc; 097 m_textNodes = textNodes; 098 m_isFullDocument = isFullDocument; 099 } 100 101 /** 102 * Splits a translation string into segments by marker tokens.<p> 103 * 104 * @param input the translation string 105 * 106 * @return the list of segments 107 */ 108 public static List<String> splitTranslateString(String input) { 109 110 if (input == null) { 111 return new ArrayList<String>(); 112 } 113 114 String[] parts = input.split("⟦#\\d+#⟧", -1); 115 return new ArrayList<String>(Arrays.asList(parts)); 116 } 117 118 /** 119 * Returns the parsed document.<p> 120 * 121 * @return the parsed document 122 */ 123 public Document getDocument() { 124 125 return m_doc; 126 } 127 128 /** 129 * Returns the collected text nodes.<p> 130 * 131 * @return the collected text nodes 132 */ 133 public List<TextNode> getTextNodes() { 134 135 return m_textNodes; 136 } 137 138 /** 139 * Returns true if the input represented a full document.<p> 140 * 141 * @return true if the input is a full document 142 */ 143 public boolean isFullDocument() { 144 145 return m_isFullDocument; 146 } 147 148 /** 149 * Applies a translated string to the text nodes and returns the HTML output.<p> 150 * 151 * @param translation the translated string 152 * 153 * @return the resulting HTML 154 * 155 * @throws CmsAiException if the translation does not match the text node count 156 */ 157 public String setTranslatedString(String translation) throws CmsAiException { 158 159 String result = null; 160 if (translation != null) { 161 List<String> translatioStrings = splitTranslateString(translation); 162 if (translatioStrings.size() != m_textNodes.size()) { 163 throw new CmsAiException("Translation does not contain the same number of text nodes."); 164 } 165 for (int i = 0; i < m_textNodes.size(); i++) { 166 TextNode textNode = m_textNodes.get(i); 167 textNode.text(translatioStrings.get(i)); 168 } 169 result = toString(); 170 } 171 return result; 172 } 173 174 /** 175 * Returns a string representation of the HTML of the parsed document.<p> 176 * 177 * @see java.lang.Object#toString() 178 */ 179 @Override 180 public String toString() { 181 182 if (m_doc == null) { 183 return null; 184 } 185 186 return m_isFullDocument ? m_doc.outerHtml() : m_doc.body().children().outerHtml(); 187 } 188 189 /** 190 * Returns a concatenated string with markers between text nodes.<p> 191 * 192 * @return the translation input string 193 */ 194 public String toTranslateString() { 195 196 if (m_translateString == null) { 197 if (m_textNodes != null) { 198 StringBuilder builder = new StringBuilder(); 199 int markerId = 0; 200 for (int i = 0; i < m_textNodes.size(); i++) { 201 TextNode textNode = m_textNodes.get(i); 202 builder.append(textNode.text()); 203 if (i < (m_textNodes.size() - 1)) { 204 builder.append("⟦#"); 205 builder.append(markerId); 206 builder.append("#⟧"); 207 } 208 markerId++; 209 } 210 m_translateString = builder.toString(); 211 } 212 } 213 return m_translateString; 214 } 215 } 216 217 /** Logger instance for this class. */ 218 private static final Log LOG = CmsLog.getLog(CmsAiTranslator.class); 219 220 /** The current users OpenCms context. */ 221 private CmsObject m_cms; 222 223 /** Keeps track of which fields failed to be updated. */ 224 private List<String> m_conflictFields = new ArrayList<>(); 225 226 /** Counts number of successfully translated fields. */ 227 private int m_numSuccessfulFieldUpdates; 228 229 /** The AI provider configuration. */ 230 private CmsAiProviderConfig m_providerConfig; 231 232 /** The CmsXmlContent to translate. */ 233 private CmsXmlContent m_xmlContent; 234 235 /** 236 * Creates a new translator for the given provider configuration.<p> 237 * 238 * @param config the provider configuration 239 */ 240 public CmsAiTranslator(CmsObject cms, CmsAiProviderConfig config, CmsXmlContent xmlContent) { 241 242 m_cms = cms; 243 m_providerConfig = config; 244 m_xmlContent = xmlContent; 245 } 246 247 /** 248 * Returns true if the given text contains HTML markup.<p> 249 * 250 * @param text the text to check 251 * 252 * @return true if markup is detected 253 */ 254 public static boolean hasHtmlMarkup(String text) { 255 256 if (text == null) { 257 return false; 258 } 259 260 int lt = text.indexOf('<'); 261 while (lt >= 0) { 262 int gt = text.indexOf('>', lt + 1); 263 if (gt > (lt + 1)) { 264 char c = text.charAt(lt + 1); 265 if ((c == '/') || Character.isLetter(c)) { 266 return true; 267 } 268 } 269 lt = text.indexOf('<', lt + 1); 270 } 271 272 return false; 273 } 274 275 /** 276 * Parses HTML text and collects its text nodes.<p> 277 * 278 * @param text the input text 279 * 280 * @return the parse result 281 */ 282 public static HtmlParseResult parseHtmlTextNodes(String text) { 283 284 String input = text == null ? "" : text; 285 String trimmed = input.trim().toLowerCase(Locale.ROOT); 286 boolean isFullDocument = trimmed.startsWith("<!doctype") 287 || trimmed.contains("<html") 288 || trimmed.contains("<body"); 289 Document doc = isFullDocument ? Jsoup.parse(input) : Jsoup.parseBodyFragment(input); 290 List<TextNode> textNodes = collectTextNodes(doc); 291 return new HtmlParseResult(doc, textNodes, isFullDocument); 292 } 293 294 /** 295 * Parses the translation response JSON into a map of xpath to text.<p> 296 * 297 * @param jsonText the JSON text to parse 298 * 299 * @return the translated values mapped by id 300 * 301 * @throws JSONException if the JSON is invalid 302 */ 303 public static Map<String, String> parseTranslationResult(String jsonText) throws JSONException { 304 305 Map<String, String> result = null; 306 JSONArray segments = null; 307 308 if (jsonText != null) { 309 int jsonStart = jsonText.indexOf('{'); 310 int jsonEnd = jsonText.lastIndexOf('}'); 311 if ((jsonStart >= 0) && (jsonEnd >= jsonStart)) { 312 jsonText = jsonText.substring(jsonStart, jsonEnd + 1); 313 } 314 JSONObject json = new JSONObject(jsonText); 315 segments = json.getJSONArray("segments"); 316 } 317 318 if (segments != null) { 319 result = new HashMap<String, String>(); 320 for (int i = 0; i < segments.length(); i++) { 321 JSONObject segment = segments.getJSONObject(i); 322 String id = segment.getString("id"); 323 String text = segment.getString("text"); 324 result.put(id, text); 325 } 326 } 327 328 return result; 329 } 330 331 /** 332 * Collects all TextNodes from a Jsoup HTML document.<p> 333 * 334 * @param root the root node where to start collection from 335 * 336 * @return the collected TextNodes 337 */ 338 private static List<TextNode> collectTextNodes(Node root) { 339 340 List<TextNode> result = new ArrayList<>(); 341 if (root == null) { 342 return result; 343 } 344 345 Deque<Node> stack = new ArrayDeque<>(); 346 stack.push(root); 347 348 while (!stack.isEmpty()) { 349 Node node = stack.pop(); 350 351 if (node instanceof TextNode) { 352 TextNode textNode = (TextNode)node; 353 if (!textNode.text().trim().isEmpty()) { 354 result.add(textNode); 355 } 356 } 357 358 List<Node> children = node.childNodes(); 359 for (int i = children.size() - 1; i >= 0; i--) { 360 stack.push(children.get(i)); 361 } 362 } 363 364 return result; 365 } 366 367 /** 368 * Gets the fields which couldn't be updated with a translation because of conflicts. 369 * 370 * @return the list of fields which couldn't be updated due to conflicts 371 */ 372 public List<String> getConflictFields() { 373 374 return m_conflictFields; 375 } 376 377 /** 378 * Gets the number of fields which were successfully updated with their translation. 379 * 380 * @return the number of updated fields 381 */ 382 public int getNumSuccessfulFieldUpdates() { 383 384 return m_numSuccessfulFieldUpdates; 385 } 386 387 public CmsXmlContent translateXmlContent(Locale srcLocale, Locale targetLocale) 388 throws JSONException, CmsXmlException, CmsAiException { 389 390 return translateXmlContent(srcLocale, targetLocale, null); 391 } 392 393 /** 394 * Translates XML content and applies the results to the target locale.<p> 395 * 396 * @param srcLocale the source locale 397 * @param targetLocale the target locale 398 * 399 * @return the updated XML content 400 * 401 * @throws JSONException if the JSON request or response is invalid 402 * @throws CmsXmlException in case of problems accessing the XML content 403 * @throws CmsAiException if the AI translation result does not match the required structure 404 */ 405 public CmsXmlContent translateXmlContent( 406 Locale srcLocale, 407 Locale targetLocale, 408 StreamingChatResponseHandler handler) 409 throws JSONException, CmsXmlException, CmsAiException { 410 411 String jsonText = translateXmlContentRaw(srcLocale, targetLocale, handler); 412 if (jsonText == null) { 413 return null; 414 } 415 416 Map<String, String> translationResult = parseTranslationResult(jsonText); 417 if (!m_xmlContent.hasLocale(targetLocale)) { 418 if (translationResult.size() > 0) { 419 m_xmlContent.copyLocale(srcLocale, targetLocale); 420 421 for (Map.Entry<String, String> entry : translationResult.entrySet()) { 422 String xpath = entry.getKey(); 423 String text = entry.getValue(); 424 I_CmsXmlContentValue sval = m_xmlContent.getValue(xpath, srcLocale); 425 if (sval == null) { 426 LOG.error("Could not find original value for translation at path " + xpath); 427 continue; 428 } 429 String source = sval.getStringValue(m_cms); 430 if (hasHtmlMarkup(source)) { 431 HtmlParseResult parsed = parseHtmlTextNodes(source); 432 text = parsed.setTranslatedString(text); 433 } 434 I_CmsXmlContentValue tval = m_xmlContent.getValue(xpath, targetLocale); 435 tval.setStringValue(m_cms, text); 436 m_numSuccessfulFieldUpdates += 1; 437 } 438 } 439 } else { 440 for (Map.Entry<String, String> entry : translationResult.entrySet()) { 441 try { 442 String xpath = entry.getKey(); 443 String text = entry.getValue(); 444 I_CmsXmlContentValue sval = m_xmlContent.getValue(xpath, srcLocale); 445 if (sval == null) { 446 LOG.error("Could not find original value for translation at path " + xpath); 447 continue; 448 } 449 String source = sval.getStringValue(m_cms); 450 if (hasHtmlMarkup(source)) { 451 HtmlParseResult parsed = parseHtmlTextNodes(source); 452 text = parsed.setTranslatedString(text); 453 } 454 FoundOrCreatedValue val = CmsTranslationUtil.findOrCreateValue( 455 m_cms, 456 m_xmlContent, 457 targetLocale, 458 entry.getKey()); 459 // If the value already existed, we only want to write to it if it's empty. 460 // But if it was just created, it might have a default value, which we need to overwrite. 461 if (val.wasCreated() 462 || CmsStringUtil.isEmptyOrWhitespaceOnly(val.getValue().getStringValue(m_cms))) { 463 val.getValue().setStringValue(m_cms, text); 464 m_numSuccessfulFieldUpdates += 1; 465 } 466 467 } catch (Exception e) { 468 LOG.debug(e.getLocalizedMessage(), e); 469 m_conflictFields.add(entry.getKey()); 470 } 471 } 472 } 473 474 return m_xmlContent; 475 } 476 477 /** 478 * Translates XML content with the AI provider and returns the raw response.<p> 479 * 480 * @param srcLocale the source locale 481 * @param targetLocale the target locale 482 * 483 * @return the raw AI response 484 * 485 * @throws JSONException in the unlikely case of problems generating a JSON object for the translation request 486 */ 487 protected String translateXmlContentRaw(Locale srcLocale, Locale targetLocale, StreamingChatResponseHandler handler) 488 throws JSONException { 489 490 String result = null; 491 492 List<I_CmsXmlContentValue> xmlValues = CmsTranslationUtil.getValuesToTranslate( 493 m_cms, 494 m_xmlContent, 495 srcLocale, 496 targetLocale); 497 AtomicBoolean cancelled = new AtomicBoolean(Boolean.FALSE); 498 if (xmlValues.size() > 0) { 499 500 JSONObject root = new JSONObject(); 501 JSONArray segments = new JSONArray(); 502 503 root.put("source_language", srcLocale.toString()); 504 root.put("target_language", targetLocale.toString()); 505 506 String textToTranslate = ""; 507 for (I_CmsXmlContentValue val : xmlValues) { 508 String strVal = val.getStringValue(m_cms); 509 strVal = CmsAiTranslator.parseHtmlTextNodes(strVal).toTranslateString(); 510 segments.put(new JSONObject().put("id", val.getPath()).put("text", strVal)); 511 } 512 513 root.put("segments", segments); 514 textToTranslate = root.toString(2); 515 516 final String llmPrompt = String.join( 517 "\n", 518 "You are a professional translation engine.", 519 "You will receive a JSON object with the following structure:", 520 "- source_language: the source language code", 521 "- target_language: the target language code", 522 "- segments: an array of objects", 523 " - id: a unique identifier", 524 " - text: the text to translate", 525 "Your task:", 526 "- Translate ONLY the value of segments[].text from source_language to target_language.", 527 "- Preserve meaning, tone, and grammar.", 528 "- Do NOT translate ids.", 529 "- Do NOT change, remove, reorder, or add any segments.", 530 "- Do NOT add explanations, comments, or extra fields.", 531 "- Keep placeholders, markers, or tokens (e.g. ⟦#1#⟧, {{...}}, ${...}, [TAG1], etc.) intact."); 532 533 // ResponseFormat llmResponseFormat = ResponseFormat.builder().type(ResponseFormatType.JSON).jsonSchema( 534 // JsonSchema.builder().name("TranslationResult").rootElement( 535 // JsonObjectSchema.builder().addProperty( 536 // "segments", 537 // JsonArraySchema.builder().items( 538 // JsonObjectSchema.builder().addStringProperty( 539 // "id", 540 // "The id for the text, must remain unchanged and kept in order").addStringProperty( 541 // "text", 542 // "The text to translate with optional placeholders like ⟦#1#⟧").required( 543 // "id", 544 // "text").build()).build()).required("segments").build()).build()).build(); 545 546 List<ChatMessage> messages = new ArrayList<ChatMessage>(); 547 messages.add(SystemMessage.from(llmPrompt)); 548 messages.add(UserMessage.from(textToTranslate)); 549 550 // NOT setting response format for now, as it breaks streaming 551 ChatRequest llmRequest = ChatRequest.builder().messages(messages).build(); 552 553 if (handler == null) { 554 ChatModel chatModel = new CmsAiModel(m_providerConfig).getChatModel(); 555 ChatResponse response = chatModel.chat(llmRequest); 556 result = response.aiMessage().text(); 557 } else { 558 559 StreamingChatModel chatModel = new CmsAiModel(m_providerConfig).getStreamingChatModel(); 560 561 final java.util.concurrent.CountDownLatch COUNTDOWN = new java.util.concurrent.CountDownLatch(1); 562 AtomicReference<String> resultRef = new AtomicReference<String>(); 563 StringBuilder partialBuffer = new StringBuilder(); 564 565 chatModel.chat(llmRequest, new StreamingChatResponseHandler() { 566 567 public void onCompleteResponse(ChatResponse response) { 568 569 String text = response.aiMessage().text(); 570 resultRef.set(text != null ? text : partialBuffer.toString()); 571 handler.onCompleteResponse(response); 572 COUNTDOWN.countDown(); 573 } 574 575 public void onError(Throwable error) { 576 577 handler.onError(error); 578 COUNTDOWN.countDown(); 579 } 580 581 public void onPartialResponse(PartialResponse responsePart, PartialResponseContext context) { 582 583 String responsePartText = responsePart.text(); 584 if (responsePartText != null) { 585 partialBuffer.append(responsePartText); 586 } 587 handler.onPartialResponse(responsePart, context); 588 if (context.streamingHandle().isCancelled()) { 589 cancelled.set(true); 590 } 591 } 592 }); 593 594 // Block this request thread until streaming finishes 595 try { 596 // safety timeout: 10 minutes 597 COUNTDOWN.await(10, java.util.concurrent.TimeUnit.MINUTES); 598 } catch (InterruptedException e) { 599 Thread.currentThread().interrupt(); 600 } 601 602 result = resultRef.get(); 603 if ((result == null) && !cancelled.get() && (partialBuffer.length() > 0)) { 604 result = partialBuffer.toString(); 605 } 606 } 607 } 608 return result; 609 } 610 611}