001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH & Co. KG (https://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software, please see the
018 * company website: https://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: https://www.opencms.org
022 *
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.ai;
029
030import org.opencms.ai.CmsTranslationUtil.FoundOrCreatedValue;
031import org.opencms.file.CmsObject;
032import org.opencms.json.JSONArray;
033import org.opencms.json.JSONException;
034import org.opencms.json.JSONObject;
035import org.opencms.main.CmsLog;
036import org.opencms.util.CmsStringUtil;
037import org.opencms.xml.CmsXmlException;
038import org.opencms.xml.content.CmsXmlContent;
039import org.opencms.xml.types.I_CmsXmlContentValue;
040
041import java.util.ArrayDeque;
042import java.util.ArrayList;
043import java.util.Arrays;
044import java.util.Deque;
045import java.util.HashMap;
046import java.util.List;
047import java.util.Locale;
048import java.util.Map;
049import java.util.concurrent.atomic.AtomicBoolean;
050import java.util.concurrent.atomic.AtomicReference;
051
052import org.apache.commons.logging.Log;
053
054import org.jsoup.Jsoup;
055import org.jsoup.nodes.Document;
056import org.jsoup.nodes.Node;
057import org.jsoup.nodes.TextNode;
058
059import dev.langchain4j.data.message.ChatMessage;
060import dev.langchain4j.data.message.SystemMessage;
061import dev.langchain4j.data.message.UserMessage;
062import dev.langchain4j.model.chat.ChatModel;
063import dev.langchain4j.model.chat.StreamingChatModel;
064import dev.langchain4j.model.chat.request.ChatRequest;
065import dev.langchain4j.model.chat.response.ChatResponse;
066import dev.langchain4j.model.chat.response.PartialResponse;
067import dev.langchain4j.model.chat.response.PartialResponseContext;
068import dev.langchain4j.model.chat.response.StreamingChatResponseHandler;
069
070/**
071 * Translates OpenCms XML content using an external AI provider.<p>
072 *
073 * @since 21.0.0
074 */
075public class CmsAiTranslator {
076
077    /**
078     * Holder for parsed HTML text nodes and helper operations.<p>
079     */
080    public static class HtmlParseResult {
081
082        private final Document m_doc;
083        private final boolean m_isFullDocument;
084        private final List<TextNode> m_textNodes;
085        private String m_translateString;
086
087        /**
088         * Creates a new parse result.<p>
089         *
090         * @param doc the parsed document
091         * @param textNodes the collected text nodes
092         * @param isFullDocument true if input is a full HTML document
093         */
094        public HtmlParseResult(Document doc, List<TextNode> textNodes, boolean isFullDocument) {
095
096            m_doc = doc;
097            m_textNodes = textNodes;
098            m_isFullDocument = isFullDocument;
099        }
100
101        /**
102         * Splits a translation string into segments by marker tokens.<p>
103         *
104         * @param input the translation string
105         *
106         * @return the list of segments
107         */
108        public static List<String> splitTranslateString(String input) {
109
110            if (input == null) {
111                return new ArrayList<String>();
112            }
113
114            String[] parts = input.split("⟦#\\d+#⟧", -1);
115            return new ArrayList<String>(Arrays.asList(parts));
116        }
117
118        /**
119         * Returns the parsed document.<p>
120         *
121         * @return the parsed document
122         */
123        public Document getDocument() {
124
125            return m_doc;
126        }
127
128        /**
129         * Returns the collected text nodes.<p>
130         *
131         * @return the collected text nodes
132         */
133        public List<TextNode> getTextNodes() {
134
135            return m_textNodes;
136        }
137
138        /**
139         * Returns true if the input represented a full document.<p>
140         *
141         * @return true if the input is a full document
142         */
143        public boolean isFullDocument() {
144
145            return m_isFullDocument;
146        }
147
148        /**
149         * Applies a translated string to the text nodes and returns the HTML output.<p>
150         *
151         * @param translation the translated string
152         *
153         * @return the resulting HTML
154         *
155         * @throws CmsAiException if the translation does not match the text node count
156         */
157        public String setTranslatedString(String translation) throws CmsAiException {
158
159            String result = null;
160            if (translation != null) {
161                List<String> translatioStrings = splitTranslateString(translation);
162                if (translatioStrings.size() != m_textNodes.size()) {
163                    throw new CmsAiException("Translation does not contain the same number of text nodes.");
164                }
165                for (int i = 0; i < m_textNodes.size(); i++) {
166                    TextNode textNode = m_textNodes.get(i);
167                    textNode.text(translatioStrings.get(i));
168                }
169                result = toString();
170            }
171            return result;
172        }
173
174        /**
175         * Returns a string representation of the HTML of the parsed document.<p>
176         *
177         * @see java.lang.Object#toString()
178         */
179        @Override
180        public String toString() {
181
182            if (m_doc == null) {
183                return null;
184            }
185
186            return m_isFullDocument ? m_doc.outerHtml() : m_doc.body().children().outerHtml();
187        }
188
189        /**
190         * Returns a concatenated string with markers between text nodes.<p>
191         *
192         * @return the translation input string
193         */
194        public String toTranslateString() {
195
196            if (m_translateString == null) {
197                if (m_textNodes != null) {
198                    StringBuilder builder = new StringBuilder();
199                    int markerId = 0;
200                    for (int i = 0; i < m_textNodes.size(); i++) {
201                        TextNode textNode = m_textNodes.get(i);
202                        builder.append(textNode.text());
203                        if (i < (m_textNodes.size() - 1)) {
204                            builder.append("⟦#");
205                            builder.append(markerId);
206                            builder.append("#⟧");
207                        }
208                        markerId++;
209                    }
210                    m_translateString = builder.toString();
211                }
212            }
213            return m_translateString;
214        }
215    }
216
217    /** Logger instance for this class. */
218    private static final Log LOG = CmsLog.getLog(CmsAiTranslator.class);
219
220    /** The current users OpenCms context.  */
221    private CmsObject m_cms;
222
223    /** Keeps track of which fields failed to be updated. */
224    private List<String> m_conflictFields = new ArrayList<>();
225
226    /** Counts number of successfully translated fields. */
227    private int m_numSuccessfulFieldUpdates;
228
229    /** The AI provider configuration. */
230    private CmsAiProviderConfig m_providerConfig;
231
232    /** The CmsXmlContent to translate. */
233    private CmsXmlContent m_xmlContent;
234
235    /**
236     * Creates a new translator for the given provider configuration.<p>
237     *
238     * @param config the provider configuration
239     */
240    public CmsAiTranslator(CmsObject cms, CmsAiProviderConfig config, CmsXmlContent xmlContent) {
241
242        m_cms = cms;
243        m_providerConfig = config;
244        m_xmlContent = xmlContent;
245    }
246
247    /**
248     * Returns true if the given text contains HTML markup.<p>
249     *
250     * @param text the text to check
251     *
252     * @return true if markup is detected
253     */
254    public static boolean hasHtmlMarkup(String text) {
255
256        if (text == null) {
257            return false;
258        }
259
260        int lt = text.indexOf('<');
261        while (lt >= 0) {
262            int gt = text.indexOf('>', lt + 1);
263            if (gt > (lt + 1)) {
264                char c = text.charAt(lt + 1);
265                if ((c == '/') || Character.isLetter(c)) {
266                    return true;
267                }
268            }
269            lt = text.indexOf('<', lt + 1);
270        }
271
272        return false;
273    }
274
275    /**
276     * Parses HTML text and collects its text nodes.<p>
277     *
278     * @param text the input text
279     *
280     * @return the parse result
281     */
282    public static HtmlParseResult parseHtmlTextNodes(String text) {
283
284        String input = text == null ? "" : text;
285        String trimmed = input.trim().toLowerCase(Locale.ROOT);
286        boolean isFullDocument = trimmed.startsWith("<!doctype")
287            || trimmed.contains("<html")
288            || trimmed.contains("<body");
289        Document doc = isFullDocument ? Jsoup.parse(input) : Jsoup.parseBodyFragment(input);
290        List<TextNode> textNodes = collectTextNodes(doc);
291        return new HtmlParseResult(doc, textNodes, isFullDocument);
292    }
293
294    /**
295     * Parses the translation response JSON into a map of xpath to text.<p>
296     *
297     * @param jsonText the JSON text to parse
298     *
299     * @return the translated values mapped by id
300     *
301     * @throws JSONException if the JSON is invalid
302     */
303    public static Map<String, String> parseTranslationResult(String jsonText) throws JSONException {
304
305        Map<String, String> result = null;
306        JSONArray segments = null;
307
308        if (jsonText != null) {
309            int jsonStart = jsonText.indexOf('{');
310            int jsonEnd = jsonText.lastIndexOf('}');
311            if ((jsonStart >= 0) && (jsonEnd >= jsonStart)) {
312                jsonText = jsonText.substring(jsonStart, jsonEnd + 1);
313            }
314            JSONObject json = new JSONObject(jsonText);
315            segments = json.getJSONArray("segments");
316        }
317
318        if (segments != null) {
319            result = new HashMap<String, String>();
320            for (int i = 0; i < segments.length(); i++) {
321                JSONObject segment = segments.getJSONObject(i);
322                String id = segment.getString("id");
323                String text = segment.getString("text");
324                result.put(id, text);
325            }
326        }
327
328        return result;
329    }
330
331    /**
332     * Collects all TextNodes from a Jsoup HTML document.<p>
333     *
334     * @param root the root node where to start collection from
335     *
336     * @return the collected TextNodes
337     */
338    private static List<TextNode> collectTextNodes(Node root) {
339
340        List<TextNode> result = new ArrayList<>();
341        if (root == null) {
342            return result;
343        }
344
345        Deque<Node> stack = new ArrayDeque<>();
346        stack.push(root);
347
348        while (!stack.isEmpty()) {
349            Node node = stack.pop();
350
351            if (node instanceof TextNode) {
352                TextNode textNode = (TextNode)node;
353                if (!textNode.text().trim().isEmpty()) {
354                    result.add(textNode);
355                }
356            }
357
358            List<Node> children = node.childNodes();
359            for (int i = children.size() - 1; i >= 0; i--) {
360                stack.push(children.get(i));
361            }
362        }
363
364        return result;
365    }
366
367    /**
368     * Gets the fields which couldn't be updated with a translation because of conflicts.
369     *
370     * @return the list of fields which couldn't be updated due to conflicts
371     */
372    public List<String> getConflictFields() {
373
374        return m_conflictFields;
375    }
376
377    /**
378     * Gets the number of fields which were successfully updated with their translation.
379     *
380     * @return the number of updated fields
381     */
382    public int getNumSuccessfulFieldUpdates() {
383
384        return m_numSuccessfulFieldUpdates;
385    }
386
387    public CmsXmlContent translateXmlContent(Locale srcLocale, Locale targetLocale)
388    throws JSONException, CmsXmlException, CmsAiException {
389
390        return translateXmlContent(srcLocale, targetLocale, null);
391    }
392
393    /**
394     * Translates XML content and applies the results to the target locale.<p>
395     *
396     * @param srcLocale the source locale
397     * @param targetLocale the target locale
398     *
399     * @return the updated XML content
400     *
401     * @throws JSONException if the JSON request or response is invalid
402     * @throws CmsXmlException in case of problems accessing the XML content
403     * @throws CmsAiException if the AI translation result does not match the required structure
404     */
405    public CmsXmlContent translateXmlContent(
406        Locale srcLocale,
407        Locale targetLocale,
408        StreamingChatResponseHandler handler)
409    throws JSONException, CmsXmlException, CmsAiException {
410
411        String jsonText = translateXmlContentRaw(srcLocale, targetLocale, handler);
412        if (jsonText == null) {
413            return null;
414        }
415
416        Map<String, String> translationResult = parseTranslationResult(jsonText);
417        if (!m_xmlContent.hasLocale(targetLocale)) {
418            if (translationResult.size() > 0) {
419                m_xmlContent.copyLocale(srcLocale, targetLocale);
420
421                for (Map.Entry<String, String> entry : translationResult.entrySet()) {
422                    String xpath = entry.getKey();
423                    String text = entry.getValue();
424                    I_CmsXmlContentValue sval = m_xmlContent.getValue(xpath, srcLocale);
425                    if (sval == null) {
426                        LOG.error("Could not find original value for translation at path " + xpath);
427                        continue;
428                    }
429                    String source = sval.getStringValue(m_cms);
430                    if (hasHtmlMarkup(source)) {
431                        HtmlParseResult parsed = parseHtmlTextNodes(source);
432                        text = parsed.setTranslatedString(text);
433                    }
434                    I_CmsXmlContentValue tval = m_xmlContent.getValue(xpath, targetLocale);
435                    tval.setStringValue(m_cms, text);
436                    m_numSuccessfulFieldUpdates += 1;
437                }
438            }
439        } else {
440            for (Map.Entry<String, String> entry : translationResult.entrySet()) {
441                try {
442                    String xpath = entry.getKey();
443                    String text = entry.getValue();
444                    I_CmsXmlContentValue sval = m_xmlContent.getValue(xpath, srcLocale);
445                    if (sval == null) {
446                        LOG.error("Could not find original value for translation at path " + xpath);
447                        continue;
448                    }
449                    String source = sval.getStringValue(m_cms);
450                    if (hasHtmlMarkup(source)) {
451                        HtmlParseResult parsed = parseHtmlTextNodes(source);
452                        text = parsed.setTranslatedString(text);
453                    }
454                    FoundOrCreatedValue val = CmsTranslationUtil.findOrCreateValue(
455                        m_cms,
456                        m_xmlContent,
457                        targetLocale,
458                        entry.getKey());
459                    // If the value already existed, we only want to write to it if it's empty.
460                    // But if it was just created, it might have a default value, which we need to overwrite.
461                    if (val.wasCreated()
462                        || CmsStringUtil.isEmptyOrWhitespaceOnly(val.getValue().getStringValue(m_cms))) {
463                        val.getValue().setStringValue(m_cms, text);
464                        m_numSuccessfulFieldUpdates += 1;
465                    }
466
467                } catch (Exception e) {
468                    LOG.debug(e.getLocalizedMessage(), e);
469                    m_conflictFields.add(entry.getKey());
470                }
471            }
472        }
473
474        return m_xmlContent;
475    }
476
477    /**
478     * Translates XML content with the AI provider and returns the raw response.<p>
479     *
480     * @param srcLocale the source locale
481     * @param targetLocale the target locale
482     *
483     * @return the raw AI response
484     *
485     * @throws JSONException in the unlikely case of problems generating a JSON object for the translation request
486     */
487    protected String translateXmlContentRaw(Locale srcLocale, Locale targetLocale, StreamingChatResponseHandler handler)
488    throws JSONException {
489
490        String result = null;
491
492        List<I_CmsXmlContentValue> xmlValues = CmsTranslationUtil.getValuesToTranslate(
493            m_cms,
494            m_xmlContent,
495            srcLocale,
496            targetLocale);
497        AtomicBoolean cancelled = new AtomicBoolean(Boolean.FALSE);
498        if (xmlValues.size() > 0) {
499
500            JSONObject root = new JSONObject();
501            JSONArray segments = new JSONArray();
502
503            root.put("source_language", srcLocale.toString());
504            root.put("target_language", targetLocale.toString());
505
506            String textToTranslate = "";
507            for (I_CmsXmlContentValue val : xmlValues) {
508                String strVal = val.getStringValue(m_cms);
509                strVal = CmsAiTranslator.parseHtmlTextNodes(strVal).toTranslateString();
510                segments.put(new JSONObject().put("id", val.getPath()).put("text", strVal));
511            }
512
513            root.put("segments", segments);
514            textToTranslate = root.toString(2);
515
516            final String llmPrompt = String.join(
517                "\n",
518                "You are a professional translation engine.",
519                "You will receive a JSON object with the following structure:",
520                "- source_language: the source language code",
521                "- target_language: the target language code",
522                "- segments: an array of objects",
523                "  - id: a unique identifier",
524                "  - text: the text to translate",
525                "Your task:",
526                "- Translate ONLY the value of segments[].text from source_language to target_language.",
527                "- Preserve meaning, tone, and grammar.",
528                "- Do NOT translate ids.",
529                "- Do NOT change, remove, reorder, or add any segments.",
530                "- Do NOT add explanations, comments, or extra fields.",
531                "- Keep placeholders, markers, or tokens (e.g. ⟦#1#⟧, {{...}}, ${...}, [TAG1], etc.) intact.");
532
533            //            ResponseFormat llmResponseFormat = ResponseFormat.builder().type(ResponseFormatType.JSON).jsonSchema(
534            //                JsonSchema.builder().name("TranslationResult").rootElement(
535            //                    JsonObjectSchema.builder().addProperty(
536            //                        "segments",
537            //                        JsonArraySchema.builder().items(
538            //                            JsonObjectSchema.builder().addStringProperty(
539            //                                "id",
540            //                                "The id for the text, must remain unchanged and kept in order").addStringProperty(
541            //                                    "text",
542            //                                    "The text to translate with optional placeholders like ⟦#1#⟧").required(
543            //                                        "id",
544            //                                        "text").build()).build()).required("segments").build()).build()).build();
545
546            List<ChatMessage> messages = new ArrayList<ChatMessage>();
547            messages.add(SystemMessage.from(llmPrompt));
548            messages.add(UserMessage.from(textToTranslate));
549
550            // NOT setting response format for now, as it breaks streaming
551            ChatRequest llmRequest = ChatRequest.builder().messages(messages).build();
552
553            if (handler == null) {
554                ChatModel chatModel = new CmsAiModel(m_providerConfig).getChatModel();
555                ChatResponse response = chatModel.chat(llmRequest);
556                result = response.aiMessage().text();
557            } else {
558
559                StreamingChatModel chatModel = new CmsAiModel(m_providerConfig).getStreamingChatModel();
560
561                final java.util.concurrent.CountDownLatch COUNTDOWN = new java.util.concurrent.CountDownLatch(1);
562                AtomicReference<String> resultRef = new AtomicReference<String>();
563                StringBuilder partialBuffer = new StringBuilder();
564
565                chatModel.chat(llmRequest, new StreamingChatResponseHandler() {
566
567                    public void onCompleteResponse(ChatResponse response) {
568
569                        String text = response.aiMessage().text();
570                        resultRef.set(text != null ? text : partialBuffer.toString());
571                        handler.onCompleteResponse(response);
572                        COUNTDOWN.countDown();
573                    }
574
575                    public void onError(Throwable error) {
576
577                        handler.onError(error);
578                        COUNTDOWN.countDown();
579                    }
580
581                    public void onPartialResponse(PartialResponse responsePart, PartialResponseContext context) {
582
583                        String responsePartText = responsePart.text();
584                        if (responsePartText != null) {
585                            partialBuffer.append(responsePartText);
586                        }
587                        handler.onPartialResponse(responsePart, context);
588                        if (context.streamingHandle().isCancelled()) {
589                            cancelled.set(true);
590                        }
591                    }
592                });
593
594                // Block this request thread until streaming finishes
595                try {
596                    // safety timeout: 10 minutes
597                    COUNTDOWN.await(10, java.util.concurrent.TimeUnit.MINUTES);
598                } catch (InterruptedException e) {
599                    Thread.currentThread().interrupt();
600                }
601
602                result = resultRef.get();
603                if ((result == null) && !cancelled.get() && (partialBuffer.length() > 0)) {
604                    result = partialBuffer.toString();
605                }
606            }
607        }
608        return result;
609    }
610
611}