Handle multiple morpho annotations Change-Id: I8afb7acd389639499b9da9d3ffa17ee09f4fe954

commit: 7ff770ec53764d305a3691efc87136523cbae27b [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Fri Jul 18 19:07:10 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Mon Jul 21 11:39:30 2025 +0200
tree: 0384d51e335ae5a3410bd63c726c9b3548493c0f
parent: c643a12688aa6713772dba72e788c7cd22230240 [diff] [blame]
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index c672306..7d31587 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R

@@ -913,96 +913,62 @@
 
   # Helper function to extract annotations from a span section
   extract_annotations_from_section <- function(section_content) {
+    # Handle both spaced tokens and nested single tokens
     tokens <- character(0)
     lemmas <- character(0)
     pos_tags <- character(0)
     morph_tags <- character(0)
-
-    # Split the content by </span> and process each meaningful part
-    parts <- unlist(strsplit(section_content, '</span>'))
-
-    for (part in parts) {
-      part <- trimws(part)
-      if (nchar(part) == 0) next
-
-      # Look for parts that have title attributes and end with text
-      if (grepl('<span[^>]*title=', part)) {
-        # Extract the text content (everything after the last >)
-        text_content <- gsub('.*>([^<]*)$', '\\1', part)
-        text_content <- trimws(text_content)
-
-        if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
-          tokens <- c(tokens, text_content)
-
-          # Extract all title attributes from this part
-          title_pattern <- 'title="([^"]*)"'
-          title_matches <- gregexpr(title_pattern, part)
-
-          lemma <- NA
-          pos_tag <- NA
-          morph_tag <- NA
-
-          if (title_matches[[1]][1] != -1) {
-            all_titles <- regmatches(part, title_matches)[[1]]
-            for (title_match in all_titles) {
-              title_content <- gsub(title_pattern, '\\1', title_match)
-
-              # Split by spaces and process each annotation
-              annotations <- unlist(strsplit(title_content, "\\s+"))
-              for (annotation in annotations) {
-                if (grepl('^[^/]+/l:', annotation)) {
-                  lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
-                } else if (grepl('^[^/]+/p:', annotation)) {
-                  pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
-                } else if (grepl('^[^/]+/m:', annotation)) {
-                  morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
-                }
-              }
-            }
-          }
-
-          lemmas <- c(lemmas, lemma)
-          pos_tags <- c(pos_tags, pos_tag)
-          morph_tags <- c(morph_tags, morph_tag)
-        }
-      }
+    
+    # First try to split by spaces between span groups (for multiple tokens)
+    # Look for spaces that separate token groups
+    if (grepl('</span>\\s+<span', section_content)) {
+      # Multiple tokens separated by spaces
+      token_groups <- unlist(strsplit(section_content, '(?<=</span>)\\s+(?=<span)', perl = TRUE))
+    } else {
+      # Single token (or no spaces between tokens)
+      token_groups <- c(section_content)
     }
-
-    # If no tokens found with the splitting approach, try a different method
-    if (length(tokens) == 0) {
-      # Look for the innermost spans that contain actual text
-      innermost_pattern <- '<span[^>]*title="([^"]*)"[^>]*>([^<]+)</span>'
-      innermost_matches <- gregexpr(innermost_pattern, section_content, perl = TRUE)
-
-      if (innermost_matches[[1]][1] != -1) {
-        matches <- regmatches(section_content, innermost_matches)[[1]]
-
-        for (match in matches) {
-          title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
-          text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
-          text <- trimws(text)
-
-          if (nchar(text) > 0) {
-            tokens <- c(tokens, text)
-
-            # Parse space-separated annotations in title
+    
+    for (group in token_groups) {
+      group <- trimws(group)
+      if (nchar(group) == 0) next
+      
+      # Extract the actual text content (the innermost text)
+      text_match <- regexpr('>([^<>]+)</span>', group, perl = TRUE)
+      if (text_match > 0) {
+        # Find all possible text contents and take the last one (innermost)
+        all_texts <- regmatches(group, gregexpr('>([^<>]+)</span>', group, perl = TRUE))[[1]]
+        if (length(all_texts) > 0) {
+          # Take the last match (innermost text)
+          text_content <- sub('.*>([^<>]+)</span>.*', '\\1', all_texts[length(all_texts)], perl = TRUE)
+          text_content <- trimws(text_content)
+          
+          if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
+            tokens <- c(tokens, text_content)
+            
+            # Extract all title attributes from this group
+            titles <- regmatches(group, gregexpr('title="([^"]*)"', group, perl = TRUE))[[1]]
+            
+            morph_features <- character(0)
             lemma <- NA
             pos_tag <- NA
-            morph_tag <- NA
-
-            annotations <- unlist(strsplit(title, "\\s+"))
-            for (annotation in annotations) {
-              if (grepl('^[^/]+/l:', annotation)) {
-                lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
-              } else if (grepl('^[^/]+/p:', annotation)) {
-                pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
-              } else if (grepl('^[^/]+/m:', annotation)) {
-                morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
+            
+            for (title in titles) {
+              content <- sub('title="([^"]*)"', '\\1', title, perl = TRUE)
+              
+              if (grepl('^[^/]+/l:', content)) {
+                lemma <- sub('^[^/]+/l:(.*)$', '\\1', content)
+              } else if (grepl('^[^/]+/p:', content)) {
+                pos_tag <- sub('^[^/]+/p:(.*)$', '\\1', content)
+              } else if (grepl('^[^/]+/m:', content)) {
+                morph_feature <- sub('^[^/]+/m:(.*)$', '\\1', content)
+                morph_features <- c(morph_features, morph_feature)
               }
             }
-
+            
             lemmas <- c(lemmas, lemma)
             pos_tags <- c(pos_tags, pos_tag)
+            morph_tag <- if (length(morph_features) > 0) paste(morph_features, collapse = "|") else NA
             morph_tags <- c(morph_tags, morph_tag)
           }
         }
commit	7ff770ec53764d305a3691efc87136523cbae27b	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Jul 18 19:07:10 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Jul 21 11:39:30 2025 +0200
tree	0384d51e335ae5a3410bd63c726c9b3548493c0f
parent	c643a12688aa6713772dba72e788c7cd22230240 [diff] [blame]