From 52949e251d574bdc29dded693fe04767b4236201 Mon Sep 17 00:00:00 2001 From: Ivan Despot <66276597+g-despot@users.noreply.github.com> Date: Thu, 30 Apr 2026 15:48:26 +0200 Subject: [PATCH 1/4] Fix tokenizer --- .../integration/TextAnalyzerITest.java | 163 ++++++++++++++++++ .../v1/api/collections/TextAnalyzer.java | 6 +- 2 files changed, 166 insertions(+), 3 deletions(-) create mode 100644 src/it/java/io/weaviate/integration/TextAnalyzerITest.java diff --git a/src/it/java/io/weaviate/integration/TextAnalyzerITest.java b/src/it/java/io/weaviate/integration/TextAnalyzerITest.java new file mode 100644 index 000000000..4c6356cf3 --- /dev/null +++ b/src/it/java/io/weaviate/integration/TextAnalyzerITest.java @@ -0,0 +1,163 @@ +package io.weaviate.integration; + +import java.util.Map; + +import org.assertj.core.api.Assertions; +import org.junit.BeforeClass; +import org.junit.Test; + +import io.weaviate.ConcurrentTest; +import io.weaviate.client6.v1.api.WeaviateClient; +import io.weaviate.client6.v1.api.collections.Property; +import io.weaviate.client6.v1.api.collections.TextAnalyzer; +import io.weaviate.client6.v1.api.collections.Tokenization; +import io.weaviate.client6.v1.api.collections.query.Filter; +import io.weaviate.containers.Container; +import io.weaviate.containers.Weaviate; + +/** + * End-to-end coverage for the v1.37 per-property {@link TextAnalyzer} + * configuration: ASCII folding ({@code asciiFold}, {@code asciiFoldIgnore}) + * and per-property {@code stopwordPreset}. + * + *

This test would have caught the snake_case {@code @SerializedName} + * regression in {@link TextAnalyzer} that was silently dropping every + * analyzer setting on the wire. Two layers of assertion: + * + *

    + *
  1. Schema round-trip: read the collection back via {@code + * collection.config.get()} and verify the {@code textAnalyzer} field + * is populated with the values that were sent. If the JSON keys + * don't match Weaviate's struct tags, the server stores nothing and + * this assertion fails. + *
  2. Behavioral: insert a string and run a filter query that + * only succeeds when folding actually executes server-side. If the + * analyzer config arrived as a no-op, the filter returns zero + * results and the assertion fails. + *
+ */ +public class TextAnalyzerITest extends ConcurrentTest { + private static final WeaviateClient client = Container.WEAVIATE.getClient(); + + @BeforeClass + public static void __() { + Weaviate.Version.V137.orSkip(); + } + + @Test + public void testAsciiFoldRoundTripsThroughConfigAndAffectsFilters() throws Exception { + var nsAccent = ns("AccentFolding"); + client.collections.create(nsAccent, c -> c + .properties( + Property.text("text_default", + p -> p.tokenization(Tokenization.WORD)), + Property.text("text_folded", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t.foldAscii(true)))), + Property.text("text_folded_keep_e", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t + .foldAscii(true) + .keepAscii("é")))))); + + var products = client.collections.use(nsAccent); + + // ---- Layer 1: schema round-trip ------------------------------------- + var config = products.config.get(); + Assertions.assertThat(config).isPresent(); + var props = config.get().properties(); + + var textDefault = props.stream() + .filter(p -> p.propertyName().equals("text_default")).findFirst().orElseThrow(); + var textFolded = props.stream() + .filter(p -> p.propertyName().equals("text_folded")).findFirst().orElseThrow(); + var textFoldedKeepE = props.stream() + .filter(p -> p.propertyName().equals("text_folded_keep_e")).findFirst().orElseThrow(); + + Assertions.assertThat(textDefault.textAnalyzer()) + .as("default property has no textAnalyzer config") + .isNull(); + + Assertions.assertThat(textFolded.textAnalyzer()) + .as("text_folded persists asciiFold=true") + .isNotNull() + .satisfies(ta -> { + Assertions.assertThat(ta.foldAscii()).isTrue(); + }); + + Assertions.assertThat(textFoldedKeepE.textAnalyzer()) + .as("text_folded_keep_e persists asciiFold=true and asciiFoldIgnore=[é]") + .isNotNull() + .satisfies(ta -> { + Assertions.assertThat(ta.foldAscii()).isTrue(); + Assertions.assertThat(ta.keepAscii()).containsExactly("é"); + }); + + // ---- Layer 2: behavioral -------------------------------------------- + products.data.insert(Map.of( + "text_default", "Café Crème Bio", + "text_folded", "Café Crème Bio", + "text_folded_keep_e", "Café Crème Bio")); + + // "cafe" (lowercase, no accents) must match only the fully-folded property. + var defaultMatches = products.query.fetchObjects( + q -> q.filters(Filter.property("text_default").eq("cafe"))); + Assertions.assertThat(defaultMatches.objects()) + .as("text_default has no folding, 'cafe' should not match 'Café Crème Bio'") + .isEmpty(); + + var foldedMatches = products.query.fetchObjects( + q -> q.filters(Filter.property("text_folded").eq("cafe"))); + Assertions.assertThat(foldedMatches.objects()) + .as("text_folded has asciiFold=true, 'cafe' must match 'Café Crème Bio'") + .hasSize(1); + + var keepEMatches = products.query.fetchObjects( + q -> q.filters(Filter.property("text_folded_keep_e").eq("cafe"))); + Assertions.assertThat(keepEMatches.objects()) + .as("text_folded_keep_e preserves é, 'cafe' must NOT match 'Café Crème Bio'") + .isEmpty(); + + // The exact accented form matches everywhere. + for (String prop : new String[] {"text_default", "text_folded", "text_folded_keep_e"}) { + var hits = products.query.fetchObjects( + q -> q.filters(Filter.property(prop).eq("Café"))); + Assertions.assertThat(hits.objects()) + .as("'Café' (exact) should match on %s regardless of folding", prop) + .hasSize(1); + } + } + + @Test + public void testStopwordPresetRoundTripsThroughConfig() throws Exception { + var nsStop = ns("StopwordPreset"); + client.collections.create(nsStop, c -> c + .properties( + Property.text("name_en", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t.stopwordPreset("en")))), + Property.text("name_none", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t.stopwordPreset("none")))))); + + var products = client.collections.use(nsStop); + var config = products.config.get(); + Assertions.assertThat(config).isPresent(); + var props = config.get().properties(); + + var nameEn = props.stream() + .filter(p -> p.propertyName().equals("name_en")).findFirst().orElseThrow(); + var nameNone = props.stream() + .filter(p -> p.propertyName().equals("name_none")).findFirst().orElseThrow(); + + Assertions.assertThat(nameEn.textAnalyzer()) + .as("name_en persists stopwordPreset=en") + .isNotNull() + .satisfies(ta -> Assertions.assertThat(ta.stopwordPreset()).isEqualTo("en")); + + Assertions.assertThat(nameNone.textAnalyzer()) + .as("name_none persists stopwordPreset=none") + .isNotNull() + .satisfies(ta -> Assertions.assertThat(ta.stopwordPreset()).isEqualTo("none")); + } +} diff --git a/src/main/java/io/weaviate/client6/v1/api/collections/TextAnalyzer.java b/src/main/java/io/weaviate/client6/v1/api/collections/TextAnalyzer.java index 3265b63a5..045e85d4c 100644 --- a/src/main/java/io/weaviate/client6/v1/api/collections/TextAnalyzer.java +++ b/src/main/java/io/weaviate/client6/v1/api/collections/TextAnalyzer.java @@ -10,9 +10,9 @@ import io.weaviate.client6.v1.internal.ObjectBuilder; public record TextAnalyzer( - @SerializedName("ascii_fold") Boolean foldAscii, - @SerializedName("ascii_fold_ignore") List keepAscii, - @SerializedName("stopword_preset") String stopwordPreset) { + @SerializedName("asciiFold") Boolean foldAscii, + @SerializedName("asciiFoldIgnore") List keepAscii, + @SerializedName("stopwordPreset") String stopwordPreset) { public static TextAnalyzer of() { return null; From 04188800d8d17ab368c8c68f170836900f081ca7 Mon Sep 17 00:00:00 2001 From: Ivan Despot <66276597+g-despot@users.noreply.github.com> Date: Thu, 30 Apr 2026 15:52:31 +0200 Subject: [PATCH 2/4] Remove comment --- .../integration/TextAnalyzerITest.java | 265 ++++++++---------- 1 file changed, 122 insertions(+), 143 deletions(-) diff --git a/src/it/java/io/weaviate/integration/TextAnalyzerITest.java b/src/it/java/io/weaviate/integration/TextAnalyzerITest.java index 4c6356cf3..a6ca565e4 100644 --- a/src/it/java/io/weaviate/integration/TextAnalyzerITest.java +++ b/src/it/java/io/weaviate/integration/TextAnalyzerITest.java @@ -15,149 +15,128 @@ import io.weaviate.containers.Container; import io.weaviate.containers.Weaviate; -/** - * End-to-end coverage for the v1.37 per-property {@link TextAnalyzer} - * configuration: ASCII folding ({@code asciiFold}, {@code asciiFoldIgnore}) - * and per-property {@code stopwordPreset}. - * - *

This test would have caught the snake_case {@code @SerializedName} - * regression in {@link TextAnalyzer} that was silently dropping every - * analyzer setting on the wire. Two layers of assertion: - * - *

    - *
  1. Schema round-trip: read the collection back via {@code - * collection.config.get()} and verify the {@code textAnalyzer} field - * is populated with the values that were sent. If the JSON keys - * don't match Weaviate's struct tags, the server stores nothing and - * this assertion fails. - *
  2. Behavioral: insert a string and run a filter query that - * only succeeds when folding actually executes server-side. If the - * analyzer config arrived as a no-op, the filter returns zero - * results and the assertion fails. - *
- */ public class TextAnalyzerITest extends ConcurrentTest { - private static final WeaviateClient client = Container.WEAVIATE.getClient(); - - @BeforeClass - public static void __() { - Weaviate.Version.V137.orSkip(); - } - - @Test - public void testAsciiFoldRoundTripsThroughConfigAndAffectsFilters() throws Exception { - var nsAccent = ns("AccentFolding"); - client.collections.create(nsAccent, c -> c - .properties( - Property.text("text_default", - p -> p.tokenization(Tokenization.WORD)), - Property.text("text_folded", - p -> p.tokenization(Tokenization.WORD) - .textAnalyzer(TextAnalyzer.of(t -> t.foldAscii(true)))), - Property.text("text_folded_keep_e", - p -> p.tokenization(Tokenization.WORD) - .textAnalyzer(TextAnalyzer.of(t -> t - .foldAscii(true) - .keepAscii("é")))))); - - var products = client.collections.use(nsAccent); - - // ---- Layer 1: schema round-trip ------------------------------------- - var config = products.config.get(); - Assertions.assertThat(config).isPresent(); - var props = config.get().properties(); - - var textDefault = props.stream() - .filter(p -> p.propertyName().equals("text_default")).findFirst().orElseThrow(); - var textFolded = props.stream() - .filter(p -> p.propertyName().equals("text_folded")).findFirst().orElseThrow(); - var textFoldedKeepE = props.stream() - .filter(p -> p.propertyName().equals("text_folded_keep_e")).findFirst().orElseThrow(); - - Assertions.assertThat(textDefault.textAnalyzer()) - .as("default property has no textAnalyzer config") - .isNull(); - - Assertions.assertThat(textFolded.textAnalyzer()) - .as("text_folded persists asciiFold=true") - .isNotNull() - .satisfies(ta -> { - Assertions.assertThat(ta.foldAscii()).isTrue(); - }); - - Assertions.assertThat(textFoldedKeepE.textAnalyzer()) - .as("text_folded_keep_e persists asciiFold=true and asciiFoldIgnore=[é]") - .isNotNull() - .satisfies(ta -> { - Assertions.assertThat(ta.foldAscii()).isTrue(); - Assertions.assertThat(ta.keepAscii()).containsExactly("é"); - }); - - // ---- Layer 2: behavioral -------------------------------------------- - products.data.insert(Map.of( - "text_default", "Café Crème Bio", - "text_folded", "Café Crème Bio", - "text_folded_keep_e", "Café Crème Bio")); - - // "cafe" (lowercase, no accents) must match only the fully-folded property. - var defaultMatches = products.query.fetchObjects( - q -> q.filters(Filter.property("text_default").eq("cafe"))); - Assertions.assertThat(defaultMatches.objects()) - .as("text_default has no folding, 'cafe' should not match 'Café Crème Bio'") - .isEmpty(); - - var foldedMatches = products.query.fetchObjects( - q -> q.filters(Filter.property("text_folded").eq("cafe"))); - Assertions.assertThat(foldedMatches.objects()) - .as("text_folded has asciiFold=true, 'cafe' must match 'Café Crème Bio'") - .hasSize(1); - - var keepEMatches = products.query.fetchObjects( - q -> q.filters(Filter.property("text_folded_keep_e").eq("cafe"))); - Assertions.assertThat(keepEMatches.objects()) - .as("text_folded_keep_e preserves é, 'cafe' must NOT match 'Café Crème Bio'") - .isEmpty(); - - // The exact accented form matches everywhere. - for (String prop : new String[] {"text_default", "text_folded", "text_folded_keep_e"}) { - var hits = products.query.fetchObjects( - q -> q.filters(Filter.property(prop).eq("Café"))); - Assertions.assertThat(hits.objects()) - .as("'Café' (exact) should match on %s regardless of folding", prop) - .hasSize(1); + private static final WeaviateClient client = Container.WEAVIATE.getClient(); + + @BeforeClass + public static void __() { + Weaviate.Version.V137.orSkip(); + } + + @Test + public void testAsciiFoldRoundTripsThroughConfigAndAffectsFilters() throws Exception { + var nsAccent = ns("AccentFolding"); + client.collections.create(nsAccent, c -> c + .properties( + Property.text("text_default", + p -> p.tokenization(Tokenization.WORD)), + Property.text("text_folded", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t.foldAscii(true)))), + Property.text("text_folded_keep_e", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t + .foldAscii(true) + .keepAscii("é")))))); + + var products = client.collections.use(nsAccent); + + // ---- Layer 1: schema round-trip ------------------------------------- + var config = products.config.get(); + Assertions.assertThat(config).isPresent(); + var props = config.get().properties(); + + var textDefault = props.stream() + .filter(p -> p.propertyName().equals("text_default")).findFirst().orElseThrow(); + var textFolded = props.stream() + .filter(p -> p.propertyName().equals("text_folded")).findFirst().orElseThrow(); + var textFoldedKeepE = props.stream() + .filter(p -> p.propertyName().equals("text_folded_keep_e")).findFirst().orElseThrow(); + + Assertions.assertThat(textDefault.textAnalyzer()) + .as("default property has no textAnalyzer config") + .isNull(); + + Assertions.assertThat(textFolded.textAnalyzer()) + .as("text_folded persists asciiFold=true") + .isNotNull() + .satisfies(ta -> { + Assertions.assertThat(ta.foldAscii()).isTrue(); + }); + + Assertions.assertThat(textFoldedKeepE.textAnalyzer()) + .as("text_folded_keep_e persists asciiFold=true and asciiFoldIgnore=[é]") + .isNotNull() + .satisfies(ta -> { + Assertions.assertThat(ta.foldAscii()).isTrue(); + Assertions.assertThat(ta.keepAscii()).containsExactly("é"); + }); + + // ---- Layer 2: behavioral -------------------------------------------- + products.data.insert(Map.of( + "text_default", "Café Crème Bio", + "text_folded", "Café Crème Bio", + "text_folded_keep_e", "Café Crème Bio")); + + // "cafe" (lowercase, no accents) must match only the fully-folded property. + var defaultMatches = products.query.fetchObjects( + q -> q.filters(Filter.property("text_default").eq("cafe"))); + Assertions.assertThat(defaultMatches.objects()) + .as("text_default has no folding, 'cafe' should not match 'Café Crème Bio'") + .isEmpty(); + + var foldedMatches = products.query.fetchObjects( + q -> q.filters(Filter.property("text_folded").eq("cafe"))); + Assertions.assertThat(foldedMatches.objects()) + .as("text_folded has asciiFold=true, 'cafe' must match 'Café Crème Bio'") + .hasSize(1); + + var keepEMatches = products.query.fetchObjects( + q -> q.filters(Filter.property("text_folded_keep_e").eq("cafe"))); + Assertions.assertThat(keepEMatches.objects()) + .as("text_folded_keep_e preserves é, 'cafe' must NOT match 'Café Crème Bio'") + .isEmpty(); + + // The exact accented form matches everywhere. + for (String prop : new String[] { "text_default", "text_folded", "text_folded_keep_e" }) { + var hits = products.query.fetchObjects( + q -> q.filters(Filter.property(prop).eq("Café"))); + Assertions.assertThat(hits.objects()) + .as("'Café' (exact) should match on %s regardless of folding", prop) + .hasSize(1); + } + } + + @Test + public void testStopwordPresetRoundTripsThroughConfig() throws Exception { + var nsStop = ns("StopwordPreset"); + client.collections.create(nsStop, c -> c + .properties( + Property.text("name_en", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t.stopwordPreset("en")))), + Property.text("name_none", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t.stopwordPreset("none")))))); + + var products = client.collections.use(nsStop); + var config = products.config.get(); + Assertions.assertThat(config).isPresent(); + var props = config.get().properties(); + + var nameEn = props.stream() + .filter(p -> p.propertyName().equals("name_en")).findFirst().orElseThrow(); + var nameNone = props.stream() + .filter(p -> p.propertyName().equals("name_none")).findFirst().orElseThrow(); + + Assertions.assertThat(nameEn.textAnalyzer()) + .as("name_en persists stopwordPreset=en") + .isNotNull() + .satisfies(ta -> Assertions.assertThat(ta.stopwordPreset()).isEqualTo("en")); + + Assertions.assertThat(nameNone.textAnalyzer()) + .as("name_none persists stopwordPreset=none") + .isNotNull() + .satisfies(ta -> Assertions.assertThat(ta.stopwordPreset()).isEqualTo("none")); } - } - - @Test - public void testStopwordPresetRoundTripsThroughConfig() throws Exception { - var nsStop = ns("StopwordPreset"); - client.collections.create(nsStop, c -> c - .properties( - Property.text("name_en", - p -> p.tokenization(Tokenization.WORD) - .textAnalyzer(TextAnalyzer.of(t -> t.stopwordPreset("en")))), - Property.text("name_none", - p -> p.tokenization(Tokenization.WORD) - .textAnalyzer(TextAnalyzer.of(t -> t.stopwordPreset("none")))))); - - var products = client.collections.use(nsStop); - var config = products.config.get(); - Assertions.assertThat(config).isPresent(); - var props = config.get().properties(); - - var nameEn = props.stream() - .filter(p -> p.propertyName().equals("name_en")).findFirst().orElseThrow(); - var nameNone = props.stream() - .filter(p -> p.propertyName().equals("name_none")).findFirst().orElseThrow(); - - Assertions.assertThat(nameEn.textAnalyzer()) - .as("name_en persists stopwordPreset=en") - .isNotNull() - .satisfies(ta -> Assertions.assertThat(ta.stopwordPreset()).isEqualTo("en")); - - Assertions.assertThat(nameNone.textAnalyzer()) - .as("name_none persists stopwordPreset=none") - .isNotNull() - .satisfies(ta -> Assertions.assertThat(ta.stopwordPreset()).isEqualTo("none")); - } } From 7605016c69fd8e4538f83352ed745a1d75f9d7f8 Mon Sep 17 00:00:00 2001 From: Ivan Despot <66276597+g-despot@users.noreply.github.com> Date: Thu, 30 Apr 2026 15:54:43 +0200 Subject: [PATCH 3/4] Bump version --- .github/workflows/test.yaml | 2 +- src/it/java/io/weaviate/integration/TextAnalyzerITest.java | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5b13f073e..c635d2be3 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -92,7 +92,7 @@ jobs: fail-fast: false matrix: WEAVIATE_VERSION: - ["1.32.24", "1.33.11", "1.34.7", "1.35.2", "1.36.9", "1.37.1"] + ["1.32.24", "1.33.11", "1.34.7", "1.35.2", "1.36.9", "1.37.2"] steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 diff --git a/src/it/java/io/weaviate/integration/TextAnalyzerITest.java b/src/it/java/io/weaviate/integration/TextAnalyzerITest.java index a6ca565e4..deaf6685f 100644 --- a/src/it/java/io/weaviate/integration/TextAnalyzerITest.java +++ b/src/it/java/io/weaviate/integration/TextAnalyzerITest.java @@ -41,7 +41,6 @@ public void testAsciiFoldRoundTripsThroughConfigAndAffectsFilters() throws Excep var products = client.collections.use(nsAccent); - // ---- Layer 1: schema round-trip ------------------------------------- var config = products.config.get(); Assertions.assertThat(config).isPresent(); var props = config.get().properties(); @@ -72,13 +71,11 @@ public void testAsciiFoldRoundTripsThroughConfigAndAffectsFilters() throws Excep Assertions.assertThat(ta.keepAscii()).containsExactly("é"); }); - // ---- Layer 2: behavioral -------------------------------------------- products.data.insert(Map.of( "text_default", "Café Crème Bio", "text_folded", "Café Crème Bio", "text_folded_keep_e", "Café Crème Bio")); - // "cafe" (lowercase, no accents) must match only the fully-folded property. var defaultMatches = products.query.fetchObjects( q -> q.filters(Filter.property("text_default").eq("cafe"))); Assertions.assertThat(defaultMatches.objects()) @@ -97,7 +94,6 @@ public void testAsciiFoldRoundTripsThroughConfigAndAffectsFilters() throws Excep .as("text_folded_keep_e preserves é, 'cafe' must NOT match 'Café Crème Bio'") .isEmpty(); - // The exact accented form matches everywhere. for (String prop : new String[] { "text_default", "text_folded", "text_folded_keep_e" }) { var hits = products.query.fetchObjects( q -> q.filters(Filter.property(prop).eq("Café"))); From 59da10c11dbf0617dda2036482f411db41fd7d5a Mon Sep 17 00:00:00 2001 From: Ivan Despot <66276597+g-despot@users.noreply.github.com> Date: Tue, 5 May 2026 09:54:32 +0200 Subject: [PATCH 4/4] Implement feedback --- .../integration/CollectionsITest.java | 63 ++++++++ .../integration/TextAnalyzerITest.java | 138 ------------------ 2 files changed, 63 insertions(+), 138 deletions(-) delete mode 100644 src/it/java/io/weaviate/integration/TextAnalyzerITest.java diff --git a/src/it/java/io/weaviate/integration/CollectionsITest.java b/src/it/java/io/weaviate/integration/CollectionsITest.java index 889041277..927b2ac05 100644 --- a/src/it/java/io/weaviate/integration/CollectionsITest.java +++ b/src/it/java/io/weaviate/integration/CollectionsITest.java @@ -18,6 +18,8 @@ import io.weaviate.client6.v1.api.collections.Quantization; import io.weaviate.client6.v1.api.collections.ReferenceProperty; import io.weaviate.client6.v1.api.collections.Replication; +import io.weaviate.client6.v1.api.collections.TextAnalyzer; +import io.weaviate.client6.v1.api.collections.Tokenization; import io.weaviate.client6.v1.api.collections.Replication.AsyncReplicationConfig; import io.weaviate.client6.v1.api.collections.VectorConfig; import io.weaviate.client6.v1.api.collections.VectorIndex; @@ -399,6 +401,67 @@ public void test_dropVectorIndex() throws IOException { .matches(VectorIndex::isNone).as("is 'none'"); } + @Test + public void testTextAnalyzer() throws Exception { + Weaviate.Version.V137.orSkip(); + + var nsTextAnalyzer = ns("TextAnalyzer"); + var textAnalyzer = client.collections.create(nsTextAnalyzer, c -> c + .properties( + Property.text("text_default", + p -> p.tokenization(Tokenization.WORD)), + Property.text("text_folded", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t.foldAscii(true)))), + Property.text("text_folded_keep_e", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t + .foldAscii(true) + .keepAscii("é")))), + Property.text("name_en", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t.stopwordPreset("en")))), + Property.text("name_none", + p -> p.tokenization(Tokenization.WORD) + .textAnalyzer(TextAnalyzer.of(t -> t.stopwordPreset("none")))))); + + Assertions.assertThat(textAnalyzer.config.get()) + .get() + .extracting(CollectionConfig::properties, InstanceOfAssertFactories.list(Property.class)) + .allSatisfy(property -> { + var analyzer = property.textAnalyzer(); + switch (property.propertyName()) { + case "text_default": + Assertions.assertThat(analyzer) + .as("default property has no textAnalyzer config") + .isNull(); + break; + case "text_folded": + Assertions.assertThat(analyzer) + .as("text_folded persists asciiFold=true") + .returns(true, TextAnalyzer::foldAscii); + break; + case "text_folded_keep_e": + Assertions.assertThat(analyzer) + .as("text_folded_keep_e persists asciiFold=true and asciiFoldIgnore=[é]") + .returns(true, TextAnalyzer::foldAscii) + .extracting(TextAnalyzer::keepAscii, InstanceOfAssertFactories.list(String.class)) + .containsExactly("é"); + break; + case "name_en": + Assertions.assertThat(analyzer) + .as("name_en persists stopwordPreset=en") + .returns("en", TextAnalyzer::stopwordPreset); + break; + case "name_none": + Assertions.assertThat(analyzer) + .as("name_none persists stopwordPreset=none") + .returns("none", TextAnalyzer::stopwordPreset); + break; + } + }); + } + @Test public void test_asyncReplicationConfig() throws IOException { Weaviate.Version.latest().orSkip(); diff --git a/src/it/java/io/weaviate/integration/TextAnalyzerITest.java b/src/it/java/io/weaviate/integration/TextAnalyzerITest.java deleted file mode 100644 index deaf6685f..000000000 --- a/src/it/java/io/weaviate/integration/TextAnalyzerITest.java +++ /dev/null @@ -1,138 +0,0 @@ -package io.weaviate.integration; - -import java.util.Map; - -import org.assertj.core.api.Assertions; -import org.junit.BeforeClass; -import org.junit.Test; - -import io.weaviate.ConcurrentTest; -import io.weaviate.client6.v1.api.WeaviateClient; -import io.weaviate.client6.v1.api.collections.Property; -import io.weaviate.client6.v1.api.collections.TextAnalyzer; -import io.weaviate.client6.v1.api.collections.Tokenization; -import io.weaviate.client6.v1.api.collections.query.Filter; -import io.weaviate.containers.Container; -import io.weaviate.containers.Weaviate; - -public class TextAnalyzerITest extends ConcurrentTest { - private static final WeaviateClient client = Container.WEAVIATE.getClient(); - - @BeforeClass - public static void __() { - Weaviate.Version.V137.orSkip(); - } - - @Test - public void testAsciiFoldRoundTripsThroughConfigAndAffectsFilters() throws Exception { - var nsAccent = ns("AccentFolding"); - client.collections.create(nsAccent, c -> c - .properties( - Property.text("text_default", - p -> p.tokenization(Tokenization.WORD)), - Property.text("text_folded", - p -> p.tokenization(Tokenization.WORD) - .textAnalyzer(TextAnalyzer.of(t -> t.foldAscii(true)))), - Property.text("text_folded_keep_e", - p -> p.tokenization(Tokenization.WORD) - .textAnalyzer(TextAnalyzer.of(t -> t - .foldAscii(true) - .keepAscii("é")))))); - - var products = client.collections.use(nsAccent); - - var config = products.config.get(); - Assertions.assertThat(config).isPresent(); - var props = config.get().properties(); - - var textDefault = props.stream() - .filter(p -> p.propertyName().equals("text_default")).findFirst().orElseThrow(); - var textFolded = props.stream() - .filter(p -> p.propertyName().equals("text_folded")).findFirst().orElseThrow(); - var textFoldedKeepE = props.stream() - .filter(p -> p.propertyName().equals("text_folded_keep_e")).findFirst().orElseThrow(); - - Assertions.assertThat(textDefault.textAnalyzer()) - .as("default property has no textAnalyzer config") - .isNull(); - - Assertions.assertThat(textFolded.textAnalyzer()) - .as("text_folded persists asciiFold=true") - .isNotNull() - .satisfies(ta -> { - Assertions.assertThat(ta.foldAscii()).isTrue(); - }); - - Assertions.assertThat(textFoldedKeepE.textAnalyzer()) - .as("text_folded_keep_e persists asciiFold=true and asciiFoldIgnore=[é]") - .isNotNull() - .satisfies(ta -> { - Assertions.assertThat(ta.foldAscii()).isTrue(); - Assertions.assertThat(ta.keepAscii()).containsExactly("é"); - }); - - products.data.insert(Map.of( - "text_default", "Café Crème Bio", - "text_folded", "Café Crème Bio", - "text_folded_keep_e", "Café Crème Bio")); - - var defaultMatches = products.query.fetchObjects( - q -> q.filters(Filter.property("text_default").eq("cafe"))); - Assertions.assertThat(defaultMatches.objects()) - .as("text_default has no folding, 'cafe' should not match 'Café Crème Bio'") - .isEmpty(); - - var foldedMatches = products.query.fetchObjects( - q -> q.filters(Filter.property("text_folded").eq("cafe"))); - Assertions.assertThat(foldedMatches.objects()) - .as("text_folded has asciiFold=true, 'cafe' must match 'Café Crème Bio'") - .hasSize(1); - - var keepEMatches = products.query.fetchObjects( - q -> q.filters(Filter.property("text_folded_keep_e").eq("cafe"))); - Assertions.assertThat(keepEMatches.objects()) - .as("text_folded_keep_e preserves é, 'cafe' must NOT match 'Café Crème Bio'") - .isEmpty(); - - for (String prop : new String[] { "text_default", "text_folded", "text_folded_keep_e" }) { - var hits = products.query.fetchObjects( - q -> q.filters(Filter.property(prop).eq("Café"))); - Assertions.assertThat(hits.objects()) - .as("'Café' (exact) should match on %s regardless of folding", prop) - .hasSize(1); - } - } - - @Test - public void testStopwordPresetRoundTripsThroughConfig() throws Exception { - var nsStop = ns("StopwordPreset"); - client.collections.create(nsStop, c -> c - .properties( - Property.text("name_en", - p -> p.tokenization(Tokenization.WORD) - .textAnalyzer(TextAnalyzer.of(t -> t.stopwordPreset("en")))), - Property.text("name_none", - p -> p.tokenization(Tokenization.WORD) - .textAnalyzer(TextAnalyzer.of(t -> t.stopwordPreset("none")))))); - - var products = client.collections.use(nsStop); - var config = products.config.get(); - Assertions.assertThat(config).isPresent(); - var props = config.get().properties(); - - var nameEn = props.stream() - .filter(p -> p.propertyName().equals("name_en")).findFirst().orElseThrow(); - var nameNone = props.stream() - .filter(p -> p.propertyName().equals("name_none")).findFirst().orElseThrow(); - - Assertions.assertThat(nameEn.textAnalyzer()) - .as("name_en persists stopwordPreset=en") - .isNotNull() - .satisfies(ta -> Assertions.assertThat(ta.stopwordPreset()).isEqualTo("en")); - - Assertions.assertThat(nameNone.textAnalyzer()) - .as("name_none persists stopwordPreset=none") - .isNotNull() - .satisfies(ta -> Assertions.assertThat(ta.stopwordPreset()).isEqualTo("none")); - } -}