Multi-lingual Search
Introduction
In this guide, we'll walk through setting up a multi-lingual search index using Marqo.
Getting Started
Before diving into the code, ensure you have the following prerequisites completed:
-
Clone the Repository
Get the example files by cloning the repository:git clone --branch 2.0.0 https://github.com/marqo-ai/marqo.git cd marqo/examples/MultiLingual
-
Run Marqo
Start the Marqo service using Docker with the following commands:docker rm -f marqo docker pull marqoai/marqo:2.0.0 docker run --name marqo -it -p 8882:8882 --add-host host.docker.internal:host-gateway marqoai/marqo:2.0.0
-
Refer to the Original Code
The full example code is available here.
Building the Index
To create a multi-lingual index, we'll perform the following steps:
1. Setup and Imports
Start by setting up your environment and importing necessary libraries:
from marqo import Client
from datasets import load_dataset
import datetime
import json
import pprint
import logging
# Define the index name
INDEX_NAME = "my-multilingual-index"
# Initialize logging for HTTP request information
logging.basicConfig(level=logging.DEBUG)
# Initialize the Marqo client
mq = Client("http://localhost:8882")
2. Loading the Data
Load the MultiEURLEX dataset, focusing on English and German validation splits:
dataset_en = load_dataset("multi_eurlex", "en", split="validation")
dataset_de = load_dataset("multi_eurlex", "de", split="validation")
3. Creating the Index
Create the index using the chosen multilingual model:
mq.create_index(index_name=INDEX_NAME, model="stsb-xlm-r-multilingual")
4. Preparing the Documents and Indexing them
Handle large documents by splitting them into smaller parts for easier search, then we index documents by posting them to the Marqo index:
MAX_TEXT_LENGTH = 100000
for ds, lang in [(dataset_en, "en"), (dataset_de, "de")]:
num_docs_in_dataset = len(ds)
for ii, doc in enumerate(ds):
dumped = json.dumps(doc)
# we'll set the doc ID to be the document's hash
doc_id = str(hash(dumped))
text_length = len(doc["text"])
split_size = MAX_TEXT_LENGTH // 2
# break up the text of large documents:
if text_length > MAX_TEXT_LENGTH:
text_splits = [
doc["text"][i : i + split_size]
for i in range(0, text_length, split_size)
]
else:
text_splits = [doc["text"]]
for i, sub_doc in enumerate(text_splits):
# if a document is broken up, add the text's index to the end of the document:
qualified_id = f"{doc_id}.{i}" if len(text_splits) > 1 else doc_id
# create a dict to be posted
to_post = dict(
[
(k, v) if k != "labels" else (k, str(v))
for k, v in doc.items()
if k != "text"
]
+ [("_id", qualified_id), ("language", lang), ("text", sub_doc)]
)
print(
f"doc number {ii} out of {num_docs_in_dataset} docs in dataset {lang}. "
f"_id: {qualified_id}, celex_id: {doc['celex_id']}, "
f"json to send size: {len(json.dumps(to_post))}"
)
# Index a large number of documents.
mq.index(index_name=INDEX_NAME).add_documents(
documents=[to_post],
tensor_fields=["text", "language"],
client_batch_size=64,
)
t1 = datetime.datetime.now()
print(f"finished indexing. Started at {t0}. Finished at {t1}. Took {t1 - t0}")
Searching the Index
Once indexing is complete, you can perform searches using the following function:
1. Define the Search Function
Create a function to search for a query in the specified language:
def search(q):
result = mq.index(INDEX_NAME).search(q=q)
for res in result["hits"]:
pprint.pprint(res["_highlights"])
2. Execute a Search
Test the search with a query of your choice:
# Replace 'my_search_query' with your search text
my_search_query = "Laws about the fishing industry"
search(my_search_query)
Full Code
Example
"""
This example uses the MultiEURLEX dataset.
Log from running:
Took 45 minutes on ml.g4dn.2xlarge
"""
# change this to 'cpu' if the machine you are running Marqo on doesn't have a
# Nvidia GPU
DEVICE = "cuda"
# import marqo:
from marqo import Client
# import the huggingface datasets package:
from datasets import load_dataset
# import other python packages
import datetime
import json
import pprint
import logging
# this will be the name of the index:
INDEX_NAME = "my-multilingual-index"
# this helps us see information about the HTTP requests
logging.basicConfig(level=logging.DEBUG)
# Create a new Marqo client:
mq = Client("http://localhost:8882")
def build_index():
# Load the datasets. For this example we're just using the English and
# Deutsch validation splits:
dataset_en = load_dataset('multi_eurlex', 'en', split="validation")
dataset_de = load_dataset('multi_eurlex', 'de', split="validation")
# record the start time:
t0 = datetime.datetime.now()
try:
mq.index(INDEX_NAME).delete()
except:
pass
# Create the index. The model we're using is multilingual:
mq.create_index(index_name=INDEX_NAME, model='stsb-xlm-r-multilingual')
# Let's break up large documents to make it easier to search:
MAX_TEXT_LENGTH = 100000
for ds, lang in [(dataset_en, "en"), (dataset_de, "de")]:
num_docs_in_dataset = len(ds)
for ii, doc in enumerate(ds):
dumped = json.dumps(doc)
# we'll set the doc ID to be the document's hash
doc_id = str(hash(dumped))
text_length = len(doc['text'])
split_size = MAX_TEXT_LENGTH//2
# break up the text of large documents:
if text_length > MAX_TEXT_LENGTH:
text_splits = [doc['text'][i: i + split_size] for i in range(0, text_length, split_size)]
else:
text_splits = [doc['text']]
for i, sub_doc in enumerate(text_splits):
# if a document is broken up, add the text's index to the end of the document:
qualified_id = f"{doc_id}.{i}" if len(text_splits) > 1 else doc_id
# create a dict to be posted
to_post = dict(
[(k, v) if k != "labels" else (k, str(v)) for k, v in doc. items() if k != 'text']
+ [("_id", qualified_id), ("language", lang), ('text', sub_doc)]
)
print(f"doc number {ii} out of {num_docs_in_dataset} docs in dataset {lang}. "
f"_id: {qualified_id}, celex_id: {doc['celex_id']}, "
f"json to send size: {len(json.dumps(to_post))}")
# Index the document. The device is set to 'cuda' to take
# advantage of the machine's GPU. If you don't have a GPU,
# change this argument to 'cpu'.
mq.index(index_name=INDEX_NAME).add_documents(
documents=[to_post], device=DEVICE,
tensor_fields=["language", "text", "labels"]
)
t1 = datetime.datetime.now()
print(f"finished indexing. Started at {t0}. Finished at {t1}. Took {t1 - t0}")
def search(q):
result = mq.index(INDEX_NAME).search(q=q)
# Just print out the highlights, which makes the output easier to read
for res in result["hits"]:
pprint.pprint(res["_highlights"])
# After you finishing indexing, comment out the following line to prevent going through
# the whole indexing process again.
build_index()
# Replace 'my_search_query' with whatever text you want to search. In English or Deutsch!
my_search_query = "Laws about the fishing industry"
search(my_search_query)