Skip to content

Simple Wikipedia Demo


Introduction

This guide will walk you through using Marqo to index and search a dataset from Simple Wikipedia. We'll break down the process step by step to make it easy to understand and follow along.


Getting Started

Before we begin, there are a few preliminary steps to ensure you have everything needed for this demo:

Step 1: Download the Dataset

First, download the Simple Wikipedia dataset. You can find it here: Simple Wikipedia Dataset.

Step 2: Start Marqo

Next, we need to get Marqo up and running. You can do this by executing the following command in your terminal:

docker rm -f marqo;docker run --name marqo -it -p 8882:8882 --add-host host.docker.internal:host-gateway marqoai/marqo:2.0.0

For mode detailed instructions, check the getting started guide.

Step 3: Run the Demo Script

Once Marqo is running, you can execute the simple_wiki_demo.py script:

python3 simple_wiki_demo.py

Note: Indexing can take some time depending on your computer.

Code Walkthrough

Let's dive into the code. The script is broken down into several steps to make it easier to understand and manage.

Step 0: Import and Helper Functions

Before we start indexing, we need to set up our environment with the necessary imports and helper functions.

from marqo import Client
from marqo.errors import MarqoApiError
import torch
import json
import pprint


def read_json(filename: str) -> dict:
    # reads a json file
    with open(filename, "r") as f:
        data = json.load(f)
    return data


def clean_data(data: dict) -> dict:
    # removes the wikipedia from the title for better matching
    data["title"] = data["title"].replace("- Wikipedia", "")
    # Convert docDate to string
    data["docDate"] = str(data["docDate"])
    return data

Step 1: Load the Data

After setting up our imports and helper functions, the next step is to load our dataset.

dataset_file = "simplewiki.json"
# get the data
data = read_json(dataset_file)
# clean up the title
data = [clean_data(d) for d in data]
print(f"loaded data with {len(data)} entries")

Step 2: Start Marqo

This step assumes you have started the Marqo server as described in the "Getting Started" section.

Step 3: Index the Data with Marqo

With our data loaded, we can now create an index in Marqo and add our documents to it.

index_name = "marqo-simplewiki-demo-all"

# setup the client
client = Client()

client.create_index(index_name, model="onnx/all_datasets_v4_MiniLM-L6")

responses = client.index(index_name).add_documents(
    data, client_batch_size=50, tensor_fields=["title", "content"]
)

# Uncomment to look at the responses
# pprint.pprint(responses)

Step 4: Searching with Marqo

Now that our data is indexed, we can perform searches on it.

# Create a query
query = "what is air made of?"

# Perform a search
results = client.index(index_name).search(query)

# Look at the top hit and highlights
pprint.pprint(results["hits"][0])
pprint.pprint(results["hits"][0]["_highlights"])

# Repeat the search with different parameters
results = client.index(index_name).search(query)
pprint.pprint(results["hits"][0])

# Lexical search example
results = client.index(index_name).search(query, search_method="LEXICAL")
pprint.pprint(results["hits"][0])

# Another query example
query = "what is a cube?"
results = client.index(index_name).search(query)
pprint.pprint(results["hits"][0]["_highlights"])

Full Code

simple_wiki_demo.py
#####################################################
### STEP 0. Import and define any helper functions
#####################################################

from marqo import Client
import json
import math
import numpy as np
import copy
import pprint


def read_json(filename: str) -> dict:
    # reads a json file
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data


def clean_data(data: dict) -> dict:
    # removes the wikipedia from the title for better matching
    data['title'] = data['title'].replace('- Wikipedia', '')
    # Convert docDate to string
    data["docDate"] = str(data["docDate"])
    return data


def split_big_docs(data, field='content', char_len=5e4):
    # there are some large documents which can cause issues for some users
    new_data = []
    for dat in data:

        content = dat[field]
        N = len(content)

        if N >= char_len:
            n_chunks = math.ceil(N / char_len)
            new_content = np.array_split(list(content), n_chunks)

            for _content in new_content:
                new_dat = copy.deepcopy(dat)
                new_dat[field] = ''.join(_content)
                new_data.append(new_dat)
        else:
            new_data.append(dat)
    return new_data


#####################################################
### STEP 1. load the data
#####################################################

# download the json formatted simplewiki from here -
# https://www.kaggle.com/datasets/louisgeisler/simple-wiki?resource=download
# or from
# https://drive.google.com/file/d/1OEqXeIdqaZb6BwzKIgw8G_sDi91fBawt/view?usp=sharing
dataset_file = "simplewiki.json"

# get the data
data = read_json(dataset_file)
# clean up the title
data = [clean_data(d) for d in data]

data = split_big_docs(data)
print(f"loaded data with {len(data)} entries")

#####################################################
### STEP 2. start Marqo
#####################################################

# Follow the instructions here https://github.com/marqo-ai/marqo

#####################################################
### STEP 3. index some data with marqo
#####################################################

# we use an index name. the index name needs to be lower case.
index_name = 'marqo-simplewiki-demo-all'

# setup the client
client = Client()

# we create the index. Note if it already exists an error will occur
# as you cannot overwrite an existing index
try:
    client.delete_index(index_name)
except:
    pass

# we create the index and can set the model we want to use
# the onnx models are typically faster on both CPU and GPU
# to use non-onnx just use the name 'all_datasets_v4_MiniLM-L6'
client.create_index(index_name, model='onnx/all_datasets_v4_MiniLM-L6')

responses = client.index(index_name).add_documents(
    data, client_batch_size=50,
    tensor_fields=["title", "content"]
)

# optionally take a look at the responses
# pprint.pprint(responses)

#######################################
### STEP 4. Searching with marqo ######
#######################################


# after indexing we can search using both keyword (lexical) and neural search
# this will perform neural search across all indexed fields

# lets create a query
query = 'what is air made of?'

results = client.index(index_name).search(query)

# we can check the results - lets look at the top hit
pprint.pprint(results['hits'][0])

# we also get highlighting which tells us why this article was returned
pprint.pprint(results['hits'][0]['_highlights'])

# we can restrict the search to specific fields as well
results = client.index(index_name).search(query)

# we can check the results - lets look at the top hit
pprint.pprint(results['hits'][0])

# we can check the results - lets look at the top hit
pprint.pprint(results['hits'][0])

# we use lexical search instead of tensor search
results = client.index(index_name).search(query, search_method='LEXICAL')

# we can check the results - lets look at the top hit
pprint.pprint(results['hits'][0])

# we can check the results - lets look at the top hit
pprint.pprint(results['hits'][0])

# lets create another query
query = 'what is a cube?'

results = client.index(index_name).search(query)

# we can check the results - lets look at the top hit
pprint.pprint(results['hits'][0])

# we also get highlighting which tells us why this article was returned
pprint.pprint(results['hits'][0]['_highlights'])