Simple Podcast Search Demo

Getting Started

Step 1: Set Up Your Environment

Clone the Demos repository:

git clone --branch 2.0.0 https://github.com/marqo-ai/marqo.git
cd marqo/examples/podcast-search

Start the Marqo Docker container:

docker rm -f marqo;docker run --name marqo -it -p 8882:8882 --add-host host.docker.internal:host-gateway marqoai/marqo:2.0.0

For more detailed instructions, refer to the getting started guide.

Navigate to the podcast-search directory:
```
cd podcast-search
```
Execute the demo script:
```
python3 podcast_search_demo.py
```

Walkthrough Steps

Data Preparation

Before we can start searching, we need to prepare our data.

import pandas as pd

def load_data(file: str, number_data: int) -> dict:
    podcast_data = pd.read_csv(file).head(number_data)[['name', 'description']].to_dict('records')
    # dataset came from this link: https://www.vox.com/today-explained
    # the .csv file has the following headers:
    # name, description
    # (name of podcast, short description)

    # create a 'transcript' key and add the transcript text as values to each record
    id_counter = 1
    for data in podcast_data:
        path = "data/transcripts/" + data['name'] + ".txt"
        with open(path, 'r') as f:
            content = f.read()
            data['transcript'] = content
            data['_id'] = str(id_counter)  # _id is a special key which is unique to every document
        id_counter += 1

    return podcast_data

Indexing with Marqo

Once our data is ready, we need to create an index and add our documents to it.

import marqo

index_name = "marqo-podcast-search-demo"
mq = marqo.Client(url='http://localhost:8882')
mq.create_index(index_name)
mq.index(index_name).add_documents(podcast_data, tensor_fields=['name', 'description', 'transcript'], client_batch_size=64)

Searching

With our data indexed, we can perform different types of searches.

query = 'what is long covid?'
results = mq.index(index_name).search(query)
# ['_highlights'] will return only the relevant portion rather than the whole transcript
print("Result 1 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights'])  # [0] returns the top hit
print("Result 2 -", end=" ")
pprint.pprint(results['hits'][1]['_highlights'])  # [1] returns the second hit

query = 'water issues in US'
results = mq.index(index_name).search(query)
print("Result 3 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights'])

query = 'water crisis'
results = mq.index(index_name).search(query, search_method='LEXICAL')
print("Result 4 -", end=" ")
pprint.pprint(results['hits'][0]['name'])
print("Result 5 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights'])  # [_highlights] will return an empty list if using lexical search

Full Code

podcast_search.py

import marqo
import pprint
import pandas as pd


####################################################
### STEP 1: Load Data
####################################################

def load_data(file: str, number_data: int) -> dict:
podcast_data = pd.read_csv(file).head(number_data)[['name', 'description']].to_dict('records')

     # dataset came from this link: https://www.vox.com/today-explained
     # the .csv file has the following headers:
     # name, description
     # (name of podcast, short description)

     # create a 'transcript' key and add the transcript text as values to each record
     id_counter = 1
     for data in podcast_data:
         path = "data/transcripts/" + data['name'] + ".txt"
         with open(path, 'r') as f:
             content = f.read()
             data['transcript'] = content
             data['_id'] = str(id_counter)  # _id is a special key which is unique to every document
         id_counter += 1

     return podcast_data


dataset_file = "data/podcast_data.csv"
podcast_data = load_data(dataset_file, 2)
'''
format of podcast_data -
[{'name': '....', 'description': '....', 'transcript': '....'},
{'name': '....', 'description': '....', 'transcript': '....'}]
'''


#####################################################
### STEP 2. Start Marqo
#####################################################

# Follow the instructions here https://github.com/marqo-ai/marqo/tree/2.0.0


####################################################
### STEP 3: Index Data
####################################################

index_name = "marqo-podcast-search-demo"
mq = marqo.Client(url='http://localhost:8882')  # Connection to Marqo Docker Container
mq.create_index(index_name)
mq.index(index_name).add_documents(podcast_data, 
    tensor_fields=['name', 'description', 'transcript'], client_batch_size=64)
stats = mq.index(index_name).get_stats()  # get the stats for the index
print(f"{stats['numberOfDocuments']} documents added to index: {index_name}")


####################################################
### STEP 4: Search using Marqo
####################################################

# let's create a query and perform tensor search
query = 'what is long covid?'
results = mq.index(index_name).search(query)

# ['_highlights'] will return only the relevant portion rather than the whole transcript
print("Result 1 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights'])  # [0] returns the top hit
print("Result 2 -", end=" ")
pprint.pprint(results['hits'][1]['_highlights'])  # [1] returns the second hit


# let's create another query and perform tensor search on a particular field
query = 'water issues in US'
results = mq.index(index_name).search(query)

print("Result 3 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights'])


# let's create another query and perform lexical search on a particular field
query = 'water crisis'
results = mq.index(index_name).search(query, search_method='LEXICAL')

print("Result 4 -", end=" ")
pprint.pprint(results['hits'][0]['name'])
print("Result 5 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights'])  # [_highlights] will return an empty list if using lexical search