Simple Podcast Search Demo
Getting Started
Step 1: Set Up Your Environment
-
Clone the Demos repository:
git clone --branch 2.0.0 https://github.com/marqo-ai/marqo.git cd marqo/examples/podcast-search
-
Start the Marqo Docker container:
For more detailed instructions, refer to the getting started guide.docker rm -f marqo;docker run --name marqo -it -p 8882:8882 --add-host host.docker.internal:host-gateway marqoai/marqo:2.0.0
-
Navigate to the
podcast-search
directory:cd podcast-search
-
Execute the demo script:
python3 podcast_search_demo.py
Walkthrough Steps
Data Preparation
Before we can start searching, we need to prepare our data.
import pandas as pd
def load_data(file: str, number_data: int) -> dict:
podcast_data = pd.read_csv(file).head(number_data)[['name', 'description']].to_dict('records')
# dataset came from this link: https://www.vox.com/today-explained
# the .csv file has the following headers:
# name, description
# (name of podcast, short description)
# create a 'transcript' key and add the transcript text as values to each record
id_counter = 1
for data in podcast_data:
path = "data/transcripts/" + data['name'] + ".txt"
with open(path, 'r') as f:
content = f.read()
data['transcript'] = content
data['_id'] = str(id_counter) # _id is a special key which is unique to every document
id_counter += 1
return podcast_data
Indexing with Marqo
Once our data is ready, we need to create an index and add our documents to it.
import marqo
index_name = "marqo-podcast-search-demo"
mq = marqo.Client(url='http://localhost:8882')
mq.create_index(index_name)
mq.index(index_name).add_documents(podcast_data, tensor_fields=['name', 'description', 'transcript'], client_batch_size=64)
Searching
With our data indexed, we can perform different types of searches.
query = 'what is long covid?'
results = mq.index(index_name).search(query)
# ['_highlights'] will return only the relevant portion rather than the whole transcript
print("Result 1 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights']) # [0] returns the top hit
print("Result 2 -", end=" ")
pprint.pprint(results['hits'][1]['_highlights']) # [1] returns the second hit
query = 'water issues in US'
results = mq.index(index_name).search(query)
print("Result 3 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights'])
query = 'water crisis'
results = mq.index(index_name).search(query, search_method='LEXICAL')
print("Result 4 -", end=" ")
pprint.pprint(results['hits'][0]['name'])
print("Result 5 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights']) # [_highlights] will return an empty list if using lexical search
Full Code
podcast_search.py
import marqo
import pprint
import pandas as pd
####################################################
### STEP 1: Load Data
####################################################
def load_data(file: str, number_data: int) -> dict:
podcast_data = pd.read_csv(file).head(number_data)[['name', 'description']].to_dict('records')
# dataset came from this link: https://www.vox.com/today-explained
# the .csv file has the following headers:
# name, description
# (name of podcast, short description)
# create a 'transcript' key and add the transcript text as values to each record
id_counter = 1
for data in podcast_data:
path = "data/transcripts/" + data['name'] + ".txt"
with open(path, 'r') as f:
content = f.read()
data['transcript'] = content
data['_id'] = str(id_counter) # _id is a special key which is unique to every document
id_counter += 1
return podcast_data
dataset_file = "data/podcast_data.csv"
podcast_data = load_data(dataset_file, 2)
'''
format of podcast_data -
[{'name': '....', 'description': '....', 'transcript': '....'},
{'name': '....', 'description': '....', 'transcript': '....'}]
'''
#####################################################
### STEP 2. Start Marqo
#####################################################
# Follow the instructions here https://github.com/marqo-ai/marqo/tree/2.0.0
####################################################
### STEP 3: Index Data
####################################################
index_name = "marqo-podcast-search-demo"
mq = marqo.Client(url='http://localhost:8882') # Connection to Marqo Docker Container
mq.create_index(index_name)
mq.index(index_name).add_documents(podcast_data,
tensor_fields=['name', 'description', 'transcript'], client_batch_size=64)
stats = mq.index(index_name).get_stats() # get the stats for the index
print(f"{stats['numberOfDocuments']} documents added to index: {index_name}")
####################################################
### STEP 4: Search using Marqo
####################################################
# let's create a query and perform tensor search
query = 'what is long covid?'
results = mq.index(index_name).search(query)
# ['_highlights'] will return only the relevant portion rather than the whole transcript
print("Result 1 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights']) # [0] returns the top hit
print("Result 2 -", end=" ")
pprint.pprint(results['hits'][1]['_highlights']) # [1] returns the second hit
# let's create another query and perform tensor search on a particular field
query = 'water issues in US'
results = mq.index(index_name).search(query)
print("Result 3 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights'])
# let's create another query and perform lexical search on a particular field
query = 'water crisis'
results = mq.index(index_name).search(query, search_method='LEXICAL')
print("Result 4 -", end=" ")
pprint.pprint(results['hits'][0]['name'])
print("Result 5 -", end=" ")
pprint.pprint(results['hits'][0]['_highlights']) # [_highlights] will return an empty list if using lexical search