Skip to content

Searching Stable Diffusion Generated Images

This guide will take you step-by-step through the process of using Marqo to index and search through a large dataset of images.

Getting Started

Before diving into the code, let's set up our environment to ensure you can smoothly run through the examples:

  1. Clone the examples repository.
  2. Start Marqo using Docker with the following commands:
    docker rm -f marqo
    docker pull marqoai/marqo:2.0.0
    docker run --name marqo -it -p 8882:8882 --add-host host.docker.internal:host-gateway marqoai/marqo:2.0.0
    
    For more detailed instructions, check the getting started guide.
  3. Follow this walkthrough or refer to the original code and article here and here.

Breaking Down the Code

The process is broken down into six key steps, each of which is essential for the proper indexing and retrieval of images.

STEP 0. Import and define any helper functions

First, we'll import the necessary libraries and define any helper functions we might need.

# Import necessary libraries
from marqo import Client
import glob
import os
import pandas as pd
import subprocess

STEP 1. Setup and Create Documents for Indexing

We'll set up the path for our image directory and start an image server to make the images accessible to Docker. Then, we'll create a list of documents for indexing.

# Define the image directory and start an image server

# this should be where the images are unzipped
# get the dataset from here https://drive.google.com/file/d/16_1MlX9GH-6v060jYA23eTJwH74fSU4L/view?usp=sharing
images_directory = "hot-dog-100k/"

# the images are accessed via docker from here - you will be able
# to access them at something like http://[::]:8000/ or http://localhost:8000/
docker_path = "http://host.docker.internal:8222/"

# we start an image server for easier access from within docker
pid = subprocess.Popen(
    ["python3", "-m", "http.server", "8222", "--directory", images_directory],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.STDOUT,
)

# now find all the files
files = glob.glob(images_directory + "/*.jpg")

# we want to map the filename only with its docker path
files_map = {os.path.basename(f): f for f in files}

# update them to use the correct path
files_docker = [f.replace(images_directory, docker_path) for f in files]

# now we create our documents for indexing - a list of python dicts with keys as field names
documents = [
    {"image_docker": file_docker, "_id": os.path.basename(file_docker)}
    for file_docker, file_local in zip(files_docker, files)
]

STEP 2. Initial Indexing

Create an index with Marqo and set the model to use for indexing. Then, we'll add our documents to this index.

# Initialize the Marqo client and create an index

# get the marqo client
client = Client()

# index name - should be lowercase
index_name = "hot-dogs-100k"

settings = {
    "model": "open_clip/ViT-B-32/laion2b_s34b_b79k",
    "treatUrlsAndPointersAsImages": True,
}
client.create_index(index_name, settings_dict=settings)

responses = client.index(index_name).add_documents(
    documents, client_batch_size=50, tensor_fields=["image_docker"]
)

Step 3. Add Labels via Zero-Shot Learning

Use zero-shot learning to add labels to the images, which will help in categorizing and retrieving them later on.

# Define labels and index them
index_name = "one_dog_two"

# the documents here are actually serving as labels. the document (i.e. label)
# that most closely matches is returned first and can be used as a label
# each one gets scored and those scores can also be kept
labels = [
    {"label": "one hot dog"},
    {"label": "two hot dogs"},
    {"label": "a hamburger"},
    {"label": "a face"},
]

# get a copy of the labels only
label_strings = [list(a.values())[0] for a in labels]

# we create a new index
settings = {
    "model": "open_clip/ViT-B-32/laion2b_s34b_b79k",
    "treatUrlsAndPointersAsImages": True,
}
client.create_index(index_name, settings_dict=settings)

# add our labels to the index
responses = client.index(index_name).add_documents(
    labels, tensor_fields=["label"], client_batch_size=64
)

# loop through the documents and search against the labels to get scores
for doc in documents:
    # the url for the image is what is used as the search - an image
    responses = client.index(index_name).search(doc["image_docker"])

    # now retrieve the score for each label and add it to our document
    for lab in label_strings:
        doc[lab.replace(" ", "_")] = [
            r["_score"] for r in responses["hits"] if r["label"] == lab
        ][0]

documents_image_docker = [doc.pop("image_docker") for doc in documents]
responses = client.index("hot-dogs-100k").add_documents(
    documents, client_batch_size=50, tensor_fields=["image_docker"]
)

Step 4. Remove Black Images

Identify and remove any black images from the index to improve the search results.

# Query to find black images and remove them

query = "a black image"

results = client.index(index_name).search(query)

# remove the blank images
results = client.index(index_name).search(results["hits"][0]["image_docker"], limit=100)

# we check the results - scores of very close to 1 are duplicated (this value can change depending on the task)
documents_delete = [r["_id"] for r in results["hits"] if r["_score"] > 0.99999]

client.index(index_name).delete_documents(documents_delete)

Step 5. Order Images Based on Similarity

Organize the images by their similarity to each other to create a more intuitive search experience.

# Reorder images based on similarity

# pick one to start
results = client.index("hot-dogs-100k").search(
    "a photo of a smiling face",
    filter_string="a_face:[0.58 TO 0.99] AND a_hamburger:[0.60 TO 0.99]",
)

# find the document that matches closest with the query
index = [
    ind for ind, doc in enumerate(documents) if doc["_id"] == results["hits"][0]["_id"]
][0]
current_document = documents[index]
# create a list to store the "sorted" documents
ordered_documents = [current_document["_id"]]

for i in range(len(documents)):
    # remove current document
    client.index(index_name).delete_documents([current_document["_id"]])

    # now search with it to get next best
    results = client.index(index_name).search(
        current_document["image_docker"],
        filter_string="a_face:[0.58 TO 0.99] AND a_hamburger:[0.60 TO 0.99]",
    )

    next_document = results["hits"][0]

    # now add it
    ordered_documents.append(next_document["_id"])

    current_document = next_document

ordered_images = [files_map[f] for f in ordered_documents]
deleted_documents = [d for d in documents if d["_id"] in ordered_documents]

Step 6. Animate the Ordered Images

Lastly, we'll animate the ordered images to visualize the similarity between them.

# Animate and save the ordered images
import sys
import subprocess
from pathlib import Path


def prepend_number(filename, number, new_dir):
    _dir, _name = os.path.split(filename)

    return _dir + f"/{new_dir}/" + number + "_" + _name


def copyWithSubprocess(cmd):
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)


save_base_dir = "outputs/"
Path(save_base_dir).mkdir(parents=True, exist_ok=True)

new_images = [
    save_base_dir + f"{str(i).zfill(5)}" + os.path.basename(f)
    for i, f in enumerate(ordered_images)
]

for image, new_image in zip(ordered_images, new_images):
    cmd = ["cp", image, new_image]

    copyWithSubprocess(cmd)

Full Code

hot-dog-100k.py
#####################################################
### STEP 0. Import and define any helper functions
#####################################################

from marqo import Client
from marqo.errors import MarqoApiError
import torch
import json
import pprint
import glob
import os
import pandas as pd
import subprocess

#####################################################
### STEP 1. Setup some things and create the documents for indexing
#####################################################

# this should be where the images are unzipped
# get the dataset from here https://drive.google.com/file/d/16_1MlX9GH-6v060jYA23eTJwH74fSU4L/view?usp=sharing
images_directory = 'hot-dog-100k/'

# the images are accessed via docker from here - you will be able
# to access them at something like http://[::]:8000/ or http://localhost:8000/
docker_path = 'http://host.docker.internal:8222/'

# we start an image server for easier access from within docker
pid = subprocess.Popen(['python3', '-m', 'http.server', '8222', '--directory', images_directory], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

# now find all the files
files = glob.glob(images_directory + "/*.jpg")

# we want to map the filename only with its docker path
files_map = {os.path.basename(f):f for f in files}

# update them to use the correct path
files_docker = [f.replace(images_directory, docker_path) for f in files]

# now we create our documents for indexing - a list of python dicts with keys as field names
documents = [{"image_docker":file_docker, '_id':os.path.basename(file_docker)} for file_docker,file_local in zip(files_docker, files)]

#####################################################
### STEP 2. Initial indexing
#####################################################

# we create the index and can set the model we want to use


# get the marqo client
client = Client()

# index name - should be lowercase
index_name = 'hot-dogs-100k'

settings = {
        "model":'open_clip/ViT-B-32/laion2b_s34b_b79k',
        "treatUrlsAndPointersAsImages": True,
        }
client.create_index(index_name, settings_dict=settings)

responses = client.index(index_name).add_documents(
    documents, client_batch_size=50, tensor_fields=['image_docker']
)


#####################################################
### Step 3. Add some labels via zero-shot learning
#####################################################


index_name = 'one_dog_two'

# the documents here are actually serving as labels. the document (i.e. label)
# that most closely matches is returned first and can be used as a label
# each one gets scored and those scores can also be kept
labels = [{"label":"one hot dog"}, {"label":"two hot dogs"},
          {"label":"a hamburger"}, {"label": "a face"}]

# get a copy of the labels only
label_strings = [list(a.values())[0] for a in labels]

# we create a new index
settings = {
        "model":'open_clip/ViT-B-32/laion2b_s34b_b79k',
        "treatUrlsAndPointersAsImages": True,
        }
client.create_index(index_name, settings_dict=settings)

# add our labels to the index
responses = client.index(index_name).add_documents(labels, tensor_fields=['label'], client_batch_size=64)

# loop through the documents and search against the labels to get scores
for doc in documents:

    # the url for the image is what is used as the search - an image
    responses = client.index(index_name).search(doc['image_docker'])

    # now retrieve the score for each label and add it to our document
    for lab in label_strings:
        doc[lab.replace(' ','_')] = [r['_score'] for r in responses['hits'] if r['label'] == lab][0]

documents_image_docker = [doc.pop('image_docker') for doc in documents]
responses = client.index("hot-dogs-100k").add_documents(
        documents, client_batch_size=50, 
        tensor_fields=['image_docker']
)

#####################################################
### Step 4. Remove the black images
#####################################################

query = 'a black image'

results = client.index(index_name).search(query)

# remove the blank images
results = client.index(index_name).search(results['hits'][0]['image_docker'], limit=100)

# we check the results - scores of very close to 1 are duplicated (this value can change depending on the task)
documents_delete = [r['_id'] for r in results['hits'] if r['_score'] > 0.99999]

client.index(index_name).delete_documents(documents_delete)


#####################################################
### Step 5. order the images based on their similarity with each other
#####################################################

# pick one to start
results = client.index("hot-dogs-100k").search('a photo of a smiling face',
                        filter_string="a_face:[0.58 TO 0.99] AND a_hamburger:[0.60 TO 0.99]")

# find the document that matches closest with the query
index = [ind for ind,doc in enumerate(documents) if doc['_id'] == results['hits'][0]['_id'] ][0]
current_document = documents[index]
# create a list to store the "sorted" documents
ordered_documents = [current_document['_id']]

for i in range(len(documents)):

    # remove current document
    client.index(index_name).delete_documents([current_document['_id']])

    # now search with it to get next best
    results = client.index(index_name).search(current_document['image_docker'],
                            filter_string="a_face:[0.58 TO 0.99] AND a_hamburger:[0.60 TO 0.99]",
                            )

    next_document = results['hits'][0]

    # now add it
    ordered_documents.append(next_document['_id'])

    current_document = next_document

ordered_images = [files_map[f] for f in ordered_documents]
deleted_documents = [d for d in documents if d['_id'] in ordered_documents]

#####################################################
### Step 6. Animate them
#####################################################
import sys
import subprocess
from pathlib import Path

def prepend_number(filename, number, new_dir):
    _dir, _name = os.path.split(filename)

    return _dir + f'/{new_dir}/' + number + '_' + _name

def copyWithSubprocess(cmd):        
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

save_base_dir = 'outputs/'
Path(save_base_dir).mkdir(parents=True, exist_ok=True)

new_images = [save_base_dir + f'{str(i).zfill(5)}' + os.path.basename(f) for i,f in enumerate(ordered_images)]

for image, new_image in zip(ordered_images, new_images):

    cmd=['cp', image, new_image]

    copyWithSubprocess(cmd)