Skip to content
This repository was archived by the owner on Oct 9, 2024. It is now read-only.
This repository was archived by the owner on Oct 9, 2024. It is now read-only.

Inference(chatbot) does not work as expected on 2 gpus with bigscience/bloom-7b1 model #90

@dantalyon

Description

@dantalyon

I am trying to create a simple chatbot using bloom-7b1 model (may use bigger models later) based on bloom-ds-zero-inference.py.
Here is my code:

import os
from pathlib import Path
import deepspeed
import torch.distributed as dist
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
import torch
from transformers.utils import is_offline_mode
from huggingface_hub import snapshot_download
from transformers import StoppingCriteria, StoppingCriteriaList


local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))

deepspeed.init_distributed("nccl")
rank = dist.get_rank()

def print_rank0(*msg):
    if rank != 0:
        return
    print(*msg)

def get_repo_root(model_name_or_path):
    # checks if online or not
    if is_offline_mode():
        print_rank0("Offline mode: forcing local_files_only=True")

    # download only on first process
    if rank == 0:
        snapshot_download(
            model_name_or_path,
            local_files_only=is_offline_mode(),
            cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
            ignore_patterns=["*.safetensors"],
        )

    dist.barrier()

    return snapshot_download(
        model_name_or_path,
        local_files_only=is_offline_mode(),
        cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
        ignore_patterns=["*.safetensors"],
    )

def get_checkpoint_files(model_name_or_path):
    cached_repo_dir = get_repo_root(model_name_or_path)

    # extensions: .bin | .pt
    # creates a list of paths from all downloaded files in cache dir
    file_list = [str(entry) for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") if entry.is_file()]
    return file_list

def write_checkpoints_json():
    checkpoint_files = get_checkpoint_files(model_name)
    if rank == 0:
        data = {"type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0}
        json.dump(data, open(checkpoints_json, "w"))

checkpoints_json = "checkpoints.json"

class StoppingCriteriaSub(StoppingCriteria):
    def __init__(self, stops = [], encounters=1):
      super().__init__()
      self.stops = stops
      self.ENCOUNTERS = encounters

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
      stop_count = 0
      for stop in self.stops:
            tokens = input_ids[0]
            stop_count = (stop == tokens).sum().item()

      if stop_count >= self.ENCOUNTERS:
          return True
      return False

model_name = "bigscience/bloom-7b1"
infer_dtype = "float16"

tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

dtype = torch.float16

stop_words_ids = [
    tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in ["Question:"]]

stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=2)])

with deepspeed.OnDevice(dtype=dtype, device="meta"):
    model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)

model = model.eval()
repo_root = get_repo_root(model_name)
write_checkpoints_json()
dist.barrier()
kwargs = dict(replace_with_kernel_inject=True)
model = deepspeed.init_inference(
    model,
    mp_size=world_size,
    base_dir=repo_root,
    dtype=getattr(torch, infer_dtype),
    checkpoint=checkpoints_json,
    **kwargs,
)
model = model.module

def chatbot(question):
    prompt = "You are  an AI chatbot named Bobby. Your job is to answer questions related to cartoon characters. Respond 'not sure' if unsure about answer.\n"
    prompt += "Question:" + " " + question + "\n" + "Answer:" + " "
    num_tokens = 100
    inputs = [prompt]
    generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False, stopping_criteria=stopping_criteria)
    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
    outputs = model.generate(**input_tokens, **generate_kwargs)
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(outputs)

if __name__ == "__main__":
    while True:
        question = input("You: ")
        if question == "q":
            break
        answer = chatbot(question)

I have not yet applied the post-processing of the output. This works fine if I run it with

deepspeed --num_gpus 1 inference.py
image

but when I run it with

deepspeed --num_gpus 2 inference.py
nothing seems to happen, and on pressing enter, memory stats come up. Have to exit with Ctrl+C.
image

I am using two Tesla V100 GPUs.
deepspeed==0.9.2 and torch==1.14.0a0+410ce96 and Python 3.8.10

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions