#! /bin/bash
set -e
CUR_DIR=$(cd `dirname $0`; pwd)


MODE=${1-"ONLINE"}
DATASET_TYPE=${2-"LLAMA2-OPENWEBTEXT"}

make_helpers() {
    HELPERS_DIR=$CUR_DIR/../../megatron/core/datasets
    if [ ! -f "$HELPERS_DIR/helpers.so" ]; then
        pushd $HELPERS_DIR
            make
        popd 
    else
        echo "helper.so exists"
    fi
}

make_dataset() {
    if [ $DATASET_TYPE = "LLAMA2-OPENWEBTEXT" ]; then
        if [ ! -d "$CUR_DIR/dataset/llama2-openwebtext" ]; then
            mkdir -p $CUR_DIR/dataset/llama2-openwebtext
	fi
        pushd $CUR_DIR/dataset/llama2-openwebtext
            ln -sf $PLATFORM_PATH/datasets/preprocessed/openwebtext-llama/openwebtext-llama_text_document.bin .
            ln -sf $PLATFORM_PATH/datasets/preprocessed/openwebtext-llama/openwebtext-llama_text_document.idx .
        popd
    elif [ $DATASET_TYPE = "LLAMA3-OPENWEBTEXT" ]; then
        if [ ! -d "$CUR_DIR/dataset/llama3-openwebtext" ]; then
            mkdir -p $CUR_DIR/dataset/llama3-openwebtext
	fi
        mkdir -p $CUR_DIR/dataset/llama3-openwebtext
        pushd $CUR_DIR/dataset/llama3-openwebtext
            ln -sf $PLATFORM_PATH/datasets/preprocessed/openwebtext-llama3/openwebtxt-llama3_text_document.bin .
            ln -sf $PLATFORM_PATH/datasets/preprocessed/openwebtext-llama3/openwebtxt-llama3_text_document.idx .
        popd
    else
	echo "Invalid dataset type"
    fi
}

make_logs_dir() {
    if [ ! -d "$CUR_DIR/logs" ]; then
        mkdir -p $CUR_DIR/logs
    fi
}

PLATFORM_PATH=""
AE_PATH=""
if [ "$MODE" == "ONLINE" ];then
    PLATFORM_PATH=/workspace/dataset/favorite/soft-data-platform/v1
    AE_PATH=/workspace/dataset/favorite/soft-data-ae/v1
elif [ "$MODE" == "OFFLINE" ];then
    PLATFORM_PATH=/data/platform
    AE_PATH=/data/AE
fi
GPT2_VOCAB_PATH=$PLATFORM_PATH/tokenizer/gpt2/gpt2-vocab.json
GPT2_MERGES_PATH=$PLATFORM_PATH/tokenizer/gpt2/gpt2-merges.txt
LLAMA2_TOKENIZER_MODEL=$PLATFORM_PATH/tokenizer/llama2/tokenizer.model
LLAMA3_TOKENIZER_PATH=$AE_PATH/llm/models/Meta-Llama-3-8B
LLAMA2_OPENWEBTEXT_DATASET_PATH=$CUR_DIR/dataset/llama2-openwebtext/openwebtext-llama_text_document
LLAMA3_OPENWEBTEXT_DATASET_PATH=$CUR_DIR/dataset/llama3-openwebtext/openwebtxt-llama3_text_document
CHECKPOINT_PATH=$PLATFORM_PATH/models/demo_tests

make_helpers
make_dataset
make_logs_dir
