#! /bin/bash
set -e

get_prop() {
    grep "${1}" ${2} | cut -d'=' -f2
}

CUR_DIR=$(cd `dirname $0`; pwd)
MODE=${1-"ONLINE"}

DATASET_PATH=""
if [ "$MODE" == "ONLINE" ];then
    DATASET_PATH=/workspace/dataset/favorite/soft-data-platform/v1
elif [ "$MODE" == "OFFLINE" ];then
    DATASET_PATH=/data/platform
fi
GPT2_VOCAB_PATH=$DATASET_PATH/datasets/glm/GPT/vocab.json
GPT2_MERGES_PATH=$DATASET_PATH/datasets/glm/GPT/merges.txt
LLAMA_TOKENIZER_MODEL=$DATASET_PATH/tokenizer/llama2/tokenizer.model
DATA_PATH=$CUR_DIR/dataset/openwebtxt/openwebtext-llama_text_document
CHECKPOINT_PATH=$DATASET_PATH/models/demo_tests

HELPERS_DIR=$CUR_DIR/../../megatron/core/datasets
if [ ! -f "$HELPERS_DIR/helpers.so" ]; then
    pushd $HELPERS_DIR
        make
    popd 
else
    echo "helper.so exists"
fi


if [ ! -d "$CUR_DIR/dataset" ]; then
    mkdir -p $CUR_DIR/dataset/openwebtxt
    pushd $CUR_DIR/dataset/openwebtxt
    ln -sf $DATASET_PATH/datasets/preprocessed/openwebtext-llama/openwebtext-llama_text_document.bin .
    ln -sf $DATASET_PATH/datasets/preprocessed/openwebtext-llama/openwebtext-llama_text_document.idx .
    popd
else
    echo "dataset exists"
fi

if [ ! -d "$CUR_DIR/logs" ]; then
    mkdir -p $CUR_DIR/logs
fi
