diff --git a/ModelParameter.py b/ModelParameter.py new file mode 100644 index 0000000..ce499cb --- /dev/null +++ b/ModelParameter.py @@ -0,0 +1,125 @@ +MODEL = "bigcode/starcoderbase-1b" +# Model checkpoint on the Hugging Face Hub. + +DATASET = "smangrul/hf-stack-v1" +# Dataset on the Hugging Face Hub. + +DATA_COLUMN = "content" +# Column name containing the code content. + +SEQ_LENGTH = 2048 +# Maximum sequence length for input data. + +## Training arguments +MAX_STEPS = 2000 +# Maximum number of training steps. + +BATCH_SIZE = 16 +# Number of samples per batch. + +GR_ACC_STEPS = 1 +# Steps to accumulate gradients before updating. + +LR = 5e-4 +# Learning rate for training. + +LR_SCHEDULER_TYPE = "cosine" +# Specifies the type of learning rate scheduler to be used during training. +# A learning rate scheduler adjusts the learning rate over the course of training to optimize the convergence and performance of the model. +# The "cosine" scheduler, in particular, adjusts the learning rate following a cosine curve, which gradually decreases the learning rate over time. +# This approach helps in initially making large updates to the model parameters and then fine-tuning them with smaller updates as training progresses. +# Other common scheduler types include "linear," which decreases the learning rate linearly, and "exponential," which decreases it exponentially. +# The choice of scheduler can significantly impact the training dynamics and the final performance of the model. + + +WEIGHT_DECAY = 0.01 +# Regularization to prevent overfitting. + +NUM_WARMUP_STEPS = 30 +# Steps for learning rate warmup. + +EVAL_FREQ = 100 +# Frequency of evaluation during training. + +SAVE_FREQ = 100 +# Frequency of saving model checkpoints. + +LOG_FREQ = 25 +# Frequency of logging training metrics. + +OUTPUT_DIR = "peft-starcoder-lora-a100" +# Directory for output files. + +BF16 = True +# Enables the use of bfloat16 (Brain Floating Point) precision for training the model. +# Bfloat16 is a 16-bit floating-point format designed to provide a good balance between computational efficiency and numerical stability. +# It offers similar range to 32-bit floating-point numbers but with reduced precision, which can speed up training and reduce memory usage. +# This is particularly useful when training large models on hardware that supports bfloat16, such as certain GPUs and TPUs. +# Using bfloat16 can lead to faster computations and lower memory requirements without significantly compromising model accuracy. + +FP16 = False +# Do not use float16 precision. + +# FIM transformations arguments +FIM_RATE = 0.5 +# Rate of Fill-in-the-Middle transformations. + +FIM_SPM_RATE = 0.5 +# Rate of Sentence Piece Model transformations. + +# LORA +LORA_R = 8 +# Specifies the rank of the Low-Rank Adaptation (LoRA) matrices. +# In the context of LoRA, the rank determines the dimensionality of the low-rank updates applied to the model's parameters. +# A higher rank allows for more expressive power and flexibility in the adaptation, potentially capturing more complex patterns in the data. +# However, it also increases the number of trainable parameters, which can lead to higher computational costs and memory usage. +# Conversely, a lower rank reduces the number of parameters, making the adaptation more efficient but potentially less expressive. +# The choice of rank is a trade-off between model capacity and computational efficiency. + + +LORA_ALPHA = 32 +# Specifies the scaling factor applied to the Low-Rank Adaptation (LoRA) matrices. +# The scaling factor controls the magnitude of the updates applied to the model parameters during training. +# It is used to adjust the impact of the low-rank updates on the original model weights. +# A higher scaling factor amplifies the effect of the low-rank updates, potentially allowing the model to adapt more quickly to new tasks. +# Conversely, a lower scaling factor reduces the impact, which can be useful for fine-tuning stability. +# This parameter is crucial for balancing the adaptation of the model while maintaining the stability and performance of the original model. + + +LORA_DROPOUT = 0.0 +# Specifies the dropout rate applied to the Low-Rank Adaptation (LoRA) matrices during training. +# Dropout is a regularization technique that helps prevent overfitting by randomly setting a fraction of input units to zero at each update during training. +# This forces the model to learn more robust features that are not dependent on specific weights. +# In the context of LoRA, dropout is applied to the low-rank matrices that are introduced into the model. +# A dropout rate of 0.0 means that no dropout is applied, while a higher rate (e.g., 0.5) would mean that 50% of the units are randomly set to zero during each training step. +# Applying dropout can improve the generalization of the model to new, unseen data. + +LORA_TARGET_MODULES = "c_proj,c_attn,q_attn,c_fc,c_proj" +# Specifies the modules within the model to which Low-Rank Adaptation (LoRA) will be applied. +# LoRA is a technique used to fine-tune large language models efficiently by introducing trainable low-rank matrices into specific layers. +# The target modules listed here are key components of the transformer architecture, including: +# - `c_proj`: The projection layer in the attention mechanism. +# - `c_attn`: The attention layer that computes the attention scores. +# - `q_attn`: The query part of the attention mechanism. +# - `c_fc`: The feedforward layer in the transformer block. +# - `c_proj`: Another projection layer, often used in the output of the attention mechanism. +# By applying LoRA to these modules, the model can adapt to new tasks with fewer parameters, reducing computational costs and memory requirements. + +# bitsandbytes config +USE_NESTED_QUANT = True +# Enable nested quantization to enhance computational efficiency. +# Nested quantization is a technique that reduces the memory footprint of a model by quantizing weights to lower bit-widths. +# This process involves converting floating-point numbers to lower precision formats, such as 4-bit integers. +# By using nested quantization, the model can be trained or run with significantly less memory, +# allowing for faster computations and the ability to handle larger models on limited hardware. +# This is particularly useful in scenarios where computational resources are constrained. + +BNB_4BIT_COMPUTE_DTYPE = "bfloat16" +# Data type used for computation when applying 4-bit quantization. +# Quantization reduces the precision of model weights to lower memory usage and computational requirements. +# 4-bit quantization specifically reduces weights to 4 bits, significantly decreasing memory footprint. +# The compute data type (bfloat16 in this case) is used for calculations during training or inference, +# balancing precision and computational efficiency. + +SEED = 0 +# Random seed for reproducibility.