Parameters

2025-02-15 23:27:25 +01:00 · 2025-02-15 23:27:25 +01:00 · 7f6ed97c8e
commit 7f6ed97c8e
parent 3880c8a0b4
1 changed files with 125 additions and 0 deletions
--- a/ModelParameter.py
+++ b/ModelParameter.py
@ -0,0 +1,125 @@
 MODEL = "bigcode/starcoderbase-1b"
 # Model checkpoint on the Hugging Face Hub.
 DATASET = "smangrul/hf-stack-v1"
 # Dataset on the Hugging Face Hub.
 DATA_COLUMN = "content"
 # Column name containing the code content.
 SEQ_LENGTH = 2048
 # Maximum sequence length for input data.
 ## Training arguments
 MAX_STEPS = 2000
 # Maximum number of training steps.
 BATCH_SIZE = 16
 # Number of samples per batch.
 GR_ACC_STEPS = 1
 # Steps to accumulate gradients before updating.
 LR = 5e-4
 # Learning rate for training.
 LR_SCHEDULER_TYPE = "cosine"
 # Specifies the type of learning rate scheduler to be used during training.
 # A learning rate scheduler adjusts the learning rate over the course of training to optimize the convergence and performance of the model.
 # The "cosine" scheduler, in particular, adjusts the learning rate following a cosine curve, which gradually decreases the learning rate over time.
 # This approach helps in initially making large updates to the model parameters and then fine-tuning them with smaller updates as training progresses.
 # Other common scheduler types include "linear," which decreases the learning rate linearly, and "exponential," which decreases it exponentially.
 # The choice of scheduler can significantly impact the training dynamics and the final performance of the model.
 WEIGHT_DECAY = 0.01
 # Regularization to prevent overfitting.
 NUM_WARMUP_STEPS = 30
 # Steps for learning rate warmup.
 EVAL_FREQ = 100
 # Frequency of evaluation during training.
 SAVE_FREQ = 100
 # Frequency of saving model checkpoints.
 LOG_FREQ = 25
 # Frequency of logging training metrics.
 OUTPUT_DIR = "peft-starcoder-lora-a100"
 # Directory for output files.
 BF16 = True
 # Enables the use of bfloat16 (Brain Floating Point) precision for training the model.
 # Bfloat16 is a 16-bit floating-point format designed to provide a good balance between computational efficiency and numerical stability.
 # It offers similar range to 32-bit floating-point numbers but with reduced precision, which can speed up training and reduce memory usage.
 # This is particularly useful when training large models on hardware that supports bfloat16, such as certain GPUs and TPUs.
 # Using bfloat16 can lead to faster computations and lower memory requirements without significantly compromising model accuracy.
 FP16 = False
 # Do not use float16 precision.
 # FIM transformations arguments
 FIM_RATE = 0.5
 # Rate of Fill-in-the-Middle transformations.
 FIM_SPM_RATE = 0.5
 # Rate of Sentence Piece Model transformations.
 # LORA
 LORA_R = 8
 # Specifies the rank of the Low-Rank Adaptation (LoRA) matrices.
 # In the context of LoRA, the rank determines the dimensionality of the low-rank updates applied to the model's parameters.
 # A higher rank allows for more expressive power and flexibility in the adaptation, potentially capturing more complex patterns in the data.
 # However, it also increases the number of trainable parameters, which can lead to higher computational costs and memory usage.
 # Conversely, a lower rank reduces the number of parameters, making the adaptation more efficient but potentially less expressive.
 # The choice of rank is a trade-off between model capacity and computational efficiency.
 LORA_ALPHA = 32
 # Specifies the scaling factor applied to the Low-Rank Adaptation (LoRA) matrices.
 # The scaling factor controls the magnitude of the updates applied to the model parameters during training.
 # It is used to adjust the impact of the low-rank updates on the original model weights.
 # A higher scaling factor amplifies the effect of the low-rank updates, potentially allowing the model to adapt more quickly to new tasks.
 # Conversely, a lower scaling factor reduces the impact, which can be useful for fine-tuning stability.
 # This parameter is crucial for balancing the adaptation of the model while maintaining the stability and performance of the original model.
 LORA_DROPOUT = 0.0
 # Specifies the dropout rate applied to the Low-Rank Adaptation (LoRA) matrices during training.
 # Dropout is a regularization technique that helps prevent overfitting by randomly setting a fraction of input units to zero at each update during training.
 # This forces the model to learn more robust features that are not dependent on specific weights.
 # In the context of LoRA, dropout is applied to the low-rank matrices that are introduced into the model.
 # A dropout rate of 0.0 means that no dropout is applied, while a higher rate (e.g., 0.5) would mean that 50% of the units are randomly set to zero during each training step.
 # Applying dropout can improve the generalization of the model to new, unseen data.
 LORA_TARGET_MODULES = "c_proj,c_attn,q_attn,c_fc,c_proj"
 # Specifies the modules within the model to which Low-Rank Adaptation (LoRA) will be applied.
 # LoRA is a technique used to fine-tune large language models efficiently by introducing trainable low-rank matrices into specific layers.
 # The target modules listed here are key components of the transformer architecture, including:
 # - `c_proj`: The projection layer in the attention mechanism.
 # - `c_attn`: The attention layer that computes the attention scores.
 # - `q_attn`: The query part of the attention mechanism.
 # - `c_fc`: The feedforward layer in the transformer block.
 # - `c_proj`: Another projection layer, often used in the output of the attention mechanism.
 # By applying LoRA to these modules, the model can adapt to new tasks with fewer parameters, reducing computational costs and memory requirements.
 # bitsandbytes config
 USE_NESTED_QUANT = True
 # Enable nested quantization to enhance computational efficiency.
 # Nested quantization is a technique that reduces the memory footprint of a model by quantizing weights to lower bit-widths.
 # This process involves converting floating-point numbers to lower precision formats, such as 4-bit integers.
 # By using nested quantization, the model can be trained or run with significantly less memory,
 # allowing for faster computations and the ability to handle larger models on limited hardware.
 # This is particularly useful in scenarios where computational resources are constrained.
 BNB_4BIT_COMPUTE_DTYPE = "bfloat16"
 # Data type used for computation when applying 4-bit quantization.
 # Quantization reduces the precision of model weights to lower memory usage and computational requirements.
 # 4-bit quantization specifically reduces weights to 4 bits, significantly decreasing memory footprint.
 # The compute data type (bfloat16 in this case) is used for calculations during training or inference,
 # balancing precision and computational efficiency.
 SEED = 0
 # Random seed for reproducibility.