Parameters
This commit is contained in:
parent
3880c8a0b4
commit
7f6ed97c8e
125
ModelParameter.py
Normal file
125
ModelParameter.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
MODEL = "bigcode/starcoderbase-1b"
|
||||||
|
# Model checkpoint on the Hugging Face Hub.
|
||||||
|
|
||||||
|
DATASET = "smangrul/hf-stack-v1"
|
||||||
|
# Dataset on the Hugging Face Hub.
|
||||||
|
|
||||||
|
DATA_COLUMN = "content"
|
||||||
|
# Column name containing the code content.
|
||||||
|
|
||||||
|
SEQ_LENGTH = 2048
|
||||||
|
# Maximum sequence length for input data.
|
||||||
|
|
||||||
|
## Training arguments
|
||||||
|
MAX_STEPS = 2000
|
||||||
|
# Maximum number of training steps.
|
||||||
|
|
||||||
|
BATCH_SIZE = 16
|
||||||
|
# Number of samples per batch.
|
||||||
|
|
||||||
|
GR_ACC_STEPS = 1
|
||||||
|
# Steps to accumulate gradients before updating.
|
||||||
|
|
||||||
|
LR = 5e-4
|
||||||
|
# Learning rate for training.
|
||||||
|
|
||||||
|
LR_SCHEDULER_TYPE = "cosine"
|
||||||
|
# Specifies the type of learning rate scheduler to be used during training.
|
||||||
|
# A learning rate scheduler adjusts the learning rate over the course of training to optimize the convergence and performance of the model.
|
||||||
|
# The "cosine" scheduler, in particular, adjusts the learning rate following a cosine curve, which gradually decreases the learning rate over time.
|
||||||
|
# This approach helps in initially making large updates to the model parameters and then fine-tuning them with smaller updates as training progresses.
|
||||||
|
# Other common scheduler types include "linear," which decreases the learning rate linearly, and "exponential," which decreases it exponentially.
|
||||||
|
# The choice of scheduler can significantly impact the training dynamics and the final performance of the model.
|
||||||
|
|
||||||
|
|
||||||
|
WEIGHT_DECAY = 0.01
|
||||||
|
# Regularization to prevent overfitting.
|
||||||
|
|
||||||
|
NUM_WARMUP_STEPS = 30
|
||||||
|
# Steps for learning rate warmup.
|
||||||
|
|
||||||
|
EVAL_FREQ = 100
|
||||||
|
# Frequency of evaluation during training.
|
||||||
|
|
||||||
|
SAVE_FREQ = 100
|
||||||
|
# Frequency of saving model checkpoints.
|
||||||
|
|
||||||
|
LOG_FREQ = 25
|
||||||
|
# Frequency of logging training metrics.
|
||||||
|
|
||||||
|
OUTPUT_DIR = "peft-starcoder-lora-a100"
|
||||||
|
# Directory for output files.
|
||||||
|
|
||||||
|
BF16 = True
|
||||||
|
# Enables the use of bfloat16 (Brain Floating Point) precision for training the model.
|
||||||
|
# Bfloat16 is a 16-bit floating-point format designed to provide a good balance between computational efficiency and numerical stability.
|
||||||
|
# It offers similar range to 32-bit floating-point numbers but with reduced precision, which can speed up training and reduce memory usage.
|
||||||
|
# This is particularly useful when training large models on hardware that supports bfloat16, such as certain GPUs and TPUs.
|
||||||
|
# Using bfloat16 can lead to faster computations and lower memory requirements without significantly compromising model accuracy.
|
||||||
|
|
||||||
|
FP16 = False
|
||||||
|
# Do not use float16 precision.
|
||||||
|
|
||||||
|
# FIM transformations arguments
|
||||||
|
FIM_RATE = 0.5
|
||||||
|
# Rate of Fill-in-the-Middle transformations.
|
||||||
|
|
||||||
|
FIM_SPM_RATE = 0.5
|
||||||
|
# Rate of Sentence Piece Model transformations.
|
||||||
|
|
||||||
|
# LORA
|
||||||
|
LORA_R = 8
|
||||||
|
# Specifies the rank of the Low-Rank Adaptation (LoRA) matrices.
|
||||||
|
# In the context of LoRA, the rank determines the dimensionality of the low-rank updates applied to the model's parameters.
|
||||||
|
# A higher rank allows for more expressive power and flexibility in the adaptation, potentially capturing more complex patterns in the data.
|
||||||
|
# However, it also increases the number of trainable parameters, which can lead to higher computational costs and memory usage.
|
||||||
|
# Conversely, a lower rank reduces the number of parameters, making the adaptation more efficient but potentially less expressive.
|
||||||
|
# The choice of rank is a trade-off between model capacity and computational efficiency.
|
||||||
|
|
||||||
|
|
||||||
|
LORA_ALPHA = 32
|
||||||
|
# Specifies the scaling factor applied to the Low-Rank Adaptation (LoRA) matrices.
|
||||||
|
# The scaling factor controls the magnitude of the updates applied to the model parameters during training.
|
||||||
|
# It is used to adjust the impact of the low-rank updates on the original model weights.
|
||||||
|
# A higher scaling factor amplifies the effect of the low-rank updates, potentially allowing the model to adapt more quickly to new tasks.
|
||||||
|
# Conversely, a lower scaling factor reduces the impact, which can be useful for fine-tuning stability.
|
||||||
|
# This parameter is crucial for balancing the adaptation of the model while maintaining the stability and performance of the original model.
|
||||||
|
|
||||||
|
|
||||||
|
LORA_DROPOUT = 0.0
|
||||||
|
# Specifies the dropout rate applied to the Low-Rank Adaptation (LoRA) matrices during training.
|
||||||
|
# Dropout is a regularization technique that helps prevent overfitting by randomly setting a fraction of input units to zero at each update during training.
|
||||||
|
# This forces the model to learn more robust features that are not dependent on specific weights.
|
||||||
|
# In the context of LoRA, dropout is applied to the low-rank matrices that are introduced into the model.
|
||||||
|
# A dropout rate of 0.0 means that no dropout is applied, while a higher rate (e.g., 0.5) would mean that 50% of the units are randomly set to zero during each training step.
|
||||||
|
# Applying dropout can improve the generalization of the model to new, unseen data.
|
||||||
|
|
||||||
|
LORA_TARGET_MODULES = "c_proj,c_attn,q_attn,c_fc,c_proj"
|
||||||
|
# Specifies the modules within the model to which Low-Rank Adaptation (LoRA) will be applied.
|
||||||
|
# LoRA is a technique used to fine-tune large language models efficiently by introducing trainable low-rank matrices into specific layers.
|
||||||
|
# The target modules listed here are key components of the transformer architecture, including:
|
||||||
|
# - `c_proj`: The projection layer in the attention mechanism.
|
||||||
|
# - `c_attn`: The attention layer that computes the attention scores.
|
||||||
|
# - `q_attn`: The query part of the attention mechanism.
|
||||||
|
# - `c_fc`: The feedforward layer in the transformer block.
|
||||||
|
# - `c_proj`: Another projection layer, often used in the output of the attention mechanism.
|
||||||
|
# By applying LoRA to these modules, the model can adapt to new tasks with fewer parameters, reducing computational costs and memory requirements.
|
||||||
|
|
||||||
|
# bitsandbytes config
|
||||||
|
USE_NESTED_QUANT = True
|
||||||
|
# Enable nested quantization to enhance computational efficiency.
|
||||||
|
# Nested quantization is a technique that reduces the memory footprint of a model by quantizing weights to lower bit-widths.
|
||||||
|
# This process involves converting floating-point numbers to lower precision formats, such as 4-bit integers.
|
||||||
|
# By using nested quantization, the model can be trained or run with significantly less memory,
|
||||||
|
# allowing for faster computations and the ability to handle larger models on limited hardware.
|
||||||
|
# This is particularly useful in scenarios where computational resources are constrained.
|
||||||
|
|
||||||
|
BNB_4BIT_COMPUTE_DTYPE = "bfloat16"
|
||||||
|
# Data type used for computation when applying 4-bit quantization.
|
||||||
|
# Quantization reduces the precision of model weights to lower memory usage and computational requirements.
|
||||||
|
# 4-bit quantization specifically reduces weights to 4 bits, significantly decreasing memory footprint.
|
||||||
|
# The compute data type (bfloat16 in this case) is used for calculations during training or inference,
|
||||||
|
# balancing precision and computational efficiency.
|
||||||
|
|
||||||
|
SEED = 0
|
||||||
|
# Random seed for reproducibility.
|
||||||
Loading…
x
Reference in New Issue
Block a user