Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use localNode decorator for DCGM and Neuron metrics for host metadata #244

Merged
merged 2 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ func getMetricRelabelConfig(hostInfoProvider hostInfoProvider) []*relabel.Config
Replacement: "${1}",
Action: relabel.Replace,
},
// hacky way to inject static values (clusterName & instanceId) to label set without additional processor
// relabel looks up an existing label then creates another label with given key (TargetLabel) and value (static)
// hacky way to inject static values (clusterName/instanceId/instancType)
jefchien marked this conversation as resolved.
Show resolved Hide resolved
// could be removed since these labels are now set by localNode decorator
{
SourceLabels: model.LabelNames{"namespace"},
TargetLabel: ci.ClusterNameKey,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,24 +68,17 @@ func GetNeuronMetricRelabelConfigs(hostinfo prometheusscraper.HostInfoProvider)
Action: relabel.Keep,
},
{
SourceLabels: model.LabelNames{"instance_id"},
TargetLabel: ci.InstanceID,
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: hostinfo.GetInstanceID(),
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"instance_type"},
TargetLabel: ci.InstanceType,
SourceLabels: model.LabelNames{"neuroncore"},
TargetLabel: "NeuronCore",
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: hostinfo.GetInstanceType(),
Replacement: "${1}",
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"neuroncore"},
TargetLabel: "NeuronCore",
SourceLabels: model.LabelNames{"instance_id"},
TargetLabel: ci.NodeNameKey,
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Replacement: os.Getenv("HOST_NAME"),
Action: relabel.Replace,
},
{
Expand All @@ -96,7 +89,7 @@ func GetNeuronMetricRelabelConfigs(hostinfo prometheusscraper.HostInfoProvider)
Action: relabel.Replace,
},
// hacky way to inject static values (clusterName) to label set without additional processor
// relabel looks up an existing label then creates another label with given key (TargetLabel) and value (static)
// could be removed since these labels are now set by localNode decorator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just remove them? They get overwritten in the localnode decorator like you said, so there isn't much use to them.

{
SourceLabels: model.LabelNames{"instance_id"},
TargetLabel: ci.ClusterNameKey,
Expand All @@ -106,9 +99,16 @@ func GetNeuronMetricRelabelConfigs(hostinfo prometheusscraper.HostInfoProvider)
},
{
SourceLabels: model.LabelNames{"instance_id"},
TargetLabel: ci.NodeNameKey,
TargetLabel: ci.InstanceID,
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: os.Getenv("HOST_NAME"),
Replacement: hostinfo.GetInstanceID(),
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"instance_type"},
TargetLabel: ci.InstanceType,
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: hostinfo.GetInstanceType(),
Action: relabel.Replace,
},
}
Expand Down
16 changes: 8 additions & 8 deletions receiver/awscontainerinsightreceiver/receiver.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,15 @@ func (acir *awsContainerInsightReceiver) initEKS(ctx context.Context, host compo
}
}

err = acir.initDcgmScraper(ctx, host, hostInfo, k8sDecorator)
err = acir.initDcgmScraper(ctx, host, hostInfo, localNodeDecorator)
if err != nil {
acir.settings.Logger.Debug("Unable to start dcgm scraper", zap.Error(err))
}
err = acir.initPodResourcesStore()
if err != nil {
acir.settings.Logger.Debug("Unable to start pod resources store", zap.Error(err))
}
err = acir.initNeuronScraper(ctx, host, hostInfo, k8sDecorator)
err = acir.initNeuronScraper(ctx, host, hostInfo, localNodeDecorator)
if err != nil {
acir.settings.Logger.Debug("Unable to start neuron scraper", zap.Error(err))
}
Expand Down Expand Up @@ -286,7 +286,7 @@ func (acir *awsContainerInsightReceiver) initPrometheusScraper(ctx context.Conte
})
return err
}
func (acir *awsContainerInsightReceiver) initDcgmScraper(ctx context.Context, host component.Host, hostInfo *hostinfo.Info, decorator *stores.K8sDecorator) error {
func (acir *awsContainerInsightReceiver) initDcgmScraper(ctx context.Context, host component.Host, hostInfo *hostinfo.Info, localNodeDecorator stores.Decorator) error {
if !acir.config.EnableAcceleratedComputeMetrics {
return nil
}
Expand All @@ -296,7 +296,7 @@ func (acir *awsContainerInsightReceiver) initDcgmScraper(ctx context.Context, ho
NextConsumer: acir.nextConsumer,
MetricType: ci.TypeContainerGPU,
MetricToUnitMap: gpu.MetricToUnit,
K8sDecorator: decorator,
K8sDecorator: localNodeDecorator,
Logger: acir.settings.Logger,
}

Expand All @@ -321,7 +321,7 @@ func (acir *awsContainerInsightReceiver) initPodResourcesStore() error {
return err
}

func (acir *awsContainerInsightReceiver) initNeuronScraper(ctx context.Context, host component.Host, hostInfo *hostinfo.Info, decorator *stores.K8sDecorator) error {
func (acir *awsContainerInsightReceiver) initNeuronScraper(ctx context.Context, host component.Host, hostInfo *hostinfo.Info, localNodeDecorator stores.Decorator) error {
if !acir.config.EnableAcceleratedComputeMetrics {
return nil
}
Expand All @@ -331,7 +331,7 @@ func (acir *awsContainerInsightReceiver) initNeuronScraper(ctx context.Context,
ContainerOrchestrator: ci.EKS,
NextConsumer: acir.nextConsumer,
MetricType: ci.TypeContainerNeuron,
K8sDecorator: decorator,
K8sDecorator: localNodeDecorator,
Logger: acir.settings.Logger,
}

Expand Down Expand Up @@ -368,15 +368,15 @@ func (acir *awsContainerInsightReceiver) initNeuronScraper(ctx context.Context,
return err
}

func (acir *awsContainerInsightReceiver) initEfaSysfsScraper(localnodeDecorator stores.Decorator) error {
func (acir *awsContainerInsightReceiver) initEfaSysfsScraper(localNodeDecorator stores.Decorator) error {
if !acir.config.EnableAcceleratedComputeMetrics {
return nil
}

if acir.podResourcesStore == nil {
return errors.New("pod resources store was not initialized")
}
acir.efaSysfsScraper = efa.NewEfaSyfsScraper(acir.settings.Logger, localnodeDecorator, acir.podResourcesStore)
acir.efaSysfsScraper = efa.NewEfaSyfsScraper(acir.settings.Logger, localNodeDecorator, acir.podResourcesStore)
return nil
}

Expand Down
Loading