Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix creation of AWs/Jobs with same name in different namespaces #652

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/usage/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ Delete the first `AppWrapper` job.

```bash
$ kubectl delete -f aw-01.yaml
appwrapper.workload.codeflare.dev "0001-aw-generic-deployment-1" deleted
workload.codeflare.dev/appwrapper "0001-aw-generic-deployment-1" deleted
```

Check the pods status of the `AppWrapper` jobs. The new pods from the second `AppWrapper` job: `0002-aw-generic-deployment-2` job should now be deployed and running.
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ require (
k8s.io/client-go v0.26.2
k8s.io/klog/v2 v2.90.1
k8s.io/metrics v0.26.2
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5
sigs.k8s.io/custom-metrics-apiserver v0.0.0
sigs.k8s.io/structured-merge-diff/v4 v4.2.3
)

replace sigs.k8s.io/custom-metrics-apiserver => sigs.k8s.io/custom-metrics-apiserver v1.25.1-0.20230306170449-63d8c93851f3
Expand Down Expand Up @@ -107,9 +109,7 @@ require (
k8s.io/component-base v0.26.2 // indirect
k8s.io/kms v0.26.2 // indirect
k8s.io/kube-openapi v0.0.0-20230303024457-afdc3dddf62d // indirect
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.35 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)
2 changes: 1 addition & 1 deletion pkg/apis/controller/v1beta1/appwrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ const AppWrapperPlural string = "appwrappers"

// AppWrapperAnnotationKey is the annotation key of Pod to identify
// which AppWrapper it belongs to.
const AppWrapperAnnotationKey = "appwrapper.mcad.ibm.com/appwrapper-name"
const AppWrapperAnnotationKey = "workload.codeflare.dev/appwrapper-name"

// +genclient
// +kubebuilder:object:root=true
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/queuejob/queuejob_controller_ex.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ func GetQueueJobKey(obj interface{}) (string, error) {
// UpdateQueueJobStatus was part of pod informer, this is now a method of queuejob_controller file.
// This change is done in an effort to simplify the controller and enable to move to controller runtime.
func (qjm *XController) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error {
labelSelector := fmt.Sprintf("%s=%s", "appwrapper.mcad.ibm.com", queuejob.Name)
labelSelector := fmt.Sprintf("%s=%s", "workload.codeflare.dev/appwrapper", queuejob.Name)
pods, errt := qjm.clients.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{LabelSelector: labelSelector})
if errt != nil {
return errt
Expand Down Expand Up @@ -208,7 +208,7 @@ func (qjm *XController) allocatableCapacity() *clusterstateapi.Resource {
klog.Errorf("[allocatableCapacity] Error listing pods %v", err)
}
for _, pod := range podList.Items {
if _, ok := pod.GetLabels()["appwrappers.mcad.ibm.com"]; !ok && pod.Status.Phase != v1.PodFailed && pod.Status.Phase != v1.PodSucceeded {
if _, ok := pod.GetLabels()["workload.codeflare.dev/appwrappers"]; !ok && pod.Status.Phase != v1.PodFailed && pod.Status.Phase != v1.PodSucceeded {
for _, container := range pod.Spec.Containers {
usedResource := clusterstateapi.NewResource(container.Resources.Requests)
capacity.Sub(usedResource)
Expand Down
5 changes: 2 additions & 3 deletions pkg/controller/queuejob/queuejob_controller_ex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ import (
"github.com/stretchr/testify/assert"
)

// TestIsJsonSyntaxError function validates that the error is related to JSON parsing of
// generic items
// TestIsJsonSyntaxError function validates that the error is related to JSON parsing of generic items
func TestIsJsonSyntaxError(t *testing.T) {
// Define the test table
var tests = []struct {
Expand Down Expand Up @@ -77,7 +76,7 @@ func TestCanIgnoreAPIError(t *testing.T) {
}
// Execute tests in parallel
for _, tc := range tests {
tc := tc // capture range variable
tc := tc // Capture range variable
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
assert.Equal(t, tc.expectedValue, queuejob.CanIgnoreAPIError(tc.inputErr))
Expand Down
55 changes: 36 additions & 19 deletions pkg/controller/queuejobresources/genericresource/genericresource.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"encoding/json"
"fmt"
"math"
"math/rand"
"runtime/debug"
"strings"
"time"
Expand All @@ -43,8 +44,9 @@ import (
"k8s.io/client-go/restmapper"
)

var appwrapperJobName = "appwrapper.mcad.ibm.com"
var resourceName = "resourceName"
var appwrapperJobLabelName = "workload.codeflare.dev/appwrapper"
var appwrapperJobLabelNamespace = "workload.codeflare.dev/appwrapper-namespace"
var resourceName = "workload.codeflare.dev/resourceName"
var appWrapperKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper")

type GenericResources struct {
Expand Down Expand Up @@ -72,6 +74,18 @@ func join(strs ...string) string {
return result
}


func GetRandomString(n int) string {
var letters = []rune("abcdefghijklmnopqrstuvwxyz0123456789")

rand.Seed(time.Now().UnixNano())
b := make([]rune, n)
for i := range b {
b[i] = letters[rand.Intn(len(letters))]
}
return string(b)
}

func (gr *GenericResources) Cleanup(aw *arbv1.AppWrapper, awr *arbv1.AppWrapperGenericResource) (genericResourceName string, groupversionkind *schema.GroupVersionKind, erro error) {
var err error
err = nil
Expand Down Expand Up @@ -166,7 +180,7 @@ func (gr *GenericResources) Cleanup(aw *arbv1.AppWrapper, awr *arbv1.AppWrapperG
}

// Get the resource to see if it exists in the AppWrapper namespace
labelSelector := fmt.Sprintf("%s=%s, %s=%s", appwrapperJobName, aw.Name, resourceName, unstruct.GetName())
labelSelector := fmt.Sprintf("%s=%s, %s=%s, %s=%s", appwrapperJobLabelName, aw.Name, appwrapperJobLabelNamespace, aw.Namespace, resourceName, unstruct.GetName())
inEtcd, err := dclient.Resource(rsrc).Namespace(aw.Namespace).List(context.Background(), metav1.ListOptions{LabelSelector: labelSelector})
if err != nil {
return name, gvk, err
Expand All @@ -175,8 +189,9 @@ func (gr *GenericResources) Cleanup(aw *arbv1.AppWrapper, awr *arbv1.AppWrapperG
// Check to see if object already exists in etcd, if not, create the object.
if inEtcd != nil || len(inEtcd.Items) > 0 {
newName := name
if len(newName) > 63 {
newName = newName[:63]
if len(newName) > 60 {
newName = newName[:60]
newName += GetRandomString(3)
}

err = deleteObject(namespaced, namespace, newName, rsrc, dclient)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will almost always fail for length >60 because the string you create is not deterministic. You should use a hash for getting the last 3 digits

Expand All @@ -187,7 +202,7 @@ func (gr *GenericResources) Cleanup(aw *arbv1.AppWrapper, awr *arbv1.AppWrapperG
return name, gvk, err
}
} else {
klog.Warningf("[Cleanup] %s/%s not found using label selector: %s.\n", name, namespace, labelSelector)
klog.Warningf("[Cleanup] %s/%s not found using label selector: %s.\n", namespace, name, labelSelector)
}

return name, gvk, err
Expand Down Expand Up @@ -297,18 +312,19 @@ func (gr *GenericResources) SyncQueueJob(aw *arbv1.AppWrapper, awr *arbv1.AppWra
} else {
labels = unstruct.GetLabels()
}
labels[appwrapperJobName] = aw.Name
labels[appwrapperJobLabelName] = aw.Name
labels[appwrapperJobLabelNamespace] = aw.Namespace
labels[resourceName] = unstruct.GetName()
unstruct.SetLabels(labels)

// Add labels to pod template if one exists.
podTemplateFound := addLabelsToPodTemplateField(&unstruct, labels)
if !podTemplateFound {
klog.V(4).Infof("[SyncQueueJob] No pod template spec exists for resource: %s to add labels.", name)
klog.V(4).Infof("[SyncQueueJob] No pod template spec exists for resource: %s/%s to add labels.", namespace, name)
}

// Get the resource to see if it exists
labelSelector := fmt.Sprintf("%s=%s, %s=%s", appwrapperJobName, aw.Name, resourceName, unstruct.GetName())
// Get the resource to see if it exists
labelSelector := fmt.Sprintf("%s=%s, %s=%s, %s=%s", appwrapperJobLabelName, aw.Name, appwrapperJobLabelNamespace, aw.Namespace, resourceName, unstruct.GetName())
inEtcd, err := dclient.Resource(rsrc).List(context.Background(), metav1.ListOptions{LabelSelector: labelSelector})
if err != nil {
return []*v1.Pod{}, err
Expand All @@ -317,8 +333,9 @@ func (gr *GenericResources) SyncQueueJob(aw *arbv1.AppWrapper, awr *arbv1.AppWra
// Check to see if object already exists in etcd, if not, create the object.
if inEtcd == nil || len(inEtcd.Items) < 1 {
newName := name
if len(newName) > 63 {
newName = newName[:63]
if len(newName) > 60 {
newName = newName[:60]
newName += GetRandomString(3)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here too

}
unstruct.SetName(newName)
//Asumption object is always namespaced
Expand All @@ -329,7 +346,7 @@ func (gr *GenericResources) SyncQueueJob(aw *arbv1.AppWrapper, awr *arbv1.AppWra
if errors.IsAlreadyExists(err) {
klog.V(4).Infof("%v\n", err.Error())
} else {
klog.Errorf("Error creating the object `%v`, the error is `%v`", newName, errors.ReasonForError(err))
klog.Errorf("Error creating the object `%s/%s`, the error is `%v`", namespace, newName, errors.ReasonForError(err))
return []*v1.Pod{}, err
}
}
Expand Down Expand Up @@ -499,7 +516,7 @@ func deleteObject(namespaced bool, namespace string, name string, rsrc schema.Gr
}

if err != nil && !errors.IsNotFound(err) {
klog.Errorf("[deleteObject] Error deleting the object `%v`, the error is `%v`.", name, errors.ReasonForError(err))
klog.Errorf("[deleteObject] Error deleting the object `%v`, in namespace %v, the error is `%v`.", name, namespace, errors.ReasonForError(err))
return err
} else {
klog.V(4).Infof("[deleteObject] Resource `%v` deleted.\n", name)
Expand Down Expand Up @@ -531,7 +548,7 @@ func GetListOfPodResourcesFromOneGenericItem(awr *arbv1.AppWrapperGenericResourc
klog.V(8).Infof("[GetListOfPodResourcesFromOneGenericItem] Requested total allocation resource from 1 pod `%v`.\n", podTotalresource)
}

// Addd individual pods to results
// Add individual pods to results
var replicaCount int = int(replicas)
for i := 0; i < replicaCount; i++ {
podResourcesList = append(podResourcesList, podTotalresource)
Expand Down Expand Up @@ -623,7 +640,7 @@ func getContainerResources(container v1.Container, replicas float64) *clustersta
}

// returns status of an item present in etcd
func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResource, namespace string, appwrapperName string, genericItemName string) (completed bool) {
func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResource, appwrapperNamespace string, appwrapperName string, genericItemName string) (completed bool) {
dd := gr.clients.Discovery()
apigroups, err := restmapper.GetAPIGroupResources(dd)
if err != nil {
Expand Down Expand Up @@ -654,8 +671,8 @@ func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResourc
return false
}

labelSelector := fmt.Sprintf("%s=%s", appwrapperJobName, appwrapperName)
inEtcd, err := dclient.Resource(rsrc).Namespace(namespace).List(context.Background(), metav1.ListOptions{LabelSelector: labelSelector})
labelSelector := fmt.Sprintf("%s=%s, %s=%s", appwrapperJobLabelName, appwrapperName, appwrapperJobLabelNamespace, appwrapperNamespace)
inEtcd, err := dclient.Resource(rsrc).Namespace(appwrapperNamespace).List(context.Background(), metav1.ListOptions{LabelSelector: labelSelector})
if err != nil {
klog.Errorf("[IsItemCompleted] Error listing object: %v", err)
return false
Expand All @@ -675,7 +692,7 @@ func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResourc
}
}
if !validAwOwnerRef {
klog.Warningf("[IsItemCompleted] Item owner name %v does match appwrappper name %v in namespace %v", unstructuredObjectName, appwrapperName, namespace)
klog.Warningf("[IsItemCompleted] Item owner name %v does match appwrappper name %v in namespace %v", unstructuredObjectName, appwrapperName, appwrapperNamespace)
continue
}

Expand Down
4 changes: 2 additions & 2 deletions test/e2e-kuttl-borrowing/steps/03-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ spec:
name: my-job-1
namespace: test
labels:
appwrapper.mcad.ibm.com: my-job-1
workload.codeflare.dev/appwrapper: my-job-1
spec:
parallelism: 1
completions: 1
Expand All @@ -40,7 +40,7 @@ spec:
name: my-job-1
namespace: test
labels:
appwrapper.mcad.ibm.com: my-job-1
workload.codeflare.dev/appwrapper: my-job-1
spec:
terminationGracePeriodSeconds: 1
restartPolicy: Never
Expand Down
4 changes: 2 additions & 2 deletions test/e2e-kuttl-borrowing/steps/04-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ spec:
name: my-job-2
namespace: test
labels:
appwrapper.mcad.ibm.com: my-job-2
workload.codeflare.dev/appwrapper: my-job-2
spec:
parallelism: 1
completions: 1
Expand All @@ -40,7 +40,7 @@ spec:
name: my-job-2
namespace: test
labels:
appwrapper.mcad.ibm.com: my-job-2
workload.codeflare.dev/appwrapper: my-job-2
spec:
terminationGracePeriodSeconds: 1
restartPolicy: Never
Expand Down
5 changes: 3 additions & 2 deletions test/e2e-kuttl-deployment-01/steps/01-assert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ metadata:
namespace: start-up
labels:
app: no-quota-deployment-01
appwrapper.mcad.ibm.com: no-quota-deployment-01
resourceName: no-quota-deployment-01
workload.codeflare.dev/appwrapper: no-quota-deployment-01
workload.codeflare.dev/appwrapper-namespace: start-up
workload.codeflare.dev/resourceName: no-quota-deployment-01
status:
availableReplicas: 1
observedGeneration: 1
Expand Down
4 changes: 2 additions & 2 deletions test/e2e-kuttl-deployment-01/steps/02-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
name: no-quota-job-02
namespace: start-up
labels:
appwrapper.mcad.ibm.com: no-quota-job-02
workload.codeflare.dev/appwrapper: no-quota-job-02
spec:
parallelism: 1
completions: 1
Expand All @@ -36,7 +36,7 @@ spec:
name: no-quota-job-1
namespace: start-up
labels:
appwrapper.mcad.ibm.com: no-quota-job-02
workload.codeflare.dev/appwrapper: no-quota-job-02
spec:
terminationGracePeriodSeconds: 1
restartPolicy: Never
Expand Down
35 changes: 19 additions & 16 deletions test/e2e-kuttl-deployment-01/steps/03-assert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,31 @@ metadata:
name: hold-completion-job-03-01
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
resourceName: hold-completion-job-03-01
workload.codeflare.dev/appwrapper: hold-completion-job-03
workload.codeflare.dev/appwrapper-namespace: start-up
workload.codeflare.dev/resourceName: hold-completion-job-03-01
status:
conditions:
- status: "True"
type: Complete
succeeded: 1
---
apiVersion: v1
kind: Pod
metadata:
apiVersion: v1
kind: Pod
metadata:
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
job-name: hold-completion-job-03-01
resourceName: hold-completion-job-03-01
labels:
workload.codeflare.dev/appwrapper: hold-completion-job-03
workload.codeflare.dev/appwrapper-namespace: start-up
job-name: hold-completion-job-03-01
workload.codeflare.dev/resourceName: hold-completion-job-03-01
---
apiVersion: v1
kind: Pod
metadata:
apiVersion: v1
kind: Pod
metadata:
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
job-name: hold-completion-job-03-02
resourceName: hold-completion-job-03-02
labels:
workload.codeflare.dev/appwrapper: hold-completion-job-03
workload.codeflare.dev/appwrapper-namespace: start-up
job-name: hold-completion-job-03-02
workload.codeflare.dev/resourceName: hold-completion-job-03-02
8 changes: 4 additions & 4 deletions test/e2e-kuttl-deployment-01/steps/03-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ spec:
name: hold-completion-job-03-01
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
workload.codeflare.dev/appwrapper: hold-completion-job-03
spec:
parallelism: 1
completions: 1
Expand All @@ -40,7 +40,7 @@ spec:
name: hold-completion-job-03-01
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
workload.codeflare.dev/appwrapper: hold-completion-job-03
spec:
terminationGracePeriodSeconds: 1
restartPolicy: Never
Expand Down Expand Up @@ -80,7 +80,7 @@ spec:
name: hold-completion-job-03-02
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
workload.codeflare.dev/appwrapper: hold-completion-job-03
spec:
parallelism: 1
completions: 1
Expand All @@ -89,7 +89,7 @@ spec:
name: hold-completion-job-03-02
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
workload.codeflare.dev/appwrapper: hold-completion-job-03
spec:
terminationGracePeriodSeconds: 1
restartPolicy: Never
Expand Down
Loading
Loading