From 3eeadecc8f91b6565bdd97db619a866f52d891be Mon Sep 17 00:00:00 2001 From: Hollie Hutchinson Date: Mon, 3 Nov 2025 11:53:59 +0000 Subject: [PATCH 1/5] Initial skc baremetal commit --- .../stackhpc-baremetal/inventory/groups | 5 + .../stackhpc-baremetal/inventory/hosts | 0 .../stackhpc-baremetal/ironic.yml | 135 ++++++++++++++++++ .../kolla/ironic/ironic.conf | 74 ++++++++++ 4 files changed, 214 insertions(+) create mode 100644 etc/kayobe/environments/stackhpc-baremetal/inventory/groups create mode 100644 etc/kayobe/environments/stackhpc-baremetal/inventory/hosts create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ironic.yml create mode 100644 etc/kayobe/environments/stackhpc-baremetal/kolla/ironic/ironic.conf diff --git a/etc/kayobe/environments/stackhpc-baremetal/inventory/groups b/etc/kayobe/environments/stackhpc-baremetal/inventory/groups new file mode 100644 index 0000000000..6235ad2ced --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/inventory/groups @@ -0,0 +1,5 @@ +[baremetal-overcloud] + +[baremetal:children] +baremetal-compute +baremetal-overcloud diff --git a/etc/kayobe/environments/stackhpc-baremetal/inventory/hosts b/etc/kayobe/environments/stackhpc-baremetal/inventory/hosts new file mode 100644 index 0000000000..e69de29bb2 diff --git a/etc/kayobe/environments/stackhpc-baremetal/ironic.yml b/etc/kayobe/environments/stackhpc-baremetal/ironic.yml new file mode 100644 index 0000000000..60d830bb1f --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ironic.yml @@ -0,0 +1,135 @@ +--- +############################################################################### +# Ironic configuration. + +# Specify the list of hardware types to load during service initialization. +kolla_ironic_enabled_hardware_types: + - redfish + +# Specify the list of bios interfaces to load during service initialization. +#kolla_ironic_enabled_bios_interfaces: + +# Default bios interface to be used for nodes that do not have bios_interface +# field set. +#kolla_ironic_default_bios_interface: + +# Specify the list of boot interfaces to load during service initialization. +kolla_ironic_enabled_boot_interfaces: + - redfish-virtual-media + - redfish-https + +# Default boot interface to be used for nodes that do not have boot_interface +# field set. +kolla_ironic_default_boot_interface: redfish-virtual-media + +# Specify the list of console interfaces to load during service initialization. +#kolla_ironic_enabled_console_interfaces: + +# Default console interface to be used for nodes that do not have +# console_interface field set. +#kolla_ironic_default_console_interface: + +# Specify the list of deploy interfaces to load during service initialization. +#kolla_ironic_enabled_deploy_interfaces: + +# Default deploy interface to be used for nodes that do not have +# deploy_interface field set. +#kolla_ironic_default_deploy_interface: + +# Specify the list of inspect interfaces to load during service initialization. +kolla_ironic_enabled_inspect_interfaces: + - redfish + - agent + +# Default inspect interface to be used for nodes that do not have +# inspect_interface field set. +kolla_ironic_default_inspect_interface: redfish + +# Specify the list of management interfaces to load during service +# initialization. +kolla_ironic_enabled_management_interfaces: + - redfish + +# Default management interface to be used for nodes that do not have +# management_interface field set. +#kolla_ironic_default_management_interface: + +# Specify the list of network interfaces to load during service initialization. +kolla_ironic_enabled_network_interfaces: + - neutron + - flat + - noop + +# Default network interface to be used for nodes that do not have +# network_interface field set. +kolla_ironic_default_network_interface: neutron + +# Specify the list of power interfaces to load during service initialization. +kolla_ironic_enabled_power_interfaces: + - redfish + +# Default power interface to be used for nodes that do not have power_interface +# field set. +#kolla_ironic_default_power_interface: + +# Specify the list of raid interfaces to load during service initialization. +#kolla_ironic_enabled_raid_interfaces: + +# Default raid interface to be used for nodes that do not have +# raid_interface field set. +kolla_ironic_default_raid_interface: agent + +# Specify the list of rescue interfaces to load during service initialization. +#kolla_ironic_enabled_rescue_interfaces: + +# Default rescue interface to be used for nodes that do not have +# rescue_interface field set. +#kolla_ironic_default_rescue_interface: + +# Specify the list of storage interfaces to load during +# service initialization. +#kolla_ironic_enabled_storage_interfaces: + +# Default storage interface to be used for nodes that do not +# have storage_interface field set. +#kolla_ironic_default_storage_interface: + +# Specify the list of vendor interfaces to load during service initialization. +#kolla_ironic_enabled_vendor_interfaces: + +# Default vendor interface to be used for nodes that do not have +# vendor_interface field set. +#kolla_ironic_default_vendor_interface: + +# Name of the Neutron network to use for cleaning. +#kolla_ironic_cleaning_network: + +# Name of the Neutron network to use for provisioning. +#kolla_ironic_provisioning_network: + +# List of default kernel parameters to append for baremetal PXE boot. +#kolla_ironic_pxe_append_params_default: + +# List of additional kernel parameters to append for baremetal PXE boot. +#kolla_ironic_pxe_append_params_extra: + +# List of kernel parameters to append for baremetal PXE boot. +#kolla_ironic_pxe_append_params: + +############################################################################### +# Ironic Node Configuration + +# Whether or not to enable the serial consoles on post configure +#ironic_serial_console_autoenable: + +# This defines the start of the range of TCP ports to used for the IPMI socat +# serial consoles +#ironic_serial_console_tcp_pool_start: + +# This defines the end of the range of TCP ports to used for the IPMI socat +# serial consoles +#ironic_serial_console_tcp_pool_end: + +############################################################################### +# Dummy variable to allow Ansible to accept this file. +workaround_ansible_issue_8743: yes diff --git a/etc/kayobe/environments/stackhpc-baremetal/kolla/ironic/ironic.conf b/etc/kayobe/environments/stackhpc-baremetal/kolla/ironic/ironic.conf new file mode 100644 index 0000000000..538c103518 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/kolla/ironic/ironic.conf @@ -0,0 +1,74 @@ +{% raw %} +{% set internal_net_ip = (internal_net_ips).get(inventory_hostname) %} + +[DEFAULT] +# avoid checksum issues, do convert on deploy node +force_raw_images = false +# Avoid some timeouts of heartbeats and vif deletes +rpc_response_timeout = 360 + +[conductor] +automated_clean = true +bootloader = file:///var/lib/ironic/httpboot/esp.img +deploy_kernel = file:///var/lib/ironic/httpboot/ironic-agent.kernel +deploy_ramdisk = file:///var/lib/ironic/httpboot/ironic-agent.initramfs + +# We have busy conductors failing to heartbeat +# Default is 10 secs +heartbeat_interval = 30 +# Default is 60 seconds +heartbeat_timeout = 360 +sync_local_state_interval = 360 + +# Normally this is 100. We see eventlet threads +# not making much progress, to for saftey reduce +# this by half, should leave work on rabbit queu +workers_pool_size = 50 +# Normally this is 8, keep it same +period_max_workers = 8 + +# Increase power sync interval to reduce load +sync_power_state_interval = 120 +power_failure_recovery_interval = 120 +# Stop checking for orphan allocations for now +check_allocations_interval = 120 + +# Wait much longer before provision timeout check, to reduce background load +# The default is 60 seconds +check_provision_state_interval = 120 +check_rescue_state_interval = 120 + +[database] +# Usually this is 50, reduce to stop DB connection timeouts +# and instead just make eventlet threads wait a bit longer +max_overflow = 5 +# By default this is 30 seconds, but as we reduce +# the pool overflow, some people will need to wait longer +pool_timeout = 60 + +[neutron] +# Increase the neutron client timeout to allow for the slow management +# switches. +timeout = 300 +request_timeout = 300 + +[glance] +# Retry image download at least once if failure +num_retries = 1 + +[neutron] +inspection_network = "{{ inspection_net_name | default('inspect-net' )}}" + +[redfish] +kernel_append_params = nofb nomodeset vga=normal console=tty0 console=ttyS0,115200n8 ipa-insecure=1 {% if internal_net_ip %}ipa-ntp-server={{ internal_net_ip }}{% endif %} + +[inspector] +extra_kernel_params = ipa-collect-lldp=1 ipa-inspection-collectors=default,logs,pci-devices ipa-insecure=1 +hooks = ramdisk-error,validate-interfaces,ports,local-link-connection,parse-lldp,root-device,cpu-capabilities,architecture +add_ports = all + +[pxe] +# 100GB size 4 weeks ttl +image_cache_size = 95367 +image_cache_ttl = 40320 +{% endraw %} From f19083095944f5f6ee8330eab6cb5bc56b11e8f7 Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Mon, 3 Nov 2025 17:04:50 +0000 Subject: [PATCH 2/5] ansible directory and playbooks added to skc-baremetal-environment --- .../ansible/baremetal-0-enroll-overcloud.yml | 83 ++++++++ .../ansible/baremetal-1-check-bmc-up.yml | 124 +++++++++++ .../baremetal-2-ensure-redfish-inspect.yml | 86 ++++++++ .../baremetal-3-ensure-agent-inspect.yml | 194 ++++++++++++++++++ .../ansible/baremetal-4-clean.yml | 112 ++++++++++ .../ansible/baremetal-all.yml | 11 + .../ansible/diagnose-baremetal.yml | 94 +++++++++ .../ansible/download-host-image.yml | 86 ++++++++ .../ansible/provision-overcloud-nova.yml | 84 ++++++++ .../ansible/provision-overcloud.yml | 83 ++++++++ .../ansible/recover-baremetal.yml | 61 ++++++ 11 files changed, 1018 insertions(+) create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-0-enroll-overcloud.yml create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-1-check-bmc-up.yml create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-2-ensure-redfish-inspect.yml create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-3-ensure-agent-inspect.yml create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-4-clean.yml create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-all.yml create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/diagnose-baremetal.yml create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/download-host-image.yml create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud-nova.yml create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud.yml create mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/recover-baremetal.yml diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-0-enroll-overcloud.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-0-enroll-overcloud.yml new file mode 100644 index 0000000000..51b2fd2f7e --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-0-enroll-overcloud.yml @@ -0,0 +1,83 @@ +--- + +- name: Register baremetal compute nodes + hosts: localhost + vars: + venv: "{{ virtualenv_path }}/openstack-cli" + tasks: + - name: Set up openstack cli virtualenv + pip: + virtualenv: "{{ venv }}" + name: + - python-openstackclient + - python-ironicclient + state: latest + virtualenv_command: "python3.{{ ansible_facts.python.version.minor }} -m venv" + extra_args: "{% if pip_upper_constraints_file %}-c {{ pip_upper_constraints_file }}{% endif %}" + +- name: Ensure overcloud baremetal nodes are registered in ironic + hosts: overcloud + gather_facts: false + max_fail_percentage: >- + {{ baremetal_compute_register_max_fail_percentage | + default(baremetal_compute_max_fail_percentage) | + default(kayobe_max_fail_percentage) | + default(100) }} + tags: + - baremetal + vars: + venv: "{{ virtualenv_path }}/openstack-cli" + controller_host: localhost + tasks: + - name: Check Ironic variables are defined + ansible.builtin.assert: + that: + - ironic_driver is defined + - ironic_driver_info is defined + - ironic_properties is defined + - ironic_resource_class is defined + fail_msg: One or more Ironic variables are undefined. + + - block: + - name: Show baremetal node + ansible.builtin.command: + cmd: "{{ venv }}/bin/openstack baremetal node show {{ inventory_hostname }}" + register: node_show + failed_when: + - '"HTTP 404" not in node_show.stderr' + - node_show.rc != 0 + changed_when: false + + # NOTE: The openstack.cloud.baremetal_node module cannot be used in this + # script due to requiring a MAC address pre-defined, instead, this should + # be discovered by inpsection following this script. + # + # NOTE: IPMI address must be passed with Redfish address to ensure existing + # Ironic nodes match with new nodes during inspection. + - name: Create baremetal nodes + ansible.builtin.shell: + cmd: | + {{ venv }}/bin/openstack baremetal node create \ + --name {{ inventory_hostname }} \ + --driver {{ ironic_driver }} \ + {% for key, value in ironic_driver_info.items() %} + --driver-info {{ key }}={{ value }} \ + {% endfor %} + {% for key, value in ironic_properties.items() %} + --property {{ key }}={{ value }} \ + {% endfor %} + --resource-class {{ ironic_resource_class }} + when: + - node_show.rc != 0 + + - name: Manage baremetal nodes + ansible.builtin.command: + cmd: "{{ venv }}/bin/openstack baremetal node manage {{ inventory_hostname }} --wait" + when: + - node_show.rc != 0 + delegate_to: "{{ controller_host }}" + vars: + # NOTE: Without this, the controller's ansible_host variable will not + # be respected when using delegate_to. + ansible_host: "{{ hostvars[controller_host].ansible_host | default(controller_host) }}" + environment: "{{ openstack_auth_env }}" diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-1-check-bmc-up.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-1-check-bmc-up.yml new file mode 100644 index 0000000000..b6ecb2d540 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-1-check-bmc-up.yml @@ -0,0 +1,124 @@ +--- +- name: Check baremetal compute node bmc is up + hosts: baremetal + gather_facts: false + max_fail_percentage: >- + {{ baremetal_compute_register_max_fail_percentage | + default(baremetal_compute_max_fail_percentage) | + default(kayobe_max_fail_percentage) | + default(100) }} + tags: + - baremetal + vars: + venv: "{{ virtualenv_path }}/openstack-cli" + controller_host: localhost + + tasks: + - name: Check Ironic variables are defined + ansible.builtin.assert: + that: + - ironic_driver is defined + - ironic_driver_info is defined + - ironic_properties is defined + - ironic_resource_class is defined + fail_msg: One or more Ironic variables are undefined. + + - name: Show and check baremetal node + delegate_to: "{{ controller_host }}" + vars: + # NOTE: Without this, the controller's ansible_host variable will not + # be respected when using delegate_to. + ansible_host: "{{ hostvars[controller_host].ansible_host | default(controller_host) }}" + environment: "{{ openstack_auth_env }}" + block: + + - name: Show baremetal node + ansible.builtin.command: + cmd: "{{ venv }}/bin/openstack baremetal node show {{ inventory_hostname }} -f json" + register: node_show + failed_when: + - node_show.rc != 0 + changed_when: false + + - name: Check if bmc is up + ansible.builtin.set_fact: + kayobe_bmc_up: "{{ (node_show.stdout | from_json)['extra'].get('kayobe_bmc_up') }}" + provision_state: "{{ (node_show.stdout | from_json)['provision_state'] }}" + + - name: Output when bmc last up run + ansible.builtin.debug: + msg: "BMC for node {{ inventory_hostname }} was up at {{ kayobe_bmc_up }}." + when: kayobe_bmc_up != "" + + - name: Check BMC is up + ansible.builtin.uri: + url: "https://{{ ironic_driver_info['redfish_address'] }}" + method: GET + status_code: 200 + validate_certs: false + timeout: 10 + + - name: Get firmware inventory (to check redfish auth) + community.general.redfish_info: + category: Update + command: GetFirmwareInventory + baseuri: "{{ ironic_redfish_address }}" + username: "{{ ironic_redfish_username }}" + password: "{{ ironic_redfish_password }}" + register: firmware_inventory + failed_when: not firmware_inventory.redfish_facts.firmware.ret + + # - name: Print fetched information + # ansible.builtin.debug: + # msg: "{{ firmware_inventory.redfish_facts.firmware | to_nice_json }}" + + - name: Reboot BMC + community.general.redfish_command: + category: Manager + command: PowerReboot + resource_id: 1 + baseuri: "{{ ironic_redfish_address }}" + username: "{{ ironic_redfish_username }}" + password: "{{ ironic_redfish_password }}" + when: kayobe_bmc_up == "" + + - name: Wait 300 seconds for port 443 to become open + ansible.builtin.wait_for: + port: 443 + host: "{{ ironic_redfish_address }}" + delay: 20 + timeout: 300 + when: kayobe_bmc_up == "" + + - name: Check BMC back up again + ansible.builtin.uri: + url: "https://{{ ironic_driver_info['redfish_address'] }}" + method: GET + status_code: 200 + validate_certs: false + timeout: 10 + register: uri_output + until: uri_output.status == 200 + delay: 5 + retries: 24 # Retries for 24 * 5 seconds = 120 seconds = 2 minutes + + - name: Note when we are able to reach the bmc, the first time + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node set {{ inventory_hostname }} --extra kayobe_bmc_up={{ now(utc=true, fmt='%Y-%m-%dT%H:%M:%SZ') }} + register: node_set + failed_when: + - node_set.rc != 0 + changed_when: true + when: kayobe_bmc_up == "" + + - name: Try move from enroll to manageable + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node manage {{ inventory_hostname }} --wait 300 + register: node_set + failed_when: + - node_set.rc != 0 + changed_when: true + when: + - provision_state == "enroll" diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-2-ensure-redfish-inspect.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-2-ensure-redfish-inspect.yml new file mode 100644 index 0000000000..61d2908e54 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-2-ensure-redfish-inspect.yml @@ -0,0 +1,86 @@ +--- +- name: Check baremetal compute node bmc is up + hosts: baremetal + gather_facts: false + max_fail_percentage: >- + {{ baremetal_compute_register_max_fail_percentage | + default(baremetal_compute_max_fail_percentage) | + default(kayobe_max_fail_percentage) | + default(100) }} + tags: + - baremetal + vars: + venv: "{{ virtualenv_path }}/openstack-cli" + controller_host: localhost + + tasks: + - name: Show and check baremetal node + delegate_to: "{{ controller_host }}" + vars: + # NOTE: Without this, the controller's ansible_host variable will not + # be respected when using delegate_to. + ansible_host: "{{ hostvars[controller_host].ansible_host | default(controller_host) }}" + redfish_inspect_timeout: 120 + environment: "{{ openstack_auth_env }}" + block: + + - name: Show baremetal node + ansible.builtin.command: + cmd: "{{ venv }}/bin/openstack baremetal node show {{ inventory_hostname }} -f json" + register: node_show + failed_when: + - node_show.rc != 0 + changed_when: false + + - name: Check BMC is up + ansible.builtin.uri: + url: "https://{{ ironic_driver_info['redfish_address'] }}" + method: GET + status_code: 200 + validate_certs: false + timeout: 10 + + - name: Check for redfish inspection details + ansible.builtin.set_fact: + kayobe_redfish_inspect_done: "{{ (node_show.stdout | from_json)['extra'].get('kayobe_redfish_inspect_done') }}" + inspect_interface: "{{ (node_show.stdout | from_json)['inspect_interface'] }}" + provision_state: "{{ (node_show.stdout | from_json)['provision_state'] }}" + + - name: Output when redfish inspection was done + ansible.builtin.debug: + msg: "{{ inventory_hostname }} inspected at {{ kayobe_redfish_inspect_done }}." + when: kayobe_redfish_inspect_done != "" + + - name: Fail if not redfish inspection + ansible.builtin.fail: + msg: "{{ inventory_hostname }} has the wrong inspect_interface: {{ inspect_interface }}" + when: + - inspect_interface != "redfish" + - kayobe_redfish_inspect_done == "" + + - name: Fail if not in manageable state + ansible.builtin.fail: + msg: "{{ inventory_hostname }} has the wrong provision_state: {{ provision_state }}" + when: + - provision_state != "manageable" + - kayobe_redfish_inspect_done == "" + + - name: Wait for inspection + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node inspect {{ inventory_hostname }} --wait {{ redfish_inspect_timeout }} + register: node_inspect + failed_when: + - node_inspect.rc != 0 + changed_when: true + when: kayobe_redfish_inspect_done == "" + + - name: Note when redfish inspection is done + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node set {{ inventory_hostname }} --extra kayobe_redfish_inspect_done={{ now(utc=true, fmt='%Y-%m-%dT%H:%M:%SZ') }} + register: node_set + failed_when: + - node_set.rc != 0 + changed_when: true + when: kayobe_redfish_inspect_done == "" diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-3-ensure-agent-inspect.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-3-ensure-agent-inspect.yml new file mode 100644 index 0000000000..74252dd5af --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-3-ensure-agent-inspect.yml @@ -0,0 +1,194 @@ +--- +- name: Check baremetal node bmc is up + hosts: baremetal + gather_facts: false + max_fail_percentage: >- + {{ baremetal_compute_register_max_fail_percentage | + default(baremetal_compute_max_fail_percentage) | + default(kayobe_max_fail_percentage) | + default(100) }} + tags: + - baremetal + vars: + venv: "{{ virtualenv_path }}/openstack-cli" + controller_host: localhost + + tasks: + - name: Show and check baremetal node + delegate_to: "{{ controller_host }}" + vars: + # NOTE: Without this, the controller's ansible_host variable will not + # be respected when using delegate_to. + ansible_host: "{{ hostvars[controller_host].ansible_host | default(controller_host) }}" + agent_inspect_timeout: "{{ 60 * 20 }}" # 20 minutes + environment: "{{ openstack_auth_env }}" + block: + + - name: Show baremetal node + ansible.builtin.command: + cmd: "{{ venv }}/bin/openstack baremetal node show {{ inventory_hostname }} -f json" + register: node_show + failed_when: + - node_show.rc != 0 + changed_when: false + + - name: Check BMC is up + ansible.builtin.uri: + url: "https://{{ ironic_driver_info['redfish_address'] }}" + method: GET + status_code: 200 + validate_certs: false + timeout: 10 + + - name: Check for agent inspection details + ansible.builtin.set_fact: + kayobe_agent_inspect_done: "{{ (node_show.stdout | from_json)['extra'].get('kayobe_agent_inspect_done') }}" + inspect_interface: "{{ (node_show.stdout | from_json)['inspect_interface'] }}" + network_interface: "{{ (node_show.stdout | from_json)['network_interface'] }}" + provision_state: "{{ (node_show.stdout | from_json)['provision_state'] }}" + + - name: Output when agent inspection was done + ansible.builtin.debug: + msg: "{{ inventory_hostname }} inspected at {{ kayobe_agent_inspect_done }}." + when: kayobe_agent_inspect_done != "" + + - name: Fail if not in manageable state + ansible.builtin.fail: + msg: "{{ inventory_hostname }} has the wrong provision_state: {{ provision_state }}" + when: + - provision_state not in ["manageable", "inspect failed"] + - kayobe_agent_inspect_done == "" + + - name: If we failed inspect, move back to managable + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node manage {{ inventory_hostname }} --wait 300 + register: node_set + failed_when: + - node_set.rc != 0 + changed_when: true + # do it slowly due to ironic api issues + throttle: 3 + when: + - provision_state == "inspect failed" + - kayobe_agent_inspect_done == "" + + - name: Move to agent inspect interface + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node set {{ inventory_hostname }} --inspect-interface agent + register: node_set + failed_when: + - node_set.rc != 0 + changed_when: true + # do it slowly due to ironic api issues + throttle: 3 + when: + - kayobe_agent_inspect_done == "" + - inspect_interface == "redfish" + + - name: Ensure we using the flat network interface and correct network + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node set {{ inventory_hostname }} --network-interface flat --driver-info inspection_network={{ ironic_flat_provisioning_network }} --driver-info cleaning_network={{ ironic_flat_provisioning_network }} --driver-info provision_network={{ ironic_flat_provisioning_network }} + register: node_set + failed_when: + - node_set.rc != 0 + changed_when: true + # do it slowly due to ironic api issues + throttle: 3 + when: + - kayobe_agent_inspect_done == "" + + - name: Gather information about baremetal ports + openstack.cloud.baremetal_port_info: + node: "{{ inventory_hostname }}" + auth_type: "{{ openstack_auth_type }}" + auth: "{{ openstack_auth }}" + cacert: "{{ openstack_cacert | default(omit, true) }}" + interface: "{{ openstack_interface | default(omit, true) }}" + register: bmport + delegate_to: localhost + when: kayobe_agent_inspect_done == "" + + - name: Disable PXE on all baremetal ports + openstack.cloud.baremetal_port: + address: "{{ item.address }}" + auth_type: "{{ openstack_auth_type }}" + auth: "{{ openstack_auth }}" + cacert: "{{ openstack_cacert | default(omit, true) }}" + interface: "{{ openstack_interface | default(omit, true) }}" + node: "{{ inventory_hostname }}" + is_pxe_enabled: false + loop: "{{ bmport.baremetal_ports }}" + when: kayobe_agent_inspect_done == "" + + - name: Re-enable PXE on the first Mellanox ethernet NIC + openstack.cloud.baremetal_port: + address: "{{ bmport.baremetal_ports | selectattr('address', 'search', item) | map(attribute='address') | list | first }}" + auth_type: "{{ openstack_auth_type }}" + auth: "{{ openstack_auth }}" + cacert: "{{ openstack_cacert | default(omit, true) }}" + interface: "{{ openstack_interface | default(omit, true) }}" + node: "{{ inventory_hostname }}" + is_pxe_enabled: true + when: + - kayobe_agent_inspect_done == "" + - bmport.baremetal_ports | selectattr('address', 'search', item) | list | length > 0 + # known mellanox ethernet NICs + loop: + - "^58:a2:e1" + - "^a0:88:c2" + - "^7c:8c:09" + - "^94:6d:ae" + - "^50:00:e6" + - "^b8:3f:d2" + - "^c4:70:bd" + + - name: Wait for inspection + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node inspect {{ inventory_hostname }} --wait {{ agent_inspect_timeout }} + register: node_inspect + failed_when: + - node_inspect.rc != 0 + changed_when: true + when: kayobe_agent_inspect_done == "" + + - name: Move to neutron interface, assuming its a multi-tenant node + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node set {{ inventory_hostname }} --network-interface neutron + register: node_set + failed_when: + - node_set.rc != 0 + changed_when: true + # do it slowly due to ironic api issues + throttle: 3 + when: + - kayobe_agent_inspect_done == "" + - ironic_network_interface == "neutron" + + - name: Remove network overrides for multi-tenant nodes + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node unset {{ inventory_hostname }} --driver-info inspection_network --driver-info cleaning_network --driver-info provision_network + register: node_set + failed_when: + - node_set.rc != 0 + changed_when: true + # do it slowly due to ironic api issues + throttle: 3 + when: + - kayobe_agent_inspect_done == "" + - ironic_network_interface == "neutron" + + - name: Note when agent inspection is done + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node set {{ inventory_hostname }} --extra kayobe_agent_inspect_done={{ now(utc=true, fmt='%Y-%m-%dT%H:%M:%SZ') }} + register: node_set + failed_when: + - node_set.rc != 0 + changed_when: true + when: kayobe_agent_inspect_done == "" diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-4-clean.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-4-clean.yml new file mode 100644 index 0000000000..99d0609738 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-4-clean.yml @@ -0,0 +1,112 @@ +--- +- name: Check baremetal node bmc is up + hosts: baremetal + gather_facts: false + max_fail_percentage: >- + {{ baremetal_compute_register_max_fail_percentage | + default(baremetal_compute_max_fail_percentage) | + default(kayobe_max_fail_percentage) | + default(100) }} + tags: + - baremetal + vars: + venv: "{{ virtualenv_path }}/openstack-cli" + controller_host: localhost + cleaning_timeout: "{{ 60 * 20 }}" # 20 minutes + + tasks: + - name: Show and check baremetal node + delegate_to: "{{ controller_host }}" + vars: + # NOTE: Without this, the controller's ansible_host variable will not + # be respected when using delegate_to. + ansible_host: "{{ hostvars[controller_host].ansible_host | default(controller_host) }}" + agent_inspect_timeout: "{{ 60 * 20 }}" # 20 minutes + environment: "{{ openstack_auth_env }}" + block: + + - name: Show baremetal node + ansible.builtin.command: + cmd: "{{ venv }}/bin/openstack baremetal node show {{ inventory_hostname }} -f json" + register: node_show + failed_when: + - node_show.rc != 0 + changed_when: false + + - name: Check BMC is up + ansible.builtin.uri: + url: "https://{{ ironic_driver_info['redfish_address'] }}" + method: GET + status_code: 200 + validate_certs: false + timeout: 10 + + - name: Check for agent inspection details + ansible.builtin.set_fact: + kayobe_agent_inspect_done: "{{ (node_show.stdout | from_json)['extra'].get('kayobe_agent_inspect_done') }}" + kayobe_clean_done: "{{ (node_show.stdout | from_json)['extra'].get('kayobe_clean_done') }}" + network_interface: "{{ (node_show.stdout | from_json)['network_interface'] }}" + provision_state: "{{ (node_show.stdout | from_json)['provision_state'] }}" + node_maintenance: "{{ (node_show.stdout | from_json)['maintenance'] }}" + + - name: Fail if not agent inspection done + ansible.builtin.fail: + msg: "{{ inventory_hostname }} has not been inspected" + when: + - kayobe_agent_inspect_done == "" + + - name: Fail if not in manageable or clean failed state + ansible.builtin.fail: + msg: "{{ inventory_hostname }} has the wrong provision_state: {{ provision_state }}" + when: + - provision_state not in ["manageable", "clean failed"] + - kayobe_clean_done == "" + + - name: If in clean failed, move back to managable + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node manage {{ inventory_hostname }} --wait 300 + register: node_manage + failed_when: + - node_manage.rc != 0 + when: + - provision_state == "clean failed" + + - name: Remove node from maintenance + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node maintenance unset {{ inventory_hostname }} + register: node_set + failed_when: + - node_set.rc != 0 + when: + - node_maintenance + - provision_state in ["manageable", "clean failed"] + + - name: Ensure hosts set to use software RAID + ansible.builtin.shell: | + openstack baremetal node set \ + --target-raid-config {{ ironic_target_raid_config }} \ + --raid-interface agent {{ inventory_hostname }} + when: + - kayobe_clean_done == "" + + - name: Clean node and make it available + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node provide {{ inventory_hostname }} --wait {{ cleaning_timeout }} + register: node_set + failed_when: + - node_set.rc != 0 + when: + - kayobe_clean_done == "" or provision_state in ["clean failed"] + + - name: Note when cleaning has completed + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node set {{ inventory_hostname }} --extra kayobe_clean_done={{ now(utc=true, fmt='%Y-%m-%dT%H:%M:%SZ') }} + register: node_set + failed_when: + - node_set.rc != 0 + changed_when: true + when: kayobe_clean_done == "" diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-all.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-all.yml new file mode 100644 index 0000000000..0d1e17e4dd --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-all.yml @@ -0,0 +1,11 @@ +--- +#- name: Ensure overcloud baremetal is enrolled +# ansible.builtin.import_playbook: ./baremetal-0-enroll-overcloud.yml +- name: Check BMC is up + ansible.builtin.import_playbook: ./baremetal-1-check-bmc-up.yml +- name: Do redfish inspection + ansible.builtin.import_playbook: ./baremetal-2-ensure-redfish-inspect.yml +- name: Do agent inspection + ansible.builtin.import_playbook: ./baremetal-3-ensure-agent-inspect.yml +- name: Make baremetal node available + ansible.builtin.import_playbook: ./baremetal-4-clean.yml diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/diagnose-baremetal.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/diagnose-baremetal.yml new file mode 100644 index 0000000000..ccc113fa21 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/diagnose-baremetal.yml @@ -0,0 +1,94 @@ +--- +- name: Node enrolment health check + hosts: baremetal-compute + gather_facts: false + connection: local + vars: + ping_cmd: "ping -c1 -W1" + venv: "{{ virtualenv_path }}/openstack-cli" + purge_faulty_baremetal: false + tasks: + - name: Ping BMC + ansible.builtin.command: "{{ ping_cmd }} {{ redfish_address }}" + register: bmc_ping + failed_when: redfish_address is not defined + changed_when: false + + - name: Fetch bmnode info + openstack.cloud.baremetal_node_info: + name: "{{ inventory_hostname }}" + register: bmnode + failed_when: false + changed_when: false + + - name: Evaluate node details + ansible.builtin.set_fact: + bmnode_details: + bmc_reachable: "{{ bmc_ping.rc == 0 | default(false) }}" + kayobe_agent_inspect_done_ts: "{{ bmnode.nodes[0].extra.get('kayobe_agent_inspect_done') }}" + kayobe_first_provision_ts: "{{ bmnode.nodes[0].extra.get('kayobe_first_provision') }}" + ironic_state: "{{ bmnode.nodes[0].provision_state | default('unknown') }}" + ironic_power: "{{ bmnode.nodes[0].power_state | default('unknown') }}" + ironic_maintenance: "{{ bmnode.nodes[0].maintenance | default('unknown') }}" + ironic_last_error: "{{ bmnode.nodes[0].last_error | default('') }}" + + - name: Evaluate enrolment status and hints + ansible.builtin.set_fact: + enrolment_status: "{{ bmnode_status }}" + enrolment_report_entry: + node: "{{ inventory_hostname }}" + status: "{{ bmnode_status }}" + details: "{{ bmnode_details }}" + vars: + bmnode_status: >- + {% if not bmnode_details.bmc_reachable %} + Can't ping node BMC + {% elif bmnode_details.kayobe_first_provision_ts %} + Node has completed Kayobe node prep for prod pipeline — check Ironic for details + {% elif bmnode_details.ironic_state == 'enroll' %} + Node stuck in Ironic node enroll state + {% elif not bmnode_details.kayobe_agent_inspect_done_ts %} + Node has not yet passed agent inspection + {% else %} + Node not yet completed node prep for prod pipeline + {% endif %} + changed_when: false + + - name: Show diagnosis statement + debug: + msg: "Status: {{ enrolment_report_entry.status }}" + + - name: Print diagnosis + ansible.builtin.debug: + msg: + - "Node: {{ enrolment_report_entry.node }}" + - "Status: {{ enrolment_report_entry.status }}" + - "Details: {{ enrolment_report_entry.details }}" + + - name: Test faulty baremetal + block: + - name: Undeploy baremetal node (test BMC connection) + ansible.builtin.command: "{{ venv }}/bin/openstack baremetal node undeploy {{ inventory_hostname }} --wait" + failed_when: false + + - name: Manage baremetal node (test BMC connection) + ansible.builtin.command: "{{ venv }}/bin/openstack baremetal node manage {{ inventory_hostname }} --wait" + failed_when: false + + - name: Bye bye baremetal + ansible.builtin.command: "{{ venv }}/bin/openstack baremetal node delete {{ inventory_hostname }}" + when: + - "{{ purge_faulty_baremetal }}" + - "{{ bmnode_details.ironic_state not in ['available', 'active'] }}" + - "{{ not bmnode_details.kayobe_first_provision_ts }}" + + #- name: Build summary list + # ansible.builtin.set_fact: + # enrolment_report_all: "{{ (enrolment_report_all | default([])) + [ hostvars[item].enrolment_report_entry ] }}" + # run_once: true + # loop: "{{ groups['all'] | sort }}" + + #- name: Pretty-print all nodes + # ansible.builtin.debug: + # msg: "{{ enrolment_report_all | to_nice_json }}" + # run_once: true diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/download-host-image.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/download-host-image.yml new file mode 100644 index 0000000000..2dde717b35 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/download-host-image.yml @@ -0,0 +1,86 @@ +--- +- name: Download an overcloud host image from Ark + hosts: controllers + vars: + # This var is an edited version of stackhpc_overcloud_host_image_url + # without the auth credentials in it. Auth is handled by username and + # password in the get_url task of this playbook + stackhpc_overcloud_host_image_url_no_auth: "{{ stackhpc_release_pulp_content_url }}/kayobe-images/\ + {{ openstack_release }}/{{ os_distribution }}/{{ os_release }}/\ + {{ stackhpc_overcloud_host_image_version }}/\ + overcloud-{{ os_distribution }}-{{ os_release }}.qcow2" + overcloud_host_image_name: "overcloud-{{ os_distribution }}-{{ os_release }}-{{ stackhpc_rocky_9_overcloud_host_image_version }}" + overcloud_host_image_ironic: false + overcloud_host_image_glance: true + tasks: + - name: Print image information + ansible.builtin.debug: + msg: | + OS Distribution: {{ os_distribution }} + OS Release: {{ os_release }} + Image tag: {{ stackhpc_overcloud_host_image_version }} + + - name: Install dependencies + ansible.builtin.pip: + name: openstacksdk + state: latest + + - name: Download image artifact + ansible.builtin.get_url: + url: "{{ stackhpc_overcloud_host_image_url_no_auth }}" + username: "{{ stackhpc_release_pulp_username }}" + password: "{{ stackhpc_release_pulp_password }}" + force_basic_auth: true + unredirected_headers: + - Authorization + dest: /tmp/{{ overcloud_host_image_name }}.qcow2 + mode: "0644" + register: image_download_result + until: image_download_result.status_code in [200, 304] + retries: 3 + delay: 60 + + #NOTE(jake): It would be nice to get the *real* checksum from Ark eventually. + - name: Get checksum of file + ansible.builtin.stat: + path: /tmp/{{ overcloud_host_image_name }}.qcow2 + checksum_algorithm: sha256 + register: host_image + + - name: Write checksum to vars file + ansible.builtin.copy: + content: | + --- + # This file is autogenerated by Ansible; DO NOT EDIT! + + stackhpc_overcloud_host_image_name: "{{ overcloud_host_image_name }}" + stackhpc_overcloud_host_image_checksum: "{{ host_image.stat.checksum }}" + dest: "{{ kayobe_env_config_path }}/stackhpc-overcloud-host-image.yml" + delegate_to: localhost + run_once: true + + - block: + - name: Copy the image to the Ironic volume + ansible.builtin.copy: + src: /tmp/{{ overcloud_host_image_name }}.qcow2 + dest: /var/lib/docker/volumes/ironic/_data/{{ overcloud_host_image_name }}.qcow2 + remote_src: true + become: true + + - name: Make the image available to the Ironic HTTP container + community.docker.docker_container_exec: + container: ironic_http + command: "mv /var/lib/ironic/{{ overcloud_host_image_name }}.qcow2 /var/lib/ironic/httpboot" + become: true + when: overcloud_host_image_ironic | bool + + - name: Upload an image to Glance + openstack.cloud.image: + name: "{{ overcloud_host_image_name }}" + container_format: bare + disk_format: qcow2 + state: present + filename: /tmp/{{ overcloud_host_image_name }}.qcow2 + run_once: true + environment: "{{ openstack_auth_env }}" + when: overcloud_host_image_glance | bool diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud-nova.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud-nova.yml new file mode 100644 index 0000000000..7a0407a503 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud-nova.yml @@ -0,0 +1,84 @@ +--- +- name: Provision baremetal instances + hosts: baremetal + gather_facts: false + vars: + network: "{{ ironic_provision_network }}" + flavor: "{{ ironic_provision_flavor }}" + image: "{{ ironic_provision_image }}" + key_name: "{{ ironic_provision_key_name }}" + force_server_delete: false + controller_host: localhost + venv: "{{ virtualenv_path }}/openstack-cli" + tasks: + - name: Show and check baremetal node + delegate_to: "{{ controller_host }}" + vars: + # NOTE: Without this, the controller's ansible_host variable will not + # be respected when using delegate_to. + ansible_host: "{{ hostvars[controller_host].ansible_host | default(controller_host) }}" + environment: "{{ openstack_auth_env }}" + block: + - name: Gather information about baremetal nodes + openstack.cloud.baremetal_node_info: + name: "{{ inventory_hostname }}" + register: bmnode_raw + + - name: Set fact for baremetal node information + ansible.builtin.set_fact: + bmnode: "{{ bmnode_raw.nodes | first }}" + + - name: Check if for first provision done + ansible.builtin.set_fact: + kayobe_clean_done: "{{ bmnode.extra.get('kayobe_clean_done') }}" + kayobe_agent_inspect_done: "{{ bmnode.extra.get('kayobe_agent_inspect_done') }}" + kayobe_first_provision_done: "{{ bmnode.extra.get('kayobe_first_provision') }}" + + # TODO: we should really check for cleaned + - name: Fail if kayobe bootstrap not done + ansible.builtin.fail: + msg: "{{ inventory_hostname }} has not been bootstrapped yet." + when: + - kayobe_agent_inspect_done == "" + + - name: Fail if not in available or active + ansible.builtin.fail: + msg: "{{ inventory_hostname }} has the wrong provision_state: {{ bmnode.provision_state }}" + when: + - bmnode.provision_state not in ["active", "available"] + + - name: Create port + openstack.cloud.port: + state: "{{ 'absent' if force_server_delete else 'present' }}" + name: "{{ inventory_hostname }}" + network: "{{ network }}" + fixed_ips: "{{ ironic_provision_fixed_ips | default(omit) }}" + vnic_type: baremetal + delegate_to: localhost + register: bmport + + # TODO: we should wait till we can reach ssh, via jump host + - name: Deploy Server + openstack.cloud.server: + state: "{{ 'absent' if force_server_delete else 'present' }}" + name: "{{ inventory_hostname }}" + nics: + - port-id: "{{ bmport.port.id }}" + image: "{{ image }}" + flavor: "{{ flavor }}" + key_name: "{{ key_name }}" + availability_zone: "::{{ bmnode.id }}" + timeout: 1800 # wait 30 mins for build + config_drive: yes + delegate_to: localhost + register: server + + - name: Note when provision has first worked + ansible.builtin.command: + cmd: | + {{ venv }}/bin/openstack baremetal node set {{ inventory_hostname }} --extra kayobe_first_provision={{ now(utc=true, fmt='%Y-%m-%dT%H:%M:%SZ') }} + register: node_set + failed_when: + - node_set.rc != 0 + changed_when: true + when: kayobe_first_provision_done == "" diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud.yml new file mode 100644 index 0000000000..a542fd9b59 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud.yml @@ -0,0 +1,83 @@ +--- +- name: Provision baremetal instances + hosts: overcloud + gather_facts: false + tasks: + - name: Ensure instance_info is referencing the node conductor + block: + - name: Gather information about baremetal nodes + openstack.cloud.baremetal_node_info: + name: "{{ inventory_hostname }}" + register: bmnode_raw + + - name: Set fact for baremetal node information + ansible.builtin.set_fact: + bmnode: "{{ bmnode_raw.nodes | first }}" + + - name: Set fact for conductor + ansible.builtin.set_fact: + ironic_instance_info_address: "{{ internal_net_ips.get(bmnode.conductor) }}" + failed_when: "{{ bmnode is none }}" + delegate_to: localhost + + - name: Configure Neutron for deployment + block: + - name: Gather information about baremetal ports + openstack.cloud.baremetal_port_info: + node: "{{ inventory_hostname }}" + register: bmport + + - name: Set fact for baremetal port information + ansible.builtin.set_fact: + bmport_pxe: "{{ bmport.baremetal_ports | selectattr('is_pxe_enabled', 'equalto', true) | first }}" + bmport_admin: "{{ bmport.baremetal_ports | selectattr('address', 'search', '^7c') | selectattr('local_link_connection', 'defined') | first }}" + + - name: Set fact for Neutron port variables + ansible.builtin.set_fact: + ironic_port_uuid: "{{ bmport_pxe.id }}" + ironic_hs_mac_address: "{{ bmport_pxe.address }}" + ironic_oc_mac_address: "{{ bmport_admin.address }}" + failed_when: + - "{{ bmport_pxe is none }}" + - "{{ bmport_admin is none }}" + + - name: Create Neutron port + openstack.cloud.port: + state: present + name: "{{ inventory_hostname }}" + network: "{{ baremetal_network | default('provision-net') }}" + vnic_type: baremetal + register: osport + + - name: Check if VIF attach exists + ansible.builtin.shell: | + openstack baremetal node vif list \ + {{ inventory_hostname }} + failed_when: false + register: vif + + - name: Attach VIF to baremetal node + ansible.builtin.shell: | + openstack baremetal node vif attach \ + {{ inventory_hostname }} \ + {{ osport.port.id }} + when: '{{ vif.stdout == "" }}' + delegate_to: localhost + when: '"neutron" or "flat" in (ironic_network_interface | default("flat"))' + + - name: Deploy instances + openstack.cloud.baremetal_node_action: + name: "{{ inventory_hostname }}" + config_drive: "{{ ironic_config_drive }}" + instance_info: "{{ ironic_instance_info }}" + delegate_to: localhost + + - name: Wait for sshd to be listening on the node + ansible.builtin.wait_for: + port: 22 + host: "{{ admin_oc_net_ips[inventory_hostname] }}" + search_regex: OpenSSH + timeout: 600 + vars: + ansible_user: "{{ overcloud_bootstrap_user }}" + delegate_to: localhost diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/recover-baremetal.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/recover-baremetal.yml new file mode 100644 index 0000000000..609a413c82 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/recover-baremetal.yml @@ -0,0 +1,61 @@ +--- +- name: Recover baremetal machines + hosts: baremetal-compute + gather_facts: false + connection: local + vars: + venv: "{{ virtualenv_path }}/openstack-cli" + controller_host: localhost + tasks: + - name: Gather information about baremetal nodes + openstack.cloud.baremetal_node_info: + name: "{{ inventory_hostname }}" + register: bmnode + + - name: Set facts for provision state + ansible.builtin.set_fact: + bmnode_prov: "{{ bmnode.nodes[0].provision_state }}" + + - name: Recover BMC + block: + - name: Reboot BMC + community.general.redfish_command: + category: Manager + command: PowerReboot + resource_id: 1 + baseuri: "{{ ironic_redfish_address }}" + username: "{{ ironic_redfish_username }}" + password: "{{ ironic_redfish_password }}" + + - name: Wait 300 seconds for port 443 to become open + ansible.builtin.wait_for: + port: 443 + host: "{{ ironic_redfish_address }}" + delay: 20 + timeout: 300 + + - name: Check BMC back up again + ansible.builtin.uri: + url: "https://{{ ironic_driver_info['redfish_address'] }}" + method: GET + status_code: 200 + validate_certs: false + timeout: 10 + register: uri_output + until: uri_output.status == 200 + delay: 5 + retries: 24 # Retries for 24 * 5 seconds = 120 seconds = 2 minutes + when: bmnode_prov in ['deploy failed', 'error', 'clean failed'] + + - name: Manage baremetals in 'clean failed' state + ansible.builtin.command: + cmd: "{{ venv }}/bin/openstack baremetal node manage {{ inventory_hostname }}" + when: bmnode_prov in 'clean failed' + + - name: Undeploy baremetals in 'deploy failed' or 'error' state + ansible.builtin.command: + cmd: "{{ venv }}/bin/openstack baremetal node undeploy {{ inventory_hostname }}" + when: bmnode_prov in ['deploy failed', 'error'] + +- name: Make baremetal nodes available + ansible.builtin.import_playbook: ./baremetal-4-clean.yml From 4725d97d8c23ca71c2a4a35d489e1fd2daa0442b Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Tue, 4 Nov 2025 11:33:38 +0000 Subject: [PATCH 3/5] created stackhpc-sushy-baremetal environment --- .../ansible/baremetal-0-enroll-overcloud.yml | 2 +- .../ansible/provision-overcloud.yml | 83 ------ .../.kayobe-environment | 5 + .../ansible/create-virtual-baremetal.yml | 52 ++++ .../ansible/generate-mac-addresses.yml | 21 ++ .../ansible/setup-local-link-information.yml | 27 ++ .../ansible/sushy-emulator.yml | 66 +++++ .../ansible/sushy.conf.j2 | 59 +++++ .../ansible/sushyemud.service.j2 | 8 + .../ansible/vbmc-net.xml.j2 | 6 + .../ansible/vbmc-node.xml.j2 | 236 ++++++++++++++++++ .../ansible/vbmc-pool.xml.j2 | 18 ++ .../stackhpc-sushy-baremetal/controllers.yml | 14 ++ .../stackhpc-sushy-baremetal/inventory/groups | 11 + .../stackhpc-sushy-baremetal/stackhpc.yml | 3 + 15 files changed, 527 insertions(+), 84 deletions(-) delete mode 100644 etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud.yml create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/.kayobe-environment create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/create-virtual-baremetal.yml create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/generate-mac-addresses.yml create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/setup-local-link-information.yml create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushy-emulator.yml create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushy.conf.j2 create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushyemud.service.j2 create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/vbmc-net.xml.j2 create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/vbmc-node.xml.j2 create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/vbmc-pool.xml.j2 create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/controllers.yml create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/inventory/groups create mode 100644 etc/kayobe/environments/stackhpc-sushy-baremetal/stackhpc.yml diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-0-enroll-overcloud.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-0-enroll-overcloud.yml index 51b2fd2f7e..d2cbf0c05c 100644 --- a/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-0-enroll-overcloud.yml +++ b/etc/kayobe/environments/stackhpc-baremetal/ansible/baremetal-0-enroll-overcloud.yml @@ -16,7 +16,7 @@ extra_args: "{% if pip_upper_constraints_file %}-c {{ pip_upper_constraints_file }}{% endif %}" - name: Ensure overcloud baremetal nodes are registered in ironic - hosts: overcloud + hosts: baremetal-overcloud gather_facts: false max_fail_percentage: >- {{ baremetal_compute_register_max_fail_percentage | diff --git a/etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud.yml b/etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud.yml deleted file mode 100644 index a542fd9b59..0000000000 --- a/etc/kayobe/environments/stackhpc-baremetal/ansible/provision-overcloud.yml +++ /dev/null @@ -1,83 +0,0 @@ ---- -- name: Provision baremetal instances - hosts: overcloud - gather_facts: false - tasks: - - name: Ensure instance_info is referencing the node conductor - block: - - name: Gather information about baremetal nodes - openstack.cloud.baremetal_node_info: - name: "{{ inventory_hostname }}" - register: bmnode_raw - - - name: Set fact for baremetal node information - ansible.builtin.set_fact: - bmnode: "{{ bmnode_raw.nodes | first }}" - - - name: Set fact for conductor - ansible.builtin.set_fact: - ironic_instance_info_address: "{{ internal_net_ips.get(bmnode.conductor) }}" - failed_when: "{{ bmnode is none }}" - delegate_to: localhost - - - name: Configure Neutron for deployment - block: - - name: Gather information about baremetal ports - openstack.cloud.baremetal_port_info: - node: "{{ inventory_hostname }}" - register: bmport - - - name: Set fact for baremetal port information - ansible.builtin.set_fact: - bmport_pxe: "{{ bmport.baremetal_ports | selectattr('is_pxe_enabled', 'equalto', true) | first }}" - bmport_admin: "{{ bmport.baremetal_ports | selectattr('address', 'search', '^7c') | selectattr('local_link_connection', 'defined') | first }}" - - - name: Set fact for Neutron port variables - ansible.builtin.set_fact: - ironic_port_uuid: "{{ bmport_pxe.id }}" - ironic_hs_mac_address: "{{ bmport_pxe.address }}" - ironic_oc_mac_address: "{{ bmport_admin.address }}" - failed_when: - - "{{ bmport_pxe is none }}" - - "{{ bmport_admin is none }}" - - - name: Create Neutron port - openstack.cloud.port: - state: present - name: "{{ inventory_hostname }}" - network: "{{ baremetal_network | default('provision-net') }}" - vnic_type: baremetal - register: osport - - - name: Check if VIF attach exists - ansible.builtin.shell: | - openstack baremetal node vif list \ - {{ inventory_hostname }} - failed_when: false - register: vif - - - name: Attach VIF to baremetal node - ansible.builtin.shell: | - openstack baremetal node vif attach \ - {{ inventory_hostname }} \ - {{ osport.port.id }} - when: '{{ vif.stdout == "" }}' - delegate_to: localhost - when: '"neutron" or "flat" in (ironic_network_interface | default("flat"))' - - - name: Deploy instances - openstack.cloud.baremetal_node_action: - name: "{{ inventory_hostname }}" - config_drive: "{{ ironic_config_drive }}" - instance_info: "{{ ironic_instance_info }}" - delegate_to: localhost - - - name: Wait for sshd to be listening on the node - ansible.builtin.wait_for: - port: 22 - host: "{{ admin_oc_net_ips[inventory_hostname] }}" - search_regex: OpenSSH - timeout: 600 - vars: - ansible_user: "{{ overcloud_bootstrap_user }}" - delegate_to: localhost diff --git a/etc/kayobe/environments/stackhpc-sushy-baremetal/.kayobe-environment b/etc/kayobe/environments/stackhpc-sushy-baremetal/.kayobe-environment new file mode 100644 index 0000000000..0a2a5f6995 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-sushy-baremetal/.kayobe-environment @@ -0,0 +1,5 @@ +--- + +dependencies: + - ci-aio + - stackhpc-baremetal diff --git a/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/create-virtual-baremetal.yml b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/create-virtual-baremetal.yml new file mode 100644 index 0000000000..60c5d11679 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/create-virtual-baremetal.yml @@ -0,0 +1,52 @@ +--- +- name: Create Libvirt vBMC nodes + gather_facts: false + become: true + hosts: sushy-libvirt + tasks: + - name: Gather facts + setup: + delegate_to: localhost + + - name: Install Python requirements + ansible.builtin.pip: + name: + - libvirt-python==11.3.0 + - lxml==5.4.0 + delegate_to: localhost + + - name: Define vBMC storage pool + community.libvirt.virt_pool: + command: define + name: default + xml: "{{ lookup('template', sushy_directory + '/vbmc-pool.xml.j2') }}" + delegate_to: localhost + run_once: true + + - name: Start vBMC storage pool + community.libvirt.virt_pool: + state: active + name: default + delegate_to: localhost + run_once: true + + - import_role: + name: stackhpc.libvirt-vm + vars: + libvirt_vm_arch: x86_64 + libvirt_vms: + - state: present + name: "{{ inventory_hostname }}" + xml_file: "{{ sushy_directory }}/vbmc-node.xml.j2" + volumes: + - name: '{{ inventory_hostname }}.qcow2' + device: 'disk' + format: 'qcow2' + capacity: '20GB' + pool: 'default' + interfaces: + - network: 'breth1' + start: false + autostart: false + boot_firmware: uefi + delegate_to: localhost diff --git a/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/generate-mac-addresses.yml b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/generate-mac-addresses.yml new file mode 100644 index 0000000000..24d62cdde7 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/generate-mac-addresses.yml @@ -0,0 +1,21 @@ +--- +- name: Generate virtual baremetal MAC addresses + hosts: sushy-libvirt + gather_facts: false + tasks: + - name: Write hosts + block: + - name: Create address dictionary + set_fact: + bikolla_mac_addresses: "{{ bikolla_mac_addresses | combine({item: '52:54:00' | community.general.random_mac}) }}" + vars: + bikolla_mac_addresses: {} + delegate_to: "{{ sushy_host | default('localhost') }}" + with_items: "{{ play_hosts }}" + + - name: Write mac addresses file + copy: + content: '{{ {"bikolla_mac_addresses": bikolla_mac_addresses} | to_nice_yaml }}' + dest: "{{ kayobe_env_config_path }}/bikolla-mac-addresses.yml" + delegate_to: "{{ sushy_host | default('localhost') }}" + run_once: true diff --git a/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/setup-local-link-information.yml b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/setup-local-link-information.yml new file mode 100644 index 0000000000..b6d512db1f --- /dev/null +++ b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/setup-local-link-information.yml @@ -0,0 +1,27 @@ +--- +- name: Setup fake local_link_information + hosts: sushy-libvirt-ipxe + gather_facts: false + tasks: + - name: Get baremetal port metadata + openstack.cloud.baremetal_port_info: + node: "{{ inventory_hostname }}" + delegate_to: localhost + register: baremetal + + - name: Setup local_link_information + debug: + msg: "{{ baremetal.ports[0] }}" + + - name: Update baremetal port + openstack.cloud.baremetal_port: + state: present + id: "{{ baremetal.ports[0].id }}" + node: "{{ inventory_hostname }}" + address: "{{ bikolla_mac_addresses[inventory_hostname] }}" + is_pxe_enabled: true + local_link_connection: + switch_id: "{{ bikolla_mac_addresses[inventory_hostname] }}" + port_id: "{{ baremetal.ports[0].id }}" + switch_info: "{{ inventory_hostname }}" + delegate_to: "{{ sushy_host | default('localhost') }}" diff --git a/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushy-emulator.yml b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushy-emulator.yml new file mode 100644 index 0000000000..faabeb4d3e --- /dev/null +++ b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushy-emulator.yml @@ -0,0 +1,66 @@ +--- +- name: Ensure Sushy Emulator is deployed + hosts: "{{ sushy_host | default('localhost') }}" + become: true + gather_facts: false + tasks: + - name: Create Sushy configuration directory + ansible.builtin.file: + path: /etc/sushy + state: directory + + - name: Template Sushy configuration + ansible.builtin.template: + src: "{{ sushy_directory }}/sushy.conf.j2" + dest: "/etc/sushy/sushy.conf" + + - name: Enable Rocky devel repository + community.general.dnf_config_manager: + name: devel + state: enabled + + - name: Install package dependencies + ansible.builtin.dnf: + name: + - qemu-kvm + - libvirt + - libvirt-devel + - python3-devel + state: present + + - name: Start and enable the QEMU service + ansible.builtin.systemd_service: + name: virtqemud + state: started + enabled: true + + - name: Start and enable the virtual stroage service + ansible.builtin.systemd_service: + name: virtstoraged + state: started + enabled: true + + - name: Start and enable the virtual network service + ansible.builtin.systemd_service: + name: virtnetworkd + state: started + enabled: true + + - name: Create Sushy virtualenv + ansible.builtin.pip: + name: + - libvirt-python + - sushy-tools + virtualenv: /opt/kayobe/venvs/sushy + virtualenv_command: python3 -m venv + + - name: Template Sushy service unit file + ansible.builtin.template: + src: "{{ sushy_directory }}/sushyemud.service.j2" + dest: "/etc/systemd/system/sushyemud.service" + + - name: Start and enable the Sushy Emulator service + ansible.builtin.systemd_service: + name: sushyemud + state: started + enabled: true diff --git a/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushy.conf.j2 b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushy.conf.j2 new file mode 100644 index 0000000000..445b9871a5 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushy.conf.j2 @@ -0,0 +1,59 @@ +{% macro storage(uuid, enum, drive_id, end) -%} + "{{ uuid }}": [ + { + "Id": "{{ enum }}", + "Name": "Local Storage Controller", + "StorageControllers": [ + { + "MemberId": "0", + "Name": "Contoso Integrated RAID", + "SpeedGbps": 12 + } + ], + "Drives": [ + "{{ drive_id }}" + ] + } + ]{% if not end %},{% endif %} +{%- endmacro %} + +{% macro drive(uuid, enum, drive_id, end) -%} + ("{{ uuid }}", "{{ enum }}"): [ + { + "Id": "{{ drive_id }}", + "Name": "Drive Sample", + "CapacityBytes": 899527000000, + "Protocol": "SAS" + } + ]{% if not end %},{% endif %} +{%- endmacro %} + +{% macro volume(uuid, enum, hostname, end) -%} + ("{{ uuid }}", "{{ enum }}"): [ { + "libvirtPoolName": "default", + "libvirtVolName": "{{ hostname }}.qcow2", + "Id": "{{ enum }}", + "Name": "{{ hostname }}-volume", + "VolumeType": "File", + "CapacityBytes": 1073741824 + } + ]{% if not end %},{% endif %} +{%- endmacro %} + +SUSHY_EMULATOR_STORAGE = { +{% for host in groups["sushy-libvirt"] %} + {{ storage(host | to_uuid, 1, ('drive-'+host) | to_uuid, (host in groups["sushy-libvirt"] | last ) | bool) }} +{% endfor %} +} + +SUSHY_EMULATOR_DRIVES = { +{% for host in groups["sushy-libvirt"] %} + {{ drive(host | to_uuid, 1, ('drive-'+host) | to_uuid, (host in groups["sushy-libvirt"] | last ) | bool) }} +{% endfor %} +} + +SUSHY_EMULATOR_VOLUMES = { +{% for host in groups["sushy-libvirt"] %} + {{ volume(host | to_uuid, 1, host, (host in groups["sushy-libvirt"] | last ) | bool) }} +{% endfor %} +} diff --git a/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushyemud.service.j2 b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushyemud.service.j2 new file mode 100644 index 0000000000..248de07e09 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/sushyemud.service.j2 @@ -0,0 +1,8 @@ +[Unit] +Description=Virtual Redfish BMC service + +[Service] +ExecStart=/opt/kayobe/venvs/sushy/bin/sushy-emulator -i 192.168.33.3 -p 34343 --config /etc/sushy/sushy.conf + +[Install] +WantedBy=multi-user.target default.target diff --git a/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/vbmc-net.xml.j2 b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/vbmc-net.xml.j2 new file mode 100644 index 0000000000..ff3bcae5c7 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/vbmc-net.xml.j2 @@ -0,0 +1,6 @@ + + vbmc-net + {{ 'vbmc-net' | to_uuid }} + + + diff --git a/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/vbmc-node.xml.j2 b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/vbmc-node.xml.j2 new file mode 100644 index 0000000000..b8f9aa4b30 --- /dev/null +++ b/etc/kayobe/environments/stackhpc-sushy-baremetal/ansible/vbmc-node.xml.j2 @@ -0,0 +1,236 @@ + + {{ inventory_hostname }} + {{ inventory_hostname | to_uuid }} + + + + + + + + + + + + + + + + + + + + + + + 4882812 + 4882812 + 2 + + /machine + + + hvm + +{% if bikolla_enable_secureboot %} + + +{% else %} + +{% endif %} + + + + + + + + + + + + + destroy + restart + destroy + + + + + + /usr/libexec/qemu-kvm + + + + + + + +
+ + + +
+ + + + + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + + + +
+ + + +
+ + + +
+ + + + + + + +
+ + + + + + + + + + + + + + + + + +
+ + + +
+ + + + + + + + + + +