Add km module kafka

This commit is contained in:
leewei
2023-02-14 16:27:47 +08:00
parent 229140f067
commit 0b8160a714
4039 changed files with 718112 additions and 46204 deletions

View File

@@ -0,0 +1,14 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,519 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os.path
import random
import signal
import time
import requests
from ducktape.errors import DucktapeError
from ducktape.services.service import Service
from ducktape.utils.util import wait_until
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
class ConnectServiceBase(KafkaPathResolverMixin, Service):
"""Base class for Kafka Connect services providing some common settings and functionality"""
PERSISTENT_ROOT = "/mnt/connect"
CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "connect.properties")
# The log file contains normal log4j logs written using a file appender. stdout and stderr are handled separately
# so they can be used for other output, e.g. verifiable source & sink.
LOG_FILE = os.path.join(PERSISTENT_ROOT, "connect.log")
STDOUT_FILE = os.path.join(PERSISTENT_ROOT, "connect.stdout")
STDERR_FILE = os.path.join(PERSISTENT_ROOT, "connect.stderr")
LOG4J_CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "connect-log4j.properties")
PID_FILE = os.path.join(PERSISTENT_ROOT, "connect.pid")
EXTERNAL_CONFIGS_FILE = os.path.join(PERSISTENT_ROOT, "connect-external-configs.properties")
CONNECT_REST_PORT = 8083
HEAP_DUMP_FILE = os.path.join(PERSISTENT_ROOT, "connect_heap_dump.bin")
# Currently the Connect worker supports waiting on four modes:
STARTUP_MODE_INSTANT = 'INSTANT'
"""STARTUP_MODE_INSTANT: Start Connect worker and return immediately"""
STARTUP_MODE_LOAD = 'LOAD'
"""STARTUP_MODE_LOAD: Start Connect worker and return after discovering and loading plugins"""
STARTUP_MODE_LISTEN = 'LISTEN'
"""STARTUP_MODE_LISTEN: Start Connect worker and return after opening the REST port."""
STARTUP_MODE_JOIN = 'JOIN'
"""STARTUP_MODE_JOIN: Start Connect worker and return after joining the group."""
logs = {
"connect_log": {
"path": LOG_FILE,
"collect_default": True},
"connect_stdout": {
"path": STDOUT_FILE,
"collect_default": False},
"connect_stderr": {
"path": STDERR_FILE,
"collect_default": True},
"connect_heap_dump_file": {
"path": HEAP_DUMP_FILE,
"collect_default": True}
}
def __init__(self, context, num_nodes, kafka, files, startup_timeout_sec = 60):
super(ConnectServiceBase, self).__init__(context, num_nodes)
self.kafka = kafka
self.security_config = kafka.security_config.client_config()
self.files = files
self.startup_mode = self.STARTUP_MODE_LISTEN
self.startup_timeout_sec = startup_timeout_sec
self.environment = {}
self.external_config_template_func = None
def pids(self, node):
"""Return process ids for Kafka Connect processes."""
try:
return [pid for pid in node.account.ssh_capture("cat " + self.PID_FILE, callback=int)]
except:
return []
def set_configs(self, config_template_func, connector_config_templates=None):
"""
Set configurations for the worker and the connector to run on
it. These are not provided in the constructor because the worker
config generally needs access to ZK/Kafka services to
create the configuration.
"""
self.config_template_func = config_template_func
self.connector_config_templates = connector_config_templates
def set_external_configs(self, external_config_template_func):
"""
Set the properties that will be written in the external file properties
as used by the org.apache.kafka.common.config.provider.FileConfigProvider.
When this is used, the worker configuration must also enable the FileConfigProvider.
This is not provided in the constructor in case the worker
config generally needs access to ZK/Kafka services to
create the configuration.
"""
self.external_config_template_func = external_config_template_func
def listening(self, node):
try:
self.list_connectors(node)
self.logger.debug("Connect worker started serving REST at: '%s:%s')", node.account.hostname,
self.CONNECT_REST_PORT)
return True
except requests.exceptions.ConnectionError:
self.logger.debug("REST resources are not loaded yet")
return False
def start(self, mode=None):
if mode:
self.startup_mode = mode
super(ConnectServiceBase, self).start()
def start_and_return_immediately(self, node, worker_type, remote_connector_configs):
cmd = self.start_cmd(node, remote_connector_configs)
self.logger.debug("Connect %s command: %s", worker_type, cmd)
node.account.ssh(cmd)
def start_and_wait_to_load_plugins(self, node, worker_type, remote_connector_configs):
with node.account.monitor_log(self.LOG_FILE) as monitor:
self.start_and_return_immediately(node, worker_type, remote_connector_configs)
monitor.wait_until('Kafka version', timeout_sec=self.startup_timeout_sec,
err_msg="Never saw message indicating Kafka Connect finished startup on node: " +
"%s in condition mode: %s" % (str(node.account), self.startup_mode))
def start_and_wait_to_start_listening(self, node, worker_type, remote_connector_configs):
self.start_and_return_immediately(node, worker_type, remote_connector_configs)
wait_until(lambda: self.listening(node), timeout_sec=self.startup_timeout_sec,
err_msg="Kafka Connect failed to start on node: %s in condition mode: %s" %
(str(node.account), self.startup_mode))
def start_and_wait_to_join_group(self, node, worker_type, remote_connector_configs):
if worker_type != 'distributed':
raise RuntimeError("Cannot wait for joined group message for %s" % worker_type)
with node.account.monitor_log(self.LOG_FILE) as monitor:
self.start_and_return_immediately(node, worker_type, remote_connector_configs)
monitor.wait_until('Joined group', timeout_sec=self.startup_timeout_sec,
err_msg="Never saw message indicating Kafka Connect joined group on node: " +
"%s in condition mode: %s" % (str(node.account), self.startup_mode))
def stop_node(self, node, clean_shutdown=True):
self.logger.info((clean_shutdown and "Cleanly" or "Forcibly") + " stopping Kafka Connect on " + str(node.account))
pids = self.pids(node)
sig = signal.SIGTERM if clean_shutdown else signal.SIGKILL
for pid in pids:
node.account.signal(pid, sig, allow_fail=True)
if clean_shutdown:
for pid in pids:
wait_until(lambda: not node.account.alive(pid), timeout_sec=self.startup_timeout_sec, err_msg="Kafka Connect process on " + str(
node.account) + " took too long to exit")
node.account.ssh("rm -f " + self.PID_FILE, allow_fail=False)
def restart(self, clean_shutdown=True):
# We don't want to do any clean up here, just restart the process.
for node in self.nodes:
self.logger.info("Restarting Kafka Connect on " + str(node.account))
self.restart_node(node, clean_shutdown)
def restart_node(self, node, clean_shutdown=True):
self.stop_node(node, clean_shutdown)
self.start_node(node)
def clean_node(self, node):
node.account.kill_process("connect", clean_shutdown=False, allow_fail=True)
self.security_config.clean_node(node)
other_files = " ".join(self.config_filenames() + self.files)
node.account.ssh("rm -rf -- %s %s" % (ConnectServiceBase.PERSISTENT_ROOT, other_files), allow_fail=False)
def config_filenames(self):
return [os.path.join(self.PERSISTENT_ROOT, "connect-connector-" + str(idx) + ".properties") for idx, template in enumerate(self.connector_config_templates or [])]
def list_connectors(self, node=None, **kwargs):
return self._rest_with_retry('/connectors', node=node, **kwargs)
def create_connector(self, config, node=None, **kwargs):
create_request = {
'name': config['name'],
'config': config
}
return self._rest_with_retry('/connectors', create_request, node=node, method="POST", **kwargs)
def get_connector(self, name, node=None, **kwargs):
return self._rest_with_retry('/connectors/' + name, node=node, **kwargs)
def get_connector_config(self, name, node=None, **kwargs):
return self._rest_with_retry('/connectors/' + name + '/config', node=node, **kwargs)
def set_connector_config(self, name, config, node=None, **kwargs):
# Unlike many other calls, a 409 when setting a connector config is expected if the connector already exists.
# However, we also might see 409s for other reasons (e.g. rebalancing). So we still perform retries at the cost
# of tests possibly taking longer to ultimately fail. Tests that care about this can explicitly override the
# number of retries.
return self._rest_with_retry('/connectors/' + name + '/config', config, node=node, method="PUT", **kwargs)
def get_connector_tasks(self, name, node=None, **kwargs):
return self._rest_with_retry('/connectors/' + name + '/tasks', node=node, **kwargs)
def delete_connector(self, name, node=None, **kwargs):
return self._rest_with_retry('/connectors/' + name, node=node, method="DELETE", **kwargs)
def get_connector_status(self, name, node=None):
return self._rest('/connectors/' + name + '/status', node=node)
def restart_connector(self, name, node=None, **kwargs):
return self._rest_with_retry('/connectors/' + name + '/restart', node=node, method="POST", **kwargs)
def restart_task(self, connector_name, task_id, node=None):
return self._rest('/connectors/' + connector_name + '/tasks/' + str(task_id) + '/restart', node=node, method="POST")
def pause_connector(self, name, node=None):
return self._rest('/connectors/' + name + '/pause', node=node, method="PUT")
def resume_connector(self, name, node=None):
return self._rest('/connectors/' + name + '/resume', node=node, method="PUT")
def list_connector_plugins(self, node=None):
return self._rest('/connector-plugins/', node=node)
def validate_config(self, connector_type, validate_request, node=None):
return self._rest('/connector-plugins/' + connector_type + '/config/validate', validate_request, node=node, method="PUT")
def _rest(self, path, body=None, node=None, method="GET"):
if node is None:
node = random.choice(self.nodes)
meth = getattr(requests, method.lower())
url = self._base_url(node) + path
self.logger.debug("Kafka Connect REST request: %s %s %s %s", node.account.hostname, url, method, body)
resp = meth(url, json=body)
self.logger.debug("%s %s response: %d", url, method, resp.status_code)
if resp.status_code > 400:
self.logger.debug("Connect REST API error for %s: %d %s", resp.url, resp.status_code, resp.text)
raise ConnectRestError(resp.status_code, resp.text, resp.url)
if resp.status_code == 204 or resp.status_code == 202:
return None
else:
return resp.json()
def _rest_with_retry(self, path, body=None, node=None, method="GET", retries=40, retry_backoff=.25):
"""
Invokes a REST API with retries for errors that may occur during normal operation (notably 409 CONFLICT
responses that can occur due to rebalancing or 404 when the connect resources are not initialized yet).
"""
exception_to_throw = None
for i in range(0, retries + 1):
try:
return self._rest(path, body, node, method)
except ConnectRestError as e:
exception_to_throw = e
if e.status != 409 and e.status != 404:
break
time.sleep(retry_backoff)
raise exception_to_throw
def _base_url(self, node):
return 'http://' + node.account.externally_routable_ip + ':' + str(self.CONNECT_REST_PORT)
def append_to_environment_variable(self, envvar, value):
env_opts = self.environment[envvar]
if env_opts is None:
env_opts = "\"%s\"" % value
else:
env_opts = "\"%s %s\"" % (env_opts.strip('\"'), value)
self.environment[envvar] = env_opts
class ConnectStandaloneService(ConnectServiceBase):
"""Runs Kafka Connect in standalone mode."""
def __init__(self, context, kafka, files, startup_timeout_sec = 60):
super(ConnectStandaloneService, self).__init__(context, 1, kafka, files, startup_timeout_sec)
# For convenience since this service only makes sense with a single node
@property
def node(self):
return self.nodes[0]
def start_cmd(self, node, connector_configs):
cmd = "( export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%s\"; " % self.LOG4J_CONFIG_FILE
heap_kafka_opts = "-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=%s" % \
self.logs["connect_heap_dump_file"]["path"]
other_kafka_opts = self.security_config.kafka_opts.strip('\"')
cmd += "export KAFKA_OPTS=\"%s %s\"; " % (heap_kafka_opts, other_kafka_opts)
for envvar in self.environment:
cmd += "export %s=%s; " % (envvar, str(self.environment[envvar]))
cmd += "%s %s " % (self.path.script("connect-standalone.sh", node), self.CONFIG_FILE)
cmd += " ".join(connector_configs)
cmd += " & echo $! >&3 ) 1>> %s 2>> %s 3> %s" % (self.STDOUT_FILE, self.STDERR_FILE, self.PID_FILE)
return cmd
def start_node(self, node):
node.account.ssh("mkdir -p %s" % self.PERSISTENT_ROOT, allow_fail=False)
self.security_config.setup_node(node)
if self.external_config_template_func:
node.account.create_file(self.EXTERNAL_CONFIGS_FILE, self.external_config_template_func(node))
node.account.create_file(self.CONFIG_FILE, self.config_template_func(node))
node.account.create_file(self.LOG4J_CONFIG_FILE, self.render('connect_log4j.properties', log_file=self.LOG_FILE))
remote_connector_configs = []
for idx, template in enumerate(self.connector_config_templates):
target_file = os.path.join(self.PERSISTENT_ROOT, "connect-connector-" + str(idx) + ".properties")
node.account.create_file(target_file, template)
remote_connector_configs.append(target_file)
self.logger.info("Starting Kafka Connect standalone process on " + str(node.account))
if self.startup_mode == self.STARTUP_MODE_LOAD:
self.start_and_wait_to_load_plugins(node, 'standalone', remote_connector_configs)
elif self.startup_mode == self.STARTUP_MODE_INSTANT:
self.start_and_return_immediately(node, 'standalone', remote_connector_configs)
elif self.startup_mode == self.STARTUP_MODE_JOIN:
self.start_and_wait_to_join_group(node, 'standalone', remote_connector_configs)
else:
# The default mode is to wait until the complete startup of the worker
self.start_and_wait_to_start_listening(node, 'standalone', remote_connector_configs)
if len(self.pids(node)) == 0:
raise RuntimeError("No process ids recorded")
class ConnectDistributedService(ConnectServiceBase):
"""Runs Kafka Connect in distributed mode."""
def __init__(self, context, num_nodes, kafka, files, offsets_topic="connect-offsets",
configs_topic="connect-configs", status_topic="connect-status", startup_timeout_sec = 60):
super(ConnectDistributedService, self).__init__(context, num_nodes, kafka, files, startup_timeout_sec)
self.startup_mode = self.STARTUP_MODE_JOIN
self.offsets_topic = offsets_topic
self.configs_topic = configs_topic
self.status_topic = status_topic
# connector_configs argument is intentionally ignored in distributed service.
def start_cmd(self, node, connector_configs):
cmd = "( export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%s\"; " % self.LOG4J_CONFIG_FILE
heap_kafka_opts = "-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=%s" % \
self.logs["connect_heap_dump_file"]["path"]
other_kafka_opts = self.security_config.kafka_opts.strip('\"')
cmd += "export KAFKA_OPTS=\"%s %s\"; " % (heap_kafka_opts, other_kafka_opts)
for envvar in self.environment:
cmd += "export %s=%s; " % (envvar, str(self.environment[envvar]))
cmd += "%s %s " % (self.path.script("connect-distributed.sh", node), self.CONFIG_FILE)
cmd += " & echo $! >&3 ) 1>> %s 2>> %s 3> %s" % (self.STDOUT_FILE, self.STDERR_FILE, self.PID_FILE)
return cmd
def start_node(self, node):
node.account.ssh("mkdir -p %s" % self.PERSISTENT_ROOT, allow_fail=False)
self.security_config.setup_node(node)
if self.external_config_template_func:
node.account.create_file(self.EXTERNAL_CONFIGS_FILE, self.external_config_template_func(node))
node.account.create_file(self.CONFIG_FILE, self.config_template_func(node))
node.account.create_file(self.LOG4J_CONFIG_FILE, self.render('connect_log4j.properties', log_file=self.LOG_FILE))
if self.connector_config_templates:
raise DucktapeError("Config files are not valid in distributed mode, submit connectors via the REST API")
self.logger.info("Starting Kafka Connect distributed process on " + str(node.account))
if self.startup_mode == self.STARTUP_MODE_LOAD:
self.start_and_wait_to_load_plugins(node, 'distributed', '')
elif self.startup_mode == self.STARTUP_MODE_INSTANT:
self.start_and_return_immediately(node, 'distributed', '')
elif self.startup_mode == self.STARTUP_MODE_LISTEN:
self.start_and_wait_to_start_listening(node, 'distributed', '')
else:
# The default mode is to wait until the complete startup of the worker
self.start_and_wait_to_join_group(node, 'distributed', '')
if len(self.pids(node)) == 0:
raise RuntimeError("No process ids recorded")
class ErrorTolerance(object):
ALL = "all"
NONE = "none"
class ConnectRestError(RuntimeError):
def __init__(self, status, msg, url):
self.status = status
self.message = msg
self.url = url
def __unicode__(self):
return "Kafka Connect REST call failed: returned " + self.status + " for " + self.url + ". Response: " + self.message
class VerifiableConnector(object):
def messages(self):
"""
Collect and parse the logs from Kafka Connect nodes. Return a list containing all parsed JSON messages generated by
this source.
"""
self.logger.info("Collecting messages from log of %s %s", type(self).__name__, self.name)
records = []
for node in self.cc.nodes:
for line in node.account.ssh_capture('cat ' + self.cc.STDOUT_FILE):
try:
data = json.loads(line)
except ValueError:
self.logger.debug("Ignoring unparseable line: %s", line)
continue
# Filter to only ones matching our name to support multiple verifiable producers
if data['name'] != self.name:
continue
data['node'] = node
records.append(data)
return records
def stop(self):
self.logger.info("Destroying connector %s %s", type(self).__name__, self.name)
self.cc.delete_connector(self.name)
class VerifiableSource(VerifiableConnector):
"""
Helper class for running a verifiable source connector on a Kafka Connect cluster and analyzing the output.
"""
def __init__(self, cc, name="verifiable-source", tasks=1, topic="verifiable", throughput=1000):
self.cc = cc
self.logger = self.cc.logger
self.name = name
self.tasks = tasks
self.topic = topic
self.throughput = throughput
def committed_messages(self):
return filter(lambda m: 'committed' in m and m['committed'], self.messages())
def sent_messages(self):
return filter(lambda m: 'committed' not in m or not m['committed'], self.messages())
def start(self):
self.logger.info("Creating connector VerifiableSourceConnector %s", self.name)
self.cc.create_connector({
'name': self.name,
'connector.class': 'org.apache.kafka.connect.tools.VerifiableSourceConnector',
'tasks.max': self.tasks,
'topic': self.topic,
'throughput': self.throughput
})
class VerifiableSink(VerifiableConnector):
"""
Helper class for running a verifiable sink connector on a Kafka Connect cluster and analyzing the output.
"""
def __init__(self, cc, name="verifiable-sink", tasks=1, topics=["verifiable"]):
self.cc = cc
self.logger = self.cc.logger
self.name = name
self.tasks = tasks
self.topics = topics
def flushed_messages(self):
return filter(lambda m: 'flushed' in m and m['flushed'], self.messages())
def received_messages(self):
return filter(lambda m: 'flushed' not in m or not m['flushed'], self.messages())
def start(self):
self.logger.info("Creating connector VerifiableSinkConnector %s", self.name)
self.cc.create_connector({
'name': self.name,
'connector.class': 'org.apache.kafka.connect.tools.VerifiableSinkConnector',
'tasks.max': self.tasks,
'topics': ",".join(self.topics)
})
class MockSink(object):
def __init__(self, cc, topics, mode=None, delay_sec=10, name="mock-sink"):
self.cc = cc
self.logger = self.cc.logger
self.name = name
self.mode = mode
self.delay_sec = delay_sec
self.topics = topics
def start(self):
self.logger.info("Creating connector MockSinkConnector %s", self.name)
self.cc.create_connector({
'name': self.name,
'connector.class': 'org.apache.kafka.connect.tools.MockSinkConnector',
'tasks.max': 1,
'topics': ",".join(self.topics),
'mock_mode': self.mode,
'delay_ms': self.delay_sec * 1000
})
class MockSource(object):
def __init__(self, cc, mode=None, delay_sec=10, name="mock-source"):
self.cc = cc
self.logger = self.cc.logger
self.name = name
self.mode = mode
self.delay_sec = delay_sec
def start(self):
self.logger.info("Creating connector MockSourceConnector %s", self.name)
self.cc.create_connector({
'name': self.name,
'connector.class': 'org.apache.kafka.connect.tools.MockSourceConnector',
'tasks.max': 1,
'mock_mode': self.mode,
'delay_ms': self.delay_sec * 1000
})

View File

@@ -0,0 +1,315 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import os
from ducktape.cluster.remoteaccount import RemoteCommandError
from ducktape.services.background_thread import BackgroundThreadService
from ducktape.utils.util import wait_until
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
from kafkatest.services.monitor.jmx import JmxMixin
from kafkatest.version import DEV_BRANCH, LATEST_0_8_2, LATEST_0_9, LATEST_0_10_0, V_0_9_0_0, V_0_10_0_0, V_0_11_0_0, V_2_0_0
"""
The console consumer is a tool that reads data from Kafka and outputs it to standard output.
"""
class ConsoleConsumer(KafkaPathResolverMixin, JmxMixin, BackgroundThreadService):
# Root directory for persistent output
PERSISTENT_ROOT = "/mnt/console_consumer"
STDOUT_CAPTURE = os.path.join(PERSISTENT_ROOT, "console_consumer.stdout")
STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "console_consumer.stderr")
LOG_DIR = os.path.join(PERSISTENT_ROOT, "logs")
LOG_FILE = os.path.join(LOG_DIR, "console_consumer.log")
LOG4J_CONFIG = os.path.join(PERSISTENT_ROOT, "tools-log4j.properties")
CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "console_consumer.properties")
JMX_TOOL_LOG = os.path.join(PERSISTENT_ROOT, "jmx_tool.log")
JMX_TOOL_ERROR_LOG = os.path.join(PERSISTENT_ROOT, "jmx_tool.err.log")
logs = {
"consumer_stdout": {
"path": STDOUT_CAPTURE,
"collect_default": False},
"consumer_stderr": {
"path": STDERR_CAPTURE,
"collect_default": False},
"consumer_log": {
"path": LOG_FILE,
"collect_default": True},
"jmx_log": {
"path" : JMX_TOOL_LOG,
"collect_default": False},
"jmx_err_log": {
"path": JMX_TOOL_ERROR_LOG,
"collect_default": False}
}
def __init__(self, context, num_nodes, kafka, topic, group_id="test-consumer-group", new_consumer=True,
message_validator=None, from_beginning=True, consumer_timeout_ms=None, version=DEV_BRANCH,
client_id="console-consumer", print_key=False, jmx_object_names=None, jmx_attributes=None,
enable_systest_events=False, stop_timeout_sec=35, print_timestamp=False, print_partition=False,
isolation_level="read_uncommitted", jaas_override_variables=None,
kafka_opts_override="", client_prop_file_override="", consumer_properties={}):
"""
Args:
context: standard context
num_nodes: number of nodes to use (this should be 1)
kafka: kafka service
topic: consume from this topic
new_consumer: use new Kafka consumer if True
message_validator: function which returns message or None
from_beginning: consume from beginning if True, else from the end
consumer_timeout_ms: corresponds to consumer.timeout.ms. consumer process ends if time between
successively consumed messages exceeds this timeout. Setting this and
waiting for the consumer to stop is a pretty good way to consume all messages
in a topic.
print_timestamp if True, print each message's timestamp as well
print_key if True, print each message's key as well
print_partition if True, print each message's partition as well
enable_systest_events if True, console consumer will print additional lifecycle-related information
only available in 0.10.0 and later.
stop_timeout_sec After stopping a node, wait up to stop_timeout_sec for the node to stop,
and the corresponding background thread to finish successfully.
isolation_level How to handle transactional messages.
jaas_override_variables A dict of variables to be used in the jaas.conf template file
kafka_opts_override Override parameters of the KAFKA_OPTS environment variable
client_prop_file_override Override client.properties file used by the consumer
consumer_properties A dict of values to pass in as --consumer-property key=value
"""
JmxMixin.__init__(self, num_nodes=num_nodes, jmx_object_names=jmx_object_names, jmx_attributes=(jmx_attributes or []),
root=ConsoleConsumer.PERSISTENT_ROOT)
BackgroundThreadService.__init__(self, context, num_nodes)
self.kafka = kafka
self.new_consumer = new_consumer
self.group_id = group_id
self.args = {
'topic': topic,
}
self.consumer_timeout_ms = consumer_timeout_ms
for node in self.nodes:
node.version = version
self.from_beginning = from_beginning
self.message_validator = message_validator
self.messages_consumed = {idx: [] for idx in range(1, num_nodes + 1)}
self.clean_shutdown_nodes = set()
self.client_id = client_id
self.print_key = print_key
self.print_partition = print_partition
self.log_level = "TRACE"
self.stop_timeout_sec = stop_timeout_sec
self.isolation_level = isolation_level
self.enable_systest_events = enable_systest_events
if self.enable_systest_events:
# Only available in 0.10.0 and up
assert version >= V_0_10_0_0
self.print_timestamp = print_timestamp
self.jaas_override_variables = jaas_override_variables or {}
self.kafka_opts_override = kafka_opts_override
self.client_prop_file_override = client_prop_file_override
self.consumer_properties = consumer_properties
def prop_file(self, node):
"""Return a string which can be used to create a configuration file appropriate for the given node."""
# Process client configuration
prop_file = self.render('console_consumer.properties')
if hasattr(node, "version") and node.version <= LATEST_0_8_2:
# in 0.8.2.X and earlier, console consumer does not have --timeout-ms option
# instead, we have to pass it through the config file
prop_file += "\nconsumer.timeout.ms=%s\n" % str(self.consumer_timeout_ms)
# Add security properties to the config. If security protocol is not specified,
# use the default in the template properties.
self.security_config = self.kafka.security_config.client_config(prop_file, node, self.jaas_override_variables)
self.security_config.setup_node(node)
prop_file += str(self.security_config)
return prop_file
def start_cmd(self, node):
"""Return the start command appropriate for the given node."""
args = self.args.copy()
args['zk_connect'] = self.kafka.zk_connect_setting()
args['stdout'] = ConsoleConsumer.STDOUT_CAPTURE
args['stderr'] = ConsoleConsumer.STDERR_CAPTURE
args['log_dir'] = ConsoleConsumer.LOG_DIR
args['log4j_config'] = ConsoleConsumer.LOG4J_CONFIG
args['config_file'] = ConsoleConsumer.CONFIG_FILE
args['stdout'] = ConsoleConsumer.STDOUT_CAPTURE
args['jmx_port'] = self.jmx_port
args['console_consumer'] = self.path.script("kafka-console-consumer.sh", node)
args['broker_list'] = self.kafka.bootstrap_servers(self.security_config.security_protocol)
if self.kafka_opts_override:
args['kafka_opts'] = "\"%s\"" % self.kafka_opts_override
else:
args['kafka_opts'] = self.security_config.kafka_opts
cmd = "export JMX_PORT=%(jmx_port)s; " \
"export LOG_DIR=%(log_dir)s; " \
"export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%(log4j_config)s\"; " \
"export KAFKA_OPTS=%(kafka_opts)s; " \
"%(console_consumer)s " \
"--topic %(topic)s " \
"--consumer.config %(config_file)s " % args
if self.new_consumer:
assert node.version >= V_0_9_0_0, \
"new_consumer is only supported if version >= 0.9.0.0, version %s" % str(node.version)
if node.version <= LATEST_0_10_0:
cmd += " --new-consumer"
cmd += " --bootstrap-server %(broker_list)s" % args
if node.version >= V_0_11_0_0:
cmd += " --isolation-level %s" % self.isolation_level
else:
assert node.version < V_2_0_0, \
"new_consumer==false is only supported if version < 2.0.0, version %s" % str(node.version)
cmd += " --zookeeper %(zk_connect)s" % args
if self.from_beginning:
cmd += " --from-beginning"
if self.consumer_timeout_ms is not None:
# version 0.8.X and below do not support --timeout-ms option
# This will be added in the properties file instead
if node.version > LATEST_0_8_2:
cmd += " --timeout-ms %s" % self.consumer_timeout_ms
if self.print_timestamp:
cmd += " --property print.timestamp=true"
if self.print_key:
cmd += " --property print.key=true"
if self.print_partition:
cmd += " --property print.partition=true"
# LoggingMessageFormatter was introduced after 0.9
if node.version > LATEST_0_9:
cmd += " --formatter kafka.tools.LoggingMessageFormatter"
if self.enable_systest_events:
# enable systest events is only available in 0.10.0 and later
# check the assertion here as well, in case node.version has been modified
assert node.version >= V_0_10_0_0
cmd += " --enable-systest-events"
if self.consumer_properties is not None:
for k, v in self.consumer_properties.items():
cmd += " --consumer-property %s=%s" % (k, v)
cmd += " 2>> %(stderr)s | tee -a %(stdout)s &" % args
return cmd
def pids(self, node):
return node.account.java_pids(self.java_class_name())
def alive(self, node):
return len(self.pids(node)) > 0
def _worker(self, idx, node):
node.account.ssh("mkdir -p %s" % ConsoleConsumer.PERSISTENT_ROOT, allow_fail=False)
# Create and upload config file
self.logger.info("console_consumer.properties:")
self.security_config = self.kafka.security_config.client_config(node=node,
jaas_override_variables=self.jaas_override_variables)
self.security_config.setup_node(node)
if self.client_prop_file_override:
prop_file = self.client_prop_file_override
else:
prop_file = self.prop_file(node)
self.logger.info(prop_file)
node.account.create_file(ConsoleConsumer.CONFIG_FILE, prop_file)
# Create and upload log properties
log_config = self.render('tools_log4j.properties', log_file=ConsoleConsumer.LOG_FILE)
node.account.create_file(ConsoleConsumer.LOG4J_CONFIG, log_config)
# Run and capture output
cmd = self.start_cmd(node)
self.logger.debug("Console consumer %d command: %s", idx, cmd)
consumer_output = node.account.ssh_capture(cmd, allow_fail=False)
with self.lock:
self.logger.debug("collecting following jmx objects: %s", self.jmx_object_names)
self.start_jmx_tool(idx, node)
for line in consumer_output:
msg = line.strip()
if msg == "shutdown_complete":
# Note that we can only rely on shutdown_complete message if running 0.10.0 or greater
if node in self.clean_shutdown_nodes:
raise Exception("Unexpected shutdown event from consumer, already shutdown. Consumer index: %d" % idx)
self.clean_shutdown_nodes.add(node)
else:
if self.message_validator is not None:
msg = self.message_validator(msg)
if msg is not None:
self.messages_consumed[idx].append(msg)
with self.lock:
self.read_jmx_output(idx, node)
def start_node(self, node):
BackgroundThreadService.start_node(self, node)
def stop_node(self, node):
self.logger.info("%s Stopping node %s" % (self.__class__.__name__, str(node.account)))
node.account.kill_java_processes(self.java_class_name(),
clean_shutdown=True, allow_fail=True)
stopped = self.wait_node(node, timeout_sec=self.stop_timeout_sec)
assert stopped, "Node %s: did not stop within the specified timeout of %s seconds" % \
(str(node.account), str(self.stop_timeout_sec))
def clean_node(self, node):
if self.alive(node):
self.logger.warn("%s %s was still alive at cleanup time. Killing forcefully..." %
(self.__class__.__name__, node.account))
JmxMixin.clean_node(self, node)
node.account.kill_java_processes(self.java_class_name(), clean_shutdown=False, allow_fail=True)
node.account.ssh("rm -rf %s" % ConsoleConsumer.PERSISTENT_ROOT, allow_fail=False)
self.security_config.clean_node(node)
def java_class_name(self):
return "ConsoleConsumer"
def has_log_message(self, node, message):
try:
node.account.ssh("grep '%s' %s" % (message, ConsoleConsumer.LOG_FILE))
except RemoteCommandError:
return False
return True
def wait_for_offset_reset(self, node, topic, num_partitions):
for partition in range(num_partitions):
message = "Resetting offset for partition %s-%d" % (topic, partition)
wait_until(lambda: self.has_log_message(node, message),
timeout_sec=60,
err_msg="Offset not reset for partition %s-%d" % (topic, partition))

View File

@@ -0,0 +1,21 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Define Consumer configuration property names here.
"""
GROUP_INSTANCE_ID = "group.instance.id"
SESSION_TIMEOUT_MS = "session.timeout.ms"

View File

@@ -0,0 +1,102 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
"""
Delegation tokens is a tool to manage the lifecycle of delegation tokens.
All commands are executed on a secured Kafka node reusing its generated jaas.conf and krb5.conf.
"""
class DelegationTokens(KafkaPathResolverMixin):
def __init__(self, kafka, context):
self.client_properties_content = """
security.protocol=SASL_PLAINTEXT
sasl.kerberos.service.name=kafka
"""
self.context = context
self.command_path = self.path.script("kafka-delegation-tokens.sh")
self.kafka_opts = "KAFKA_OPTS=\"-Djava.security.auth.login.config=/mnt/security/jaas.conf " \
"-Djava.security.krb5.conf=/mnt/security/krb5.conf\" "
self.kafka = kafka
self.bootstrap_server = " --bootstrap-server " + self.kafka.bootstrap_servers('SASL_PLAINTEXT')
self.base_cmd = self.kafka_opts + self.command_path + self.bootstrap_server
self.client_prop_path = os.path.join(self.kafka.PERSISTENT_ROOT, "client.properties")
self.jaas_deleg_conf_path = os.path.join(self.kafka.PERSISTENT_ROOT, "jaas_deleg.conf")
self.token_hmac_path = os.path.join(self.kafka.PERSISTENT_ROOT, "deleg_token_hmac.out")
self.delegation_token_out = os.path.join(self.kafka.PERSISTENT_ROOT, "delegation_token.out")
self.expire_delegation_token_out = os.path.join(self.kafka.PERSISTENT_ROOT, "expire_delegation_token.out")
self.renew_delegation_token_out = os.path.join(self.kafka.PERSISTENT_ROOT, "renew_delegation_token.out")
self.node = self.kafka.nodes[0]
def generate_delegation_token(self, maxlifetimeperiod=-1):
self.node.account.create_file(self.client_prop_path, self.client_properties_content)
cmd = self.base_cmd + " --create" \
" --max-life-time-period %s" \
" --command-config %s > %s" % (maxlifetimeperiod, self.client_prop_path, self.delegation_token_out)
self.node.account.ssh(cmd, allow_fail=False)
def expire_delegation_token(self, hmac):
cmd = self.base_cmd + " --expire" \
" --expiry-time-period -1" \
" --hmac %s" \
" --command-config %s > %s" % (hmac, self.client_prop_path, self.expire_delegation_token_out)
self.node.account.ssh(cmd, allow_fail=False)
def renew_delegation_token(self, hmac, renew_time_period=-1):
cmd = self.base_cmd + " --renew" \
" --renew-time-period %s" \
" --hmac %s" \
" --command-config %s > %s" \
% (renew_time_period, hmac, self.client_prop_path, self.renew_delegation_token_out)
return self.node.account.ssh_capture(cmd, allow_fail=False)
def create_jaas_conf_with_delegation_token(self):
dt = self.parse_delegation_token_out()
jaas_deleg_content = """
KafkaClient {
org.apache.kafka.common.security.scram.ScramLoginModule required
username="%s"
password="%s"
tokenauth=true;
};
""" % (dt["tokenid"], dt["hmac"])
self.node.account.create_file(self.jaas_deleg_conf_path, jaas_deleg_content)
return jaas_deleg_content
def token_hmac(self):
dt = self.parse_delegation_token_out()
return dt["hmac"]
def parse_delegation_token_out(self):
cmd = "tail -1 %s" % self.delegation_token_out
output_iter = self.node.account.ssh_capture(cmd, allow_fail=False)
output = ""
for line in output_iter:
output += line
tokenid, hmac, owner, renewers, issuedate, expirydate, maxdate = output.split()
return {"tokenid" : tokenid,
"hmac" : hmac,
"owner" : owner,
"renewers" : renewers,
"issuedate" : issuedate,
"expirydate" :expirydate,
"maxdate" : maxdate}

View File

@@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from kafka import KafkaService
from util import TopicPartition
from config import KafkaConfig

View File

@@ -0,0 +1,48 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import config_property
class KafkaConfig(dict):
"""A dictionary-like container class which allows for definition of overridable default values,
which is also capable of "rendering" itself as a useable server.properties file.
"""
DEFAULTS = {
config_property.PORT: 9092,
config_property.SOCKET_RECEIVE_BUFFER_BYTES: 65536,
config_property.LOG_DIRS: "/mnt/kafka/kafka-data-logs-1,/mnt/kafka/kafka-data-logs-2",
config_property.ZOOKEEPER_CONNECTION_TIMEOUT_MS: 2000
}
def __init__(self, **kwargs):
super(KafkaConfig, self).__init__(**kwargs)
# Set defaults
for key, val in self.DEFAULTS.items():
if not self.has_key(key):
self[key] = val
def render(self):
"""Render self as a series of lines key=val\n, and do so in a consistent order. """
keys = [k for k in self.keys()]
keys.sort()
s = ""
for k in keys:
s += "%s=%s\n" % (k, str(self[k]))
return s

View File

@@ -0,0 +1,192 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Define Kafka configuration property names here.
"""
BROKER_ID = "broker.id"
PORT = "port"
ADVERTISED_HOSTNAME = "advertised.host.name"
NUM_NETWORK_THREADS = "num.network.threads"
NUM_IO_THREADS = "num.io.threads"
SOCKET_SEND_BUFFER_BYTES = "socket.send.buffer.bytes"
SOCKET_RECEIVE_BUFFER_BYTES = "socket.receive.buffer.bytes"
SOCKET_REQUEST_MAX_BYTES = "socket.request.max.bytes"
LOG_DIRS = "log.dirs"
NUM_PARTITIONS = "num.partitions"
NUM_RECOVERY_THREADS_PER_DATA_DIR = "num.recovery.threads.per.data.dir"
LOG_RETENTION_HOURS = "log.retention.hours"
LOG_SEGMENT_BYTES = "log.segment.bytes"
LOG_RETENTION_CHECK_INTERVAL_MS = "log.retention.check.interval.ms"
LOG_RETENTION_MS = "log.retention.ms"
LOG_CLEANER_ENABLE = "log.cleaner.enable"
AUTO_CREATE_TOPICS_ENABLE = "auto.create.topics.enable"
ZOOKEEPER_CONNECT = "zookeeper.connect"
ZOOKEEPER_SSL_CLIENT_ENABLE = "zookeeper.ssl.client.enable"
ZOOKEEPER_CLIENT_CNXN_SOCKET = "zookeeper.clientCnxnSocket"
ZOOKEEPER_CONNECTION_TIMEOUT_MS = "zookeeper.connection.timeout.ms"
INTER_BROKER_PROTOCOL_VERSION = "inter.broker.protocol.version"
MESSAGE_FORMAT_VERSION = "log.message.format.version"
MESSAGE_TIMESTAMP_TYPE = "message.timestamp.type"
THROTTLING_REPLICATION_RATE_LIMIT = "replication.quota.throttled.rate"
LOG_FLUSH_INTERVAL_MESSAGE = "log.flush.interval.messages"
REPLICA_HIGHWATERMARK_CHECKPOINT_INTERVAL_MS = "replica.high.watermark.checkpoint.interval.ms"
LOG_ROLL_TIME_MS = "log.roll.ms"
OFFSETS_TOPIC_NUM_PARTITIONS = "offsets.topic.num.partitions"
DELEGATION_TOKEN_MAX_LIFETIME_MS="delegation.token.max.lifetime.ms"
DELEGATION_TOKEN_EXPIRY_TIME_MS="delegation.token.expiry.time.ms"
DELEGATION_TOKEN_MASTER_KEY="delegation.token.master.key"
SASL_ENABLED_MECHANISMS="sasl.enabled.mechanisms"
"""
From KafkaConfig.scala
/** ********* General Configuration ***********/
val MaxReservedBrokerIdProp = "reserved.broker.max.id"
val MessageMaxBytesProp = "message.max.bytes"
val NumIoThreadsProp = "num.io.threads"
val BackgroundThreadsProp = "background.threads"
val QueuedMaxRequestsProp = "queued.max.requests"
/** ********* Socket Server Configuration ***********/
val PortProp = "port"
val HostNameProp = "host.name"
val ListenersProp = "listeners"
val AdvertisedPortProp = "advertised.port"
val AdvertisedListenersProp = "advertised.listeners"
val SocketSendBufferBytesProp = "socket.send.buffer.bytes"
val SocketReceiveBufferBytesProp = "socket.receive.buffer.bytes"
val SocketRequestMaxBytesProp = "socket.request.max.bytes"
val MaxConnectionsPerIpProp = "max.connections.per.ip"
val MaxConnectionsPerIpOverridesProp = "max.connections.per.ip.overrides"
val ConnectionsMaxIdleMsProp = "connections.max.idle.ms"
/** ********* Log Configuration ***********/
val NumPartitionsProp = "num.partitions"
val LogDirsProp = "log.dirs"
val LogDirProp = "log.dir"
val LogSegmentBytesProp = "log.segment.bytes"
val LogRollTimeMillisProp = "log.roll.ms"
val LogRollTimeHoursProp = "log.roll.hours"
val LogRollTimeJitterMillisProp = "log.roll.jitter.ms"
val LogRollTimeJitterHoursProp = "log.roll.jitter.hours"
val LogRetentionTimeMillisProp = "log.retention.ms"
val LogRetentionTimeMinutesProp = "log.retention.minutes"
val LogRetentionTimeHoursProp = "log.retention.hours"
val LogRetentionBytesProp = "log.retention.bytes"
val LogCleanupIntervalMsProp = "log.retention.check.interval.ms"
val LogCleanupPolicyProp = "log.cleanup.policy"
val LogCleanerThreadsProp = "log.cleaner.threads"
val LogCleanerIoMaxBytesPerSecondProp = "log.cleaner.io.max.bytes.per.second"
val LogCleanerDedupeBufferSizeProp = "log.cleaner.dedupe.buffer.size"
val LogCleanerIoBufferSizeProp = "log.cleaner.io.buffer.size"
val LogCleanerDedupeBufferLoadFactorProp = "log.cleaner.io.buffer.load.factor"
val LogCleanerBackoffMsProp = "log.cleaner.backoff.ms"
val LogCleanerMinCleanRatioProp = "log.cleaner.min.cleanable.ratio"
val LogCleanerEnableProp = "log.cleaner.enable"
val LogCleanerDeleteRetentionMsProp = "log.cleaner.delete.retention.ms"
val LogIndexSizeMaxBytesProp = "log.index.size.max.bytes"
val LogIndexIntervalBytesProp = "log.index.interval.bytes"
val LogFlushIntervalMessagesProp = "log.flush.interval.messages"
val LogDeleteDelayMsProp = "log.segment.delete.delay.ms"
val LogFlushSchedulerIntervalMsProp = "log.flush.scheduler.interval.ms"
val LogFlushIntervalMsProp = "log.flush.interval.ms"
val LogFlushOffsetCheckpointIntervalMsProp = "log.flush.offset.checkpoint.interval.ms"
val LogPreAllocateProp = "log.preallocate"
val NumRecoveryThreadsPerDataDirProp = "num.recovery.threads.per.data.dir"
val MinInSyncReplicasProp = "min.insync.replicas"
/** ********* Replication configuration ***********/
val ControllerSocketTimeoutMsProp = "controller.socket.timeout.ms"
val DefaultReplicationFactorProp = "default.replication.factor"
val ReplicaLagTimeMaxMsProp = "replica.lag.time.max.ms"
val ReplicaSocketTimeoutMsProp = "replica.socket.timeout.ms"
val ReplicaSocketReceiveBufferBytesProp = "replica.socket.receive.buffer.bytes"
val ReplicaFetchMaxBytesProp = "replica.fetch.max.bytes"
val ReplicaFetchWaitMaxMsProp = "replica.fetch.wait.max.ms"
val ReplicaFetchMinBytesProp = "replica.fetch.min.bytes"
val ReplicaFetchBackoffMsProp = "replica.fetch.backoff.ms"
val NumReplicaFetchersProp = "num.replica.fetchers"
val ReplicaHighWatermarkCheckpointIntervalMsProp = "replica.high.watermark.checkpoint.interval.ms"
val FetchPurgatoryPurgeIntervalRequestsProp = "fetch.purgatory.purge.interval.requests"
val ProducerPurgatoryPurgeIntervalRequestsProp = "producer.purgatory.purge.interval.requests"
val AutoLeaderRebalanceEnableProp = "auto.leader.rebalance.enable"
val LeaderImbalancePerBrokerPercentageProp = "leader.imbalance.per.broker.percentage"
val LeaderImbalanceCheckIntervalSecondsProp = "leader.imbalance.check.interval.seconds"
val UncleanLeaderElectionEnableProp = "unclean.leader.election.enable"
val InterBrokerSecurityProtocolProp = "security.inter.broker.protocol"
val InterBrokerProtocolVersionProp = "inter.broker.protocol.version"
/** ********* Controlled shutdown configuration ***********/
val ControlledShutdownMaxRetriesProp = "controlled.shutdown.max.retries"
val ControlledShutdownRetryBackoffMsProp = "controlled.shutdown.retry.backoff.ms"
val ControlledShutdownEnableProp = "controlled.shutdown.enable"
/** ********* Consumer coordinator configuration ***********/
val ConsumerMinSessionTimeoutMsProp = "consumer.min.session.timeout.ms"
val ConsumerMaxSessionTimeoutMsProp = "consumer.max.session.timeout.ms"
/** ********* Offset management configuration ***********/
val OffsetMetadataMaxSizeProp = "offset.metadata.max.bytes"
val OffsetsLoadBufferSizeProp = "offsets.load.buffer.size"
val OffsetsTopicReplicationFactorProp = "offsets.topic.replication.factor"
val OffsetsTopicPartitionsProp = "offsets.topic.num.partitions"
val OffsetsTopicSegmentBytesProp = "offsets.topic.segment.bytes"
val OffsetsTopicCompressionCodecProp = "offsets.topic.compression.codec"
val OffsetsRetentionMinutesProp = "offsets.retention.minutes"
val OffsetsRetentionCheckIntervalMsProp = "offsets.retention.check.interval.ms"
val OffsetCommitTimeoutMsProp = "offsets.commit.timeout.ms"
val OffsetCommitRequiredAcksProp = "offsets.commit.required.acks"
/** ********* Quota Configuration ***********/
val ProducerQuotaBytesPerSecondDefaultProp = "quota.producer.default"
val ConsumerQuotaBytesPerSecondDefaultProp = "quota.consumer.default"
val NumQuotaSamplesProp = "quota.window.num"
val QuotaWindowSizeSecondsProp = "quota.window.size.seconds"
val DeleteTopicEnableProp = "delete.topic.enable"
val CompressionTypeProp = "compression.type"
/** ********* Kafka Metrics Configuration ***********/
val MetricSampleWindowMsProp = CommonClientConfigs.METRICS_SAMPLE_WINDOW_MS_CONFIG
val MetricNumSamplesProp: String = CommonClientConfigs.METRICS_NUM_SAMPLES_CONFIG
val MetricReporterClassesProp: String = CommonClientConfigs.METRIC_REPORTER_CLASSES_CONFIG
/** ********* SSL Configuration ****************/
val PrincipalBuilderClassProp = SSLConfigs.PRINCIPAL_BUILDER_CLASS_CONFIG
val SSLProtocolProp = SSLConfigs.SSL_PROTOCOL_CONFIG
val SSLProviderProp = SSLConfigs.SSL_PROVIDER_CONFIG
val SSLCipherSuitesProp = SSLConfigs.SSL_CIPHER_SUITES_CONFIG
val SSLEnabledProtocolsProp = SSLConfigs.SSL_ENABLED_PROTOCOLS_CONFIG
val SSLKeystoreTypeProp = SSLConfigs.SSL_KEYSTORE_TYPE_CONFIG
val SSLKeystoreLocationProp = SSLConfigs.SSL_KEYSTORE_LOCATION_CONFIG
val SSLKeystorePasswordProp = SSLConfigs.SSL_KEYSTORE_PASSWORD_CONFIG
val SSLKeyPasswordProp = SSLConfigs.SSL_KEY_PASSWORD_CONFIG
val SSLTruststoreTypeProp = SSLConfigs.SSL_TRUSTSTORE_TYPE_CONFIG
val SSLTruststoreLocationProp = SSLConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG
val SSLTruststorePasswordProp = SSLConfigs.SSL_TRUSTSTORE_PASSWORD_CONFIG
val SSLKeyManagerAlgorithmProp = SSLConfigs.SSL_KEYMANAGER_ALGORITHM_CONFIG
val SSLTrustManagerAlgorithmProp = SSLConfigs.SSL_TRUSTMANAGER_ALGORITHM_CONFIG
val SSLEndpointIdentificationAlgorithmProp = SSLConfigs.SSL_ENDPOINT_IDENTIFICATION_ALGORITHM_CONFIG
val SSLSecureRandomImplementationProp = SSLConfigs.SSL_SECURE_RANDOM_IMPLEMENTATION_CONFIG
val SSLClientAuthProp = SSLConfigs.SSL_CLIENT_AUTH_CONFIG
"""

View File

@@ -0,0 +1,897 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import json
import os.path
import re
import signal
import time
from ducktape.services.service import Service
from ducktape.utils.util import wait_until
from ducktape.cluster.remoteaccount import RemoteCommandError
from config import KafkaConfig
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
from kafkatest.services.kafka import config_property
from kafkatest.services.monitor.jmx import JmxMixin
from kafkatest.services.security.minikdc import MiniKdc
from kafkatest.services.security.listener_security_config import ListenerSecurityConfig
from kafkatest.services.security.security_config import SecurityConfig
from kafkatest.version import DEV_BRANCH, LATEST_0_10_0
class KafkaListener:
def __init__(self, name, port_number, security_protocol, open=False):
self.name = name
self.port_number = port_number
self.security_protocol = security_protocol
self.open = open
def listener(self):
return "%s://:%s" % (self.name, str(self.port_number))
def advertised_listener(self, node):
return "%s://%s:%s" % (self.name, node.account.hostname, str(self.port_number))
def listener_security_protocol(self):
return "%s:%s" % (self.name, self.security_protocol)
class KafkaService(KafkaPathResolverMixin, JmxMixin, Service):
PERSISTENT_ROOT = "/mnt/kafka"
STDOUT_STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "server-start-stdout-stderr.log")
LOG4J_CONFIG = os.path.join(PERSISTENT_ROOT, "kafka-log4j.properties")
# Logs such as controller.log, server.log, etc all go here
OPERATIONAL_LOG_DIR = os.path.join(PERSISTENT_ROOT, "kafka-operational-logs")
OPERATIONAL_LOG_INFO_DIR = os.path.join(OPERATIONAL_LOG_DIR, "info")
OPERATIONAL_LOG_DEBUG_DIR = os.path.join(OPERATIONAL_LOG_DIR, "debug")
# Kafka log segments etc go here
DATA_LOG_DIR_PREFIX = os.path.join(PERSISTENT_ROOT, "kafka-data-logs")
DATA_LOG_DIR_1 = "%s-1" % (DATA_LOG_DIR_PREFIX)
DATA_LOG_DIR_2 = "%s-2" % (DATA_LOG_DIR_PREFIX)
CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "kafka.properties")
# Kafka Authorizer
ACL_AUTHORIZER = "kafka.security.authorizer.AclAuthorizer"
# Old Kafka Authorizer. This is deprecated but still supported.
SIMPLE_AUTHORIZER = "kafka.security.auth.SimpleAclAuthorizer"
HEAP_DUMP_FILE = os.path.join(PERSISTENT_ROOT, "kafka_heap_dump.bin")
INTERBROKER_LISTENER_NAME = 'INTERNAL'
JAAS_CONF_PROPERTY = "java.security.auth.login.config=/mnt/security/jaas.conf"
KRB5_CONF = "java.security.krb5.conf=/mnt/security/krb5.conf"
logs = {
"kafka_server_start_stdout_stderr": {
"path": STDOUT_STDERR_CAPTURE,
"collect_default": True},
"kafka_operational_logs_info": {
"path": OPERATIONAL_LOG_INFO_DIR,
"collect_default": True},
"kafka_operational_logs_debug": {
"path": OPERATIONAL_LOG_DEBUG_DIR,
"collect_default": False},
"kafka_data_1": {
"path": DATA_LOG_DIR_1,
"collect_default": False},
"kafka_data_2": {
"path": DATA_LOG_DIR_2,
"collect_default": False},
"kafka_heap_dump_file": {
"path": HEAP_DUMP_FILE,
"collect_default": True}
}
def __init__(self, context, num_nodes, zk, security_protocol=SecurityConfig.PLAINTEXT, interbroker_security_protocol=SecurityConfig.PLAINTEXT,
client_sasl_mechanism=SecurityConfig.SASL_MECHANISM_GSSAPI, interbroker_sasl_mechanism=SecurityConfig.SASL_MECHANISM_GSSAPI,
authorizer_class_name=None, topics=None, version=DEV_BRANCH, jmx_object_names=None,
jmx_attributes=None, zk_connect_timeout=5000, zk_session_timeout=6000, server_prop_overides=None, zk_chroot=None,
zk_client_secure=False,
listener_security_config=ListenerSecurityConfig(), per_node_server_prop_overrides=None, extra_kafka_opts=""):
"""
:param context: test context
:param ZookeeperService zk:
:param dict topics: which topics to create automatically
:param str security_protocol: security protocol for clients to use
:param str interbroker_security_protocol: security protocol to use for broker-to-broker communication
:param str client_sasl_mechanism: sasl mechanism for clients to use
:param str interbroker_sasl_mechanism: sasl mechanism to use for broker-to-broker communication
:param str authorizer_class_name: which authorizer class to use
:param str version: which kafka version to use. Defaults to "dev" branch
:param jmx_object_names:
:param jmx_attributes:
:param int zk_connect_timeout:
:param int zk_session_timeout:
:param dict server_prop_overides: overrides for kafka.properties file
:param zk_chroot:
:param bool zk_client_secure: connect to Zookeeper over secure client port (TLS) when True
:param ListenerSecurityConfig listener_security_config: listener config to use
:param dict per_node_server_prop_overrides:
:param str extra_kafka_opts: jvm args to add to KAFKA_OPTS variable
"""
Service.__init__(self, context, num_nodes)
JmxMixin.__init__(self, num_nodes=num_nodes, jmx_object_names=jmx_object_names, jmx_attributes=(jmx_attributes or []),
root=KafkaService.PERSISTENT_ROOT)
self.zk = zk
self.security_protocol = security_protocol
self.client_sasl_mechanism = client_sasl_mechanism
self.topics = topics
self.minikdc = None
self.authorizer_class_name = authorizer_class_name
self.zk_set_acl = False
if server_prop_overides is None:
self.server_prop_overides = []
else:
self.server_prop_overides = server_prop_overides
if per_node_server_prop_overrides is None:
self.per_node_server_prop_overrides = {}
else:
self.per_node_server_prop_overrides = per_node_server_prop_overrides
self.log_level = "DEBUG"
self.zk_chroot = zk_chroot
self.zk_client_secure = zk_client_secure
self.listener_security_config = listener_security_config
self.extra_kafka_opts = extra_kafka_opts
#
# In a heavily loaded and not very fast machine, it is
# sometimes necessary to give more time for the zk client
# to have its session established, especially if the client
# is authenticating and waiting for the SaslAuthenticated
# in addition to the SyncConnected event.
#
# The default value for zookeeper.connect.timeout.ms is
# 2 seconds and here we increase it to 5 seconds, but
# it can be overridden by setting the corresponding parameter
# for this constructor.
self.zk_connect_timeout = zk_connect_timeout
# Also allow the session timeout to be provided explicitly,
# primarily so that test cases can depend on it when waiting
# e.g. brokers to deregister after a hard kill.
self.zk_session_timeout = zk_session_timeout
self.port_mappings = {
'PLAINTEXT': KafkaListener('PLAINTEXT', 9092, 'PLAINTEXT', False),
'SSL': KafkaListener('SSL', 9093, 'SSL', False),
'SASL_PLAINTEXT': KafkaListener('SASL_PLAINTEXT', 9094, 'SASL_PLAINTEXT', False),
'SASL_SSL': KafkaListener('SASL_SSL', 9095, 'SASL_SSL', False),
KafkaService.INTERBROKER_LISTENER_NAME:
KafkaListener(KafkaService.INTERBROKER_LISTENER_NAME, 9099, None, False)
}
self.interbroker_listener = None
self.setup_interbroker_listener(interbroker_security_protocol, self.listener_security_config.use_separate_interbroker_listener)
self.interbroker_sasl_mechanism = interbroker_sasl_mechanism
for node in self.nodes:
node.version = version
node.config = KafkaConfig(**{config_property.BROKER_ID: self.idx(node)})
def set_version(self, version):
for node in self.nodes:
node.version = version
@property
def interbroker_security_protocol(self):
return self.interbroker_listener.security_protocol
# this is required for backwards compatibility - there are a lot of tests that set this property explicitly
# meaning 'use one of the existing listeners that match given security protocol, do not use custom listener'
@interbroker_security_protocol.setter
def interbroker_security_protocol(self, security_protocol):
self.setup_interbroker_listener(security_protocol, use_separate_listener=False)
def setup_interbroker_listener(self, security_protocol, use_separate_listener=False):
self.listener_security_config.use_separate_interbroker_listener = use_separate_listener
if self.listener_security_config.use_separate_interbroker_listener:
# do not close existing port here since it is not used exclusively for interbroker communication
self.interbroker_listener = self.port_mappings[KafkaService.INTERBROKER_LISTENER_NAME]
self.interbroker_listener.security_protocol = security_protocol
else:
# close dedicated interbroker port, so it's not dangling in 'listeners' and 'advertised.listeners'
self.close_port(KafkaService.INTERBROKER_LISTENER_NAME)
self.interbroker_listener = self.port_mappings[security_protocol]
@property
def security_config(self):
config = SecurityConfig(self.context, self.security_protocol, self.interbroker_listener.security_protocol,
zk_sasl=self.zk.zk_sasl, zk_tls=self.zk_client_secure,
client_sasl_mechanism=self.client_sasl_mechanism,
interbroker_sasl_mechanism=self.interbroker_sasl_mechanism,
listener_security_config=self.listener_security_config)
for port in self.port_mappings.values():
if port.open:
config.enable_security_protocol(port.security_protocol)
return config
def open_port(self, listener_name):
self.port_mappings[listener_name].open = True
def close_port(self, listener_name):
self.port_mappings[listener_name].open = False
def start_minikdc_if_necessary(self, add_principals=""):
if self.security_config.has_sasl:
if self.minikdc is None:
self.minikdc = MiniKdc(self.context, self.nodes, extra_principals = add_principals)
self.minikdc.start()
else:
self.minikdc = None
def alive(self, node):
return len(self.pids(node)) > 0
def start(self, add_principals="", use_zk_to_create_topic=True):
if self.zk_client_secure and not self.zk.zk_client_secure_port:
raise Exception("Unable to start Kafka: TLS to Zookeeper requested but Zookeeper secure port not enabled")
self.open_port(self.security_protocol)
self.interbroker_listener.open = True
self.start_minikdc_if_necessary(add_principals)
self._ensure_zk_chroot()
Service.start(self)
self.logger.info("Waiting for brokers to register at ZK")
retries = 30
expected_broker_ids = set(self.nodes)
wait_until(lambda: {node for node in self.nodes if self.is_registered(node)} == expected_broker_ids, 30, 1)
if retries == 0:
raise RuntimeError("Kafka servers didn't register at ZK within 30 seconds")
# Create topics if necessary
if self.topics is not None:
for topic, topic_cfg in self.topics.items():
if topic_cfg is None:
topic_cfg = {}
topic_cfg["topic"] = topic
self.create_topic(topic_cfg, use_zk_to_create_topic=use_zk_to_create_topic)
def _ensure_zk_chroot(self):
self.logger.info("Ensuring zk_chroot %s exists", self.zk_chroot)
if self.zk_chroot:
if not self.zk_chroot.startswith('/'):
raise Exception("Zookeeper chroot must start with '/' but found " + self.zk_chroot)
parts = self.zk_chroot.split('/')[1:]
for i in range(len(parts)):
self.zk.create('/' + '/'.join(parts[:i+1]))
def set_protocol_and_port(self, node):
listeners = []
advertised_listeners = []
protocol_map = []
for port in self.port_mappings.values():
if port.open:
listeners.append(port.listener())
advertised_listeners.append(port.advertised_listener(node))
protocol_map.append(port.listener_security_protocol())
self.listeners = ','.join(listeners)
self.advertised_listeners = ','.join(advertised_listeners)
self.listener_security_protocol_map = ','.join(protocol_map)
self.interbroker_bootstrap_servers = self.__bootstrap_servers(self.interbroker_listener, True)
def prop_file(self, node):
self.set_protocol_and_port(node)
#load template configs as dictionary
config_template = self.render('kafka.properties', node=node, broker_id=self.idx(node),
security_config=self.security_config, num_nodes=self.num_nodes,
listener_security_config=self.listener_security_config)
configs = dict( l.rstrip().split('=', 1) for l in config_template.split('\n')
if not l.startswith("#") and "=" in l )
#load specific test override configs
override_configs = KafkaConfig(**node.config)
override_configs[config_property.ADVERTISED_HOSTNAME] = node.account.hostname
override_configs[config_property.ZOOKEEPER_CONNECT] = self.zk_connect_setting()
if self.zk_client_secure:
override_configs[config_property.ZOOKEEPER_SSL_CLIENT_ENABLE] = 'true'
override_configs[config_property.ZOOKEEPER_CLIENT_CNXN_SOCKET] = 'org.apache.zookeeper.ClientCnxnSocketNetty'
else:
override_configs[config_property.ZOOKEEPER_SSL_CLIENT_ENABLE] = 'false'
for prop in self.server_prop_overides:
override_configs[prop[0]] = prop[1]
for prop in self.per_node_server_prop_overrides.get(self.idx(node), []):
override_configs[prop[0]] = prop[1]
#update template configs with test override configs
configs.update(override_configs)
prop_file = self.render_configs(configs)
return prop_file
def render_configs(self, configs):
"""Render self as a series of lines key=val\n, and do so in a consistent order. """
keys = [k for k in configs.keys()]
keys.sort()
s = ""
for k in keys:
s += "%s=%s\n" % (k, str(configs[k]))
return s
def start_cmd(self, node):
cmd = "export JMX_PORT=%d; " % self.jmx_port
cmd += "export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%s\"; " % self.LOG4J_CONFIG
heap_kafka_opts = "-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=%s" % \
self.logs["kafka_heap_dump_file"]["path"]
security_kafka_opts = self.security_config.kafka_opts.strip('\"')
cmd += "export KAFKA_OPTS=\"%s %s %s\"; " % (heap_kafka_opts, security_kafka_opts, self.extra_kafka_opts)
cmd += "%s %s 1>> %s 2>> %s &" % \
(self.path.script("kafka-server-start.sh", node),
KafkaService.CONFIG_FILE,
KafkaService.STDOUT_STDERR_CAPTURE,
KafkaService.STDOUT_STDERR_CAPTURE)
return cmd
def start_node(self, node, timeout_sec=60):
node.account.mkdirs(KafkaService.PERSISTENT_ROOT)
prop_file = self.prop_file(node)
self.logger.info("kafka.properties:")
self.logger.info(prop_file)
node.account.create_file(KafkaService.CONFIG_FILE, prop_file)
node.account.create_file(self.LOG4J_CONFIG, self.render('log4j.properties', log_dir=KafkaService.OPERATIONAL_LOG_DIR))
self.security_config.setup_node(node)
self.security_config.setup_credentials(node, self.path, self.zk_connect_setting(), broker=True)
cmd = self.start_cmd(node)
self.logger.debug("Attempting to start KafkaService on %s with command: %s" % (str(node.account), cmd))
with node.account.monitor_log(KafkaService.STDOUT_STDERR_CAPTURE) as monitor:
node.account.ssh(cmd)
# Kafka 1.0.0 and higher don't have a space between "Kafka" and "Server"
monitor.wait_until("Kafka\s*Server.*started", timeout_sec=timeout_sec, backoff_sec=.25,
err_msg="Kafka server didn't finish startup in %d seconds" % timeout_sec)
# Credentials for inter-broker communication are created before starting Kafka.
# Client credentials are created after starting Kafka so that both loading of
# existing credentials from ZK and dynamic update of credentials in Kafka are tested.
self.security_config.setup_credentials(node, self.path, self.zk_connect_setting(), broker=False)
self.start_jmx_tool(self.idx(node), node)
if len(self.pids(node)) == 0:
raise Exception("No process ids recorded on node %s" % node.account.hostname)
def pids(self, node):
"""Return process ids associated with running processes on the given node."""
try:
cmd = "jcmd | grep -e %s | awk '{print $1}'" % self.java_class_name()
pid_arr = [pid for pid in node.account.ssh_capture(cmd, allow_fail=True, callback=int)]
return pid_arr
except (RemoteCommandError, ValueError) as e:
return []
def signal_node(self, node, sig=signal.SIGTERM):
pids = self.pids(node)
for pid in pids:
node.account.signal(pid, sig)
def signal_leader(self, topic, partition=0, sig=signal.SIGTERM):
leader = self.leader(topic, partition)
self.signal_node(leader, sig)
def stop_node(self, node, clean_shutdown=True, timeout_sec=60):
pids = self.pids(node)
sig = signal.SIGTERM if clean_shutdown else signal.SIGKILL
for pid in pids:
node.account.signal(pid, sig, allow_fail=False)
try:
wait_until(lambda: len(self.pids(node)) == 0, timeout_sec=timeout_sec,
err_msg="Kafka node failed to stop in %d seconds" % timeout_sec)
except Exception:
self.thread_dump(node)
raise
def thread_dump(self, node):
for pid in self.pids(node):
try:
node.account.signal(pid, signal.SIGQUIT, allow_fail=True)
except:
self.logger.warn("Could not dump threads on node")
def clean_node(self, node):
JmxMixin.clean_node(self, node)
self.security_config.clean_node(node)
node.account.kill_java_processes(self.java_class_name(),
clean_shutdown=False, allow_fail=True)
node.account.ssh("sudo rm -rf -- %s" % KafkaService.PERSISTENT_ROOT, allow_fail=False)
def _kafka_topics_cmd(self, node, use_zk_connection=True):
"""
Returns kafka-topics.sh command path with jaas configuration and krb5 environment variable
set. If Admin client is not going to be used, don't set the environment variable.
"""
kafka_topic_script = self.path.script("kafka-topics.sh", node)
skip_security_settings = use_zk_connection or not node.version.topic_command_supports_bootstrap_server()
return kafka_topic_script if skip_security_settings else \
"KAFKA_OPTS='-D%s -D%s' %s" % (KafkaService.JAAS_CONF_PROPERTY, KafkaService.KRB5_CONF, kafka_topic_script)
def _kafka_topics_cmd_config(self, node, use_zk_connection=True):
"""
Return --command-config parameter to the kafka-topics.sh command. The config parameter specifies
the security settings that AdminClient uses to connect to a secure kafka server.
"""
skip_command_config = use_zk_connection or not node.version.topic_command_supports_bootstrap_server()
return "" if skip_command_config else " --command-config <(echo '%s')" % (self.security_config.client_config())
def create_topic(self, topic_cfg, node=None, use_zk_to_create_topic=True):
"""Run the admin tool create topic command.
Specifying node is optional, and may be done if for different kafka nodes have different versions,
and we care where command gets run.
If the node is not specified, run the command from self.nodes[0]
"""
if node is None:
node = self.nodes[0]
self.logger.info("Creating topic %s with settings %s",
topic_cfg["topic"], topic_cfg)
use_zk_connection = topic_cfg.get('if-not-exists', False) or use_zk_to_create_topic
cmd = "%(kafka_topics_cmd)s %(connection_string)s --create --topic %(topic)s " % {
'kafka_topics_cmd': self._kafka_topics_cmd(node, use_zk_connection),
'connection_string': self._connect_setting(node, use_zk_connection),
'topic': topic_cfg.get("topic"),
}
if 'replica-assignment' in topic_cfg:
cmd += " --replica-assignment %(replica-assignment)s" % {
'replica-assignment': topic_cfg.get('replica-assignment')
}
else:
cmd += " --partitions %(partitions)d --replication-factor %(replication-factor)d" % {
'partitions': topic_cfg.get('partitions', 1),
'replication-factor': topic_cfg.get('replication-factor', 1)
}
if topic_cfg.get('if-not-exists', False):
cmd += ' --if-not-exists'
if "configs" in topic_cfg.keys() and topic_cfg["configs"] is not None:
for config_name, config_value in topic_cfg["configs"].items():
cmd += " --config %s=%s" % (config_name, str(config_value))
cmd += self._kafka_topics_cmd_config(node, use_zk_connection)
self.logger.info("Running topic creation command...\n%s" % cmd)
node.account.ssh(cmd)
def delete_topic(self, topic, node=None):
"""
Delete a topic with the topics command
:param topic:
:param node:
:return:
"""
if node is None:
node = self.nodes[0]
self.logger.info("Deleting topic %s" % topic)
kafka_topic_script = self.path.script("kafka-topics.sh", node)
cmd = kafka_topic_script + " "
cmd += "--bootstrap-server %(bootstrap_servers)s --delete --topic %(topic)s " % {
'bootstrap_servers': self.bootstrap_servers(self.security_protocol),
'topic': topic
}
self.logger.info("Running topic delete command...\n%s" % cmd)
node.account.ssh(cmd)
def describe_topic(self, topic, node=None, use_zk_to_describe_topic=True):
if node is None:
node = self.nodes[0]
cmd = "%s %s --topic %s --describe %s" % \
(self._kafka_topics_cmd(node=node, use_zk_connection=use_zk_to_describe_topic),
self._connect_setting(node=node, use_zk_connection=use_zk_to_describe_topic),
topic, self._kafka_topics_cmd_config(node=node, use_zk_connection=use_zk_to_describe_topic))
self.logger.info("Running topic describe command...\n%s" % cmd)
output = ""
for line in node.account.ssh_capture(cmd):
output += line
return output
def list_topics(self, node=None, use_zk_to_list_topic=True):
if node is None:
node = self.nodes[0]
cmd = "%s %s --list %s" % (self._kafka_topics_cmd(node, use_zk_to_list_topic),
self._connect_setting(node, use_zk_to_list_topic),
self._kafka_topics_cmd_config(node, use_zk_to_list_topic))
for line in node.account.ssh_capture(cmd):
if not line.startswith("SLF4J"):
yield line.rstrip()
def alter_message_format(self, topic, msg_format_version, node=None):
if node is None:
node = self.nodes[0]
self.logger.info("Altering message format version for topic %s with format %s", topic, msg_format_version)
cmd = "%s --zookeeper %s %s --entity-name %s --entity-type topics --alter --add-config message.format.version=%s" % \
(self.path.script("kafka-configs.sh", node), self.zk_connect_setting(), self.zk.zkTlsConfigFileOption(), topic, msg_format_version)
self.logger.info("Running alter message format command...\n%s" % cmd)
node.account.ssh(cmd)
def set_unclean_leader_election(self, topic, value=True, node=None):
if node is None:
node = self.nodes[0]
if value is True:
self.logger.info("Enabling unclean leader election for topic %s", topic)
else:
self.logger.info("Disabling unclean leader election for topic %s", topic)
cmd = "%s --zookeeper %s %s --entity-name %s --entity-type topics --alter --add-config unclean.leader.election.enable=%s" % \
(self.path.script("kafka-configs.sh", node), self.zk_connect_setting(), self.zk.zkTlsConfigFileOption(), topic, str(value).lower())
self.logger.info("Running alter unclean leader command...\n%s" % cmd)
node.account.ssh(cmd)
def parse_describe_topic(self, topic_description):
"""Parse output of kafka-topics.sh --describe (or describe_topic() method above), which is a string of form
PartitionCount:2\tReplicationFactor:2\tConfigs:
Topic: test_topic\ttPartition: 0\tLeader: 3\tReplicas: 3,1\tIsr: 3,1
Topic: test_topic\tPartition: 1\tLeader: 1\tReplicas: 1,2\tIsr: 1,2
into a dictionary structure appropriate for use with reassign-partitions tool:
{
"partitions": [
{"topic": "test_topic", "partition": 0, "replicas": [3, 1]},
{"topic": "test_topic", "partition": 1, "replicas": [1, 2]}
]
}
"""
lines = map(lambda x: x.strip(), topic_description.split("\n"))
partitions = []
for line in lines:
m = re.match(".*Leader:.*", line)
if m is None:
continue
fields = line.split("\t")
# ["Partition: 4", "Leader: 0"] -> ["4", "0"]
fields = map(lambda x: x.split(" ")[1], fields)
partitions.append(
{"topic": fields[0],
"partition": int(fields[1]),
"replicas": map(int, fields[3].split(','))})
return {"partitions": partitions}
def verify_reassign_partitions(self, reassignment, node=None):
"""Run the reassign partitions admin tool in "verify" mode
"""
if node is None:
node = self.nodes[0]
json_file = "/tmp/%s_reassign.json" % str(time.time())
# reassignment to json
json_str = json.dumps(reassignment)
json_str = json.dumps(json_str)
# create command
cmd = "echo %s > %s && " % (json_str, json_file)
cmd += "%s " % self.path.script("kafka-reassign-partitions.sh", node)
cmd += "--zookeeper %s " % self.zk_connect_setting()
cmd += "--reassignment-json-file %s " % json_file
cmd += "--verify "
cmd += "&& sleep 1 && rm -f %s" % json_file
# send command
self.logger.info("Verifying partition reassignment...")
self.logger.debug(cmd)
output = ""
for line in node.account.ssh_capture(cmd):
output += line
self.logger.debug(output)
if re.match(".*Reassignment of partition.*failed.*",
output.replace('\n', '')) is not None:
return False
if re.match(".*is still in progress.*",
output.replace('\n', '')) is not None:
return False
return True
def execute_reassign_partitions(self, reassignment, node=None,
throttle=None):
"""Run the reassign partitions admin tool in "verify" mode
"""
if node is None:
node = self.nodes[0]
json_file = "/tmp/%s_reassign.json" % str(time.time())
# reassignment to json
json_str = json.dumps(reassignment)
json_str = json.dumps(json_str)
# create command
cmd = "echo %s > %s && " % (json_str, json_file)
cmd += "%s " % self.path.script( "kafka-reassign-partitions.sh", node)
cmd += "--zookeeper %s " % self.zk_connect_setting()
cmd += "--reassignment-json-file %s " % json_file
cmd += "--execute"
if throttle is not None:
cmd += " --throttle %d" % throttle
cmd += " && sleep 1 && rm -f %s" % json_file
# send command
self.logger.info("Executing parition reassignment...")
self.logger.debug(cmd)
output = ""
for line in node.account.ssh_capture(cmd):
output += line
self.logger.debug("Verify partition reassignment:")
self.logger.debug(output)
def search_data_files(self, topic, messages):
"""Check if a set of messages made it into the Kakfa data files. Note that
this method takes no account of replication. It simply looks for the
payload in all the partition files of the specified topic. 'messages' should be
an array of numbers. The list of missing messages is returned.
"""
payload_match = "payload: " + "$|payload: ".join(str(x) for x in messages) + "$"
found = set([])
self.logger.debug("number of unique missing messages we will search for: %d",
len(messages))
for node in self.nodes:
# Grab all .log files in directories prefixed with this topic
files = node.account.ssh_capture("find %s* -regex '.*/%s-.*/[^/]*.log'" % (KafkaService.DATA_LOG_DIR_PREFIX, topic))
# Check each data file to see if it contains the messages we want
for log in files:
cmd = "%s kafka.tools.DumpLogSegments --print-data-log --files %s | grep -E \"%s\"" % \
(self.path.script("kafka-run-class.sh", node), log.strip(), payload_match)
for line in node.account.ssh_capture(cmd, allow_fail=True):
for val in messages:
if line.strip().endswith("payload: "+str(val)):
self.logger.debug("Found %s in data-file [%s] in line: [%s]" % (val, log.strip(), line.strip()))
found.add(val)
self.logger.debug("Number of unique messages found in the log: %d",
len(found))
missing = list(set(messages) - found)
if len(missing) > 0:
self.logger.warn("The following values were not found in the data files: " + str(missing))
return missing
def restart_cluster(self, clean_shutdown=True, timeout_sec=60, after_each_broker_restart=None, *args):
for node in self.nodes:
self.restart_node(node, clean_shutdown=clean_shutdown, timeout_sec=timeout_sec)
if after_each_broker_restart is not None:
after_each_broker_restart(*args)
def restart_node(self, node, clean_shutdown=True, timeout_sec=60):
"""Restart the given node."""
self.stop_node(node, clean_shutdown, timeout_sec)
self.start_node(node, timeout_sec)
def isr_idx_list(self, topic, partition=0):
""" Get in-sync replica list the given topic and partition.
"""
self.logger.debug("Querying zookeeper to find in-sync replicas for topic %s and partition %d" % (topic, partition))
zk_path = "/brokers/topics/%s/partitions/%d/state" % (topic, partition)
partition_state = self.zk.query(zk_path, chroot=self.zk_chroot)
if partition_state is None:
raise Exception("Error finding partition state for topic %s and partition %d." % (topic, partition))
partition_state = json.loads(partition_state)
self.logger.info(partition_state)
isr_idx_list = partition_state["isr"]
self.logger.info("Isr for topic %s and partition %d is now: %s" % (topic, partition, isr_idx_list))
return isr_idx_list
def replicas(self, topic, partition=0):
""" Get the assigned replicas for the given topic and partition.
"""
self.logger.debug("Querying zookeeper to find assigned replicas for topic %s and partition %d" % (topic, partition))
zk_path = "/brokers/topics/%s" % (topic)
assignment = self.zk.query(zk_path, chroot=self.zk_chroot)
if assignment is None:
raise Exception("Error finding partition state for topic %s and partition %d." % (topic, partition))
assignment = json.loads(assignment)
self.logger.info(assignment)
replicas = assignment["partitions"][str(partition)]
self.logger.info("Assigned replicas for topic %s and partition %d is now: %s" % (topic, partition, replicas))
return [self.get_node(replica) for replica in replicas]
def leader(self, topic, partition=0):
""" Get the leader replica for the given topic and partition.
"""
self.logger.debug("Querying zookeeper to find leader replica for topic %s and partition %d" % (topic, partition))
zk_path = "/brokers/topics/%s/partitions/%d/state" % (topic, partition)
partition_state = self.zk.query(zk_path, chroot=self.zk_chroot)
if partition_state is None:
raise Exception("Error finding partition state for topic %s and partition %d." % (topic, partition))
partition_state = json.loads(partition_state)
self.logger.info(partition_state)
leader_idx = int(partition_state["leader"])
self.logger.info("Leader for topic %s and partition %d is now: %d" % (topic, partition, leader_idx))
return self.get_node(leader_idx)
def cluster_id(self):
""" Get the current cluster id
"""
self.logger.debug("Querying ZooKeeper to retrieve cluster id")
cluster = self.zk.query("/cluster/id", chroot=self.zk_chroot)
try:
return json.loads(cluster)['id'] if cluster else None
except:
self.logger.debug("Data in /cluster/id znode could not be parsed. Data = %s" % cluster)
raise
def check_protocol_errors(self, node):
""" Checks for common protocol exceptions due to invalid inter broker protocol handling.
While such errors can and should be checked in other ways, checking the logs is a worthwhile failsafe.
"""
for node in self.nodes:
exit_code = node.account.ssh("grep -e 'java.lang.IllegalArgumentException: Invalid version' -e SchemaException %s/*"
% KafkaService.OPERATIONAL_LOG_DEBUG_DIR, allow_fail=True)
if exit_code != 1:
return False
return True
def list_consumer_groups(self, node=None, command_config=None):
""" Get list of consumer groups.
"""
if node is None:
node = self.nodes[0]
consumer_group_script = self.path.script("kafka-consumer-groups.sh", node)
if command_config is None:
command_config = ""
else:
command_config = "--command-config " + command_config
cmd = "%s --bootstrap-server %s %s --list" % \
(consumer_group_script,
self.bootstrap_servers(self.security_protocol),
command_config)
output = ""
self.logger.debug(cmd)
for line in node.account.ssh_capture(cmd):
if not line.startswith("SLF4J"):
output += line
self.logger.debug(output)
return output
def describe_consumer_group(self, group, node=None, command_config=None):
""" Describe a consumer group.
"""
if node is None:
node = self.nodes[0]
consumer_group_script = self.path.script("kafka-consumer-groups.sh", node)
if command_config is None:
command_config = ""
else:
command_config = "--command-config " + command_config
cmd = "%s --bootstrap-server %s %s --group %s --describe" % \
(consumer_group_script,
self.bootstrap_servers(self.security_protocol),
command_config, group)
output = ""
self.logger.debug(cmd)
for line in node.account.ssh_capture(cmd):
if not (line.startswith("SLF4J") or line.startswith("TOPIC") or line.startswith("Could not fetch offset")):
output += line
self.logger.debug(output)
return output
def zk_connect_setting(self):
return self.zk.connect_setting(self.zk_chroot, self.zk_client_secure)
def _connect_setting(self, node, use_zk_connection=True):
"""
Checks if --bootstrap-server config is supported, if yes then returns a string with
bootstrap server, otherwise returns zookeeper connection string.
"""
if node.version.topic_command_supports_bootstrap_server() and not use_zk_connection:
connection_setting = "--bootstrap-server %s" % (self.bootstrap_servers(self.security_protocol))
else:
connection_setting = "--zookeeper %s" % (self.zk_connect_setting())
return connection_setting
def __bootstrap_servers(self, port, validate=True, offline_nodes=[]):
if validate and not port.open:
raise ValueError("We are retrieving bootstrap servers for the port: %s which is not currently open. - " %
str(port.port_number))
return ','.join([node.account.hostname + ":" + str(port.port_number)
for node in self.nodes
if node not in offline_nodes])
def bootstrap_servers(self, protocol='PLAINTEXT', validate=True, offline_nodes=[]):
"""Return comma-delimited list of brokers in this cluster formatted as HOSTNAME1:PORT1,HOSTNAME:PORT2,...
This is the format expected by many config files.
"""
port_mapping = self.port_mappings[protocol]
self.logger.info("Bootstrap client port is: " + str(port_mapping.port_number))
return self.__bootstrap_servers(port_mapping, validate, offline_nodes)
def controller(self):
""" Get the controller node
"""
self.logger.debug("Querying zookeeper to find controller broker")
controller_info = self.zk.query("/controller", chroot=self.zk_chroot)
if controller_info is None:
raise Exception("Error finding controller info")
controller_info = json.loads(controller_info)
self.logger.debug(controller_info)
controller_idx = int(controller_info["brokerid"])
self.logger.info("Controller's ID: %d" % (controller_idx))
return self.get_node(controller_idx)
def is_registered(self, node):
"""
Check whether a broker is registered in Zookeeper
"""
self.logger.debug("Querying zookeeper to see if broker %s is registered", str(node))
broker_info = self.zk.query("/brokers/ids/%s" % self.idx(node), chroot=self.zk_chroot)
self.logger.debug("Broker info: %s", broker_info)
return broker_info is not None
def get_offset_shell(self, topic, partitions, max_wait_ms, offsets, time):
node = self.nodes[0]
cmd = self.path.script("kafka-run-class.sh", node)
cmd += " kafka.tools.GetOffsetShell"
cmd += " --topic %s --broker-list %s --max-wait-ms %s --offsets %s --time %s" % (topic, self.bootstrap_servers(self.security_protocol), max_wait_ms, offsets, time)
if partitions:
cmd += ' --partitions %s' % partitions
cmd += " 2>> %s/get_offset_shell.log" % KafkaService.PERSISTENT_ROOT
cmd += " | tee -a %s/get_offset_shell.log &" % KafkaService.PERSISTENT_ROOT
output = ""
self.logger.debug(cmd)
for line in node.account.ssh_capture(cmd):
output += line
self.logger.debug(output)
return output
def java_class_name(self):
return "kafka.Kafka"

View File

@@ -0,0 +1,91 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# see kafka.server.KafkaConfig for additional details and defaults
advertised.host.name={{ node.account.hostname }}
listeners={{ listeners }}
advertised.listeners={{ advertised_listeners }}
listener.security.protocol.map={{ listener_security_protocol_map }}
{% if node.version.supports_named_listeners() %}
inter.broker.listener.name={{ interbroker_listener.name }}
{% else %}
security.inter.broker.protocol={{ interbroker_listener.security_protocol }}
{% endif %}
{% for k, v in listener_security_config.client_listener_overrides.iteritems() %}
{% if listener_security_config.requires_sasl_mechanism_prefix(k) %}
listener.name.{{ security_protocol.lower() }}.{{ security_config.client_sasl_mechanism.lower() }}.{{ k }}={{ v }}
{% else %}
listener.name.{{ security_protocol.lower() }}.{{ k }}={{ v }}
{% endif %}
{% endfor %}
{% if interbroker_listener.name != security_protocol %}
{% for k, v in listener_security_config.interbroker_listener_overrides.iteritems() %}
{% if listener_security_config.requires_sasl_mechanism_prefix(k) %}
listener.name.{{ interbroker_listener.name.lower() }}.{{ security_config.interbroker_sasl_mechanism.lower() }}.{{ k }}={{ v }}
{% else %}
listener.name.{{ interbroker_listener.name.lower() }}.{{ k }}={{ v }}
{% endif %}
{% endfor %}
{% endif %}
ssl.keystore.location=/mnt/security/test.keystore.jks
ssl.keystore.password=test-ks-passwd
ssl.key.password=test-ks-passwd
ssl.keystore.type=JKS
ssl.truststore.location=/mnt/security/test.truststore.jks
ssl.truststore.password=test-ts-passwd
ssl.truststore.type=JKS
ssl.endpoint.identification.algorithm=HTTPS
# Zookeeper TLS settings
#
# Note that zookeeper.ssl.client.enable will be set to true or false elsewhere, as appropriate.
# If it is false then these ZK keystore/truststore settings will have no effect. If it is true then
# zookeeper.clientCnxnSocket will also be set elsewhere (to org.apache.zookeeper.ClientCnxnSocketNetty)
{% if not zk.zk_tls_encrypt_only %}
zookeeper.ssl.keystore.location=/mnt/security/test.keystore.jks
zookeeper.ssl.keystore.password=test-ks-passwd
{% endif %}
zookeeper.ssl.truststore.location=/mnt/security/test.truststore.jks
zookeeper.ssl.truststore.password=test-ts-passwd
#
sasl.mechanism.inter.broker.protocol={{ security_config.interbroker_sasl_mechanism }}
sasl.enabled.mechanisms={{ ",".join(security_config.enabled_sasl_mechanisms) }}
sasl.kerberos.service.name=kafka
{% if authorizer_class_name is not none %}
ssl.client.auth=required
authorizer.class.name={{ authorizer_class_name }}
{% endif %}
zookeeper.set.acl={{"true" if zk_set_acl else "false"}}
zookeeper.connection.timeout.ms={{ zk_connect_timeout }}
zookeeper.session.timeout.ms={{ zk_session_timeout }}
{% if replica_lag is defined %}
replica.lag.time.max.ms={{replica_lag}}
{% endif %}
{% if auto_create_topics_enable is defined and auto_create_topics_enable is not none %}
auto.create.topics.enable={{ auto_create_topics_enable }}
{% endif %}
offsets.topic.num.partitions={{ num_nodes }}
offsets.topic.replication.factor={{ 3 if num_nodes > 3 else num_nodes }}
# Set to a low, but non-zero value to exercise this path without making tests much slower
group.initial.rebalance.delay.ms=100

View File

@@ -0,0 +1,136 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
log4j.rootLogger={{ log_level|default("DEBUG") }}, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c)%n
# INFO level appenders
log4j.appender.kafkaInfoAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.kafkaInfoAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.kafkaInfoAppender.File={{ log_dir }}/info/server.log
log4j.appender.kafkaInfoAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.kafkaInfoAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.kafkaInfoAppender.Threshold=INFO
log4j.appender.stateChangeInfoAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.stateChangeInfoAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.stateChangeInfoAppender.File={{ log_dir }}/info/state-change.log
log4j.appender.stateChangeInfoAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.stateChangeInfoAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.stateChangeInfoAppender.Threshold=INFO
log4j.appender.requestInfoAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.requestInfoAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.requestInfoAppender.File={{ log_dir }}/info/kafka-request.log
log4j.appender.requestInfoAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.requestInfoAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.requestInfoAppender.Threshold=INFO
log4j.appender.cleanerInfoAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.cleanerInfoAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.cleanerInfoAppender.File={{ log_dir }}/info/log-cleaner.log
log4j.appender.cleanerInfoAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.cleanerInfoAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.cleanerInfoAppender.Threshold=INFO
log4j.appender.controllerInfoAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.controllerInfoAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.controllerInfoAppender.File={{ log_dir }}/info/controller.log
log4j.appender.controllerInfoAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.controllerInfoAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.controllerInfoAppender.Threshold=INFO
log4j.appender.authorizerInfoAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.authorizerInfoAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.authorizerInfoAppender.File={{ log_dir }}/info/kafka-authorizer.log
log4j.appender.authorizerInfoAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.authorizerInfoAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.authorizerInfoAppender.Threshold=INFO
# DEBUG level appenders
log4j.appender.kafkaDebugAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.kafkaDebugAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.kafkaDebugAppender.File={{ log_dir }}/debug/server.log
log4j.appender.kafkaDebugAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.kafkaDebugAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.kafkaDebugAppender.Threshold=DEBUG
log4j.appender.stateChangeDebugAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.stateChangeDebugAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.stateChangeDebugAppender.File={{ log_dir }}/debug/state-change.log
log4j.appender.stateChangeDebugAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.stateChangeDebugAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.stateChangeDebugAppender.Threshold=DEBUG
log4j.appender.requestDebugAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.requestDebugAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.requestDebugAppender.File={{ log_dir }}/debug/kafka-request.log
log4j.appender.requestDebugAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.requestDebugAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.requestDebugAppender.Threshold=DEBUG
log4j.appender.cleanerDebugAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.cleanerDebugAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.cleanerDebugAppender.File={{ log_dir }}/debug/log-cleaner.log
log4j.appender.cleanerDebugAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.cleanerDebugAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.cleanerDebugAppender.Threshold=DEBUG
log4j.appender.controllerDebugAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.controllerDebugAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.controllerDebugAppender.File={{ log_dir }}/debug/controller.log
log4j.appender.controllerDebugAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.controllerDebugAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.controllerDebugAppender.Threshold=DEBUG
log4j.appender.authorizerDebugAppender=org.apache.log4j.DailyRollingFileAppender
log4j.appender.authorizerDebugAppender.DatePattern='.'yyyy-MM-dd-HH
log4j.appender.authorizerDebugAppender.File={{ log_dir }}/debug/kafka-authorizer.log
log4j.appender.authorizerDebugAppender.layout=org.apache.log4j.PatternLayout
log4j.appender.authorizerDebugAppender.layout.ConversionPattern=[%d] %p %m (%c)%n
log4j.appender.authorizerDebugAppender.Threshold=DEBUG
# Turn on all our debugging info
log4j.logger.kafka.producer.async.DefaultEventHandler={{ log_level|default("DEBUG") }}, kafkaInfoAppender, kafkaDebugAppender
log4j.logger.kafka.client.ClientUtils={{ log_level|default("DEBUG") }}, kafkaInfoAppender, kafkaDebugAppender
log4j.logger.kafka.perf={{ log_level|default("DEBUG") }}, kafkaInfoAppender, kafkaDebugAppender
log4j.logger.kafka.perf.ProducerPerformance$ProducerThread={{ log_level|default("DEBUG") }}, kafkaInfoAppender, kafkaDebugAppender
log4j.logger.kafka={{ log_level|default("DEBUG") }}, kafkaInfoAppender, kafkaDebugAppender
log4j.logger.kafka.network.RequestChannel$={{ log_level|default("DEBUG") }}, requestInfoAppender, requestDebugAppender
log4j.additivity.kafka.network.RequestChannel$=false
log4j.logger.kafka.network.Processor={{ log_level|default("DEBUG") }}, requestInfoAppender, requestDebugAppender
log4j.logger.kafka.server.KafkaApis={{ log_level|default("DEBUG") }}, requestInfoAppender, requestDebugAppender
log4j.additivity.kafka.server.KafkaApis=false
log4j.logger.kafka.request.logger={{ log_level|default("DEBUG") }}, requestInfoAppender, requestDebugAppender
log4j.additivity.kafka.request.logger=false
log4j.logger.kafka.controller={{ log_level|default("DEBUG") }}, controllerInfoAppender, controllerDebugAppender
log4j.additivity.kafka.controller=false
log4j.logger.kafka.log.LogCleaner={{ log_level|default("DEBUG") }}, cleanerInfoAppender, cleanerDebugAppender
log4j.additivity.kafka.log.LogCleaner=false
log4j.logger.state.change.logger={{ log_level|default("DEBUG") }}, stateChangeInfoAppender, stateChangeDebugAppender
log4j.additivity.state.change.logger=false
#Change this to debug to get the actual audit log for authorizer.
log4j.logger.kafka.authorizer.logger={{ log_level|default("DEBUG") }}, authorizerInfoAppender, authorizerDebugAppender
log4j.additivity.kafka.authorizer.logger=false

View File

@@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import namedtuple
TopicPartition = namedtuple('TopicPartition', ['topic', 'partition'])

View File

@@ -0,0 +1,83 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ducktape.services.background_thread import BackgroundThreadService
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
from kafkatest.services.security.security_config import SecurityConfig
class KafkaLog4jAppender(KafkaPathResolverMixin, BackgroundThreadService):
logs = {
"producer_log": {
"path": "/mnt/kafka_log4j_appender.log",
"collect_default": False}
}
def __init__(self, context, num_nodes, kafka, topic, max_messages=-1, security_protocol="PLAINTEXT"):
super(KafkaLog4jAppender, self).__init__(context, num_nodes)
self.kafka = kafka
self.topic = topic
self.max_messages = max_messages
self.security_protocol = security_protocol
self.security_config = SecurityConfig(self.context, security_protocol)
self.stop_timeout_sec = 30
def _worker(self, idx, node):
cmd = self.start_cmd(node)
self.logger.debug("VerifiableLog4jAppender %d command: %s" % (idx, cmd))
self.security_config.setup_node(node)
node.account.ssh(cmd)
def start_cmd(self, node):
cmd = self.path.script("kafka-run-class.sh", node)
cmd += " "
cmd += self.java_class_name()
cmd += " --topic %s --broker-list %s" % (self.topic, self.kafka.bootstrap_servers(self.security_protocol))
if self.max_messages > 0:
cmd += " --max-messages %s" % str(self.max_messages)
if self.security_protocol != SecurityConfig.PLAINTEXT:
cmd += " --security-protocol %s" % str(self.security_protocol)
if self.security_protocol == SecurityConfig.SSL or self.security_protocol == SecurityConfig.SASL_SSL:
cmd += " --ssl-truststore-location %s" % str(SecurityConfig.TRUSTSTORE_PATH)
cmd += " --ssl-truststore-password %s" % str(SecurityConfig.ssl_stores.truststore_passwd)
if self.security_protocol == SecurityConfig.SASL_PLAINTEXT or \
self.security_protocol == SecurityConfig.SASL_SSL or \
self.security_protocol == SecurityConfig.SASL_MECHANISM_GSSAPI or \
self.security_protocol == SecurityConfig.SASL_MECHANISM_PLAIN:
cmd += " --sasl-kerberos-service-name %s" % str('kafka')
cmd += " --client-jaas-conf-path %s" % str(SecurityConfig.JAAS_CONF_PATH)
cmd += " --kerb5-conf-path %s" % str(SecurityConfig.KRB5CONF_PATH)
cmd += " 2>> /mnt/kafka_log4j_appender.log | tee -a /mnt/kafka_log4j_appender.log &"
return cmd
def stop_node(self, node):
node.account.kill_java_processes(self.java_class_name(), allow_fail=False)
stopped = self.wait_node(node, timeout_sec=self.stop_timeout_sec)
assert stopped, "Node %s: did not stop within the specified timeout of %s seconds" % \
(str(node.account), str(self.stop_timeout_sec))
def clean_node(self, node):
node.account.kill_java_processes(self.java_class_name(), clean_shutdown=False,
allow_fail=False)
node.account.ssh("rm -rf /mnt/kafka_log4j_appender.log", allow_fail=False)
def java_class_name(self):
return "org.apache.kafka.tools.VerifiableLog4jAppender"

View File

@@ -0,0 +1,88 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from ducktape.services.background_thread import BackgroundThreadService
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin, CORE_LIBS_JAR_NAME, CORE_DEPENDANT_TEST_LIBS_JAR_NAME
from kafkatest.services.security.security_config import SecurityConfig
from kafkatest.version import DEV_BRANCH
class LogCompactionTester(KafkaPathResolverMixin, BackgroundThreadService):
OUTPUT_DIR = "/mnt/logcompaction_tester"
LOG_PATH = os.path.join(OUTPUT_DIR, "logcompaction_tester_stdout.log")
VERIFICATION_STRING = "Data verification is completed"
logs = {
"tool_logs": {
"path": LOG_PATH,
"collect_default": True}
}
def __init__(self, context, kafka, security_protocol="PLAINTEXT", stop_timeout_sec=30):
super(LogCompactionTester, self).__init__(context, 1)
self.kafka = kafka
self.security_protocol = security_protocol
self.security_config = SecurityConfig(self.context, security_protocol)
self.stop_timeout_sec = stop_timeout_sec
self.log_compaction_completed = False
def _worker(self, idx, node):
node.account.ssh("mkdir -p %s" % LogCompactionTester.OUTPUT_DIR)
cmd = self.start_cmd(node)
self.logger.info("LogCompactionTester %d command: %s" % (idx, cmd))
self.security_config.setup_node(node)
for line in node.account.ssh_capture(cmd):
self.logger.debug("Checking line:{}".format(line))
if line.startswith(LogCompactionTester.VERIFICATION_STRING):
self.log_compaction_completed = True
def start_cmd(self, node):
core_libs_jar = self.path.jar(CORE_LIBS_JAR_NAME, DEV_BRANCH)
core_dependant_test_libs_jar = self.path.jar(CORE_DEPENDANT_TEST_LIBS_JAR_NAME, DEV_BRANCH)
cmd = "for file in %s; do CLASSPATH=$CLASSPATH:$file; done;" % core_libs_jar
cmd += " for file in %s; do CLASSPATH=$CLASSPATH:$file; done;" % core_dependant_test_libs_jar
cmd += " export CLASSPATH;"
cmd += self.path.script("kafka-run-class.sh", node)
cmd += " %s" % self.java_class_name()
cmd += " --bootstrap-server %s --messages 1000000 --sleep 20 --duplicates 10 --percent-deletes 10" % (self.kafka.bootstrap_servers(self.security_protocol))
cmd += " 2>> %s | tee -a %s &" % (self.logs["tool_logs"]["path"], self.logs["tool_logs"]["path"])
return cmd
def stop_node(self, node):
node.account.kill_java_processes(self.java_class_name(), clean_shutdown=True,
allow_fail=True)
stopped = self.wait_node(node, timeout_sec=self.stop_timeout_sec)
assert stopped, "Node %s: did not stop within the specified timeout of %s seconds" % \
(str(node.account), str(self.stop_timeout_sec))
def clean_node(self, node):
node.account.kill_java_processes(self.java_class_name(), clean_shutdown=False,
allow_fail=True)
node.account.ssh("rm -rf %s" % LogCompactionTester.OUTPUT_DIR, allow_fail=False)
def java_class_name(self):
return "kafka.tools.LogCompactionTester"
@property
def is_done(self):
return self.log_compaction_completed

View File

@@ -0,0 +1,164 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from ducktape.services.service import Service
from ducktape.utils.util import wait_until
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
"""
MirrorMaker is a tool for mirroring data between two Kafka clusters.
"""
class MirrorMaker(KafkaPathResolverMixin, Service):
# Root directory for persistent output
PERSISTENT_ROOT = "/mnt/mirror_maker"
LOG_DIR = os.path.join(PERSISTENT_ROOT, "logs")
LOG_FILE = os.path.join(LOG_DIR, "mirror_maker.log")
LOG4J_CONFIG = os.path.join(PERSISTENT_ROOT, "tools-log4j.properties")
PRODUCER_CONFIG = os.path.join(PERSISTENT_ROOT, "producer.properties")
CONSUMER_CONFIG = os.path.join(PERSISTENT_ROOT, "consumer.properties")
logs = {
"mirror_maker_log": {
"path": LOG_FILE,
"collect_default": True}
}
def __init__(self, context, num_nodes, source, target, whitelist=None, num_streams=1,
consumer_timeout_ms=None, offsets_storage="kafka",
offset_commit_interval_ms=60000, log_level="DEBUG", producer_interceptor_classes=None):
"""
MirrorMaker mirrors messages from one or more source clusters to a single destination cluster.
Args:
context: standard context
source: source Kafka cluster
target: target Kafka cluster to which data will be mirrored
whitelist: whitelist regex for topics to mirror
blacklist: blacklist regex for topics not to mirror
num_streams: number of consumer threads to create; can be a single int, or a list with
one value per node, allowing num_streams to be the same for each node,
or configured independently per-node
consumer_timeout_ms: consumer stops if t > consumer_timeout_ms elapses between consecutive messages
offsets_storage: used for consumer offsets.storage property
offset_commit_interval_ms: how frequently the mirror maker consumer commits offsets
"""
super(MirrorMaker, self).__init__(context, num_nodes=num_nodes)
self.log_level = log_level
self.consumer_timeout_ms = consumer_timeout_ms
self.num_streams = num_streams
if not isinstance(num_streams, int):
# if not an integer, num_streams should be configured per-node
assert len(num_streams) == num_nodes
self.whitelist = whitelist
self.source = source
self.target = target
self.offsets_storage = offsets_storage.lower()
if not (self.offsets_storage in ["kafka", "zookeeper"]):
raise Exception("offsets_storage should be 'kafka' or 'zookeeper'. Instead found %s" % self.offsets_storage)
self.offset_commit_interval_ms = offset_commit_interval_ms
self.producer_interceptor_classes = producer_interceptor_classes
self.external_jars = None
# These properties are potentially used by third-party tests.
self.source_auto_offset_reset = None
self.partition_assignment_strategy = None
def start_cmd(self, node):
cmd = "export LOG_DIR=%s;" % MirrorMaker.LOG_DIR
cmd += " export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%s\";" % MirrorMaker.LOG4J_CONFIG
cmd += " export KAFKA_OPTS=%s;" % self.security_config.kafka_opts
# add external dependencies, for instance for interceptors
if self.external_jars is not None:
cmd += "for file in %s; do CLASSPATH=$CLASSPATH:$file; done; " % self.external_jars
cmd += "export CLASSPATH; "
cmd += " %s %s" % (self.path.script("kafka-run-class.sh", node),
self.java_class_name())
cmd += " --consumer.config %s" % MirrorMaker.CONSUMER_CONFIG
cmd += " --producer.config %s" % MirrorMaker.PRODUCER_CONFIG
cmd += " --offset.commit.interval.ms %s" % str(self.offset_commit_interval_ms)
if isinstance(self.num_streams, int):
cmd += " --num.streams %d" % self.num_streams
else:
# config num_streams separately on each node
cmd += " --num.streams %d" % self.num_streams[self.idx(node) - 1]
if self.whitelist is not None:
cmd += " --whitelist=\"%s\"" % self.whitelist
cmd += " 1>> %s 2>> %s &" % (MirrorMaker.LOG_FILE, MirrorMaker.LOG_FILE)
return cmd
def pids(self, node):
return node.account.java_pids(self.java_class_name())
def alive(self, node):
return len(self.pids(node)) > 0
def start_node(self, node):
node.account.ssh("mkdir -p %s" % MirrorMaker.PERSISTENT_ROOT, allow_fail=False)
node.account.ssh("mkdir -p %s" % MirrorMaker.LOG_DIR, allow_fail=False)
self.security_config = self.source.security_config.client_config()
self.security_config.setup_node(node)
# Create, upload one consumer config file for source cluster
consumer_props = self.render("mirror_maker_consumer.properties")
consumer_props += str(self.security_config)
node.account.create_file(MirrorMaker.CONSUMER_CONFIG, consumer_props)
self.logger.info("Mirrormaker consumer props:\n" + consumer_props)
# Create, upload producer properties file for target cluster
producer_props = self.render('mirror_maker_producer.properties')
producer_props += str(self.security_config)
self.logger.info("Mirrormaker producer props:\n" + producer_props)
node.account.create_file(MirrorMaker.PRODUCER_CONFIG, producer_props)
# Create and upload log properties
log_config = self.render('tools_log4j.properties', log_file=MirrorMaker.LOG_FILE)
node.account.create_file(MirrorMaker.LOG4J_CONFIG, log_config)
# Run mirror maker
cmd = self.start_cmd(node)
self.logger.debug("Mirror maker command: %s", cmd)
node.account.ssh(cmd, allow_fail=False)
wait_until(lambda: self.alive(node), timeout_sec=30, backoff_sec=.5,
err_msg="Mirror maker took to long to start.")
self.logger.debug("Mirror maker is alive")
def stop_node(self, node, clean_shutdown=True):
node.account.kill_java_processes(self.java_class_name(), allow_fail=True,
clean_shutdown=clean_shutdown)
wait_until(lambda: not self.alive(node), timeout_sec=30, backoff_sec=.5,
err_msg="Mirror maker took to long to stop.")
def clean_node(self, node):
if self.alive(node):
self.logger.warn("%s %s was still alive at cleanup time. Killing forcefully..." %
(self.__class__.__name__, node.account))
node.account.kill_java_processes(self.java_class_name(), clean_shutdown=False,
allow_fail=True)
node.account.ssh("rm -rf %s" % MirrorMaker.PERSISTENT_ROOT, allow_fail=False)
self.security_config.clean_node(node)
def java_class_name(self):
return "kafka.tools.MirrorMaker"

View File

@@ -0,0 +1,14 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,228 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
from collections import defaultdict, namedtuple
import json
from threading import Thread
from select import select
import socket
MetricKey = namedtuple('MetricKey', ['host', 'client_id', 'name', 'group', 'tags'])
MetricValue = namedtuple('MetricValue', ['time', 'value'])
# Python's logging library doesn't define anything more detailed than DEBUG, but we'd like a finer-grained setting for
# for highly detailed messages, e.g. logging every single incoming request.
TRACE = 5
class HttpMetricsCollector(object):
"""
HttpMetricsCollector enables collection of metrics from various Kafka clients instrumented with the
PushHttpMetricsReporter. It starts a web server locally and provides the necessary configuration for clients
to automatically report metrics data to this server. It also provides basic functionality for querying the
recorded metrics. This class can be used either as a mixin or standalone object.
"""
# The port to listen on on the worker node, which will be forwarded to the port listening on this driver node
REMOTE_PORT = 6789
def __init__(self, **kwargs):
"""
Create a new HttpMetricsCollector
:param period the period, in seconds, between updates that the metrics reporter configuration should define.
defaults to reporting once per second
:param args:
:param kwargs:
"""
self._http_metrics_period = kwargs.pop('period', 1)
super(HttpMetricsCollector, self).__init__(**kwargs)
# TODO: currently we maintain just a simple map from all key info -> value. However, some key fields are far
# more common to filter on, so we'd want to index by them, e.g. host, client.id, metric name.
self._http_metrics = defaultdict(list)
self._httpd = HTTPServer(('', 0), _MetricsReceiver)
self._httpd.parent = self
self._httpd.metrics = self._http_metrics
self._http_metrics_thread = Thread(target=self._run_http_metrics_httpd,
name='http-metrics-thread[%s]' % str(self))
self._http_metrics_thread.start()
self._forwarders = {}
@property
def http_metrics_url(self):
"""
:return: the URL to use when reporting metrics
"""
return "http://%s:%d" % ("localhost", self.REMOTE_PORT)
@property
def http_metrics_client_configs(self):
"""
Get client configurations that can be used to report data to this collector. Put these in a properties file for
clients (e.g. console producer or consumer) to have them push metrics to this driver. Note that in some cases
(e.g. streams, connect) these settings may need to be prefixed.
:return: a dictionary of client configurations that will direct a client to report metrics to this collector
"""
return {
"metric.reporters": "org.apache.kafka.tools.PushHttpMetricsReporter",
"metrics.url": self.http_metrics_url,
"metrics.period": self._http_metrics_period,
}
def start_node(self, node):
local_port = self._httpd.socket.getsockname()[1]
self.logger.debug('HttpMetricsCollector listening on %s', local_port)
self._forwarders[self.idx(node)] = _ReverseForwarder(self.logger, node, self.REMOTE_PORT, local_port)
super(HttpMetricsCollector, self).start_node(node)
def stop(self):
super(HttpMetricsCollector, self).stop()
if self._http_metrics_thread:
self.logger.debug("Shutting down metrics httpd")
self._httpd.shutdown()
self._http_metrics_thread.join()
self.logger.debug("Finished shutting down metrics httpd")
def stop_node(self, node):
super(HttpMetricsCollector, self).stop_node(node)
idx = self.idx(node)
self._forwarders[idx].stop()
del self._forwarders[idx]
def metrics(self, host=None, client_id=None, name=None, group=None, tags=None):
"""
Get any collected metrics that match the specified parameters, yielding each as a tuple of
(key, [<timestamp, value>, ...]) values.
"""
for k, values in self._http_metrics.iteritems():
if ((host is None or host == k.host) and
(client_id is None or client_id == k.client_id) and
(name is None or name == k.name) and
(group is None or group == k.group) and
(tags is None or tags == k.tags)):
yield (k, values)
def _run_http_metrics_httpd(self):
self._httpd.serve_forever()
class _MetricsReceiver(BaseHTTPRequestHandler):
"""
HTTP request handler that accepts requests from the PushHttpMetricsReporter and stores them back into the parent
HttpMetricsCollector
"""
def log_message(self, format, *args, **kwargs):
# Don't do any logging here so we get rid of the mostly useless per-request Apache log-style info that spams
# the debug log
pass
def do_POST(self):
data = self.rfile.read(int(self.headers['Content-Length']))
data = json.loads(data)
self.server.parent.logger.log(TRACE, "POST %s\n\n%s\n%s", self.path, self.headers,
json.dumps(data, indent=4, separators=(',', ': ')))
self.send_response(204)
self.end_headers()
client = data['client']
host = client['host']
client_id = client['client_id']
ts = client['time']
metrics = data['metrics']
for raw_metric in metrics:
name = raw_metric['name']
group = raw_metric['group']
# Convert to tuple of pairs because dicts & lists are unhashable
tags = tuple([(k, v) for k, v in raw_metric['tags'].iteritems()]),
value = raw_metric['value']
key = MetricKey(host=host, client_id=client_id, name=name, group=group, tags=tags)
metric_value = MetricValue(time=ts, value=value)
self.server.metrics[key].append(metric_value)
class _ReverseForwarder(object):
"""
Runs reverse forwarding of a port on a node to a local port. This allows you to setup a server on the test driver
that only assumes we have basic SSH access that ducktape guarantees is available for worker nodes.
"""
def __init__(self, logger, node, remote_port, local_port):
self.logger = logger
self._node = node
self._local_port = local_port
self._remote_port = remote_port
self.logger.debug('Forwarding %s port %d to driver port %d', node, remote_port, local_port)
self._stopping = False
self._transport = node.account.ssh_client.get_transport()
self._transport.request_port_forward('', remote_port)
self._accept_thread = Thread(target=self._accept)
self._accept_thread.start()
def stop(self):
self._stopping = True
self._accept_thread.join(30)
if self._accept_thread.isAlive():
raise RuntimeError("Failed to stop reverse forwarder on %s", self._node)
self._transport.cancel_port_forward('', self._remote_port)
def _accept(self):
while not self._stopping:
chan = self._transport.accept(1)
if chan is None:
continue
thr = Thread(target=self._handler, args=(chan,))
thr.setDaemon(True)
thr.start()
def _handler(self, chan):
sock = socket.socket()
try:
sock.connect(("localhost", self._local_port))
except Exception as e:
self.logger.error('Forwarding request to port %d failed: %r', self._local_port, e)
return
self.logger.log(TRACE, 'Connected! Tunnel open %r -> %r -> %d', chan.origin_addr, chan.getpeername(),
self._local_port)
while True:
r, w, x = select([sock, chan], [], [])
if sock in r:
data = sock.recv(1024)
if len(data) == 0:
break
chan.send(data)
if chan in r:
data = chan.recv(1024)
if len(data) == 0:
break
sock.send(data)
chan.close()
sock.close()
self.logger.log(TRACE, 'Tunnel closed from %r', chan.origin_addr)

View File

@@ -0,0 +1,141 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from ducktape.cluster.remoteaccount import RemoteCommandError
from ducktape.utils.util import wait_until
from kafkatest.version import get_version, V_0_11_0_0, DEV_BRANCH
class JmxMixin(object):
"""This mixin helps existing service subclasses start JmxTool on their worker nodes and collect jmx stats.
A couple things worth noting:
- this is not a service in its own right.
- we assume the service using JmxMixin also uses KafkaPathResolverMixin
- this uses the --wait option for JmxTool, so the list of object names must be explicit; no patterns are permitted
"""
def __init__(self, num_nodes, jmx_object_names=None, jmx_attributes=None, jmx_poll_ms=1000, root="/mnt"):
self.jmx_object_names = jmx_object_names
self.jmx_attributes = jmx_attributes or []
self.jmx_poll_ms = jmx_poll_ms
self.jmx_port = 9192
self.started = [False] * num_nodes
self.jmx_stats = [{} for x in range(num_nodes)]
self.maximum_jmx_value = {} # map from object_attribute_name to maximum value observed over time
self.average_jmx_value = {} # map from object_attribute_name to average value observed over time
self.jmx_tool_log = os.path.join(root, "jmx_tool.log")
self.jmx_tool_err_log = os.path.join(root, "jmx_tool.err.log")
def clean_node(self, node):
node.account.kill_java_processes(self.jmx_class_name(), clean_shutdown=False,
allow_fail=True)
idx = self.idx(node)
self.started[idx-1] = False
node.account.ssh("rm -f -- %s %s" % (self.jmx_tool_log, self.jmx_tool_err_log), allow_fail=False)
def start_jmx_tool(self, idx, node):
if self.jmx_object_names is None:
self.logger.debug("%s: Not starting jmx tool because no jmx objects are defined" % node.account)
return
if self.started[idx-1]:
self.logger.debug("%s: jmx tool has been started already on this node" % node.account)
return
# JmxTool is not particularly robust to slow-starting processes. In order to ensure JmxTool doesn't fail if the
# process we're trying to monitor takes awhile before listening on the JMX port, wait until we can see that port
# listening before even launching JmxTool
def check_jmx_port_listening():
return 0 == node.account.ssh("nc -z 127.0.0.1 %d" % self.jmx_port, allow_fail=True)
wait_until(check_jmx_port_listening, timeout_sec=30, backoff_sec=.1,
err_msg="%s: Never saw JMX port for %s start listening" % (node.account, self))
# To correctly wait for requested JMX metrics to be added we need the --wait option for JmxTool. This option was
# not added until 0.11.0.1, so any earlier versions need to use JmxTool from a newer version.
use_jmxtool_version = get_version(node)
if use_jmxtool_version <= V_0_11_0_0:
use_jmxtool_version = DEV_BRANCH
cmd = "%s %s " % (self.path.script("kafka-run-class.sh", use_jmxtool_version), self.jmx_class_name())
cmd += "--reporting-interval %d --jmx-url service:jmx:rmi:///jndi/rmi://127.0.0.1:%d/jmxrmi" % (self.jmx_poll_ms, self.jmx_port)
cmd += " --wait"
for jmx_object_name in self.jmx_object_names:
cmd += " --object-name %s" % jmx_object_name
cmd += " --attributes "
for jmx_attribute in self.jmx_attributes:
cmd += "%s," % jmx_attribute
cmd += " 1>> %s" % self.jmx_tool_log
cmd += " 2>> %s &" % self.jmx_tool_err_log
self.logger.debug("%s: Start JmxTool %d command: %s" % (node.account, idx, cmd))
node.account.ssh(cmd, allow_fail=False)
wait_until(lambda: self._jmx_has_output(node), timeout_sec=30, backoff_sec=.5, err_msg="%s: Jmx tool took too long to start" % node.account)
self.started[idx-1] = True
def _jmx_has_output(self, node):
"""Helper used as a proxy to determine whether jmx is running by that jmx_tool_log contains output."""
try:
node.account.ssh("test -s %s" % self.jmx_tool_log, allow_fail=False)
return True
except RemoteCommandError:
return False
def read_jmx_output(self, idx, node):
if not self.started[idx-1]:
return
object_attribute_names = []
cmd = "cat %s" % self.jmx_tool_log
self.logger.debug("Read jmx output %d command: %s", idx, cmd)
lines = [line for line in node.account.ssh_capture(cmd, allow_fail=False)]
assert len(lines) > 1, "There don't appear to be any samples in the jmx tool log: %s" % lines
for line in lines:
if "time" in line:
object_attribute_names = line.strip()[1:-1].split("\",\"")[1:]
continue
stats = [float(field) for field in line.split(',')]
time_sec = int(stats[0]/1000)
self.jmx_stats[idx-1][time_sec] = {name: stats[i+1] for i, name in enumerate(object_attribute_names)}
# do not calculate average and maximum of jmx stats until we have read output from all nodes
# If the service is multithreaded, this means that the results will be aggregated only when the last
# service finishes
if any(len(time_to_stats) == 0 for time_to_stats in self.jmx_stats):
return
start_time_sec = min([min(time_to_stats.keys()) for time_to_stats in self.jmx_stats])
end_time_sec = max([max(time_to_stats.keys()) for time_to_stats in self.jmx_stats])
for name in object_attribute_names:
aggregates_per_time = []
for time_sec in xrange(start_time_sec, end_time_sec + 1):
# assume that value is 0 if it is not read by jmx tool at the given time. This is appropriate for metrics such as bandwidth
values_per_node = [time_to_stats.get(time_sec, {}).get(name, 0) for time_to_stats in self.jmx_stats]
# assume that value is aggregated across nodes by sum. This is appropriate for metrics such as bandwidth
aggregates_per_time.append(sum(values_per_node))
self.average_jmx_value[name] = sum(aggregates_per_time) / len(aggregates_per_time)
self.maximum_jmx_value[name] = max(aggregates_per_time)
def read_jmx_output_all_nodes(self):
for node in self.nodes:
self.read_jmx_output(self.idx(node), node)
def jmx_class_name(self):
return "kafka.tools.JmxTool"

View File

@@ -0,0 +1,19 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from performance import PerformanceService, throughput, latency, compute_aggregate_throughput
from end_to_end_latency import EndToEndLatencyService
from producer_performance import ProducerPerformanceService
from consumer_performance import ConsumerPerformanceService

View File

@@ -0,0 +1,187 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from kafkatest.services.performance import PerformanceService
from kafkatest.services.security.security_config import SecurityConfig
from kafkatest.version import DEV_BRANCH, V_0_9_0_0, V_2_0_0, LATEST_0_10_0
class ConsumerPerformanceService(PerformanceService):
"""
See ConsumerPerformance.scala as the source of truth on these settings, but for reference:
"zookeeper" "The connection string for the zookeeper connection in the form host:port. Multiple URLS can
be given to allow fail-over. This option is only used with the old consumer."
"broker-list", "A broker list to use for connecting if using the new consumer."
"topic", "REQUIRED: The topic to consume from."
"group", "The group id to consume on."
"fetch-size", "The amount of data to fetch in a single request."
"from-latest", "If the consumer does not already have an establishedoffset to consume from,
start with the latest message present in the log rather than the earliest message."
"socket-buffer-size", "The size of the tcp RECV size."
"threads", "Number of processing threads."
"num-fetch-threads", "Number of fetcher threads. Defaults to 1"
"new-consumer", "Use the new consumer implementation."
"consumer.config", "Consumer config properties file."
"""
# Root directory for persistent output
PERSISTENT_ROOT = "/mnt/consumer_performance"
LOG_DIR = os.path.join(PERSISTENT_ROOT, "logs")
STDOUT_CAPTURE = os.path.join(PERSISTENT_ROOT, "consumer_performance.stdout")
STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "consumer_performance.stderr")
LOG_FILE = os.path.join(LOG_DIR, "consumer_performance.log")
LOG4J_CONFIG = os.path.join(PERSISTENT_ROOT, "tools-log4j.properties")
CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "consumer.properties")
logs = {
"consumer_performance_output": {
"path": STDOUT_CAPTURE,
"collect_default": True},
"consumer_performance_stderr": {
"path": STDERR_CAPTURE,
"collect_default": True},
"consumer_performance_log": {
"path": LOG_FILE,
"collect_default": True}
}
def __init__(self, context, num_nodes, kafka, topic, messages, version=DEV_BRANCH, new_consumer=True, settings={}):
super(ConsumerPerformanceService, self).__init__(context, num_nodes)
self.kafka = kafka
self.security_config = kafka.security_config.client_config()
self.topic = topic
self.messages = messages
self.new_consumer = new_consumer
self.settings = settings
assert version >= V_0_9_0_0 or (not new_consumer), \
"new_consumer is only supported if version >= 0.9.0.0, version %s" % str(version)
assert version < V_2_0_0 or new_consumer, \
"new_consumer==false is only supported if version < 2.0.0, version %s" % str(version)
security_protocol = self.security_config.security_protocol
assert version >= V_0_9_0_0 or security_protocol == SecurityConfig.PLAINTEXT, \
"Security protocol %s is only supported if version >= 0.9.0.0, version %s" % (self.security_config, str(version))
# These less-frequently used settings can be updated manually after instantiation
self.fetch_size = None
self.socket_buffer_size = None
self.threads = None
self.num_fetch_threads = None
self.group = None
self.from_latest = None
for node in self.nodes:
node.version = version
def args(self, version):
"""Dictionary of arguments used to start the Consumer Performance script."""
args = {
'topic': self.topic,
'messages': self.messages,
}
if self.new_consumer:
if version <= LATEST_0_10_0:
args['new-consumer'] = ""
args['broker-list'] = self.kafka.bootstrap_servers(self.security_config.security_protocol)
else:
args['zookeeper'] = self.kafka.zk_connect_setting()
if self.fetch_size is not None:
args['fetch-size'] = self.fetch_size
if self.socket_buffer_size is not None:
args['socket-buffer-size'] = self.socket_buffer_size
if self.threads is not None:
args['threads'] = self.threads
if self.num_fetch_threads is not None:
args['num-fetch-threads'] = self.num_fetch_threads
if self.group is not None:
args['group'] = self.group
if self.from_latest:
args['from-latest'] = ""
return args
def start_cmd(self, node):
cmd = "export LOG_DIR=%s;" % ConsumerPerformanceService.LOG_DIR
cmd += " export KAFKA_OPTS=%s;" % self.security_config.kafka_opts
cmd += " export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%s\";" % ConsumerPerformanceService.LOG4J_CONFIG
cmd += " %s" % self.path.script("kafka-consumer-perf-test.sh", node)
for key, value in self.args(node.version).items():
cmd += " --%s %s" % (key, value)
if node.version >= V_0_9_0_0:
# This is only used for security settings
cmd += " --consumer.config %s" % ConsumerPerformanceService.CONFIG_FILE
for key, value in self.settings.items():
cmd += " %s=%s" % (str(key), str(value))
cmd += " 2>> %(stderr)s | tee -a %(stdout)s" % {'stdout': ConsumerPerformanceService.STDOUT_CAPTURE,
'stderr': ConsumerPerformanceService.STDERR_CAPTURE}
return cmd
def parse_results(self, line, version):
parts = line.split(',')
if version >= V_0_9_0_0:
result = {
'total_mb': float(parts[2]),
'mbps': float(parts[3]),
'records_per_sec': float(parts[5]),
}
else:
result = {
'total_mb': float(parts[3]),
'mbps': float(parts[4]),
'records_per_sec': float(parts[6]),
}
return result
def _worker(self, idx, node):
node.account.ssh("mkdir -p %s" % ConsumerPerformanceService.PERSISTENT_ROOT, allow_fail=False)
log_config = self.render('tools_log4j.properties', log_file=ConsumerPerformanceService.LOG_FILE)
node.account.create_file(ConsumerPerformanceService.LOG4J_CONFIG, log_config)
node.account.create_file(ConsumerPerformanceService.CONFIG_FILE, str(self.security_config))
self.security_config.setup_node(node)
cmd = self.start_cmd(node)
self.logger.debug("Consumer performance %d command: %s", idx, cmd)
last = None
for line in node.account.ssh_capture(cmd):
last = line
# Parse and save the last line's information
self.results[idx-1] = self.parse_results(last, node.version)

View File

@@ -0,0 +1,124 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from kafkatest.services.performance import PerformanceService
from kafkatest.services.security.security_config import SecurityConfig
from kafkatest.version import DEV_BRANCH, V_0_9_0_0
class EndToEndLatencyService(PerformanceService):
MESSAGE_BYTES = 21 # 0.8.X messages are fixed at 21 bytes, so we'll match that for other versions
# Root directory for persistent output
PERSISTENT_ROOT = "/mnt/end_to_end_latency"
LOG_DIR = os.path.join(PERSISTENT_ROOT, "logs")
STDOUT_CAPTURE = os.path.join(PERSISTENT_ROOT, "end_to_end_latency.stdout")
STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "end_to_end_latency.stderr")
LOG_FILE = os.path.join(LOG_DIR, "end_to_end_latency.log")
LOG4J_CONFIG = os.path.join(PERSISTENT_ROOT, "tools-log4j.properties")
CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "client.properties")
logs = {
"end_to_end_latency_output": {
"path": STDOUT_CAPTURE,
"collect_default": True},
"end_to_end_latency_stderr": {
"path": STDERR_CAPTURE,
"collect_default": True},
"end_to_end_latency_log": {
"path": LOG_FILE,
"collect_default": True}
}
def __init__(self, context, num_nodes, kafka, topic, num_records, compression_type="none", version=DEV_BRANCH, acks=1):
super(EndToEndLatencyService, self).__init__(context, num_nodes,
root=EndToEndLatencyService.PERSISTENT_ROOT)
self.kafka = kafka
self.security_config = kafka.security_config.client_config()
security_protocol = self.security_config.security_protocol
if version < V_0_9_0_0:
assert security_protocol == SecurityConfig.PLAINTEXT, \
"Security protocol %s is only supported if version >= 0.9.0.0, version %s" % (self.security_config, str(version))
assert compression_type == "none", \
"Compression type %s is only supported if version >= 0.9.0.0, version %s" % (compression_type, str(version))
self.args = {
'topic': topic,
'num_records': num_records,
'acks': acks,
'compression_type': compression_type,
'kafka_opts': self.security_config.kafka_opts,
'message_bytes': EndToEndLatencyService.MESSAGE_BYTES
}
for node in self.nodes:
node.version = version
def start_cmd(self, node):
args = self.args.copy()
args.update({
'zk_connect': self.kafka.zk_connect_setting(),
'bootstrap_servers': self.kafka.bootstrap_servers(self.security_config.security_protocol),
'config_file': EndToEndLatencyService.CONFIG_FILE,
'kafka_run_class': self.path.script("kafka-run-class.sh", node),
'java_class_name': self.java_class_name()
})
cmd = "export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%s\"; " % EndToEndLatencyService.LOG4J_CONFIG
if node.version >= V_0_9_0_0:
cmd += "KAFKA_OPTS=%(kafka_opts)s %(kafka_run_class)s %(java_class_name)s " % args
cmd += "%(bootstrap_servers)s %(topic)s %(num_records)d %(acks)d %(message_bytes)d %(config_file)s" % args
else:
# Set fetch max wait to 0 to match behavior in later versions
cmd += "KAFKA_OPTS=%(kafka_opts)s %(kafka_run_class)s kafka.tools.TestEndToEndLatency " % args
cmd += "%(bootstrap_servers)s %(zk_connect)s %(topic)s %(num_records)d 0 %(acks)d" % args
cmd += " 2>> %(stderr)s | tee -a %(stdout)s" % {'stdout': EndToEndLatencyService.STDOUT_CAPTURE,
'stderr': EndToEndLatencyService.STDERR_CAPTURE}
return cmd
def _worker(self, idx, node):
node.account.ssh("mkdir -p %s" % EndToEndLatencyService.PERSISTENT_ROOT, allow_fail=False)
log_config = self.render('tools_log4j.properties', log_file=EndToEndLatencyService.LOG_FILE)
node.account.create_file(EndToEndLatencyService.LOG4J_CONFIG, log_config)
client_config = str(self.security_config)
if node.version >= V_0_9_0_0:
client_config += "compression_type=%(compression_type)s" % self.args
node.account.create_file(EndToEndLatencyService.CONFIG_FILE, client_config)
self.security_config.setup_node(node)
cmd = self.start_cmd(node)
self.logger.debug("End-to-end latency %d command: %s", idx, cmd)
results = {}
for line in node.account.ssh_capture(cmd):
if line.startswith("Avg latency:"):
results['latency_avg_ms'] = float(line.split()[2])
if line.startswith("Percentiles"):
results['latency_50th_ms'] = float(line.split()[3][:-1])
results['latency_99th_ms'] = float(line.split()[6][:-1])
results['latency_999th_ms'] = float(line.split()[9])
self.results[idx-1] = results
def java_class_name(self):
return "kafka.tools.EndToEndLatency"

View File

@@ -0,0 +1,72 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ducktape.services.background_thread import BackgroundThreadService
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
class PerformanceService(KafkaPathResolverMixin, BackgroundThreadService):
def __init__(self, context=None, num_nodes=0, root="/mnt/*", stop_timeout_sec=30):
super(PerformanceService, self).__init__(context, num_nodes)
self.results = [None] * self.num_nodes
self.stats = [[] for x in range(self.num_nodes)]
self.stop_timeout_sec = stop_timeout_sec
self.root = root
def java_class_name(self):
"""
Returns the name of the Java class which this service creates. Subclasses should override
this method, so that we know the name of the java process to stop. If it is not
overridden, we will kill all java processes in PerformanceService#stop_node (for backwards
compatibility.)
"""
return ""
def stop_node(self, node):
node.account.kill_java_processes(self.java_class_name(), clean_shutdown=True, allow_fail=True)
stopped = self.wait_node(node, timeout_sec=self.stop_timeout_sec)
assert stopped, "Node %s: did not stop within the specified timeout of %s seconds" % \
(str(node.account), str(self.stop_timeout_sec))
def clean_node(self, node):
node.account.kill_java_processes(self.java_class_name(), clean_shutdown=False, allow_fail=True)
node.account.ssh("rm -rf -- %s" % self.root, allow_fail=False)
def throughput(records_per_sec, mb_per_sec):
"""Helper method to ensure uniform representation of throughput data"""
return {
"records_per_sec": records_per_sec,
"mb_per_sec": mb_per_sec
}
def latency(latency_50th_ms, latency_99th_ms, latency_999th_ms):
"""Helper method to ensure uniform representation of latency data"""
return {
"latency_50th_ms": latency_50th_ms,
"latency_99th_ms": latency_99th_ms,
"latency_999th_ms": latency_999th_ms
}
def compute_aggregate_throughput(perf):
"""Helper method for computing throughput after running a performance service."""
aggregate_rate = sum([r['records_per_sec'] for r in perf.results])
aggregate_mbps = sum([r['mbps'] for r in perf.results])
return throughput(aggregate_rate, aggregate_mbps)

View File

@@ -0,0 +1,174 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
from ducktape.utils.util import wait_until
from ducktape.cluster.remoteaccount import RemoteCommandError
from kafkatest.directory_layout.kafka_path import TOOLS_JAR_NAME, TOOLS_DEPENDANT_TEST_LIBS_JAR_NAME
from kafkatest.services.monitor.http import HttpMetricsCollector
from kafkatest.services.performance import PerformanceService
from kafkatest.services.security.security_config import SecurityConfig
from kafkatest.version import DEV_BRANCH, V_0_9_0_0
class ProducerPerformanceService(HttpMetricsCollector, PerformanceService):
PERSISTENT_ROOT = "/mnt/producer_performance"
STDOUT_CAPTURE = os.path.join(PERSISTENT_ROOT, "producer_performance.stdout")
STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "producer_performance.stderr")
LOG_DIR = os.path.join(PERSISTENT_ROOT, "logs")
LOG_FILE = os.path.join(LOG_DIR, "producer_performance.log")
LOG4J_CONFIG = os.path.join(PERSISTENT_ROOT, "tools-log4j.properties")
def __init__(self, context, num_nodes, kafka, topic, num_records, record_size, throughput, version=DEV_BRANCH, settings=None,
intermediate_stats=False, client_id="producer-performance"):
super(ProducerPerformanceService, self).__init__(context=context, num_nodes=num_nodes)
self.logs = {
"producer_performance_stdout": {
"path": ProducerPerformanceService.STDOUT_CAPTURE,
"collect_default": True},
"producer_performance_stderr": {
"path": ProducerPerformanceService.STDERR_CAPTURE,
"collect_default": True},
"producer_performance_log": {
"path": ProducerPerformanceService.LOG_FILE,
"collect_default": True}
}
self.kafka = kafka
self.security_config = kafka.security_config.client_config()
security_protocol = self.security_config.security_protocol
assert version >= V_0_9_0_0 or security_protocol == SecurityConfig.PLAINTEXT, \
"Security protocol %s is only supported if version >= 0.9.0.0, version %s" % (self.security_config, str(version))
self.args = {
'topic': topic,
'kafka_opts': self.security_config.kafka_opts,
'num_records': num_records,
'record_size': record_size,
'throughput': throughput
}
self.settings = settings or {}
self.intermediate_stats = intermediate_stats
self.client_id = client_id
for node in self.nodes:
node.version = version
def start_cmd(self, node):
args = self.args.copy()
args.update({
'bootstrap_servers': self.kafka.bootstrap_servers(self.security_config.security_protocol),
'client_id': self.client_id,
'kafka_run_class': self.path.script("kafka-run-class.sh", node),
'metrics_props': ' '.join(["%s=%s" % (k, v) for k, v in self.http_metrics_client_configs.iteritems()])
})
cmd = ""
if node.version < DEV_BRANCH:
# In order to ensure more consistent configuration between versions, always use the ProducerPerformance
# tool from the development branch
tools_jar = self.path.jar(TOOLS_JAR_NAME, DEV_BRANCH)
tools_dependant_libs_jar = self.path.jar(TOOLS_DEPENDANT_TEST_LIBS_JAR_NAME, DEV_BRANCH)
for jar in (tools_jar, tools_dependant_libs_jar):
cmd += "for file in %s; do CLASSPATH=$CLASSPATH:$file; done; " % jar
cmd += "export CLASSPATH; "
cmd += " export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%s\"; " % ProducerPerformanceService.LOG4J_CONFIG
cmd += "KAFKA_OPTS=%(kafka_opts)s KAFKA_HEAP_OPTS=\"-XX:+HeapDumpOnOutOfMemoryError\" %(kafka_run_class)s org.apache.kafka.tools.ProducerPerformance " \
"--topic %(topic)s --num-records %(num_records)d --record-size %(record_size)d --throughput %(throughput)d --producer-props bootstrap.servers=%(bootstrap_servers)s client.id=%(client_id)s %(metrics_props)s" % args
self.security_config.setup_node(node)
if self.security_config.security_protocol != SecurityConfig.PLAINTEXT:
self.settings.update(self.security_config.properties)
for key, value in self.settings.items():
cmd += " %s=%s" % (str(key), str(value))
cmd += " 2>>%s | tee %s" % (ProducerPerformanceService.STDERR_CAPTURE, ProducerPerformanceService.STDOUT_CAPTURE)
return cmd
def pids(self, node):
try:
cmd = "jps | grep -i ProducerPerformance | awk '{print $1}'"
pid_arr = [pid for pid in node.account.ssh_capture(cmd, allow_fail=True, callback=int)]
return pid_arr
except (RemoteCommandError, ValueError) as e:
return []
def alive(self, node):
return len(self.pids(node)) > 0
def _worker(self, idx, node):
node.account.ssh("mkdir -p %s" % ProducerPerformanceService.PERSISTENT_ROOT, allow_fail=False)
# Create and upload log properties
log_config = self.render('tools_log4j.properties', log_file=ProducerPerformanceService.LOG_FILE)
node.account.create_file(ProducerPerformanceService.LOG4J_CONFIG, log_config)
cmd = self.start_cmd(node)
self.logger.debug("Producer performance %d command: %s", idx, cmd)
# start ProducerPerformance process
start = time.time()
producer_output = node.account.ssh_capture(cmd)
wait_until(lambda: self.alive(node), timeout_sec=20, err_msg="ProducerPerformance failed to start")
# block until there is at least one line of output
first_line = next(producer_output, None)
if first_line is None:
raise Exception("No output from ProducerPerformance")
wait_until(lambda: not self.alive(node), timeout_sec=1200, backoff_sec=2, err_msg="ProducerPerformance failed to finish")
elapsed = time.time() - start
self.logger.debug("ProducerPerformance process ran for %s seconds" % elapsed)
# parse producer output from file
last = None
producer_output = node.account.ssh_capture("cat %s" % ProducerPerformanceService.STDOUT_CAPTURE)
for line in producer_output:
if self.intermediate_stats:
try:
self.stats[idx-1].append(self.parse_stats(line))
except:
# Sometimes there are extraneous log messages
pass
last = line
try:
self.results[idx-1] = self.parse_stats(last)
except:
raise Exception("Unable to parse aggregate performance statistics on node %d: %s" % (idx, last))
def parse_stats(self, line):
parts = line.split(',')
return {
'records': int(parts[0].split()[0]),
'records_per_sec': float(parts[1].split()[0]),
'mbps': float(parts[1].split('(')[1].split()[0]),
'latency_avg_ms': float(parts[2].split()[0]),
'latency_max_ms': float(parts[3].split()[0]),
'latency_50th_ms': float(parts[4].split()[0]),
'latency_95th_ms': float(parts[5].split()[0]),
'latency_99th_ms': float(parts[6].split()[0]),
'latency_999th_ms': float(parts[7].split()[0]),
}

View File

@@ -0,0 +1,108 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from kafkatest.services.monitor.jmx import JmxMixin
from kafkatest.services.streams import StreamsTestBaseService
from kafkatest.services.kafka import KafkaConfig
from kafkatest.services import streams_property
#
# Class used to start the simple Kafka Streams benchmark
#
class StreamsSimpleBenchmarkService(StreamsTestBaseService):
"""Base class for simple Kafka Streams benchmark"""
def __init__(self, test_context, kafka, test_name, num_threads, num_recs_or_wait_ms, key_skew, value_size):
super(StreamsSimpleBenchmarkService, self).__init__(test_context,
kafka,
"org.apache.kafka.streams.perf.SimpleBenchmark",
test_name,
num_recs_or_wait_ms,
key_skew,
value_size)
self.jmx_option = ""
if test_name.startswith('stream') or test_name.startswith('table'):
self.jmx_option = "stream-jmx"
JmxMixin.__init__(self,
num_nodes=1,
jmx_object_names=['kafka.streams:type=stream-thread-metrics,thread-id=simple-benchmark-StreamThread-%d' %(i+1) for i in range(num_threads)],
jmx_attributes=['process-latency-avg',
'process-rate',
'commit-latency-avg',
'commit-rate',
'poll-latency-avg',
'poll-rate'],
root=StreamsTestBaseService.PERSISTENT_ROOT)
if test_name.startswith('consume'):
self.jmx_option = "consumer-jmx"
JmxMixin.__init__(self,
num_nodes=1,
jmx_object_names=['kafka.consumer:type=consumer-fetch-manager-metrics,client-id=simple-benchmark-consumer'],
jmx_attributes=['records-consumed-rate'],
root=StreamsTestBaseService.PERSISTENT_ROOT)
self.num_threads = num_threads
def prop_file(self):
cfg = KafkaConfig(**{streams_property.STATE_DIR: self.PERSISTENT_ROOT,
streams_property.KAFKA_SERVERS: self.kafka.bootstrap_servers(),
streams_property.NUM_THREADS: self.num_threads})
return cfg.render()
def start_cmd(self, node):
if self.jmx_option != "":
args = self.args.copy()
args['jmx_port'] = self.jmx_port
args['config_file'] = self.CONFIG_FILE
args['stdout'] = self.STDOUT_FILE
args['stderr'] = self.STDERR_FILE
args['pidfile'] = self.PID_FILE
args['log4j'] = self.LOG4J_CONFIG_FILE
args['kafka_run_class'] = self.path.script("kafka-run-class.sh", node)
cmd = "( export JMX_PORT=%(jmx_port)s; export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%(log4j)s\"; " \
"INCLUDE_TEST_JARS=true %(kafka_run_class)s %(streams_class_name)s " \
" %(config_file)s %(user_test_args1)s %(user_test_args2)s %(user_test_args3)s" \
" %(user_test_args4)s & echo $! >&3 ) 1>> %(stdout)s 2>> %(stderr)s 3> %(pidfile)s" % args
else:
cmd = super(StreamsSimpleBenchmarkService, self).start_cmd(node)
return cmd
def start_node(self, node):
super(StreamsSimpleBenchmarkService, self).start_node(node)
if self.jmx_option != "":
self.start_jmx_tool(1, node)
def clean_node(self, node):
if self.jmx_option != "":
JmxMixin.clean_node(self, node)
super(StreamsSimpleBenchmarkService, self).clean_node(node)
def collect_data(self, node, tag = None):
# Collect the data and return it to the framework
output = node.account.ssh_capture("grep Performance %s" % self.STDOUT_FILE)
data = {}
for line in output:
parts = line.split(':')
data[tag + parts[0]] = parts[1]
return data

View File

@@ -0,0 +1,25 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Define the root logger with appender file
log4j.rootLogger = {{ log_level|default("INFO") }}, FILE
log4j.appender.FILE=org.apache.log4j.FileAppender
log4j.appender.FILE.File={{ log_file }}
log4j.appender.FILE.ImmediateFlush=true
# Set the append to false, overwrite
log4j.appender.FILE.Append=false
log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
log4j.appender.FILE.layout.conversionPattern=[%d] %p %m (%c)%n

View File

@@ -0,0 +1,93 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ducktape.services.background_thread import BackgroundThreadService
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
from kafkatest.services.security.security_config import SecurityConfig
import re
class ReplicaVerificationTool(KafkaPathResolverMixin, BackgroundThreadService):
logs = {
"producer_log": {
"path": "/mnt/replica_verification_tool.log",
"collect_default": False}
}
def __init__(self, context, num_nodes, kafka, topic, report_interval_ms, security_protocol="PLAINTEXT", stop_timeout_sec=30):
super(ReplicaVerificationTool, self).__init__(context, num_nodes)
self.kafka = kafka
self.topic = topic
self.report_interval_ms = report_interval_ms
self.security_protocol = security_protocol
self.security_config = SecurityConfig(self.context, security_protocol)
self.partition_lag = {}
self.stop_timeout_sec = stop_timeout_sec
def _worker(self, idx, node):
cmd = self.start_cmd(node)
self.logger.debug("ReplicaVerificationTool %d command: %s" % (idx, cmd))
self.security_config.setup_node(node)
for line in node.account.ssh_capture(cmd):
self.logger.debug("Parsing line:{}".format(line))
parsed = re.search('.*max lag is (.+?) for partition ([a-zA-Z0-9._-]+-[0-9]+) at', line)
if parsed:
lag = int(parsed.group(1))
topic_partition = parsed.group(2)
self.logger.debug("Setting max lag for {} as {}".format(topic_partition, lag))
self.partition_lag[topic_partition] = lag
def get_lag_for_partition(self, topic, partition):
"""
Get latest lag for given topic-partition
Args:
topic: a topic
partition: a partition of the topic
"""
topic_partition = topic + '-' + str(partition)
lag = self.partition_lag.get(topic_partition, -1)
self.logger.debug("Returning lag for {} as {}".format(topic_partition, lag))
return lag
def start_cmd(self, node):
cmd = self.path.script("kafka-run-class.sh", node)
cmd += " %s" % self.java_class_name()
cmd += " --broker-list %s --topic-white-list %s --time -2 --report-interval-ms %s" % (self.kafka.bootstrap_servers(self.security_protocol), self.topic, self.report_interval_ms)
cmd += " 2>> /mnt/replica_verification_tool.log | tee -a /mnt/replica_verification_tool.log &"
return cmd
def stop_node(self, node):
node.account.kill_java_processes(self.java_class_name(), clean_shutdown=True,
allow_fail=True)
stopped = self.wait_node(node, timeout_sec=self.stop_timeout_sec)
assert stopped, "Node %s: did not stop within the specified timeout of %s seconds" % \
(str(node.account), str(self.stop_timeout_sec))
def clean_node(self, node):
node.account.kill_java_processes(self.java_class_name(), clean_shutdown=False,
allow_fail=True)
node.account.ssh("rm -rf /mnt/replica_verification_tool.log", allow_fail=False)
def java_class_name(self):
return "kafka.tools.ReplicaVerificationTool"

View File

@@ -0,0 +1,15 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,75 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
class ACLs(KafkaPathResolverMixin):
def __init__(self, context):
self.context = context
def set_acls(self, protocol, kafka, topic, group):
node = kafka.nodes[0]
setting = kafka.zk_connect_setting()
# Set server ACLs
kafka_principal = "User:CN=systemtest" if protocol == "SSL" else "User:kafka"
self.acls_command(node, ACLs.add_cluster_acl(setting, kafka_principal))
self.acls_command(node, ACLs.broker_read_acl(setting, "*", kafka_principal))
# Set client ACLs
client_principal = "User:CN=systemtest" if protocol == "SSL" else "User:client"
self.acls_command(node, ACLs.produce_acl(setting, topic, client_principal))
self.acls_command(node, ACLs.consume_acl(setting, topic, group, client_principal))
def acls_command(self, node, properties):
cmd = "%s %s" % (self.path.script("kafka-acls.sh", node), properties)
node.account.ssh(cmd)
@staticmethod
def add_cluster_acl(zk_connect, principal="User:kafka"):
return "--authorizer-properties zookeeper.connect=%(zk_connect)s --add --cluster " \
"--operation=ClusterAction --allow-principal=%(principal)s " % {
'zk_connect': zk_connect,
'principal': principal
}
@staticmethod
def broker_read_acl(zk_connect, topic, principal="User:kafka"):
return "--authorizer-properties zookeeper.connect=%(zk_connect)s --add --topic=%(topic)s " \
"--operation=Read --allow-principal=%(principal)s " % {
'zk_connect': zk_connect,
'topic': topic,
'principal': principal
}
@staticmethod
def produce_acl(zk_connect, topic, principal="User:client"):
return "--authorizer-properties zookeeper.connect=%(zk_connect)s --add --topic=%(topic)s " \
"--producer --allow-principal=%(principal)s " % {
'zk_connect': zk_connect,
'topic': topic,
'principal': principal
}
@staticmethod
def consume_acl(zk_connect, topic, group, principal="User:client"):
return "--authorizer-properties zookeeper.connect=%(zk_connect)s --add --topic=%(topic)s " \
"--group=%(group)s --consumer --allow-principal=%(principal)s " % {
'zk_connect': zk_connect,
'topic': topic,
'group': group,
'principal': principal
}

View File

@@ -0,0 +1,43 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class ListenerSecurityConfig:
SASL_MECHANISM_PREFIXED_CONFIGS = ["connections.max.reauth.ms", "sasl.jaas.config",
"sasl.login.callback.handler.class", "sasl.login.class",
"sasl.server.callback.handler.class"]
def __init__(self, use_separate_interbroker_listener=False,
client_listener_overrides={}, interbroker_listener_overrides={}):
"""
:param bool use_separate_interbroker_listener - if set, will use a separate interbroker listener,
with security protocol set to interbroker_security_protocol value. If set, requires
interbroker_security_protocol to be provided.
Normally port name is the same as its security protocol, so setting security_protocol and
interbroker_security_protocol to the same value will lead to a single port being open and both client
and broker-to-broker communication will go over that port. This parameter allows
you to add an interbroker listener with the same security protocol as a client listener, but running on a
separate port.
:param dict client_listener_overrides - non-prefixed listener config overrides for named client listener
(for example 'sasl.jaas.config', 'ssl.keystore.location', 'sasl.login.callback.handler.class', etc).
:param dict interbroker_listener_overrides - non-prefixed listener config overrides for named interbroker
listener (for example 'sasl.jaas.config', 'ssl.keystore.location', 'sasl.login.callback.handler.class', etc).
"""
self.use_separate_interbroker_listener = use_separate_interbroker_listener
self.client_listener_overrides = client_listener_overrides
self.interbroker_listener_overrides = interbroker_listener_overrides
def requires_sasl_mechanism_prefix(self, config):
return config in ListenerSecurityConfig.SASL_MECHANISM_PREFIXED_CONFIGS

View File

@@ -0,0 +1,136 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import uuid
from io import open
from os import remove, close
from shutil import move
from tempfile import mkstemp
from ducktape.services.service import Service
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin, CORE_LIBS_JAR_NAME, CORE_DEPENDANT_TEST_LIBS_JAR_NAME
from kafkatest.version import DEV_BRANCH
class MiniKdc(KafkaPathResolverMixin, Service):
logs = {
"minikdc_log": {
"path": "/mnt/minikdc/minikdc.log",
"collect_default": True}
}
WORK_DIR = "/mnt/minikdc"
PROPS_FILE = "/mnt/minikdc/minikdc.properties"
KEYTAB_FILE = "/mnt/minikdc/keytab"
KRB5CONF_FILE = "/mnt/minikdc/krb5.conf"
LOG_FILE = "/mnt/minikdc/minikdc.log"
LOCAL_KEYTAB_FILE = None
LOCAL_KRB5CONF_FILE = None
@staticmethod
def _set_local_keytab_file(local_scratch_dir):
"""Set MiniKdc.LOCAL_KEYTAB_FILE exactly once per test.
LOCAL_KEYTAB_FILE is currently used like a global variable to provide a mechanism to share the
location of the local keytab file among all services which might need it.
Since individual ducktape tests are each run in a subprocess forked from the ducktape main process,
class variables set at class load time are duplicated between test processes. This leads to collisions
if test subprocesses are run in parallel, so we defer setting these class variables until after the test itself
begins to run.
"""
if MiniKdc.LOCAL_KEYTAB_FILE is None:
MiniKdc.LOCAL_KEYTAB_FILE = os.path.join(local_scratch_dir, "keytab")
return MiniKdc.LOCAL_KEYTAB_FILE
@staticmethod
def _set_local_krb5conf_file(local_scratch_dir):
"""Set MiniKdc.LOCAL_KRB5CONF_FILE exactly once per test.
See _set_local_keytab_file for details why we do this.
"""
if MiniKdc.LOCAL_KRB5CONF_FILE is None:
MiniKdc.LOCAL_KRB5CONF_FILE = os.path.join(local_scratch_dir, "krb5conf")
return MiniKdc.LOCAL_KRB5CONF_FILE
def __init__(self, context, kafka_nodes, extra_principals=""):
super(MiniKdc, self).__init__(context, 1)
self.kafka_nodes = kafka_nodes
self.extra_principals = extra_principals
# context.local_scratch_dir uses a ducktape feature:
# each test_context object has a unique local scratch directory which is available for the duration of the test
# which is automatically garbage collected after the test finishes
MiniKdc._set_local_keytab_file(context.local_scratch_dir)
MiniKdc._set_local_krb5conf_file(context.local_scratch_dir)
def replace_in_file(self, file_path, pattern, subst):
fh, abs_path = mkstemp()
with open(abs_path, 'w') as new_file:
with open(file_path) as old_file:
for line in old_file:
new_file.write(line.replace(pattern, subst))
close(fh)
remove(file_path)
move(abs_path, file_path)
def start_node(self, node):
node.account.ssh("mkdir -p %s" % MiniKdc.WORK_DIR, allow_fail=False)
props_file = self.render('minikdc.properties', node=node)
node.account.create_file(MiniKdc.PROPS_FILE, props_file)
self.logger.info("minikdc.properties")
self.logger.info(props_file)
kafka_principals = ' '.join(['kafka/' + kafka_node.account.hostname for kafka_node in self.kafka_nodes])
principals = 'client ' + kafka_principals + ' ' + self.extra_principals
self.logger.info("Starting MiniKdc with principals " + principals)
core_libs_jar = self.path.jar(CORE_LIBS_JAR_NAME, DEV_BRANCH)
core_dependant_test_libs_jar = self.path.jar(CORE_DEPENDANT_TEST_LIBS_JAR_NAME, DEV_BRANCH)
cmd = "for file in %s; do CLASSPATH=$CLASSPATH:$file; done;" % core_libs_jar
cmd += " for file in %s; do CLASSPATH=$CLASSPATH:$file; done;" % core_dependant_test_libs_jar
cmd += " export CLASSPATH;"
cmd += " %s kafka.security.minikdc.MiniKdc %s %s %s %s 1>> %s 2>> %s &" % (self.path.script("kafka-run-class.sh", node), MiniKdc.WORK_DIR, MiniKdc.PROPS_FILE, MiniKdc.KEYTAB_FILE, principals, MiniKdc.LOG_FILE, MiniKdc.LOG_FILE)
self.logger.debug("Attempting to start MiniKdc on %s with command: %s" % (str(node.account), cmd))
with node.account.monitor_log(MiniKdc.LOG_FILE) as monitor:
node.account.ssh(cmd)
monitor.wait_until("MiniKdc Running", timeout_sec=60, backoff_sec=1, err_msg="MiniKdc didn't finish startup")
node.account.copy_from(MiniKdc.KEYTAB_FILE, MiniKdc.LOCAL_KEYTAB_FILE)
node.account.copy_from(MiniKdc.KRB5CONF_FILE, MiniKdc.LOCAL_KRB5CONF_FILE)
# KDC is set to bind openly (via 0.0.0.0). Change krb5.conf to hold the specific KDC address
self.replace_in_file(MiniKdc.LOCAL_KRB5CONF_FILE, '0.0.0.0', node.account.hostname)
def stop_node(self, node):
self.logger.info("Stopping %s on %s" % (type(self).__name__, node.account.hostname))
node.account.kill_java_processes("MiniKdc", clean_shutdown=True, allow_fail=False)
def clean_node(self, node):
node.account.kill_java_processes("MiniKdc", clean_shutdown=False, allow_fail=True)
node.account.ssh("rm -rf " + MiniKdc.WORK_DIR, allow_fail=False)
if os.path.exists(MiniKdc.LOCAL_KEYTAB_FILE):
os.remove(MiniKdc.LOCAL_KEYTAB_FILE)
if os.path.exists(MiniKdc.LOCAL_KRB5CONF_FILE):
os.remove(MiniKdc.LOCAL_KRB5CONF_FILE)

View File

@@ -0,0 +1,352 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import subprocess
from tempfile import mkdtemp
from shutil import rmtree
from ducktape.template import TemplateRenderer
from kafkatest.services.security.minikdc import MiniKdc
from kafkatest.services.security.listener_security_config import ListenerSecurityConfig
import itertools
class SslStores(object):
def __init__(self, local_scratch_dir, logger=None):
self.logger = logger
self.ca_crt_path = os.path.join(local_scratch_dir, "test.ca.crt")
self.ca_jks_path = os.path.join(local_scratch_dir, "test.ca.jks")
self.ca_passwd = "test-ca-passwd"
self.truststore_path = os.path.join(local_scratch_dir, "test.truststore.jks")
self.truststore_passwd = "test-ts-passwd"
self.keystore_passwd = "test-ks-passwd"
# Zookeeper TLS (as of v3.5.6) does not support a key password different than the keystore password
self.key_passwd = self.keystore_passwd
# Allow upto one hour of clock skew between host and VMs
self.startdate = "-1H"
for file in [self.ca_crt_path, self.ca_jks_path, self.truststore_path]:
if os.path.exists(file):
os.remove(file)
def generate_ca(self):
"""
Generate CA private key and certificate.
"""
self.runcmd("keytool -genkeypair -alias ca -keyalg RSA -keysize 2048 -keystore %s -storetype JKS -storepass %s -keypass %s -dname CN=SystemTestCA -startdate %s --ext bc=ca:true" % (self.ca_jks_path, self.ca_passwd, self.ca_passwd, self.startdate))
self.runcmd("keytool -export -alias ca -keystore %s -storepass %s -storetype JKS -rfc -file %s" % (self.ca_jks_path, self.ca_passwd, self.ca_crt_path))
def generate_truststore(self):
"""
Generate JKS truststore containing CA certificate.
"""
self.runcmd("keytool -importcert -alias ca -file %s -keystore %s -storepass %s -storetype JKS -noprompt" % (self.ca_crt_path, self.truststore_path, self.truststore_passwd))
def generate_and_copy_keystore(self, node):
"""
Generate JKS keystore with certificate signed by the test CA.
The generated certificate has the node's hostname as a DNS SubjectAlternativeName.
"""
ks_dir = mkdtemp(dir="/tmp")
ks_path = os.path.join(ks_dir, "test.keystore.jks")
csr_path = os.path.join(ks_dir, "test.kafka.csr")
crt_path = os.path.join(ks_dir, "test.kafka.crt")
self.runcmd("keytool -genkeypair -alias kafka -keyalg RSA -keysize 2048 -keystore %s -storepass %s -storetype JKS -keypass %s -dname CN=systemtest -ext SAN=DNS:%s -startdate %s" % (ks_path, self.keystore_passwd, self.key_passwd, self.hostname(node), self.startdate))
self.runcmd("keytool -certreq -keystore %s -storepass %s -storetype JKS -keypass %s -alias kafka -file %s" % (ks_path, self.keystore_passwd, self.key_passwd, csr_path))
self.runcmd("keytool -gencert -keystore %s -storepass %s -storetype JKS -alias ca -infile %s -outfile %s -dname CN=systemtest -ext SAN=DNS:%s -startdate %s" % (self.ca_jks_path, self.ca_passwd, csr_path, crt_path, self.hostname(node), self.startdate))
self.runcmd("keytool -importcert -keystore %s -storepass %s -storetype JKS -alias ca -file %s -noprompt" % (ks_path, self.keystore_passwd, self.ca_crt_path))
self.runcmd("keytool -importcert -keystore %s -storepass %s -storetype JKS -keypass %s -alias kafka -file %s -noprompt" % (ks_path, self.keystore_passwd, self.key_passwd, crt_path))
node.account.copy_to(ks_path, SecurityConfig.KEYSTORE_PATH)
# generate ZooKeeper client TLS config file for encryption-only (no client cert) use case
str = """zookeeper.clientCnxnSocket=org.apache.zookeeper.ClientCnxnSocketNetty
zookeeper.ssl.client.enable=true
zookeeper.ssl.truststore.location=%s
zookeeper.ssl.truststore.password=%s
""" % (SecurityConfig.TRUSTSTORE_PATH, self.truststore_passwd)
node.account.create_file(SecurityConfig.ZK_CLIENT_TLS_ENCRYPT_ONLY_CONFIG_PATH, str)
# also generate ZooKeeper client TLS config file for mutual authentication use case
str = """zookeeper.clientCnxnSocket=org.apache.zookeeper.ClientCnxnSocketNetty
zookeeper.ssl.client.enable=true
zookeeper.ssl.truststore.location=%s
zookeeper.ssl.truststore.password=%s
zookeeper.ssl.keystore.location=%s
zookeeper.ssl.keystore.password=%s
""" % (SecurityConfig.TRUSTSTORE_PATH, self.truststore_passwd, SecurityConfig.KEYSTORE_PATH, self.keystore_passwd)
node.account.create_file(SecurityConfig.ZK_CLIENT_MUTUAL_AUTH_CONFIG_PATH, str)
rmtree(ks_dir)
def hostname(self, node):
""" Hostname which may be overridden for testing validation failures
"""
return node.account.hostname
def runcmd(self, cmd):
if self.logger:
self.logger.log(logging.DEBUG, cmd)
proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, stderr = proc.communicate()
if proc.returncode != 0:
raise RuntimeError("Command '%s' returned non-zero exit status %d: %s" % (cmd, proc.returncode, stdout))
class SecurityConfig(TemplateRenderer):
PLAINTEXT = 'PLAINTEXT'
SSL = 'SSL'
SASL_PLAINTEXT = 'SASL_PLAINTEXT'
SASL_SSL = 'SASL_SSL'
SASL_MECHANISM_GSSAPI = 'GSSAPI'
SASL_MECHANISM_PLAIN = 'PLAIN'
SASL_MECHANISM_SCRAM_SHA_256 = 'SCRAM-SHA-256'
SASL_MECHANISM_SCRAM_SHA_512 = 'SCRAM-SHA-512'
SCRAM_CLIENT_USER = "kafka-client"
SCRAM_CLIENT_PASSWORD = "client-secret"
SCRAM_BROKER_USER = "kafka-broker"
SCRAM_BROKER_PASSWORD = "broker-secret"
CONFIG_DIR = "/mnt/security"
KEYSTORE_PATH = "/mnt/security/test.keystore.jks"
TRUSTSTORE_PATH = "/mnt/security/test.truststore.jks"
ZK_CLIENT_TLS_ENCRYPT_ONLY_CONFIG_PATH = "/mnt/security/zk_client_tls_encrypt_only_config.properties"
ZK_CLIENT_MUTUAL_AUTH_CONFIG_PATH = "/mnt/security/zk_client_mutual_auth_config.properties"
JAAS_CONF_PATH = "/mnt/security/jaas.conf"
KRB5CONF_PATH = "/mnt/security/krb5.conf"
KEYTAB_PATH = "/mnt/security/keytab"
# This is initialized only when the first instance of SecurityConfig is created
ssl_stores = None
def __init__(self, context, security_protocol=None, interbroker_security_protocol=None,
client_sasl_mechanism=SASL_MECHANISM_GSSAPI, interbroker_sasl_mechanism=SASL_MECHANISM_GSSAPI,
zk_sasl=False, zk_tls=False, template_props="", static_jaas_conf=True, jaas_override_variables=None,
listener_security_config=ListenerSecurityConfig()):
"""
Initialize the security properties for the node and copy
keystore and truststore to the remote node if the transport protocol
is SSL. If security_protocol is None, the protocol specified in the
template properties file is used. If no protocol is specified in the
template properties either, PLAINTEXT is used as default.
"""
self.context = context
if not SecurityConfig.ssl_stores:
# This generates keystore/trustore files in a local scratch directory which gets
# automatically destroyed after the test is run
# Creating within the scratch directory allows us to run tests in parallel without fear of collision
SecurityConfig.ssl_stores = SslStores(context.local_scratch_dir, context.logger)
SecurityConfig.ssl_stores.generate_ca()
SecurityConfig.ssl_stores.generate_truststore()
if security_protocol is None:
security_protocol = self.get_property('security.protocol', template_props)
if security_protocol is None:
security_protocol = SecurityConfig.PLAINTEXT
elif security_protocol not in [SecurityConfig.PLAINTEXT, SecurityConfig.SSL, SecurityConfig.SASL_PLAINTEXT, SecurityConfig.SASL_SSL]:
raise Exception("Invalid security.protocol in template properties: " + security_protocol)
if interbroker_security_protocol is None:
interbroker_security_protocol = security_protocol
self.interbroker_security_protocol = interbroker_security_protocol
self.has_sasl = self.is_sasl(security_protocol) or self.is_sasl(interbroker_security_protocol) or zk_sasl
self.has_ssl = self.is_ssl(security_protocol) or self.is_ssl(interbroker_security_protocol) or zk_tls
self.zk_sasl = zk_sasl
self.zk_tls = zk_tls
self.static_jaas_conf = static_jaas_conf
self.listener_security_config = listener_security_config
self.properties = {
'security.protocol' : security_protocol,
'ssl.keystore.location' : SecurityConfig.KEYSTORE_PATH,
'ssl.keystore.password' : SecurityConfig.ssl_stores.keystore_passwd,
'ssl.key.password' : SecurityConfig.ssl_stores.key_passwd,
'ssl.truststore.location' : SecurityConfig.TRUSTSTORE_PATH,
'ssl.truststore.password' : SecurityConfig.ssl_stores.truststore_passwd,
'ssl.endpoint.identification.algorithm' : 'HTTPS',
'sasl.mechanism' : client_sasl_mechanism,
'sasl.mechanism.inter.broker.protocol' : interbroker_sasl_mechanism,
'sasl.kerberos.service.name' : 'kafka'
}
self.properties.update(self.listener_security_config.client_listener_overrides)
self.jaas_override_variables = jaas_override_variables or {}
def client_config(self, template_props="", node=None, jaas_override_variables=None):
# If node is not specified, use static jaas config which will be created later.
# Otherwise use static JAAS configuration files with SASL_SSL and sasl.jaas.config
# property with SASL_PLAINTEXT so that both code paths are tested by existing tests.
# Note that this is an artibtrary choice and it is possible to run all tests with
# either static or dynamic jaas config files if required.
static_jaas_conf = node is None or (self.has_sasl and self.has_ssl)
return SecurityConfig(self.context, self.security_protocol,
client_sasl_mechanism=self.client_sasl_mechanism,
template_props=template_props,
static_jaas_conf=static_jaas_conf,
jaas_override_variables=jaas_override_variables,
listener_security_config=self.listener_security_config)
def enable_security_protocol(self, security_protocol):
self.has_sasl = self.has_sasl or self.is_sasl(security_protocol)
self.has_ssl = self.has_ssl or self.is_ssl(security_protocol)
def setup_ssl(self, node):
node.account.ssh("mkdir -p %s" % SecurityConfig.CONFIG_DIR, allow_fail=False)
node.account.copy_to(SecurityConfig.ssl_stores.truststore_path, SecurityConfig.TRUSTSTORE_PATH)
SecurityConfig.ssl_stores.generate_and_copy_keystore(node)
def setup_sasl(self, node):
node.account.ssh("mkdir -p %s" % SecurityConfig.CONFIG_DIR, allow_fail=False)
jaas_conf_file = "jaas.conf"
java_version = node.account.ssh_capture("java -version")
jaas_conf = None
if 'sasl.jaas.config' not in self.properties:
jaas_conf = self.render_jaas_config(
jaas_conf_file,
{
'node': node,
'is_ibm_jdk': any('IBM' in line for line in java_version),
'SecurityConfig': SecurityConfig,
'client_sasl_mechanism': self.client_sasl_mechanism,
'enabled_sasl_mechanisms': self.enabled_sasl_mechanisms
}
)
else:
jaas_conf = self.properties['sasl.jaas.config']
if self.static_jaas_conf:
node.account.create_file(SecurityConfig.JAAS_CONF_PATH, jaas_conf)
elif 'sasl.jaas.config' not in self.properties:
self.properties['sasl.jaas.config'] = jaas_conf.replace("\n", " \\\n")
if self.has_sasl_kerberos:
node.account.copy_to(MiniKdc.LOCAL_KEYTAB_FILE, SecurityConfig.KEYTAB_PATH)
node.account.copy_to(MiniKdc.LOCAL_KRB5CONF_FILE, SecurityConfig.KRB5CONF_PATH)
def render_jaas_config(self, jaas_conf_file, config_variables):
"""
Renders the JAAS config file contents
:param jaas_conf_file: name of the JAAS config template file
:param config_variables: dict of variables used in the template
:return: the rendered template string
"""
variables = config_variables.copy()
variables.update(self.jaas_override_variables) # override variables
return self.render(jaas_conf_file, **variables)
def setup_node(self, node):
if self.has_ssl:
self.setup_ssl(node)
if self.has_sasl:
self.setup_sasl(node)
def setup_credentials(self, node, path, zk_connect, broker):
if broker:
self.maybe_create_scram_credentials(node, zk_connect, path, self.interbroker_sasl_mechanism,
SecurityConfig.SCRAM_BROKER_USER, SecurityConfig.SCRAM_BROKER_PASSWORD)
else:
self.maybe_create_scram_credentials(node, zk_connect, path, self.client_sasl_mechanism,
SecurityConfig.SCRAM_CLIENT_USER, SecurityConfig.SCRAM_CLIENT_PASSWORD)
def maybe_create_scram_credentials(self, node, zk_connect, path, mechanism, user_name, password):
if self.has_sasl and self.is_sasl_scram(mechanism):
cmd = "%s --zookeeper %s --entity-name %s --entity-type users --alter --add-config %s=[password=%s]" % \
(path.script("kafka-configs.sh", node), zk_connect,
user_name, mechanism, password)
node.account.ssh(cmd)
def clean_node(self, node):
if self.security_protocol != SecurityConfig.PLAINTEXT:
node.account.ssh("rm -rf %s" % SecurityConfig.CONFIG_DIR, allow_fail=False)
def get_property(self, prop_name, template_props=""):
"""
Get property value from the string representation of
a properties file.
"""
value = None
for line in template_props.split("\n"):
items = line.split("=")
if len(items) == 2 and items[0].strip() == prop_name:
value = str(items[1].strip())
return value
def is_ssl(self, security_protocol):
return security_protocol == SecurityConfig.SSL or security_protocol == SecurityConfig.SASL_SSL
def is_sasl(self, security_protocol):
return security_protocol == SecurityConfig.SASL_PLAINTEXT or security_protocol == SecurityConfig.SASL_SSL
def is_sasl_scram(self, sasl_mechanism):
return sasl_mechanism == SecurityConfig.SASL_MECHANISM_SCRAM_SHA_256 or sasl_mechanism == SecurityConfig.SASL_MECHANISM_SCRAM_SHA_512
@property
def security_protocol(self):
return self.properties['security.protocol']
@property
def client_sasl_mechanism(self):
return self.properties['sasl.mechanism']
@property
def interbroker_sasl_mechanism(self):
return self.properties['sasl.mechanism.inter.broker.protocol']
@property
def enabled_sasl_mechanisms(self):
return set([self.client_sasl_mechanism, self.interbroker_sasl_mechanism])
@property
def has_sasl_kerberos(self):
return self.has_sasl and (SecurityConfig.SASL_MECHANISM_GSSAPI in self.enabled_sasl_mechanisms)
@property
def kafka_opts(self):
if self.has_sasl:
if self.static_jaas_conf:
return "\"-Djava.security.auth.login.config=%s -Djava.security.krb5.conf=%s\"" % (SecurityConfig.JAAS_CONF_PATH, SecurityConfig.KRB5CONF_PATH)
else:
return "\"-Djava.security.krb5.conf=%s\"" % SecurityConfig.KRB5CONF_PATH
else:
return ""
def props(self, prefix=''):
"""
Return properties as string with line separators, optionally with a prefix.
This is used to append security config properties to
a properties file.
:param prefix: prefix to add to each property
:return: a string containing line-separated properties
"""
if self.security_protocol == SecurityConfig.PLAINTEXT:
return ""
if self.has_sasl and not self.static_jaas_conf and 'sasl.jaas.config' not in self.properties:
raise Exception("JAAS configuration property has not yet been initialized")
config_lines = (prefix + key + "=" + value for key, value in self.properties.iteritems())
# Extra blank lines ensure this can be appended/prepended safely
return "\n".join(itertools.chain([""], config_lines, [""]))
def __str__(self):
"""
Return properties as a string with line separators.
"""
return self.props()

View File

@@ -0,0 +1,108 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE
* file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file
* to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
{% if static_jaas_conf %}
KafkaClient {
{% endif %}
{% if "GSSAPI" in client_sasl_mechanism %}
{% if is_ibm_jdk %}
com.ibm.security.auth.module.Krb5LoginModule required debug=false
credsType=both
useKeytab="file:/mnt/security/keytab"
principal="client@EXAMPLE.COM";
{% else %}
com.sun.security.auth.module.Krb5LoginModule required debug=false
doNotPrompt=true
useKeyTab=true
storeKey=true
keyTab="/mnt/security/keytab"
principal="client@EXAMPLE.COM";
{% endif %}
{% elif client_sasl_mechanism == "PLAIN" %}
org.apache.kafka.common.security.plain.PlainLoginModule required
username="client"
password="client-secret";
{% elif "SCRAM-SHA-256" in client_sasl_mechanism or "SCRAM-SHA-512" in client_sasl_mechanism %}
org.apache.kafka.common.security.scram.ScramLoginModule required
username="{{ SecurityConfig.SCRAM_CLIENT_USER }}"
password="{{ SecurityConfig.SCRAM_CLIENT_PASSWORD }}";
{% endif %}
{% if static_jaas_conf %}
};
KafkaServer {
{% if "GSSAPI" in enabled_sasl_mechanisms %}
{% if is_ibm_jdk %}
com.ibm.security.auth.module.Krb5LoginModule required debug=false
credsType=both
useKeytab="file:/mnt/security/keytab"
principal="kafka/{{ node.account.hostname }}@EXAMPLE.COM";
{% else %}
com.sun.security.auth.module.Krb5LoginModule required debug=false
doNotPrompt=true
useKeyTab=true
storeKey=true
keyTab="/mnt/security/keytab"
principal="kafka/{{ node.account.hostname }}@EXAMPLE.COM";
{% endif %}
{% endif %}
{% if "PLAIN" in enabled_sasl_mechanisms %}
org.apache.kafka.common.security.plain.PlainLoginModule required
username="kafka"
password="kafka-secret"
user_client="client-secret"
user_kafka="kafka-secret";
{% endif %}
{% if "SCRAM-SHA-256" in client_sasl_mechanism or "SCRAM-SHA-512" in client_sasl_mechanism %}
org.apache.kafka.common.security.scram.ScramLoginModule required
username="{{ SecurityConfig.SCRAM_BROKER_USER }}"
password="{{ SecurityConfig.SCRAM_BROKER_PASSWORD }}";
{% endif %}
};
{% if zk_sasl %}
Client {
{% if is_ibm_jdk %}
com.ibm.security.auth.module.Krb5LoginModule required debug=false
credsType=both
useKeytab="file:/mnt/security/keytab"
principal="zkclient@EXAMPLE.COM";
{% else %}
com.sun.security.auth.module.Krb5LoginModule required
useKeyTab=true
keyTab="/mnt/security/keytab"
storeKey=true
useTicketCache=false
principal="zkclient@EXAMPLE.COM";
{% endif %}
};
Server {
{% if is_ibm_jdk %}
com.ibm.security.auth.module.Krb5LoginModule required debug=false
credsType=both
useKeyTab="file:/mnt/security/keytab"
principal="zookeeper/{{ node.account.hostname }}@EXAMPLE.COM";
{% else %}
com.sun.security.auth.module.Krb5LoginModule required
useKeyTab=true
keyTab="/mnt/security/keytab"
storeKey=true
useTicketCache=false
principal="zookeeper/{{ node.account.hostname }}@EXAMPLE.COM";
{% endif %}
};
{% endif %}
{% endif %}

View File

@@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
kdc.bind.address=0.0.0.0

View File

@@ -0,0 +1,701 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path
import signal
import streams_property
import consumer_property
from ducktape.services.service import Service
from ducktape.utils.util import wait_until
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
from kafkatest.services.kafka import KafkaConfig
from kafkatest.services.monitor.jmx import JmxMixin
from kafkatest.version import LATEST_0_10_0, LATEST_0_10_1
STATE_DIR = "state.dir"
class StreamsTestBaseService(KafkaPathResolverMixin, JmxMixin, Service):
"""Base class for Streams Test services providing some common settings and functionality"""
PERSISTENT_ROOT = "/mnt/streams"
# The log file contains normal log4j logs written using a file appender. stdout and stderr are handled separately
CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "streams.properties")
LOG_FILE = os.path.join(PERSISTENT_ROOT, "streams.log")
STDOUT_FILE = os.path.join(PERSISTENT_ROOT, "streams.stdout")
STDERR_FILE = os.path.join(PERSISTENT_ROOT, "streams.stderr")
JMX_LOG_FILE = os.path.join(PERSISTENT_ROOT, "jmx_tool.log")
JMX_ERR_FILE = os.path.join(PERSISTENT_ROOT, "jmx_tool.err.log")
LOG4J_CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "tools-log4j.properties")
PID_FILE = os.path.join(PERSISTENT_ROOT, "streams.pid")
CLEAN_NODE_ENABLED = True
logs = {
"streams_config": {
"path": CONFIG_FILE,
"collect_default": True},
"streams_config.1": {
"path": CONFIG_FILE + ".1",
"collect_default": True},
"streams_config.0-1": {
"path": CONFIG_FILE + ".0-1",
"collect_default": True},
"streams_config.1-1": {
"path": CONFIG_FILE + ".1-1",
"collect_default": True},
"streams_log": {
"path": LOG_FILE,
"collect_default": True},
"streams_stdout": {
"path": STDOUT_FILE,
"collect_default": True},
"streams_stderr": {
"path": STDERR_FILE,
"collect_default": True},
"streams_log.1": {
"path": LOG_FILE + ".1",
"collect_default": True},
"streams_stdout.1": {
"path": STDOUT_FILE + ".1",
"collect_default": True},
"streams_stderr.1": {
"path": STDERR_FILE + ".1",
"collect_default": True},
"streams_log.2": {
"path": LOG_FILE + ".2",
"collect_default": True},
"streams_stdout.2": {
"path": STDOUT_FILE + ".2",
"collect_default": True},
"streams_stderr.2": {
"path": STDERR_FILE + ".2",
"collect_default": True},
"streams_log.3": {
"path": LOG_FILE + ".3",
"collect_default": True},
"streams_stdout.3": {
"path": STDOUT_FILE + ".3",
"collect_default": True},
"streams_stderr.3": {
"path": STDERR_FILE + ".3",
"collect_default": True},
"streams_log.0-1": {
"path": LOG_FILE + ".0-1",
"collect_default": True},
"streams_stdout.0-1": {
"path": STDOUT_FILE + ".0-1",
"collect_default": True},
"streams_stderr.0-1": {
"path": STDERR_FILE + ".0-1",
"collect_default": True},
"streams_log.0-2": {
"path": LOG_FILE + ".0-2",
"collect_default": True},
"streams_stdout.0-2": {
"path": STDOUT_FILE + ".0-2",
"collect_default": True},
"streams_stderr.0-2": {
"path": STDERR_FILE + ".0-2",
"collect_default": True},
"streams_log.0-3": {
"path": LOG_FILE + ".0-3",
"collect_default": True},
"streams_stdout.0-3": {
"path": STDOUT_FILE + ".0-3",
"collect_default": True},
"streams_stderr.0-3": {
"path": STDERR_FILE + ".0-3",
"collect_default": True},
"streams_log.0-4": {
"path": LOG_FILE + ".0-4",
"collect_default": True},
"streams_stdout.0-4": {
"path": STDOUT_FILE + ".0-4",
"collect_default": True},
"streams_stderr.0-4": {
"path": STDERR_FILE + ".0-4",
"collect_default": True},
"streams_log.0-5": {
"path": LOG_FILE + ".0-5",
"collect_default": True},
"streams_stdout.0-5": {
"path": STDOUT_FILE + ".0-5",
"collect_default": True},
"streams_stderr.0-5": {
"path": STDERR_FILE + ".0-5",
"collect_default": True},
"streams_log.0-6": {
"path": LOG_FILE + ".0-6",
"collect_default": True},
"streams_stdout.0-6": {
"path": STDOUT_FILE + ".0-6",
"collect_default": True},
"streams_stderr.0-6": {
"path": STDERR_FILE + ".0-6",
"collect_default": True},
"streams_log.1-1": {
"path": LOG_FILE + ".1-1",
"collect_default": True},
"streams_stdout.1-1": {
"path": STDOUT_FILE + ".1-1",
"collect_default": True},
"streams_stderr.1-1": {
"path": STDERR_FILE + ".1-1",
"collect_default": True},
"streams_log.1-2": {
"path": LOG_FILE + ".1-2",
"collect_default": True},
"streams_stdout.1-2": {
"path": STDOUT_FILE + ".1-2",
"collect_default": True},
"streams_stderr.1-2": {
"path": STDERR_FILE + ".1-2",
"collect_default": True},
"streams_log.1-3": {
"path": LOG_FILE + ".1-3",
"collect_default": True},
"streams_stdout.1-3": {
"path": STDOUT_FILE + ".1-3",
"collect_default": True},
"streams_stderr.1-3": {
"path": STDERR_FILE + ".1-3",
"collect_default": True},
"streams_log.1-4": {
"path": LOG_FILE + ".1-4",
"collect_default": True},
"streams_stdout.1-4": {
"path": STDOUT_FILE + ".1-4",
"collect_default": True},
"streams_stderr.1-4": {
"path": STDERR_FILE + ".1-4",
"collect_default": True},
"streams_log.1-5": {
"path": LOG_FILE + ".1-5",
"collect_default": True},
"streams_stdout.1-5": {
"path": STDOUT_FILE + ".1-5",
"collect_default": True},
"streams_stderr.1-5": {
"path": STDERR_FILE + ".1-5",
"collect_default": True},
"streams_log.1-6": {
"path": LOG_FILE + ".1-6",
"collect_default": True},
"streams_stdout.1-6": {
"path": STDOUT_FILE + ".1-6",
"collect_default": True},
"streams_stderr.1-6": {
"path": STDERR_FILE + ".1-6",
"collect_default": True},
"jmx_log": {
"path": JMX_LOG_FILE,
"collect_default": True},
"jmx_err": {
"path": JMX_ERR_FILE,
"collect_default": True},
}
def __init__(self, test_context, kafka, streams_class_name, user_test_args1, user_test_args2=None, user_test_args3=None, user_test_args4=None):
Service.__init__(self, test_context, num_nodes=1)
self.kafka = kafka
self.args = {'streams_class_name': streams_class_name,
'user_test_args1': user_test_args1,
'user_test_args2': user_test_args2,
'user_test_args3': user_test_args3,
'user_test_args4': user_test_args4}
self.log_level = "DEBUG"
@property
def node(self):
return self.nodes[0]
def pids(self, node):
try:
pids = [pid for pid in node.account.ssh_capture("cat " + self.PID_FILE, callback=str)]
return [int(pid) for pid in pids]
except Exception, exception:
self.logger.debug(str(exception))
return []
def stop_nodes(self, clean_shutdown=True):
for node in self.nodes:
self.stop_node(node, clean_shutdown)
def stop_node(self, node, clean_shutdown=True):
self.logger.info((clean_shutdown and "Cleanly" or "Forcibly") + " stopping Streams Test on " + str(node.account))
pids = self.pids(node)
sig = signal.SIGTERM if clean_shutdown else signal.SIGKILL
for pid in pids:
node.account.signal(pid, sig, allow_fail=True)
if clean_shutdown:
for pid in pids:
wait_until(lambda: not node.account.alive(pid), timeout_sec=120, err_msg="Streams Test process on " + str(node.account) + " took too long to exit")
node.account.ssh("rm -f " + self.PID_FILE, allow_fail=False)
def restart(self):
# We don't want to do any clean up here, just restart the process.
for node in self.nodes:
self.logger.info("Restarting Kafka Streams on " + str(node.account))
self.stop_node(node)
self.start_node(node)
def abortThenRestart(self):
# We don't want to do any clean up here, just abort then restart the process. The running service is killed immediately.
for node in self.nodes:
self.logger.info("Aborting Kafka Streams on " + str(node.account))
self.stop_node(node, False)
self.logger.info("Restarting Kafka Streams on " + str(node.account))
self.start_node(node)
def wait(self, timeout_sec=1440):
for node in self.nodes:
self.wait_node(node, timeout_sec)
def wait_node(self, node, timeout_sec=None):
for pid in self.pids(node):
wait_until(lambda: not node.account.alive(pid), timeout_sec=timeout_sec, err_msg="Streams Test process on " + str(node.account) + " took too long to exit")
def clean_node(self, node):
node.account.kill_process("streams", clean_shutdown=False, allow_fail=True)
if self.CLEAN_NODE_ENABLED:
node.account.ssh("rm -rf " + self.PERSISTENT_ROOT, allow_fail=False)
def start_cmd(self, node):
args = self.args.copy()
args['config_file'] = self.CONFIG_FILE
args['stdout'] = self.STDOUT_FILE
args['stderr'] = self.STDERR_FILE
args['pidfile'] = self.PID_FILE
args['log4j'] = self.LOG4J_CONFIG_FILE
args['kafka_run_class'] = self.path.script("kafka-run-class.sh", node)
cmd = "( export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%(log4j)s\"; " \
"INCLUDE_TEST_JARS=true %(kafka_run_class)s %(streams_class_name)s " \
" %(config_file)s %(user_test_args1)s %(user_test_args2)s %(user_test_args3)s" \
" %(user_test_args4)s & echo $! >&3 ) 1>> %(stdout)s 2>> %(stderr)s 3> %(pidfile)s" % args
self.logger.info("Executing streams cmd: " + cmd)
return cmd
def prop_file(self):
cfg = KafkaConfig(**{streams_property.STATE_DIR: self.PERSISTENT_ROOT, streams_property.KAFKA_SERVERS: self.kafka.bootstrap_servers()})
return cfg.render()
def start_node(self, node):
node.account.mkdirs(self.PERSISTENT_ROOT)
prop_file = self.prop_file()
node.account.create_file(self.CONFIG_FILE, prop_file)
node.account.create_file(self.LOG4J_CONFIG_FILE, self.render('tools_log4j.properties', log_file=self.LOG_FILE))
self.logger.info("Starting StreamsTest process on " + str(node.account))
with node.account.monitor_log(self.STDOUT_FILE) as monitor:
node.account.ssh(self.start_cmd(node))
monitor.wait_until('StreamsTest instance started', timeout_sec=60, err_msg="Never saw message indicating StreamsTest finished startup on " + str(node.account))
if len(self.pids(node)) == 0:
raise RuntimeError("No process ids recorded")
class StreamsSmokeTestBaseService(StreamsTestBaseService):
"""Base class for Streams Smoke Test services providing some common settings and functionality"""
def __init__(self, test_context, kafka, command, processing_guarantee = 'at_least_once', num_threads = 3, replication_factor = 3):
super(StreamsSmokeTestBaseService, self).__init__(test_context,
kafka,
"org.apache.kafka.streams.tests.StreamsSmokeTest",
command)
self.NUM_THREADS = num_threads
self.PROCESSING_GUARANTEE = processing_guarantee
self.KAFKA_STREAMS_VERSION = ""
self.UPGRADE_FROM = None
self.REPLICATION_FACTOR = replication_factor
def set_version(self, kafka_streams_version):
self.KAFKA_STREAMS_VERSION = kafka_streams_version
def set_upgrade_from(self, upgrade_from):
self.UPGRADE_FROM = upgrade_from
def prop_file(self):
properties = {streams_property.STATE_DIR: self.PERSISTENT_ROOT,
streams_property.KAFKA_SERVERS: self.kafka.bootstrap_servers(),
"processing.guarantee": self.PROCESSING_GUARANTEE,
streams_property.NUM_THREADS: self.NUM_THREADS,
"replication.factor": self.REPLICATION_FACTOR,
"num.standby.replicas": 2,
"buffered.records.per.partition": 100,
"commit.interval.ms": 1000,
"auto.offset.reset": "earliest",
"acks": "all"}
if self.UPGRADE_FROM is not None:
properties['upgrade.from'] = self.UPGRADE_FROM
cfg = KafkaConfig(**properties)
return cfg.render()
def start_cmd(self, node):
args = self.args.copy()
args['config_file'] = self.CONFIG_FILE
args['stdout'] = self.STDOUT_FILE
args['stderr'] = self.STDERR_FILE
args['pidfile'] = self.PID_FILE
args['log4j'] = self.LOG4J_CONFIG_FILE
args['version'] = self.KAFKA_STREAMS_VERSION
args['kafka_run_class'] = self.path.script("kafka-run-class.sh", node)
cmd = "( export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%(log4j)s\";" \
" INCLUDE_TEST_JARS=true UPGRADE_KAFKA_STREAMS_TEST_VERSION=%(version)s" \
" bash -x %(kafka_run_class)s %(streams_class_name)s" \
" %(config_file)s %(user_test_args1)s" \
" & echo $! >&3 ) " \
"1>> %(stdout)s 2>> %(stderr)s 3> %(pidfile)s" % args
self.logger.info("Executing streams cmd: " + cmd)
return cmd
class StreamsEosTestBaseService(StreamsTestBaseService):
"""Base class for Streams EOS Test services providing some common settings and functionality"""
clean_node_enabled = True
def __init__(self, test_context, kafka, command):
super(StreamsEosTestBaseService, self).__init__(test_context,
kafka,
"org.apache.kafka.streams.tests.StreamsEosTest",
command)
def clean_node(self, node):
if self.clean_node_enabled:
super(StreamsEosTestBaseService, self).clean_node(node)
class StreamsSmokeTestDriverService(StreamsSmokeTestBaseService):
def __init__(self, test_context, kafka):
super(StreamsSmokeTestDriverService, self).__init__(test_context, kafka, "run")
self.DISABLE_AUTO_TERMINATE = ""
def disable_auto_terminate(self):
self.DISABLE_AUTO_TERMINATE = "disableAutoTerminate"
def start_cmd(self, node):
args = self.args.copy()
args['config_file'] = self.CONFIG_FILE
args['stdout'] = self.STDOUT_FILE
args['stderr'] = self.STDERR_FILE
args['pidfile'] = self.PID_FILE
args['log4j'] = self.LOG4J_CONFIG_FILE
args['disable_auto_terminate'] = self.DISABLE_AUTO_TERMINATE
args['kafka_run_class'] = self.path.script("kafka-run-class.sh", node)
cmd = "( export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%(log4j)s\"; " \
"INCLUDE_TEST_JARS=true %(kafka_run_class)s %(streams_class_name)s " \
" %(config_file)s %(user_test_args1)s %(disable_auto_terminate)s" \
" & echo $! >&3 ) 1>> %(stdout)s 2>> %(stderr)s 3> %(pidfile)s" % args
return cmd
class StreamsSmokeTestJobRunnerService(StreamsSmokeTestBaseService):
def __init__(self, test_context, kafka, processing_guarantee = 'at_least_once', num_threads = 3, replication_factor = 3):
super(StreamsSmokeTestJobRunnerService, self).__init__(test_context, kafka, "process", processing_guarantee = processing_guarantee, num_threads = num_threads, replication_factor = replication_factor)
class StreamsSmokeTestEOSJobRunnerService(StreamsSmokeTestBaseService):
def __init__(self, test_context, kafka):
super(StreamsSmokeTestEOSJobRunnerService, self).__init__(test_context, kafka, "process-eos")
class StreamsEosTestDriverService(StreamsEosTestBaseService):
def __init__(self, test_context, kafka):
super(StreamsEosTestDriverService, self).__init__(test_context, kafka, "run")
class StreamsEosTestJobRunnerService(StreamsEosTestBaseService):
def __init__(self, test_context, kafka):
super(StreamsEosTestJobRunnerService, self).__init__(test_context, kafka, "process")
class StreamsComplexEosTestJobRunnerService(StreamsEosTestBaseService):
def __init__(self, test_context, kafka):
super(StreamsComplexEosTestJobRunnerService, self).__init__(test_context, kafka, "process-complex")
class StreamsEosTestVerifyRunnerService(StreamsEosTestBaseService):
def __init__(self, test_context, kafka):
super(StreamsEosTestVerifyRunnerService, self).__init__(test_context, kafka, "verify")
class StreamsComplexEosTestVerifyRunnerService(StreamsEosTestBaseService):
def __init__(self, test_context, kafka):
super(StreamsComplexEosTestVerifyRunnerService, self).__init__(test_context, kafka, "verify-complex")
class StreamsSmokeTestShutdownDeadlockService(StreamsSmokeTestBaseService):
def __init__(self, test_context, kafka):
super(StreamsSmokeTestShutdownDeadlockService, self).__init__(test_context, kafka, "close-deadlock-test")
class StreamsBrokerCompatibilityService(StreamsTestBaseService):
def __init__(self, test_context, kafka, eosEnabled):
super(StreamsBrokerCompatibilityService, self).__init__(test_context,
kafka,
"org.apache.kafka.streams.tests.BrokerCompatibilityTest",
eosEnabled)
class StreamsBrokerDownResilienceService(StreamsTestBaseService):
def __init__(self, test_context, kafka, configs):
super(StreamsBrokerDownResilienceService, self).__init__(test_context,
kafka,
"org.apache.kafka.streams.tests.StreamsBrokerDownResilienceTest",
configs)
def start_cmd(self, node):
args = self.args.copy()
args['config_file'] = self.CONFIG_FILE
args['stdout'] = self.STDOUT_FILE
args['stderr'] = self.STDERR_FILE
args['pidfile'] = self.PID_FILE
args['log4j'] = self.LOG4J_CONFIG_FILE
args['kafka_run_class'] = self.path.script("kafka-run-class.sh", node)
cmd = "( export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%(log4j)s\"; " \
"INCLUDE_TEST_JARS=true %(kafka_run_class)s %(streams_class_name)s " \
" %(config_file)s %(user_test_args1)s %(user_test_args2)s %(user_test_args3)s" \
" %(user_test_args4)s & echo $! >&3 ) 1>> %(stdout)s 2>> %(stderr)s 3> %(pidfile)s" % args
self.logger.info("Executing: " + cmd)
return cmd
class StreamsStandbyTaskService(StreamsTestBaseService):
def __init__(self, test_context, kafka, configs):
super(StreamsStandbyTaskService, self).__init__(test_context,
kafka,
"org.apache.kafka.streams.tests.StreamsStandByReplicaTest",
configs)
class StreamsOptimizedUpgradeTestService(StreamsTestBaseService):
def __init__(self, test_context, kafka):
super(StreamsOptimizedUpgradeTestService, self).__init__(test_context,
kafka,
"org.apache.kafka.streams.tests.StreamsOptimizedTest",
"")
self.OPTIMIZED_CONFIG = 'none'
self.INPUT_TOPIC = None
self.AGGREGATION_TOPIC = None
self.REDUCE_TOPIC = None
self.JOIN_TOPIC = None
def prop_file(self):
properties = {streams_property.STATE_DIR: self.PERSISTENT_ROOT,
streams_property.KAFKA_SERVERS: self.kafka.bootstrap_servers()}
properties['topology.optimization'] = self.OPTIMIZED_CONFIG
properties['input.topic'] = self.INPUT_TOPIC
properties['aggregation.topic'] = self.AGGREGATION_TOPIC
properties['reduce.topic'] = self.REDUCE_TOPIC
properties['join.topic'] = self.JOIN_TOPIC
cfg = KafkaConfig(**properties)
return cfg.render()
class StreamsUpgradeTestJobRunnerService(StreamsTestBaseService):
def __init__(self, test_context, kafka):
super(StreamsUpgradeTestJobRunnerService, self).__init__(test_context,
kafka,
"org.apache.kafka.streams.tests.StreamsUpgradeTest",
"")
self.UPGRADE_FROM = None
self.UPGRADE_TO = None
self.extra_properties = {}
def set_config(self, key, value):
self.extra_properties[key] = value
def set_version(self, kafka_streams_version):
self.KAFKA_STREAMS_VERSION = kafka_streams_version
def set_upgrade_from(self, upgrade_from):
self.UPGRADE_FROM = upgrade_from
def set_upgrade_to(self, upgrade_to):
self.UPGRADE_TO = upgrade_to
def prop_file(self):
properties = self.extra_properties.copy()
properties[streams_property.STATE_DIR] = self.PERSISTENT_ROOT
properties[streams_property.KAFKA_SERVERS] = self.kafka.bootstrap_servers()
if self.UPGRADE_FROM is not None:
properties['upgrade.from'] = self.UPGRADE_FROM
if self.UPGRADE_TO == "future_version":
properties['test.future.metadata'] = "any_value"
cfg = KafkaConfig(**properties)
return cfg.render()
def start_cmd(self, node):
args = self.args.copy()
if self.KAFKA_STREAMS_VERSION == str(LATEST_0_10_0) or self.KAFKA_STREAMS_VERSION == str(LATEST_0_10_1):
args['zk'] = self.kafka.zk.connect_setting()
else:
args['zk'] = ""
args['config_file'] = self.CONFIG_FILE
args['stdout'] = self.STDOUT_FILE
args['stderr'] = self.STDERR_FILE
args['pidfile'] = self.PID_FILE
args['log4j'] = self.LOG4J_CONFIG_FILE
args['version'] = self.KAFKA_STREAMS_VERSION
args['kafka_run_class'] = self.path.script("kafka-run-class.sh", node)
cmd = "( export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%(log4j)s\"; " \
"INCLUDE_TEST_JARS=true UPGRADE_KAFKA_STREAMS_TEST_VERSION=%(version)s " \
" %(kafka_run_class)s %(streams_class_name)s %(zk)s %(config_file)s " \
" & echo $! >&3 ) 1>> %(stdout)s 2>> %(stderr)s 3> %(pidfile)s" % args
self.logger.info("Executing: " + cmd)
return cmd
class StreamsNamedRepartitionTopicService(StreamsTestBaseService):
def __init__(self, test_context, kafka):
super(StreamsNamedRepartitionTopicService, self).__init__(test_context,
kafka,
"org.apache.kafka.streams.tests.StreamsNamedRepartitionTest",
"")
self.ADD_ADDITIONAL_OPS = 'false'
self.INPUT_TOPIC = None
self.AGGREGATION_TOPIC = None
def prop_file(self):
properties = {streams_property.STATE_DIR: self.PERSISTENT_ROOT,
streams_property.KAFKA_SERVERS: self.kafka.bootstrap_servers()}
properties['input.topic'] = self.INPUT_TOPIC
properties['aggregation.topic'] = self.AGGREGATION_TOPIC
properties['add.operations'] = self.ADD_ADDITIONAL_OPS
cfg = KafkaConfig(**properties)
return cfg.render()
class StaticMemberTestService(StreamsTestBaseService):
def __init__(self, test_context, kafka, group_instance_id, num_threads):
super(StaticMemberTestService, self).__init__(test_context,
kafka,
"org.apache.kafka.streams.tests.StaticMemberTestClient",
"")
self.INPUT_TOPIC = None
self.GROUP_INSTANCE_ID = group_instance_id
self.NUM_THREADS = num_threads
def prop_file(self):
properties = {streams_property.STATE_DIR: self.PERSISTENT_ROOT,
streams_property.KAFKA_SERVERS: self.kafka.bootstrap_servers(),
streams_property.NUM_THREADS: self.NUM_THREADS,
consumer_property.GROUP_INSTANCE_ID: self.GROUP_INSTANCE_ID,
consumer_property.SESSION_TIMEOUT_MS: 60000}
properties['input.topic'] = self.INPUT_TOPIC
cfg = KafkaConfig(**properties)
return cfg.render()
class CooperativeRebalanceUpgradeService(StreamsTestBaseService):
def __init__(self, test_context, kafka):
super(CooperativeRebalanceUpgradeService, self).__init__(test_context,
kafka,
"org.apache.kafka.streams.tests.StreamsUpgradeToCooperativeRebalanceTest",
"")
self.UPGRADE_FROM = None
# these properties will be overridden in test
self.SOURCE_TOPIC = None
self.SINK_TOPIC = None
self.TASK_DELIMITER = "#"
self.REPORT_INTERVAL = None
self.standby_tasks = None
self.active_tasks = None
self.upgrade_phase = None
def set_tasks(self, task_string):
label = "TASK-ASSIGNMENTS:"
task_string_substr = task_string[len(label):]
all_tasks = task_string_substr.split(self.TASK_DELIMITER)
self.active_tasks = set(all_tasks[0].split(","))
if len(all_tasks) > 1:
self.standby_tasks = set(all_tasks[1].split(","))
def set_version(self, kafka_streams_version):
self.KAFKA_STREAMS_VERSION = kafka_streams_version
def set_upgrade_phase(self, upgrade_phase):
self.upgrade_phase = upgrade_phase
def start_cmd(self, node):
args = self.args.copy()
if self.KAFKA_STREAMS_VERSION == str(LATEST_0_10_0) or self.KAFKA_STREAMS_VERSION == str(LATEST_0_10_1):
args['zk'] = self.kafka.zk.connect_setting()
else:
args['zk'] = ""
args['config_file'] = self.CONFIG_FILE
args['stdout'] = self.STDOUT_FILE
args['stderr'] = self.STDERR_FILE
args['pidfile'] = self.PID_FILE
args['log4j'] = self.LOG4J_CONFIG_FILE
args['version'] = self.KAFKA_STREAMS_VERSION
args['kafka_run_class'] = self.path.script("kafka-run-class.sh", node)
cmd = "( export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%(log4j)s\"; " \
"INCLUDE_TEST_JARS=true UPGRADE_KAFKA_STREAMS_TEST_VERSION=%(version)s " \
" %(kafka_run_class)s %(streams_class_name)s %(zk)s %(config_file)s " \
" & echo $! >&3 ) 1>> %(stdout)s 2>> %(stderr)s 3> %(pidfile)s" % args
self.logger.info("Executing: " + cmd)
return cmd
def prop_file(self):
properties = {streams_property.STATE_DIR: self.PERSISTENT_ROOT,
streams_property.KAFKA_SERVERS: self.kafka.bootstrap_servers()}
if self.UPGRADE_FROM is not None:
properties['upgrade.from'] = self.UPGRADE_FROM
else:
try:
del properties['upgrade.from']
except KeyError:
self.logger.info("Key 'upgrade.from' not there, better safe than sorry")
if self.upgrade_phase is not None:
properties['upgrade.phase'] = self.upgrade_phase
properties['source.topic'] = self.SOURCE_TOPIC
properties['sink.topic'] = self.SINK_TOPIC
properties['task.delimiter'] = self.TASK_DELIMITER
properties['report.interval'] = self.REPORT_INTERVAL
cfg = KafkaConfig(**properties)
return cfg.render()

View File

@@ -0,0 +1,22 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Define Streams configuration property names here.
"""
STATE_DIR = "state.dir"
KAFKA_SERVERS = "bootstrap.servers"
NUM_THREADS = "num.stream.threads"

View File

@@ -0,0 +1,29 @@
##
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##
# Define the root logger with appender file
log4j.rootLogger = {{ log_level|default("INFO") }}, FILE
log4j.appender.FILE=org.apache.log4j.FileAppender
log4j.appender.FILE.File={{ log_file }}
log4j.appender.FILE.ImmediateFlush=true
log4j.appender.FILE.Append=true
log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
log4j.appender.FILE.layout.conversionPattern=[%d] %p %m (%c)%n
log4j.logger.org.apache.zookeeper=ERROR
log4j.logger.org.reflections=ERROR

View File

@@ -0,0 +1,24 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
group.id={{ group_id|default('test-consumer-group') }}
{% if client_id is defined and client_id is not none %}
client.id={{ client_id }}
{% endif %}
{% if consumer_metadata_max_age_ms is defined and consumer_metadata_max_age_ms is not none %}
metadata.max.age.ms={{ consumer_metadata_max_age_ms }}
{% endif %}

View File

@@ -0,0 +1,27 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# see kafka.consumer.ConsumerConfig for more details
bootstrap.servers={{ source.bootstrap_servers(security_config.security_protocol) }}
{% if source_auto_offset_reset is defined and source_auto_offset_reset is not none %}
auto.offset.reset={{ source_auto_offset_reset|default('latest') }}
{% endif %}
group.id={{ group_id|default('test-consumer-group') }}
{% if partition_assignment_strategy is defined and partition_assignment_strategy is not none %}
partition.assignment.strategy={{ partition_assignment_strategy }}
{% endif %}

View File

@@ -0,0 +1,20 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
bootstrap.servers = {{ target.bootstrap_servers(security_config.security_protocol) }}
{% if producer_interceptor_classes is defined and producer_interceptor_classes is not none %}
interceptor.classes={{ producer_interceptor_classes }}
{% endif %}

View File

@@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# see kafka.producer.ProducerConfig for more details
request.timeout.ms={{ request_timeout_ms }}

View File

@@ -0,0 +1,31 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Define the root logger with appender file
log4j.rootLogger = {{ log_level|default("INFO") }}, FILE
{% if loggers is defined %}
{% for logger, log_level in loggers.iteritems() %}
log4j.logger.{{ logger }}={{ log_level }}
{% endfor %}
{% endif %}
log4j.appender.FILE=org.apache.log4j.FileAppender
log4j.appender.FILE.File={{ log_file }}
log4j.appender.FILE.ImmediateFlush=true
# Set the append to true
log4j.appender.FILE.Append=true
log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
log4j.appender.FILE.layout.conversionPattern=[%d] %p %m (%c)%n

View File

@@ -0,0 +1,40 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dataDir=/mnt/zookeeper/data
{% if zk_client_port %}
clientPort=2181
{% endif %}
{% if zk_client_secure_port %}
secureClientPort=2182
serverCnxnFactory=org.apache.zookeeper.server.NettyServerCnxnFactory
authProvider.x509=org.apache.zookeeper.server.auth.X509AuthenticationProvider
ssl.keyStore.location=/mnt/security/test.keystore.jks
ssl.keyStore.password=test-ks-passwd
ssl.keyStore.type=JKS
ssl.trustStore.location=/mnt/security/test.truststore.jks
ssl.trustStore.password=test-ts-passwd
ssl.trustStore.type=JKS
{% if zk_tls_encrypt_only %}
ssl.clientAuth=none
{% endif %}
{% endif %}
maxClientCnxns=0
initLimit=5
syncLimit=2
quorumListenOnAllIPs=true
{% for node in nodes %}
server.{{ loop.index }}={{ node.account.hostname }}:2888:3888
{% endfor %}

View File

@@ -0,0 +1,204 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import signal
from ducktape.utils.util import wait_until
from ducktape.services.background_thread import BackgroundThreadService
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
from ducktape.cluster.remoteaccount import RemoteCommandError
class TransactionalMessageCopier(KafkaPathResolverMixin, BackgroundThreadService):
"""This service wraps org.apache.kafka.tools.TransactionalMessageCopier for
use in system testing.
"""
PERSISTENT_ROOT = "/mnt/transactional_message_copier"
STDOUT_CAPTURE = os.path.join(PERSISTENT_ROOT, "transactional_message_copier.stdout")
STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "transactional_message_copier.stderr")
LOG_DIR = os.path.join(PERSISTENT_ROOT, "logs")
LOG_FILE = os.path.join(LOG_DIR, "transactional_message_copier.log")
LOG4J_CONFIG = os.path.join(PERSISTENT_ROOT, "tools-log4j.properties")
logs = {
"transactional_message_copier_stdout": {
"path": STDOUT_CAPTURE,
"collect_default": True},
"transactional_message_copier_stderr": {
"path": STDERR_CAPTURE,
"collect_default": True},
"transactional_message_copier_log": {
"path": LOG_FILE,
"collect_default": True}
}
def __init__(self, context, num_nodes, kafka, transactional_id, consumer_group,
input_topic, input_partition, output_topic, max_messages=-1,
transaction_size=1000, transaction_timeout=None, enable_random_aborts=True,
use_group_metadata=False, group_mode=False):
super(TransactionalMessageCopier, self).__init__(context, num_nodes)
self.kafka = kafka
self.transactional_id = transactional_id
self.consumer_group = consumer_group
self.transaction_size = transaction_size
self.transaction_timeout = transaction_timeout
self.input_topic = input_topic
self.input_partition = input_partition
self.output_topic = output_topic
self.max_messages = max_messages
self.message_copy_finished = False
self.consumed = -1
self.remaining = -1
self.stop_timeout_sec = 60
self.enable_random_aborts = enable_random_aborts
self.use_group_metadata = use_group_metadata
self.group_mode = group_mode
self.loggers = {
"org.apache.kafka.clients.producer": "TRACE",
"org.apache.kafka.clients.consumer": "TRACE"
}
def _worker(self, idx, node):
node.account.ssh("mkdir -p %s" % TransactionalMessageCopier.PERSISTENT_ROOT,
allow_fail=False)
# Create and upload log properties
log_config = self.render('tools_log4j.properties',
log_file=TransactionalMessageCopier.LOG_FILE)
node.account.create_file(TransactionalMessageCopier.LOG4J_CONFIG, log_config)
# Configure security
self.security_config = self.kafka.security_config.client_config(node=node)
self.security_config.setup_node(node)
cmd = self.start_cmd(node, idx)
self.logger.debug("TransactionalMessageCopier %d command: %s" % (idx, cmd))
try:
for line in node.account.ssh_capture(cmd):
line = line.strip()
data = self.try_parse_json(line)
if data is not None:
with self.lock:
self.remaining = int(data["remaining"])
self.consumed = int(data["consumed"])
self.logger.info("%s: consumed %d, remaining %d" %
(self.transactional_id, self.consumed, self.remaining))
if "shutdown_complete" in data:
if self.remaining == 0:
# We are only finished if the remaining
# messages at the time of shutdown is 0.
#
# Otherwise a clean shutdown would still print
# a 'shutdown complete' messages even though
# there are unprocessed messages, causing
# tests to fail.
self.logger.info("%s : Finished message copy" % self.transactional_id)
self.message_copy_finished = True
else:
self.logger.info("%s : Shut down without finishing message copy." %\
self.transactional_id)
except RemoteCommandError as e:
self.logger.debug("Got exception while reading output from copier, \
probably because it was SIGKILL'd (exit code 137): %s" % str(e))
def start_cmd(self, node, idx):
cmd = "export LOG_DIR=%s;" % TransactionalMessageCopier.LOG_DIR
cmd += " export KAFKA_OPTS=%s;" % self.security_config.kafka_opts
cmd += " export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%s\"; " % TransactionalMessageCopier.LOG4J_CONFIG
cmd += self.path.script("kafka-run-class.sh", node) + " org.apache.kafka.tools." + "TransactionalMessageCopier"
cmd += " --broker-list %s" % self.kafka.bootstrap_servers(self.security_config.security_protocol)
cmd += " --transactional-id %s" % self.transactional_id
cmd += " --consumer-group %s" % self.consumer_group
cmd += " --input-topic %s" % self.input_topic
cmd += " --output-topic %s" % self.output_topic
cmd += " --input-partition %s" % str(self.input_partition)
cmd += " --transaction-size %s" % str(self.transaction_size)
if self.transaction_timeout is not None:
cmd += " --transaction-timeout %s" % str(self.transaction_timeout)
if self.enable_random_aborts:
cmd += " --enable-random-aborts"
if self.use_group_metadata:
cmd += " --use-group-metadata"
if self.group_mode:
cmd += " --group-mode"
if self.max_messages > 0:
cmd += " --max-messages %s" % str(self.max_messages)
cmd += " 2>> %s | tee -a %s &" % (TransactionalMessageCopier.STDERR_CAPTURE, TransactionalMessageCopier.STDOUT_CAPTURE)
return cmd
def clean_node(self, node):
self.kill_node(node, clean_shutdown=False)
node.account.ssh("rm -rf " + self.PERSISTENT_ROOT, allow_fail=False)
self.security_config.clean_node(node)
def pids(self, node):
try:
cmd = "jps | grep -i TransactionalMessageCopier | awk '{print $1}'"
pid_arr = [pid for pid in node.account.ssh_capture(cmd, allow_fail=True, callback=int)]
return pid_arr
except (RemoteCommandError, ValueError) as e:
self.logger.error("Could not list pids: %s" % str(e))
return []
def alive(self, node):
return len(self.pids(node)) > 0
def kill_node(self, node, clean_shutdown=True):
pids = self.pids(node)
sig = signal.SIGTERM if clean_shutdown else signal.SIGKILL
for pid in pids:
node.account.signal(pid, sig)
wait_until(lambda: len(self.pids(node)) == 0, timeout_sec=60, err_msg="Message Copier failed to stop")
def stop_node(self, node, clean_shutdown=True):
self.kill_node(node, clean_shutdown)
stopped = self.wait_node(node, timeout_sec=self.stop_timeout_sec)
assert stopped, "Node %s: did not stop within the specified timeout of %s seconds" % \
(str(node.account), str(self.stop_timeout_sec))
def restart(self, clean_shutdown):
if self.is_done:
return
node = self.nodes[0]
with self.lock:
self.consumed = -1
self.remaining = -1
self.stop_node(node, clean_shutdown)
self.start_node(node)
def try_parse_json(self, string):
"""Try to parse a string as json. Return None if not parseable."""
try:
record = json.loads(string)
return record
except ValueError:
self.logger.debug("Could not parse as json: %s" % str(string))
return None
@property
def is_done(self):
return self.message_copy_finished
def progress_percent(self):
with self.lock:
if self.remaining < 0:
return 0
if self.consumed + self.remaining == 0:
return 100
return (float(self.consumed)/float(self.consumed + self.remaining)) * 100

View File

@@ -0,0 +1,14 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,56 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ducktape.services.service import Service
from kafkatest.services.trogdor.task_spec import TaskSpec
class ConsumeBenchWorkloadSpec(TaskSpec):
def __init__(self, start_ms, duration_ms, consumer_node, bootstrap_servers,
target_messages_per_sec, max_messages, active_topics,
consumer_conf, common_client_conf, admin_client_conf, consumer_group=None, threads_per_worker=1):
super(ConsumeBenchWorkloadSpec, self).__init__(start_ms, duration_ms)
self.message["class"] = "org.apache.kafka.trogdor.workload.ConsumeBenchSpec"
self.message["consumerNode"] = consumer_node
self.message["bootstrapServers"] = bootstrap_servers
self.message["targetMessagesPerSec"] = target_messages_per_sec
self.message["maxMessages"] = max_messages
self.message["consumerConf"] = consumer_conf
self.message["adminClientConf"] = admin_client_conf
self.message["commonClientConf"] = common_client_conf
self.message["activeTopics"] = active_topics
self.message["threadsPerWorker"] = threads_per_worker
if consumer_group is not None:
self.message["consumerGroup"] = consumer_group
class ConsumeBenchWorkloadService(Service):
def __init__(self, context, kafka):
Service.__init__(self, context, num_nodes=1)
self.bootstrap_servers = kafka.bootstrap_servers(validate=False)
self.consumer_node = self.nodes[0].account.hostname
def free(self):
Service.free(self)
def wait_node(self, node, timeout_sec=None):
pass
def stop_node(self, node):
pass
def clean_node(self, node):
pass

View File

@@ -0,0 +1,48 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from kafkatest.services.trogdor.task_spec import TaskSpec
class DegradedNetworkFaultSpec(TaskSpec):
"""
The specification for a network degradation fault.
Degrades the network so that traffic on a subset of nodes has higher latency
"""
def __init__(self, start_ms, duration_ms):
"""
Create a new NetworkDegradeFaultSpec.
:param start_ms: The start time, as described in task_spec.py
:param duration_ms: The duration in milliseconds.
"""
super(DegradedNetworkFaultSpec, self).__init__(start_ms, duration_ms)
self.message["class"] = "org.apache.kafka.trogdor.fault.DegradedNetworkFaultSpec"
self.message["nodeSpecs"] = {}
def add_node_spec(self, node, networkDevice, latencyMs=0, rateLimitKbit=0):
"""
Add a node spec to this fault spec
:param node: The node name which is to be degraded
:param networkDevice: The network device name (e.g., eth0) to apply the degradation to
:param latencyMs: Optional. How much latency to add to each packet
:param rateLimitKbit: Optional. Maximum throughput in kilobits per second to allow
:return:
"""
self.message["nodeSpecs"][node] = {
"rateLimitKbit": rateLimitKbit, "latencyMs": latencyMs, "networkDevice": networkDevice
}

View File

@@ -0,0 +1,46 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from kafkatest.services.trogdor.task_spec import TaskSpec
class FilesUnreadableFaultSpec(TaskSpec):
"""
The specification for a fault which makes files unreadable.
"""
def __init__(self, start_ms, duration_ms, node_names, mount_path,
prefix, error_code):
"""
Create a new FilesUnreadableFaultSpec.
:param start_ms: The start time, as described in task_spec.py
:param duration_ms: The duration in milliseconds.
:param node_names: The names of the node(s) to create the fault on.
:param mount_path: The mount path.
:param prefix: The prefix within the mount point to make unreadable.
:param error_code: The error code to use.
"""
super(FilesUnreadableFaultSpec, self).__init__(start_ms, duration_ms)
self.message["class"] = "org.apache.kafka.trogdor.fault.FilesUnreadableFaultSpec"
self.message["nodeNames"] = node_names
self.message["mountPath"] = mount_path
self.message["prefix"] = prefix
self.message["errorCode"] = error_code
self.kibosh_message = {}
self.kibosh_message["type"] = "unreadable"
self.kibosh_message["prefix"] = prefix
self.kibosh_message["code"] = error_code

View File

@@ -0,0 +1,156 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os.path
from ducktape.services.service import Service
from ducktape.utils import util
class KiboshService(Service):
"""
Kibosh is a fault-injecting FUSE filesystem.
Attributes:
INSTALL_ROOT The path of where Kibosh is installed.
BINARY_NAME The Kibosh binary name.
BINARY_PATH The path to the kibosh binary.
"""
INSTALL_ROOT = "/opt/kibosh/build"
BINARY_NAME = "kibosh"
BINARY_PATH = os.path.join(INSTALL_ROOT, BINARY_NAME)
def __init__(self, context, nodes, target, mirror, persist="/mnt/kibosh"):
"""
Create a Kibosh service.
:param context: The TestContext object.
:param nodes: The nodes to put the Kibosh FS on. Kibosh allocates no
nodes of its own.
:param target: The target directory, which Kibosh exports a view of.
:param mirror: The mirror directory, where Kibosh injects faults.
:param persist: Where the log files and pid files will be created.
"""
Service.__init__(self, context, num_nodes=0)
if (len(nodes) == 0):
raise RuntimeError("You must supply at least one node to run the service on.")
for node in nodes:
self.nodes.append(node)
self.target = target
self.mirror = mirror
self.persist = persist
self.control_path = os.path.join(self.mirror, "kibosh_control")
self.pidfile_path = os.path.join(self.persist, "pidfile")
self.stdout_stderr_path = os.path.join(self.persist, "kibosh-stdout-stderr.log")
self.log_path = os.path.join(self.persist, "kibosh.log")
self.logs = {
"kibosh-stdout-stderr.log": {
"path": self.stdout_stderr_path,
"collect_default": True},
"kibosh.log": {
"path": self.log_path,
"collect_default": True}
}
def free(self):
"""Clear the nodes list."""
# Because the filesystem runs on nodes which have been allocated by other services, those nodes
# are not deallocated here.
self.nodes = []
Service.free(self)
def kibosh_running(self, node):
return 0 == node.account.ssh("test -e '%s'" % self.control_path, allow_fail=True)
def start_node(self, node):
node.account.mkdirs(self.persist)
cmd = "sudo -E "
cmd += " %s" % KiboshService.BINARY_PATH
cmd += " --target %s" % self.target
cmd += " --pidfile %s" % self.pidfile_path
cmd += " --log %s" % self.log_path
cmd += " --control-mode 666"
cmd += " --verbose"
cmd += " %s" % self.mirror
cmd += " &> %s" % self.stdout_stderr_path
node.account.ssh(cmd)
util.wait_until(lambda: self.kibosh_running(node), 20, backoff_sec=.1,
err_msg="Timed out waiting for kibosh to start on %s" % node.account.hostname)
def pids(self, node):
return [pid for pid in node.account.ssh_capture("test -e '%s' && test -e /proc/$(cat '%s')" %
(self.pidfile_path, self.pidfile_path), allow_fail=True)]
def wait_node(self, node, timeout_sec=None):
return len(self.pids(node)) == 0
def kibosh_process_running(self, node):
pids = self.pids(node)
if len(pids) == 0:
return True
return False
def stop_node(self, node):
"""Halt kibosh process(es) on this node."""
node.account.logger.debug("stop_node(%s): unmounting %s" % (node.name, self.mirror))
node.account.ssh("sudo fusermount -u %s" % self.mirror, allow_fail=True)
# Wait for the kibosh process to terminate.
try:
util.wait_until(lambda: self.kibosh_process_running(node), 20, backoff_sec=.1,
err_msg="Timed out waiting for kibosh to stop on %s" % node.account.hostname)
except TimeoutError:
# If the process won't terminate, use kill -9 to shut it down.
node.account.logger.debug("stop_node(%s): killing the kibosh process managing %s" % (node.name, self.mirror))
node.account.ssh("sudo kill -9 %s" % (" ".join(self.pids(node))), allow_fail=True)
node.account.ssh("sudo fusermount -u %s" % self.mirror)
util.wait_until(lambda: self.kibosh_process_running(node), 20, backoff_sec=.1,
err_msg="Timed out waiting for kibosh to stop on %s" % node.account.hostname)
def clean_node(self, node):
"""Clean up persistent state on this node - e.g. service logs, configuration files etc."""
self.stop_node(node)
node.account.ssh("rm -rf -- %s" % self.persist)
def set_faults(self, node, specs):
"""
Set the currently active faults.
:param node: The node.
:param spec: An array of FaultSpec objects describing the faults.
"""
if len(specs) == 0:
obj_json = "{}"
else:
fault_array = [spec.kibosh_message for spec in specs]
obj = { 'faults': fault_array }
obj_json = json.dumps(obj)
node.account.create_file(self.control_path, obj_json)
def get_fault_json(self, node):
"""
Return a JSON string which contains the currently active faults.
:param node: The node.
:returns: The fault JSON describing the faults.
"""
iter = node.account.ssh_capture("cat '%s'" % self.control_path)
text = ""
for line in iter:
text = "%s%s" % (text, line.rstrip("\r\n"))
return text

View File

@@ -0,0 +1,39 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from kafkatest.services.trogdor.task_spec import TaskSpec
class NetworkPartitionFaultSpec(TaskSpec):
"""
The specification for a network partition fault.
Network partition faults fracture the network into different partitions
that cannot communicate with each other.
"""
def __init__(self, start_ms, duration_ms, partitions):
"""
Create a new NetworkPartitionFaultSpec.
:param start_ms: The start time, as described in task_spec.py
:param duration_ms: The duration in milliseconds.
:param partitions: An array of arrays describing the partitions.
The inner arrays may contain either node names,
or ClusterNode objects.
"""
super(NetworkPartitionFaultSpec, self).__init__(start_ms, duration_ms)
self.message["class"] = "org.apache.kafka.trogdor.fault.NetworkPartitionFaultSpec"
self.message["partitions"] = [TaskSpec.to_node_names(p) for p in partitions]

View File

@@ -0,0 +1,35 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from kafkatest.services.trogdor.task_spec import TaskSpec
class NoOpTaskSpec(TaskSpec):
"""
The specification for a nop-op task.
No-op faults are used to test Trogdor. They don't do anything,
but must be propagated to all Trogdor agents.
"""
def __init__(self, start_ms, duration_ms):
"""
Create a new NoOpFault.
:param start_ms: The start time, as described in task_spec.py
:param duration_ms: The duration in milliseconds.
"""
super(NoOpTaskSpec, self).__init__(start_ms, duration_ms)
self.message["class"] = "org.apache.kafka.trogdor.task.NoOpTaskSpec";

View File

@@ -0,0 +1,38 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from kafkatest.services.trogdor.task_spec import TaskSpec
class ProcessStopFaultSpec(TaskSpec):
"""
The specification for a process stop fault.
"""
def __init__(self, start_ms, duration_ms, nodes, java_process_name):
"""
Create a new ProcessStopFaultSpec.
:param start_ms: The start time, as described in task_spec.py
:param duration_ms: The duration in milliseconds.
:param node_names: An array describing the nodes to stop processes on. The array
may contain either node names, or ClusterNode objects.
:param java_process_name: The name of the java process to stop. This is the name which
is reported by jps, etc., not the OS-level process name.
"""
super(ProcessStopFaultSpec, self).__init__(start_ms, duration_ms)
self.message["class"] = "org.apache.kafka.trogdor.fault.ProcessStopFaultSpec"
self.message["nodeNames"] = TaskSpec.to_node_names(nodes)
self.message["javaProcessName"] = java_process_name

View File

@@ -0,0 +1,56 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ducktape.services.service import Service
from kafkatest.services.trogdor.task_spec import TaskSpec
class ProduceBenchWorkloadSpec(TaskSpec):
def __init__(self, start_ms, duration_ms, producer_node, bootstrap_servers,
target_messages_per_sec, max_messages, producer_conf, admin_client_conf,
common_client_conf, inactive_topics, active_topics,
transaction_generator=None):
super(ProduceBenchWorkloadSpec, self).__init__(start_ms, duration_ms)
self.message["class"] = "org.apache.kafka.trogdor.workload.ProduceBenchSpec"
self.message["producerNode"] = producer_node
self.message["bootstrapServers"] = bootstrap_servers
self.message["targetMessagesPerSec"] = target_messages_per_sec
self.message["maxMessages"] = max_messages
self.message["producerConf"] = producer_conf
self.message["transactionGenerator"] = transaction_generator
self.message["adminClientConf"] = admin_client_conf
self.message["commonClientConf"] = common_client_conf
self.message["inactiveTopics"] = inactive_topics
self.message["activeTopics"] = active_topics
class ProduceBenchWorkloadService(Service):
def __init__(self, context, kafka):
Service.__init__(self, context, num_nodes=1)
self.bootstrap_servers = kafka.bootstrap_servers(validate=False)
self.producer_node = self.nodes[0].account.hostname
def free(self):
Service.free(self)
def wait_node(self, node, timeout_sec=None):
pass
def stop_node(self, node):
pass
def clean_node(self, node):
pass

View File

@@ -0,0 +1,49 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ducktape.services.service import Service
from kafkatest.services.trogdor.task_spec import TaskSpec
class RoundTripWorkloadSpec(TaskSpec):
def __init__(self, start_ms, duration_ms, client_node, bootstrap_servers,
target_messages_per_sec, max_messages, active_topics):
super(RoundTripWorkloadSpec, self).__init__(start_ms, duration_ms)
self.message["class"] = "org.apache.kafka.trogdor.workload.RoundTripWorkloadSpec"
self.message["clientNode"] = client_node
self.message["bootstrapServers"] = bootstrap_servers
self.message["targetMessagesPerSec"] = target_messages_per_sec
self.message["maxMessages"] = max_messages
self.message["activeTopics"] = active_topics
class RoundTripWorkloadService(Service):
def __init__(self, context, kafka):
Service.__init__(self, context, num_nodes=1)
self.bootstrap_servers = kafka.bootstrap_servers(validate=False)
self.client_node = self.nodes[0].account.hostname
def free(self):
Service.free(self)
def wait_node(self, node, timeout_sec=None):
pass
def stop_node(self, node):
pass
def clean_node(self, node):
pass

View File

@@ -0,0 +1,54 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
class TaskSpec(object):
"""
The base class for a task specification.
MAX_DURATION_MS The longest duration we should use for a task specification.
"""
MAX_DURATION_MS=10000000
def __init__(self, start_ms, duration_ms):
"""
Create a new task specification.
:param start_ms: The target start time in milliseconds since the epoch.
:param duration_ms: The duration in milliseconds.
"""
self.message = {
'startMs': start_ms,
'durationMs': duration_ms
}
@staticmethod
def to_node_names(nodes):
"""
Convert an array of nodes or node names to an array of node names.
"""
node_names = []
for obj in nodes:
if isinstance(obj, basestring):
node_names.append(obj)
else:
node_names.append(obj.name)
return node_names
def __str__(self):
return json.dumps(self.message)

View File

@@ -0,0 +1,23 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
log4j.rootLogger=DEBUG, mylogger
log4j.logger.kafka=DEBUG
log4j.logger.org.apache.kafka=DEBUG
log4j.logger.org.eclipse=INFO
log4j.appender.mylogger=org.apache.log4j.FileAppender
log4j.appender.mylogger.File={{ log_path }}
log4j.appender.mylogger.layout=org.apache.log4j.PatternLayout
log4j.appender.mylogger.layout.ConversionPattern=[%d] %p %m (%c)%n

View File

@@ -0,0 +1,354 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os.path
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3 import Retry
from ducktape.services.service import Service
from ducktape.utils.util import wait_until
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
class TrogdorService(KafkaPathResolverMixin, Service):
"""
A ducktape service for running the trogdor fault injection daemons.
Attributes:
PERSISTENT_ROOT The root filesystem path to store service files under.
COORDINATOR_STDOUT_STDERR The path where we store the coordinator's stdout/stderr output.
AGENT_STDOUT_STDERR The path where we store the agents's stdout/stderr output.
COORDINATOR_LOG The path where we store the coordinator's log4j output.
AGENT_LOG The path where we store the agent's log4j output.
AGENT_LOG4J_PROPERTIES The path to the agent log4j.properties file for log config.
COORDINATOR_LOG4J_PROPERTIES The path to the coordinator log4j.properties file for log config.
CONFIG_PATH The path to the trogdor configuration file.
DEFAULT_AGENT_PORT The default port to use for trogdor_agent daemons.
DEFAULT_COORDINATOR_PORT The default port to use for trogdor_coordinator daemons.
REQUEST_TIMEOUT The request timeout in seconds to use for REST requests.
REQUEST_HEADERS The request headers to use when communicating with trogdor.
"""
PERSISTENT_ROOT="/mnt/trogdor"
COORDINATOR_STDOUT_STDERR = os.path.join(PERSISTENT_ROOT, "trogdor-coordinator-stdout-stderr.log")
AGENT_STDOUT_STDERR = os.path.join(PERSISTENT_ROOT, "trogdor-agent-stdout-stderr.log")
COORDINATOR_LOG = os.path.join(PERSISTENT_ROOT, "trogdor-coordinator.log")
AGENT_LOG = os.path.join(PERSISTENT_ROOT, "trogdor-agent.log")
COORDINATOR_LOG4J_PROPERTIES = os.path.join(PERSISTENT_ROOT, "trogdor-coordinator-log4j.properties")
AGENT_LOG4J_PROPERTIES = os.path.join(PERSISTENT_ROOT, "trogdor-agent-log4j.properties")
CONFIG_PATH = os.path.join(PERSISTENT_ROOT, "trogdor.conf")
DEFAULT_AGENT_PORT=8888
DEFAULT_COORDINATOR_PORT=8889
REQUEST_TIMEOUT=5
REQUEST_HEADERS = {"Content-type": "application/json"}
logs = {
"trogdor_coordinator_stdout_stderr": {
"path": COORDINATOR_STDOUT_STDERR,
"collect_default": True},
"trogdor_agent_stdout_stderr": {
"path": AGENT_STDOUT_STDERR,
"collect_default": True},
"trogdor_coordinator_log": {
"path": COORDINATOR_LOG,
"collect_default": True},
"trogdor_agent_log": {
"path": AGENT_LOG,
"collect_default": True},
}
def __init__(self, context, agent_nodes=None, client_services=None,
agent_port=DEFAULT_AGENT_PORT, coordinator_port=DEFAULT_COORDINATOR_PORT):
"""
Create a Trogdor service.
:param context: The test context.
:param agent_nodes: The nodes to run the agents on.
:param client_services: Services whose nodes we should run agents on.
:param agent_port: The port to use for the trogdor_agent daemons.
:param coordinator_port: The port to use for the trogdor_coordinator daemons.
"""
Service.__init__(self, context, num_nodes=1)
self.coordinator_node = self.nodes[0]
if client_services is not None:
for client_service in client_services:
for node in client_service.nodes:
self.nodes.append(node)
if agent_nodes is not None:
for agent_node in agent_nodes:
self.nodes.append(agent_node)
if (len(self.nodes) == 1):
raise RuntimeError("You must supply at least one agent node to run the service on.")
self.agent_port = agent_port
self.coordinator_port = coordinator_port
def free(self):
# We only want to deallocate the coordinator node, not the agent nodes. So we
# change self.nodes to include only the coordinator node, and then invoke
# the base class' free method.
if self.coordinator_node is not None:
self.nodes = [self.coordinator_node]
self.coordinator_node = None
Service.free(self)
def _create_config_dict(self):
"""
Create a dictionary with the Trogdor configuration.
:return: The configuration dictionary.
"""
dict_nodes = {}
for node in self.nodes:
dict_nodes[node.name] = {
"hostname": node.account.ssh_hostname,
}
if node.name == self.coordinator_node.name:
dict_nodes[node.name]["trogdor.coordinator.port"] = self.coordinator_port
else:
dict_nodes[node.name]["trogdor.agent.port"] = self.agent_port
return {
"platform": "org.apache.kafka.trogdor.basic.BasicPlatform",
"nodes": dict_nodes,
}
def start_node(self, node):
node.account.mkdirs(TrogdorService.PERSISTENT_ROOT)
# Create the configuration file on the node.
str = json.dumps(self._create_config_dict(), indent=2)
self.logger.info("Creating configuration file %s with %s" % (TrogdorService.CONFIG_PATH, str))
node.account.create_file(TrogdorService.CONFIG_PATH, str)
if self.is_coordinator(node):
self._start_coordinator_node(node)
else:
self._start_agent_node(node)
def _start_coordinator_node(self, node):
node.account.create_file(TrogdorService.COORDINATOR_LOG4J_PROPERTIES,
self.render('log4j.properties',
log_path=TrogdorService.COORDINATOR_LOG))
self._start_trogdor_daemon("coordinator", TrogdorService.COORDINATOR_STDOUT_STDERR,
TrogdorService.COORDINATOR_LOG4J_PROPERTIES,
TrogdorService.COORDINATOR_LOG, node)
self.logger.info("Started trogdor coordinator on %s." % node.name)
def _start_agent_node(self, node):
node.account.create_file(TrogdorService.AGENT_LOG4J_PROPERTIES,
self.render('log4j.properties',
log_path=TrogdorService.AGENT_LOG))
self._start_trogdor_daemon("agent", TrogdorService.AGENT_STDOUT_STDERR,
TrogdorService.AGENT_LOG4J_PROPERTIES,
TrogdorService.AGENT_LOG, node)
self.logger.info("Started trogdor agent on %s." % node.name)
def _start_trogdor_daemon(self, daemon_name, stdout_stderr_capture_path,
log4j_properties_path, log_path, node):
cmd = "export KAFKA_LOG4J_OPTS='-Dlog4j.configuration=file:%s'; " % log4j_properties_path
cmd += "%s %s --%s.config %s --node-name %s 1>> %s 2>> %s &" % \
(self.path.script("trogdor.sh", node),
daemon_name,
daemon_name,
TrogdorService.CONFIG_PATH,
node.name,
stdout_stderr_capture_path,
stdout_stderr_capture_path)
node.account.ssh(cmd)
with node.account.monitor_log(log_path) as monitor:
monitor.wait_until("Starting %s process." % daemon_name, timeout_sec=60, backoff_sec=.10,
err_msg=("%s on %s didn't finish startup" % (daemon_name, node.name)))
def wait_node(self, node, timeout_sec=None):
if self.is_coordinator(node):
return len(node.account.java_pids(self.coordinator_class_name())) == 0
else:
return len(node.account.java_pids(self.agent_class_name())) == 0
def stop_node(self, node):
"""Halt trogdor processes on this node."""
if self.is_coordinator(node):
node.account.kill_java_processes(self.coordinator_class_name())
else:
node.account.kill_java_processes(self.agent_class_name())
def clean_node(self, node):
"""Clean up persistent state on this node - e.g. service logs, configuration files etc."""
self.stop_node(node)
node.account.ssh("rm -rf -- %s" % TrogdorService.PERSISTENT_ROOT)
def _coordinator_url(self, path):
return "http://%s:%d/coordinator/%s" % \
(self.coordinator_node.account.ssh_hostname, self.coordinator_port, path)
def request_session(self):
"""
Creates a new request session which will retry for a while.
"""
session = requests.Session()
session.mount('http://',
HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.3)))
return session
def _coordinator_post(self, path, message):
"""
Make a POST request to the Trogdor coordinator.
:param path: The URL path to use.
:param message: The message object to send.
:return: The response as an object.
"""
url = self._coordinator_url(path)
self.logger.info("POST %s %s" % (url, message))
response = self.request_session().post(url, json=message,
timeout=TrogdorService.REQUEST_TIMEOUT,
headers=TrogdorService.REQUEST_HEADERS)
response.raise_for_status()
return response.json()
def _coordinator_put(self, path, message):
"""
Make a PUT request to the Trogdor coordinator.
:param path: The URL path to use.
:param message: The message object to send.
:return: The response as an object.
"""
url = self._coordinator_url(path)
self.logger.info("PUT %s %s" % (url, message))
response = self.request_session().put(url, json=message,
timeout=TrogdorService.REQUEST_TIMEOUT,
headers=TrogdorService.REQUEST_HEADERS)
response.raise_for_status()
return response.json()
def _coordinator_get(self, path, message):
"""
Make a GET request to the Trogdor coordinator.
:param path: The URL path to use.
:param message: The message object to send.
:return: The response as an object.
"""
url = self._coordinator_url(path)
self.logger.info("GET %s %s" % (url, message))
response = self.request_session().get(url, json=message,
timeout=TrogdorService.REQUEST_TIMEOUT,
headers=TrogdorService.REQUEST_HEADERS)
response.raise_for_status()
return response.json()
def create_task(self, id, spec):
"""
Create a new task.
:param id: The task id.
:param spec: The task spec.
"""
self._coordinator_post("task/create", { "id": id, "spec": spec.message})
return TrogdorTask(id, self)
def stop_task(self, id):
"""
Stop a task.
:param id: The task id.
"""
self._coordinator_put("task/stop", { "id": id })
def tasks(self):
"""
Get the tasks which are on the coordinator.
:returns: A map of task id strings to task state objects.
Task state objects contain a 'spec' field with the spec
and a 'state' field with the state.
"""
return self._coordinator_get("tasks", {})
def is_coordinator(self, node):
return node == self.coordinator_node
def agent_class_name(self):
return "org.apache.kafka.trogdor.agent.Agent"
def coordinator_class_name(self):
return "org.apache.kafka.trogdor.coordinator.Coordinator"
class TrogdorTask(object):
PENDING_STATE = "PENDING"
RUNNING_STATE = "RUNNING"
STOPPING_STATE = "STOPPING"
DONE_STATE = "DONE"
def __init__(self, id, trogdor):
self.id = id
self.trogdor = trogdor
def task_state_or_error(self):
task_state = self.trogdor.tasks()["tasks"][self.id]
if task_state is None:
raise RuntimeError("Coordinator did not know about %s." % self.id)
error = task_state.get("error")
if error is None or error == "":
return task_state["state"], None
else:
return None, error
def done(self):
"""
Check if this task is done.
:raises RuntimeError: If the task encountered an error.
:returns: True if the task is in DONE_STATE;
False if it is in a different state.
"""
(task_state, error) = self.task_state_or_error()
if task_state is not None:
return task_state == TrogdorTask.DONE_STATE
else:
raise RuntimeError("Failed to gracefully stop %s: got task error: %s" % (self.id, error))
def running(self):
"""
Check if this task is running.
:raises RuntimeError: If the task encountered an error.
:returns: True if the task is in RUNNING_STATE;
False if it is in a different state.
"""
(task_state, error) = self.task_state_or_error()
if task_state is not None:
return task_state == TrogdorTask.RUNNING_STATE
else:
raise RuntimeError("Failed to start %s: got task error: %s" % (self.id, error))
def stop(self):
"""
Stop this task.
:raises RuntimeError: If the task encountered an error.
"""
if self.done():
return
self.trogdor.stop_task(self.id)
def wait_for_done(self, timeout_sec=360):
wait_until(lambda: self.done(),
timeout_sec=timeout_sec,
err_msg="%s failed to finish in the expected amount of time." % self.id)

View File

@@ -0,0 +1,330 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from kafkatest.directory_layout.kafka_path import TOOLS_JAR_NAME, TOOLS_DEPENDANT_TEST_LIBS_JAR_NAME
from kafkatest.version import DEV_BRANCH, LATEST_0_8_2
from ducktape.cluster.remoteaccount import RemoteCommandError
import importlib
import os
import subprocess
import signal
"""This module abstracts the implementation of a verifiable client, allowing
client developers to plug in their own client for all kafkatests that make
use of either the VerifiableConsumer or VerifiableProducer classes.
A verifiable client class must implement exec_cmd() and pids().
This file provides:
* VerifiableClientMixin class: to be used for creating new verifiable client classes
* VerifiableClientJava class: the default Java verifiable clients
* VerifiableClientApp class: uses global configuration to specify
the command to execute and optional "pids" command, deploy script, etc.
Config syntax (pass as --global <json_or_jsonfile>):
{"Verifiable(Producer|Consumer|Client)": {
"class": "kafkatest.services.verifiable_client.VerifiableClientApp",
"exec_cmd": "/vagrant/x/myclient --some --standard --args",
"pids": "pgrep -f ...", // optional
"deploy": "/vagrant/x/mydeploy.sh", // optional
"kill_signal": 2 // optional clean_shutdown kill signal (SIGINT in this case)
}}
* VerifiableClientDummy class: testing dummy
==============================
Verifiable client requirements
==============================
There are currently two verifiable client specifications:
* VerifiableConsumer
* VerifiableProducer
Common requirements for both:
* One-way communication (client -> tests) through new-line delimited
JSON objects on stdout (details below).
* Log/debug to stderr
Common communication for both:
* `{ "name": "startup_complete" }` - Client succesfully started
* `{ "name": "shutdown_complete" }` - Client succesfully terminated (after receiving SIGINT/SIGTERM)
==================
VerifiableConsumer
==================
Command line arguments:
* `--group-id <group-id>`
* `--topic <topic>`
* `--broker-list <brokers>`
* `--session-timeout <n>`
* `--enable-autocommit`
* `--max-messages <n>`
* `--assignment-strategy <s>`
* `--consumer.config <config-file>` - consumer config properties (typically empty)
Environment variables:
* `LOG_DIR` - log output directory. Typically not needed if logs are written to stderr.
* `KAFKA_OPTS` - Security config properties (Java client syntax)
* `KAFKA_LOG4J_OPTS` - Java log4j options (can be ignored)
Client communication:
* `{ "name": "offsets_committed", "success": bool, "error": "<errstr>", "offsets": [ { "topic": "<t>", "partition": <p>, "offset": <o> } ] }` - offset commit results, should be emitted for each committed offset. Emit prior to partitions_revoked.
* `{ "name": "records_consumed", "partitions": [ { "topic": "<t>", "partition": <p>, "minOffset": <o>, "maxOffset": <o> } ], "count": <total_consumed> }` - per-partition delta stats from last records_consumed. Emit every 1000 messages, or 1s. Emit prior to partitions_assigned, partitions_revoked and offsets_committed.
* `{ "name": "partitions_revoked", "partitions": [ { "topic": "<t>", "partition": <p> } ] }` - rebalance: revoked partitions
* `{ "name": "partitions_assigned", "partitions": [ { "topic": "<t>", "partition": <p> } ] }` - rebalance: assigned partitions
==================
VerifiableProducer
==================
Command line arguments:
* `--topic <topic>`
* `--broker-list <brokers>`
* `--max-messages <n>`
* `--throughput <msgs/s>`
* `--producer.config <config-file>` - producer config properties (typically empty)
Environment variables:
* `LOG_DIR` - log output directory. Typically not needed if logs are written to stderr.
* `KAFKA_OPTS` - Security config properties (Java client syntax)
* `KAFKA_LOG4J_OPTS` - Java log4j options (can be ignored)
Client communication:
* `{ "name": "producer_send_error", "message": "<error msg>", "topic": "<t>", "key": "<msg key>", "value": "<msg value>" }` - emit on produce error.
* `{ "name": "producer_send_success", "topic": "<t>", "partition": <p>, "offset": <o>, "key": "<msg key>", "value": "<msg value>" }` - emit on produce success.
===========
Development
===========
**Logs:**
During development of kafkatest clients it is generally a good idea to
enable collection of the client's stdout and stderr logs for troubleshooting.
Do this by setting "collect_default" to True for verifiable_consumder_stdout
and .._stderr in verifiable_consumer.py and verifiable_producer.py
**Deployment:**
There's currently no automatic way of deploying 3rd party kafkatest clients
on the VM instance so this needs to be done (at least partially) manually for
now.
One way to do this is logging in to a worker (`vagrant ssh worker1`), downloading
and building the kafkatest client under /vagrant (which maps to the kafka root
directory on the host and is shared with all VM instances).
Also make sure to install any system-level dependencies on each instance.
Then use /vagrant/..../yourkafkatestclient as your run-time path since it will
now be available on all instances.
The VerifiableClientApp automates the per-worker deployment with the optional
"deploy": "/vagrant/../deploy_script.sh" globals configuration property, this
script will be called on the VM just prior to executing the client.
"""
def create_verifiable_client_implementation(context, parent):
"""Factory for generating a verifiable client implementation class instance
:param parent: parent class instance, either VerifiableConsumer or VerifiableProducer
This will first check for a fully qualified client implementation class name
in context.globals as "Verifiable<type>" where <type> is "Producer" or "Consumer",
followed by "VerifiableClient" (which should implement both).
The global object layout is: {"class": "<full class name>", "..anything..": ..}.
If present, construct a new instance, else defaults to VerifiableClientJava
"""
# Default class
obj = {"class": "kafkatest.services.verifiable_client.VerifiableClientJava"}
parent_name = parent.__class__.__name__.rsplit('.', 1)[-1]
for k in [parent_name, "VerifiableClient"]:
if k in context.globals:
obj = context.globals[k]
break
if "class" not in obj:
raise SyntaxError('%s (or VerifiableClient) expected object format: {"class": "full.class.path", ..}' % parent_name)
clname = obj["class"]
# Using the fully qualified classname, import the implementation class
if clname.find('.') == -1:
raise SyntaxError("%s (or VerifiableClient) must specify full class path (including module)" % parent_name)
(module_name, clname) = clname.rsplit('.', 1)
cluster_mod = importlib.import_module(module_name)
impl_class = getattr(cluster_mod, clname)
return impl_class(parent, obj)
class VerifiableClientMixin (object):
"""
Verifiable client mixin class
"""
@property
def impl (self):
"""
:return: Return (and create if necessary) the Verifiable client implementation object.
"""
# Add _impl attribute to parent Verifiable(Consumer|Producer) object.
if not hasattr(self, "_impl"):
setattr(self, "_impl", create_verifiable_client_implementation(self.context, self))
if hasattr(self.context, "logger") and self.context.logger is not None:
self.context.logger.debug("Using client implementation %s for %s" % (self._impl.__class__.__name__, self.__class__.__name__))
return self._impl
def exec_cmd (self, node):
"""
:return: command string to execute client.
Environment variables will be prepended and command line arguments
appended to this string later by start_cmd().
This method should also take care of deploying the client on the instance, if necessary.
"""
raise NotImplementedError()
def pids (self, node):
""" :return: list of pids for this client instance on node """
raise NotImplementedError()
def kill_signal (self, clean_shutdown=True):
""" :return: the kill signal to terminate the application. """
if not clean_shutdown:
return signal.SIGKILL
return self.conf.get("kill_signal", signal.SIGTERM)
class VerifiableClientJava (VerifiableClientMixin):
"""
Verifiable Consumer and Producer using the official Java client.
"""
def __init__(self, parent, conf=None):
"""
:param parent: The parent instance, either VerifiableConsumer or VerifiableProducer
:param conf: Optional conf object (the --globals VerifiableX object)
"""
super(VerifiableClientJava, self).__init__()
self.parent = parent
self.java_class_name = parent.java_class_name()
self.conf = conf
def exec_cmd (self, node):
""" :return: command to execute to start instance
Translates Verifiable* to the corresponding Java client class name """
cmd = ""
if self.java_class_name == 'VerifiableProducer' and node.version <= LATEST_0_8_2:
# 0.8.2.X releases do not have VerifiableProducer.java, so cheat and add
# the tools jar from trunk to the classpath
tools_jar = self.parent.path.jar(TOOLS_JAR_NAME, DEV_BRANCH)
tools_dependant_libs_jar = self.parent.path.jar(TOOLS_DEPENDANT_TEST_LIBS_JAR_NAME, DEV_BRANCH)
cmd += "for file in %s; do CLASSPATH=$CLASSPATH:$file; done; " % tools_jar
cmd += "for file in %s; do CLASSPATH=$CLASSPATH:$file; done; " % tools_dependant_libs_jar
cmd += "export CLASSPATH; "
cmd += self.parent.path.script("kafka-run-class.sh", node) + " org.apache.kafka.tools." + self.java_class_name
return cmd
def pids (self, node):
""" :return: pid(s) for this client intstance on node """
try:
cmd = "jps | grep -i " + self.java_class_name + " | awk '{print $1}'"
pid_arr = [pid for pid in node.account.ssh_capture(cmd, allow_fail=True, callback=int)]
return pid_arr
except (RemoteCommandError, ValueError) as e:
return []
class VerifiableClientDummy (VerifiableClientMixin):
"""
Dummy class for testing the pluggable framework
"""
def __init__(self, parent, conf=None):
"""
:param parent: The parent instance, either VerifiableConsumer or VerifiableProducer
:param conf: Optional conf object (the --globals VerifiableX object)
"""
super(VerifiableClientDummy, self).__init__()
self.parent = parent
self.conf = conf
def exec_cmd (self, node):
""" :return: command to execute to start instance """
return 'echo -e \'{"name": "shutdown_complete" }\n\' ; echo ARGS:'
def pids (self, node):
""" :return: pid(s) for this client intstance on node """
return []
class VerifiableClientApp (VerifiableClientMixin):
"""
VerifiableClient using --global settings for exec_cmd, pids and deploy.
By using this a verifiable client application can be used through simple
--globals configuration rather than implementing a Python class.
"""
def __init__(self, parent, conf):
"""
:param parent: The parent instance, either VerifiableConsumer or VerifiableProducer
:param conf: Optional conf object (the --globals VerifiableX object)
"""
super(VerifiableClientApp, self).__init__()
self.parent = parent
# "VerifiableConsumer" or "VerifiableProducer"
self.name = self.parent.__class__.__name__
self.conf = conf
if "exec_cmd" not in self.conf:
raise SyntaxError("%s requires \"exec_cmd\": .. to be set in --globals %s object" % \
(self.__class__.__name__, self.name))
def exec_cmd (self, node):
""" :return: command to execute to start instance """
self.deploy(node)
return self.conf["exec_cmd"]
def pids (self, node):
""" :return: pid(s) for this client intstance on node """
cmd = self.conf.get("pids", "pgrep -f '" + self.conf["exec_cmd"] + "'")
try:
pid_arr = [pid for pid in node.account.ssh_capture(cmd, allow_fail=True, callback=int)]
self.parent.context.logger.info("%s pids are: %s" % (str(node.account), pid_arr))
return pid_arr
except (subprocess.CalledProcessError, ValueError) as e:
return []
def deploy (self, node):
""" Call deploy script specified by "deploy" --global key
This optional script is run on the VM instance just prior to
executing `exec_cmd` to deploy the kafkatest client.
The script path must be as seen by the VM instance, e.g. /vagrant/.... """
if "deploy" not in self.conf:
return
script_cmd = self.conf["deploy"]
self.parent.context.logger.debug("Deploying %s: %s" % (self, script_cmd))
r = node.account.ssh(script_cmd)

View File

@@ -0,0 +1,418 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
from ducktape.services.background_thread import BackgroundThreadService
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
from kafkatest.services.kafka import TopicPartition
from kafkatest.services.verifiable_client import VerifiableClientMixin
from kafkatest.version import DEV_BRANCH, V_2_3_0, V_2_3_1, V_0_10_0_0
class ConsumerState:
Started = 1
Dead = 2
Rebalancing = 3
Joined = 4
class ConsumerEventHandler(object):
def __init__(self, node, verify_offsets, idx):
self.node = node
self.idx = idx
self.state = ConsumerState.Dead
self.revoked_count = 0
self.assigned_count = 0
self.assignment = []
self.position = {}
self.committed = {}
self.total_consumed = 0
self.verify_offsets = verify_offsets
def handle_shutdown_complete(self):
self.state = ConsumerState.Dead
self.assignment = []
self.position = {}
def handle_startup_complete(self):
self.state = ConsumerState.Started
def handle_offsets_committed(self, event, node, logger):
if event["success"]:
for offset_commit in event["offsets"]:
if offset_commit.get("error", "") != "":
logger.debug("%s: Offset commit failed for: %s" % (str(node.account), offset_commit))
continue
topic = offset_commit["topic"]
partition = offset_commit["partition"]
tp = TopicPartition(topic, partition)
offset = offset_commit["offset"]
assert tp in self.assignment, \
"Committed offsets for partition %s not assigned (current assignment: %s)" % \
(str(tp), str(self.assignment))
assert tp in self.position, "No previous position for %s: %s" % (str(tp), event)
assert self.position[tp] >= offset, \
"The committed offset %d was greater than the current position %d for partition %s" % \
(offset, self.position[tp], str(tp))
self.committed[tp] = offset
def handle_records_consumed(self, event, logger):
assert self.state == ConsumerState.Joined, \
"Consumed records should only be received when joined (current state: %s)" % str(self.state)
for record_batch in event["partitions"]:
tp = TopicPartition(topic=record_batch["topic"],
partition=record_batch["partition"])
min_offset = record_batch["minOffset"]
max_offset = record_batch["maxOffset"]
assert tp in self.assignment, \
"Consumed records for partition %s which is not assigned (current assignment: %s)" % \
(str(tp), str(self.assignment))
if tp not in self.position or self.position[tp] == min_offset:
self.position[tp] = max_offset + 1
else:
msg = "Consumed from an unexpected offset (%d, %d) for partition %s" % \
(self.position.get(tp), min_offset, str(tp))
if self.verify_offsets:
raise AssertionError(msg)
else:
if tp in self.position:
self.position[tp] = max_offset + 1
logger.warn(msg)
self.total_consumed += event["count"]
def handle_partitions_revoked(self, event):
self.revoked_count += 1
self.state = ConsumerState.Rebalancing
self.position = {}
def handle_partitions_assigned(self, event):
self.assigned_count += 1
self.state = ConsumerState.Joined
assignment = []
for topic_partition in event["partitions"]:
topic = topic_partition["topic"]
partition = topic_partition["partition"]
assignment.append(TopicPartition(topic, partition))
self.assignment = assignment
def handle_kill_process(self, clean_shutdown):
# if the shutdown was clean, then we expect the explicit
# shutdown event from the consumer
if not clean_shutdown:
self.handle_shutdown_complete()
def current_assignment(self):
return list(self.assignment)
def current_position(self, tp):
if tp in self.position:
return self.position[tp]
else:
return None
def last_commit(self, tp):
if tp in self.committed:
return self.committed[tp]
else:
return None
class VerifiableConsumer(KafkaPathResolverMixin, VerifiableClientMixin, BackgroundThreadService):
"""This service wraps org.apache.kafka.tools.VerifiableConsumer for use in
system testing.
NOTE: this class should be treated as a PUBLIC API. Downstream users use
this service both directly and through class extension, so care must be
taken to ensure compatibility.
"""
PERSISTENT_ROOT = "/mnt/verifiable_consumer"
STDOUT_CAPTURE = os.path.join(PERSISTENT_ROOT, "verifiable_consumer.stdout")
STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "verifiable_consumer.stderr")
LOG_DIR = os.path.join(PERSISTENT_ROOT, "logs")
LOG_FILE = os.path.join(LOG_DIR, "verifiable_consumer.log")
LOG4J_CONFIG = os.path.join(PERSISTENT_ROOT, "tools-log4j.properties")
CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "verifiable_consumer.properties")
logs = {
"verifiable_consumer_stdout": {
"path": STDOUT_CAPTURE,
"collect_default": False},
"verifiable_consumer_stderr": {
"path": STDERR_CAPTURE,
"collect_default": False},
"verifiable_consumer_log": {
"path": LOG_FILE,
"collect_default": True}
}
def __init__(self, context, num_nodes, kafka, topic, group_id,
static_membership=False, max_messages=-1, session_timeout_sec=30, enable_autocommit=False,
assignment_strategy=None,
version=DEV_BRANCH, stop_timeout_sec=30, log_level="INFO", jaas_override_variables=None,
on_record_consumed=None, reset_policy="earliest", verify_offsets=True):
"""
:param jaas_override_variables: A dict of variables to be used in the jaas.conf template file
"""
super(VerifiableConsumer, self).__init__(context, num_nodes)
self.log_level = log_level
self.kafka = kafka
self.topic = topic
self.group_id = group_id
self.reset_policy = reset_policy
self.static_membership = static_membership
self.max_messages = max_messages
self.session_timeout_sec = session_timeout_sec
self.enable_autocommit = enable_autocommit
self.assignment_strategy = assignment_strategy
self.prop_file = ""
self.stop_timeout_sec = stop_timeout_sec
self.on_record_consumed = on_record_consumed
self.verify_offsets = verify_offsets
self.event_handlers = {}
self.global_position = {}
self.global_committed = {}
self.jaas_override_variables = jaas_override_variables or {}
for node in self.nodes:
node.version = version
def java_class_name(self):
return "VerifiableConsumer"
def _worker(self, idx, node):
with self.lock:
if node not in self.event_handlers:
self.event_handlers[node] = ConsumerEventHandler(node, self.verify_offsets, idx)
handler = self.event_handlers[node]
node.account.ssh("mkdir -p %s" % VerifiableConsumer.PERSISTENT_ROOT, allow_fail=False)
# Create and upload log properties
log_config = self.render('tools_log4j.properties', log_file=VerifiableConsumer.LOG_FILE)
node.account.create_file(VerifiableConsumer.LOG4J_CONFIG, log_config)
# Create and upload config file
self.security_config = self.kafka.security_config.client_config(self.prop_file, node,
self.jaas_override_variables)
self.security_config.setup_node(node)
self.prop_file += str(self.security_config)
self.logger.info("verifiable_consumer.properties:")
self.logger.info(self.prop_file)
node.account.create_file(VerifiableConsumer.CONFIG_FILE, self.prop_file)
self.security_config.setup_node(node)
# apply group.instance.id to the node for static membership validation
node.group_instance_id = None
if self.static_membership:
assert node.version >= V_2_3_0, \
"Version %s does not support static membership (must be 2.3 or higher)" % str(node.version)
node.group_instance_id = self.group_id + "-instance-" + str(idx)
if self.assignment_strategy:
assert node.version >= V_0_10_0_0, \
"Version %s does not setting an assignment strategy (must be 0.10.0 or higher)" % str(node.version)
cmd = self.start_cmd(node)
self.logger.debug("VerifiableConsumer %d command: %s" % (idx, cmd))
for line in node.account.ssh_capture(cmd):
event = self.try_parse_json(node, line.strip())
if event is not None:
with self.lock:
name = event["name"]
if name == "shutdown_complete":
handler.handle_shutdown_complete()
elif name == "startup_complete":
handler.handle_startup_complete()
elif name == "offsets_committed":
handler.handle_offsets_committed(event, node, self.logger)
self._update_global_committed(event)
elif name == "records_consumed":
handler.handle_records_consumed(event, self.logger)
self._update_global_position(event, node)
elif name == "record_data" and self.on_record_consumed:
self.on_record_consumed(event, node)
elif name == "partitions_revoked":
handler.handle_partitions_revoked(event)
elif name == "partitions_assigned":
handler.handle_partitions_assigned(event)
else:
self.logger.debug("%s: ignoring unknown event: %s" % (str(node.account), event))
def _update_global_position(self, consumed_event, node):
for consumed_partition in consumed_event["partitions"]:
tp = TopicPartition(consumed_partition["topic"], consumed_partition["partition"])
if tp in self.global_committed:
# verify that the position never gets behind the current commit.
if self.global_committed[tp] > consumed_partition["minOffset"]:
msg = "Consumed position %d is behind the current committed offset %d for partition %s" % \
(consumed_partition["minOffset"], self.global_committed[tp], str(tp))
if self.verify_offsets:
raise AssertionError(msg)
else:
self.logger.warn(msg)
# the consumer cannot generally guarantee that the position increases monotonically
# without gaps in the face of hard failures, so we only log a warning when this happens
if tp in self.global_position and self.global_position[tp] != consumed_partition["minOffset"]:
self.logger.warn("%s: Expected next consumed offset of %d for partition %s, but instead saw %d" %
(str(node.account), self.global_position[tp], str(tp), consumed_partition["minOffset"]))
self.global_position[tp] = consumed_partition["maxOffset"] + 1
def _update_global_committed(self, commit_event):
if commit_event["success"]:
for offset_commit in commit_event["offsets"]:
tp = TopicPartition(offset_commit["topic"], offset_commit["partition"])
offset = offset_commit["offset"]
assert self.global_position[tp] >= offset, \
"Committed offset %d for partition %s is ahead of the current position %d" % \
(offset, str(tp), self.global_position[tp])
self.global_committed[tp] = offset
def start_cmd(self, node):
cmd = ""
cmd += "export LOG_DIR=%s;" % VerifiableConsumer.LOG_DIR
cmd += " export KAFKA_OPTS=%s;" % self.security_config.kafka_opts
cmd += " export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%s\"; " % VerifiableConsumer.LOG4J_CONFIG
cmd += self.impl.exec_cmd(node)
if self.on_record_consumed:
cmd += " --verbose"
if node.group_instance_id:
cmd += " --group-instance-id %s" % node.group_instance_id
elif node.version == V_2_3_0 or node.version == V_2_3_1:
# In 2.3, --group-instance-id was required, but would be left empty
# if `None` is passed as the argument value
cmd += " --group-instance-id None"
if self.assignment_strategy:
cmd += " --assignment-strategy %s" % self.assignment_strategy
if self.enable_autocommit:
cmd += " --enable-autocommit "
cmd += " --reset-policy %s --group-id %s --topic %s --broker-list %s --session-timeout %s" % \
(self.reset_policy, self.group_id, self.topic,
self.kafka.bootstrap_servers(self.security_config.security_protocol),
self.session_timeout_sec*1000)
if self.max_messages > 0:
cmd += " --max-messages %s" % str(self.max_messages)
cmd += " --consumer.config %s" % VerifiableConsumer.CONFIG_FILE
cmd += " 2>> %s | tee -a %s &" % (VerifiableConsumer.STDOUT_CAPTURE, VerifiableConsumer.STDOUT_CAPTURE)
return cmd
def pids(self, node):
return self.impl.pids(node)
def try_parse_json(self, node, string):
"""Try to parse a string as json. Return None if not parseable."""
try:
return json.loads(string)
except ValueError:
self.logger.debug("%s: Could not parse as json: %s" % (str(node.account), str(string)))
return None
def stop_all(self):
for node in self.nodes:
self.stop_node(node)
def kill_node(self, node, clean_shutdown=True, allow_fail=False):
sig = self.impl.kill_signal(clean_shutdown)
for pid in self.pids(node):
node.account.signal(pid, sig, allow_fail)
with self.lock:
self.event_handlers[node].handle_kill_process(clean_shutdown)
def stop_node(self, node, clean_shutdown=True):
self.kill_node(node, clean_shutdown=clean_shutdown)
stopped = self.wait_node(node, timeout_sec=self.stop_timeout_sec)
assert stopped, "Node %s: did not stop within the specified timeout of %s seconds" % \
(str(node.account), str(self.stop_timeout_sec))
def clean_node(self, node):
self.kill_node(node, clean_shutdown=False)
node.account.ssh("rm -rf " + self.PERSISTENT_ROOT, allow_fail=False)
self.security_config.clean_node(node)
def current_assignment(self):
with self.lock:
return { handler.node: handler.current_assignment() for handler in self.event_handlers.itervalues() }
def current_position(self, tp):
with self.lock:
if tp in self.global_position:
return self.global_position[tp]
else:
return None
def owner(self, tp):
with self.lock:
for handler in self.event_handlers.itervalues():
if tp in handler.current_assignment():
return handler.node
return None
def last_commit(self, tp):
with self.lock:
if tp in self.global_committed:
return self.global_committed[tp]
else:
return None
def total_consumed(self):
with self.lock:
return sum(handler.total_consumed for handler in self.event_handlers.itervalues())
def num_rebalances(self):
with self.lock:
return max(handler.assigned_count for handler in self.event_handlers.itervalues())
def num_revokes_for_alive(self, keep_alive=1):
with self.lock:
return max([handler.revoked_count for handler in self.event_handlers.itervalues()
if handler.idx <= keep_alive])
def joined_nodes(self):
with self.lock:
return [handler.node for handler in self.event_handlers.itervalues()
if handler.state == ConsumerState.Joined]
def rebalancing_nodes(self):
with self.lock:
return [handler.node for handler in self.event_handlers.itervalues()
if handler.state == ConsumerState.Rebalancing]
def dead_nodes(self):
with self.lock:
return [handler.node for handler in self.event_handlers.itervalues()
if handler.state == ConsumerState.Dead]
def alive_nodes(self):
with self.lock:
return [handler.node for handler in self.event_handlers.itervalues()
if handler.state != ConsumerState.Dead]

View File

@@ -0,0 +1,315 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import time
from ducktape.cluster.remoteaccount import RemoteCommandError
from ducktape.services.background_thread import BackgroundThreadService
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
from kafkatest.services.kafka import TopicPartition
from kafkatest.services.verifiable_client import VerifiableClientMixin
from kafkatest.utils import is_int, is_int_with_prefix
from kafkatest.version import DEV_BRANCH
class VerifiableProducer(KafkaPathResolverMixin, VerifiableClientMixin, BackgroundThreadService):
"""This service wraps org.apache.kafka.tools.VerifiableProducer for use in
system testing.
NOTE: this class should be treated as a PUBLIC API. Downstream users use
this service both directly and through class extension, so care must be
taken to ensure compatibility.
"""
PERSISTENT_ROOT = "/mnt/verifiable_producer"
STDOUT_CAPTURE = os.path.join(PERSISTENT_ROOT, "verifiable_producer.stdout")
STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "verifiable_producer.stderr")
LOG_DIR = os.path.join(PERSISTENT_ROOT, "logs")
LOG_FILE = os.path.join(LOG_DIR, "verifiable_producer.log")
LOG4J_CONFIG = os.path.join(PERSISTENT_ROOT, "tools-log4j.properties")
CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "verifiable_producer.properties")
logs = {
"verifiable_producer_stdout": {
"path": STDOUT_CAPTURE,
"collect_default": False},
"verifiable_producer_stderr": {
"path": STDERR_CAPTURE,
"collect_default": False},
"verifiable_producer_log": {
"path": LOG_FILE,
"collect_default": True}
}
def __init__(self, context, num_nodes, kafka, topic, max_messages=-1, throughput=100000,
message_validator=is_int, compression_types=None, version=DEV_BRANCH, acks=None,
stop_timeout_sec=150, request_timeout_sec=30, log_level="INFO",
enable_idempotence=False, offline_nodes=[], create_time=-1, repeating_keys=None,
jaas_override_variables=None, kafka_opts_override="", client_prop_file_override="",
retries=None):
"""
Args:
:param max_messages number of messages to be produced per producer
:param message_validator checks for an expected format of messages produced. There are
currently two:
* is_int is an integer format; this is default and expected to be used if
num_nodes = 1
* is_int_with_prefix recommended if num_nodes > 1, because otherwise each producer
will produce exactly same messages, and validation may miss missing messages.
:param compression_types If None, all producers will not use compression; or a list of compression types,
one per producer (could be "none").
:param jaas_override_variables A dict of variables to be used in the jaas.conf template file
:param kafka_opts_override Override parameters of the KAFKA_OPTS environment variable
:param client_prop_file_override Override client.properties file used by the consumer
"""
super(VerifiableProducer, self).__init__(context, num_nodes)
self.log_level = log_level
self.kafka = kafka
self.topic = topic
self.max_messages = max_messages
self.throughput = throughput
self.message_validator = message_validator
self.compression_types = compression_types
if self.compression_types is not None:
assert len(self.compression_types) == num_nodes, "Specify one compression type per node"
for node in self.nodes:
node.version = version
self.acked_values = []
self.acked_values_by_partition = {}
self._last_acked_offsets = {}
self.not_acked_values = []
self.produced_count = {}
self.clean_shutdown_nodes = set()
self.acks = acks
self.stop_timeout_sec = stop_timeout_sec
self.request_timeout_sec = request_timeout_sec
self.enable_idempotence = enable_idempotence
self.offline_nodes = offline_nodes
self.create_time = create_time
self.repeating_keys = repeating_keys
self.jaas_override_variables = jaas_override_variables or {}
self.kafka_opts_override = kafka_opts_override
self.client_prop_file_override = client_prop_file_override
self.retries = retries
def java_class_name(self):
return "VerifiableProducer"
def prop_file(self, node):
idx = self.idx(node)
prop_file = self.render('producer.properties', request_timeout_ms=(self.request_timeout_sec * 1000))
prop_file += "\n{}".format(str(self.security_config))
if self.compression_types is not None:
compression_index = idx - 1
self.logger.info("VerifiableProducer (index = %d) will use compression type = %s", idx,
self.compression_types[compression_index])
prop_file += "\ncompression.type=%s\n" % self.compression_types[compression_index]
return prop_file
def _worker(self, idx, node):
node.account.ssh("mkdir -p %s" % VerifiableProducer.PERSISTENT_ROOT, allow_fail=False)
# Create and upload log properties
log_config = self.render('tools_log4j.properties', log_file=VerifiableProducer.LOG_FILE)
node.account.create_file(VerifiableProducer.LOG4J_CONFIG, log_config)
# Configure security
self.security_config = self.kafka.security_config.client_config(node=node,
jaas_override_variables=self.jaas_override_variables)
self.security_config.setup_node(node)
# Create and upload config file
if self.client_prop_file_override:
producer_prop_file = self.client_prop_file_override
else:
producer_prop_file = self.prop_file(node)
if self.acks is not None:
self.logger.info("VerifiableProducer (index = %d) will use acks = %s", idx, self.acks)
producer_prop_file += "\nacks=%s\n" % self.acks
if self.enable_idempotence:
self.logger.info("Setting up an idempotent producer")
producer_prop_file += "\nmax.in.flight.requests.per.connection=5\n"
producer_prop_file += "\nretries=1000000\n"
producer_prop_file += "\nenable.idempotence=true\n"
elif self.retries is not None:
self.logger.info("VerifiableProducer (index = %d) will use retries = %s", idx, self.retries)
producer_prop_file += "\nretries=%s\n" % self.retries
producer_prop_file += "\ndelivery.timeout.ms=%s\n" % (self.request_timeout_sec * 1000 * self.retries)
self.logger.info("verifiable_producer.properties:")
self.logger.info(producer_prop_file)
node.account.create_file(VerifiableProducer.CONFIG_FILE, producer_prop_file)
cmd = self.start_cmd(node, idx)
self.logger.debug("VerifiableProducer %d command: %s" % (idx, cmd))
self.produced_count[idx] = 0
last_produced_time = time.time()
prev_msg = None
for line in node.account.ssh_capture(cmd):
line = line.strip()
data = self.try_parse_json(line)
if data is not None:
with self.lock:
if data["name"] == "producer_send_error":
data["node"] = idx
self.not_acked_values.append(self.message_validator(data["value"]))
self.produced_count[idx] += 1
elif data["name"] == "producer_send_success":
partition = TopicPartition(data["topic"], data["partition"])
value = self.message_validator(data["value"])
self.acked_values.append(value)
if partition not in self.acked_values_by_partition:
self.acked_values_by_partition[partition] = []
self.acked_values_by_partition[partition].append(value)
self._last_acked_offsets[partition] = data["offset"]
self.produced_count[idx] += 1
# Log information if there is a large gap between successively acknowledged messages
t = time.time()
time_delta_sec = t - last_produced_time
if time_delta_sec > 2 and prev_msg is not None:
self.logger.debug(
"Time delta between successively acked messages is large: " +
"delta_t_sec: %s, prev_message: %s, current_message: %s" % (str(time_delta_sec), str(prev_msg), str(data)))
last_produced_time = t
prev_msg = data
elif data["name"] == "shutdown_complete":
if node in self.clean_shutdown_nodes:
raise Exception("Unexpected shutdown event from producer, already shutdown. Producer index: %d" % idx)
self.clean_shutdown_nodes.add(node)
def _has_output(self, node):
"""Helper used as a proxy to determine whether jmx is running by that jmx_tool_log contains output."""
try:
node.account.ssh("test -z \"$(cat %s)\"" % VerifiableProducer.STDOUT_CAPTURE, allow_fail=False)
return False
except RemoteCommandError:
return True
def start_cmd(self, node, idx):
cmd = "export LOG_DIR=%s;" % VerifiableProducer.LOG_DIR
if self.kafka_opts_override:
cmd += " export KAFKA_OPTS=\"%s\";" % self.kafka_opts_override
else:
cmd += " export KAFKA_OPTS=%s;" % self.security_config.kafka_opts
cmd += " export KAFKA_LOG4J_OPTS=\"-Dlog4j.configuration=file:%s\"; " % VerifiableProducer.LOG4J_CONFIG
cmd += self.impl.exec_cmd(node)
cmd += " --topic %s --broker-list %s" % (self.topic, self.kafka.bootstrap_servers(self.security_config.security_protocol, True, self.offline_nodes))
if self.max_messages > 0:
cmd += " --max-messages %s" % str(self.max_messages)
if self.throughput > 0:
cmd += " --throughput %s" % str(self.throughput)
if self.message_validator == is_int_with_prefix:
cmd += " --value-prefix %s" % str(idx)
if self.acks is not None:
cmd += " --acks %s " % str(self.acks)
if self.create_time > -1:
cmd += " --message-create-time %s " % str(self.create_time)
if self.repeating_keys is not None:
cmd += " --repeating-keys %s " % str(self.repeating_keys)
cmd += " --producer.config %s" % VerifiableProducer.CONFIG_FILE
cmd += " 2>> %s | tee -a %s &" % (VerifiableProducer.STDOUT_CAPTURE, VerifiableProducer.STDOUT_CAPTURE)
return cmd
def kill_node(self, node, clean_shutdown=True, allow_fail=False):
sig = self.impl.kill_signal(clean_shutdown)
for pid in self.pids(node):
node.account.signal(pid, sig, allow_fail)
def pids(self, node):
return self.impl.pids(node)
def alive(self, node):
return len(self.pids(node)) > 0
@property
def last_acked_offsets(self):
with self.lock:
return self._last_acked_offsets
@property
def acked(self):
with self.lock:
return self.acked_values
@property
def acked_by_partition(self):
with self.lock:
return self.acked_values_by_partition
@property
def not_acked(self):
with self.lock:
return self.not_acked_values
@property
def num_acked(self):
with self.lock:
return len(self.acked_values)
@property
def num_not_acked(self):
with self.lock:
return len(self.not_acked_values)
def each_produced_at_least(self, count):
with self.lock:
for idx in range(1, self.num_nodes + 1):
if self.produced_count.get(idx) is None or self.produced_count[idx] < count:
return False
return True
def stop_node(self, node):
# There is a race condition on shutdown if using `max_messages` since the
# VerifiableProducer will shutdown automatically when all messages have been
# written. In this case, the process will be gone and the signal will fail.
allow_fail = self.max_messages > 0
self.kill_node(node, clean_shutdown=True, allow_fail=allow_fail)
stopped = self.wait_node(node, timeout_sec=self.stop_timeout_sec)
assert stopped, "Node %s: did not stop within the specified timeout of %s seconds" % \
(str(node.account), str(self.stop_timeout_sec))
def clean_node(self, node):
self.kill_node(node, clean_shutdown=False, allow_fail=False)
node.account.ssh("rm -rf " + self.PERSISTENT_ROOT, allow_fail=False)
self.security_config.clean_node(node)
def try_parse_json(self, string):
"""Try to parse a string as json. Return None if not parseable."""
try:
record = json.loads(string)
return record
except ValueError:
self.logger.debug("Could not parse as json: %s" % str(string))
return None

View File

@@ -0,0 +1,251 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import time
from ducktape.services.service import Service
from ducktape.utils.util import wait_until
from ducktape.cluster.remoteaccount import RemoteCommandError
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin
from kafkatest.services.security.security_config import SecurityConfig
from kafkatest.version import DEV_BRANCH
class ZookeeperService(KafkaPathResolverMixin, Service):
ROOT = "/mnt/zookeeper"
DATA = os.path.join(ROOT, "data")
HEAP_DUMP_FILE = os.path.join(ROOT, "zk_heap_dump.bin")
logs = {
"zk_log": {
"path": "%s/zk.log" % ROOT,
"collect_default": True},
"zk_data": {
"path": DATA,
"collect_default": False},
"zk_heap_dump_file": {
"path": HEAP_DUMP_FILE,
"collect_default": True}
}
def __init__(self, context, num_nodes, zk_sasl = False, zk_client_port = True, zk_client_secure_port = False,
zk_tls_encrypt_only = False):
"""
:type context
"""
self.kafka_opts = ""
self.zk_sasl = zk_sasl
if not zk_client_port and not zk_client_secure_port:
raise Exception("Cannot disable both ZK clientPort and clientSecurePort")
self.zk_client_port = zk_client_port
self.zk_client_secure_port = zk_client_secure_port
self.zk_tls_encrypt_only = zk_tls_encrypt_only
super(ZookeeperService, self).__init__(context, num_nodes)
@property
def security_config(self):
return SecurityConfig(self.context, zk_sasl=self.zk_sasl, zk_tls=self.zk_client_secure_port)
@property
def security_system_properties(self):
return "-Dzookeeper.authProvider.sasl=org.apache.zookeeper.server.auth.SASLAuthenticationProvider " \
"-DjaasLoginRenew=3600000 " \
"-Djava.security.auth.login.config=%s " \
"-Djava.security.krb5.conf=%s " % (self.security_config.JAAS_CONF_PATH, self.security_config.KRB5CONF_PATH)
@property
def zk_principals(self):
return " zkclient " + ' '.join(['zookeeper/' + zk_node.account.hostname for zk_node in self.nodes])
def restart_cluster(self):
for node in self.nodes:
self.restart_node(node)
def restart_node(self, node):
"""Restart the given node."""
self.stop_node(node)
self.start_node(node)
def start_node(self, node):
idx = self.idx(node)
self.logger.info("Starting ZK node %d on %s", idx, node.account.hostname)
node.account.ssh("mkdir -p %s" % ZookeeperService.DATA)
node.account.ssh("echo %d > %s/myid" % (idx, ZookeeperService.DATA))
self.security_config.setup_node(node)
config_file = self.render('zookeeper.properties')
self.logger.info("zookeeper.properties:")
self.logger.info(config_file)
node.account.create_file("%s/zookeeper.properties" % ZookeeperService.ROOT, config_file)
heap_kafka_opts = "-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=%s" % self.logs["zk_heap_dump_file"]["path"]
other_kafka_opts = self.kafka_opts + ' ' + self.security_system_properties \
if self.security_config.zk_sasl else self.kafka_opts
start_cmd = "export KAFKA_OPTS=\"%s %s\";" % (heap_kafka_opts, other_kafka_opts)
start_cmd += "%s " % self.path.script("zookeeper-server-start.sh", node)
start_cmd += "%s/zookeeper.properties &>> %s &" % (ZookeeperService.ROOT, self.logs["zk_log"]["path"])
node.account.ssh(start_cmd)
wait_until(lambda: self.listening(node), timeout_sec=30, err_msg="Zookeeper node failed to start")
def listening(self, node):
try:
port = 2181 if self.zk_client_port else 2182
cmd = "nc -z %s %s" % (node.account.hostname, port)
node.account.ssh_output(cmd, allow_fail=False)
self.logger.debug("Zookeeper started accepting connections at: '%s:%s')", node.account.hostname, port)
return True
except (RemoteCommandError, ValueError) as e:
return False
def pids(self, node):
return node.account.java_pids(self.java_class_name())
def alive(self, node):
return len(self.pids(node)) > 0
def stop_node(self, node):
idx = self.idx(node)
self.logger.info("Stopping %s node %d on %s" % (type(self).__name__, idx, node.account.hostname))
node.account.kill_java_processes(self.java_class_name(), allow_fail=False)
node.account.kill_java_processes(self.java_cli_class_name(), allow_fail=False)
wait_until(lambda: not self.alive(node), timeout_sec=5, err_msg="Timed out waiting for zookeeper to stop.")
def clean_node(self, node):
self.logger.info("Cleaning ZK node %d on %s", self.idx(node), node.account.hostname)
if self.alive(node):
self.logger.warn("%s %s was still alive at cleanup time. Killing forcefully..." %
(self.__class__.__name__, node.account))
node.account.kill_java_processes(self.java_class_name(),
clean_shutdown=False, allow_fail=True)
node.account.kill_java_processes(self.java_cli_class_name(),
clean_shutdown=False, allow_fail=False)
node.account.ssh("rm -rf -- %s" % ZookeeperService.ROOT, allow_fail=False)
# force_tls is a necessary option for the case where we define both encrypted and non-encrypted ports
def connect_setting(self, chroot=None, force_tls=False):
if chroot and not chroot.startswith("/"):
raise Exception("ZK chroot must start with '/', invalid chroot: %s" % chroot)
chroot = '' if chroot is None else chroot
return ','.join([node.account.hostname + (':2182' if not self.zk_client_port or force_tls else ':2181') + chroot
for node in self.nodes])
def zkTlsConfigFileOption(self, forZooKeeperMain=False):
if not self.zk_client_secure_port:
return ""
return ("-zk-tls-config-file " if forZooKeeperMain else "--zk-tls-config-file ") + \
(SecurityConfig.ZK_CLIENT_TLS_ENCRYPT_ONLY_CONFIG_PATH if self.zk_tls_encrypt_only else SecurityConfig.ZK_CLIENT_MUTUAL_AUTH_CONFIG_PATH)
#
# This call is used to simulate a rolling upgrade to enable/disable
# the use of ZooKeeper ACLs.
#
def zookeeper_migration(self, node, zk_acl):
la_migra_cmd = "export KAFKA_OPTS=\"%s\";" % \
self.security_system_properties if self.security_config.zk_sasl else ""
la_migra_cmd += "%s --zookeeper.acl=%s --zookeeper.connect=%s %s" % \
(self.path.script("zookeeper-security-migration.sh", node), zk_acl,
self.connect_setting(force_tls=self.zk_client_secure_port),
self.zkTlsConfigFileOption())
node.account.ssh(la_migra_cmd)
def _check_chroot(self, chroot):
if chroot and not chroot.startswith("/"):
raise Exception("ZK chroot must start with '/', invalid chroot: %s" % chroot)
def query(self, path, chroot=None):
"""
Queries zookeeper for data associated with 'path' and returns all fields in the schema
"""
self._check_chroot(chroot)
chroot_path = ('' if chroot is None else chroot) + path
kafka_run_class = self.path.script("kafka-run-class.sh", DEV_BRANCH)
cmd = "%s %s -server %s %s get %s" % \
(kafka_run_class, self.java_cli_class_name(), self.connect_setting(force_tls=self.zk_client_secure_port),
self.zkTlsConfigFileOption(True),
chroot_path)
self.logger.debug(cmd)
node = self.nodes[0]
result = None
for line in node.account.ssh_capture(cmd, allow_fail=True):
# loop through all lines in the output, but only hold on to the first match
if result is None:
match = re.match("^({.+})$", line)
if match is not None:
result = match.groups()[0]
return result
def create(self, path, chroot=None, value=""):
"""
Create an znode at the given path
"""
self._check_chroot(chroot)
chroot_path = ('' if chroot is None else chroot) + path
kafka_run_class = self.path.script("kafka-run-class.sh", DEV_BRANCH)
cmd = "%s %s -server %s %s create %s '%s'" % \
(kafka_run_class, self.java_cli_class_name(), self.connect_setting(force_tls=self.zk_client_secure_port),
self.zkTlsConfigFileOption(True),
chroot_path, value)
self.logger.debug(cmd)
output = self.nodes[0].account.ssh_output(cmd)
self.logger.debug(output)
def describe(self, topic):
"""
Describe the given topic using the ConfigCommand CLI
"""
kafka_run_class = self.path.script("kafka-run-class.sh", DEV_BRANCH)
cmd = "%s kafka.admin.ConfigCommand --zookeeper %s %s --describe --topic %s" % \
(kafka_run_class, self.connect_setting(force_tls=self.zk_client_secure_port),
self.zkTlsConfigFileOption(),
topic)
self.logger.debug(cmd)
output = self.nodes[0].account.ssh_output(cmd)
self.logger.debug(output)
def list_acls(self, topic):
"""
List ACLs for the given topic using the AclCommand CLI
"""
kafka_run_class = self.path.script("kafka-run-class.sh", DEV_BRANCH)
cmd = "%s kafka.admin.AclCommand --authorizer-properties zookeeper.connect=%s %s --list --topic %s" % \
(kafka_run_class, self.connect_setting(force_tls=self.zk_client_secure_port),
self.zkTlsConfigFileOption(),
topic)
self.logger.debug(cmd)
output = self.nodes[0].account.ssh_output(cmd)
self.logger.debug(output)
def java_class_name(self):
""" The class name of the Zookeeper quorum peers. """
return "org.apache.zookeeper.server.quorum.QuorumPeerMain"
def java_cli_class_name(self):
""" The class name of the Zookeeper tool within Kafka. """
return "org.apache.zookeeper.ZooKeeperMainWithTlsSupportForKafka"