Compare commits

..

2 Commits

Author SHA1 Message Date
leewei
e510b8e2db Fix gradle depends 2023-02-20 17:14:27 +08:00
leewei
469baad65b Add km module kafka 2023-02-14 14:57:39 +08:00
285 changed files with 18282 additions and 40 deletions

1
.gitignore vendored
View File

@@ -1,4 +1,3 @@
.gradle/
dist
*classes

View File

@@ -751,7 +751,7 @@ project(':core') {
compile libs.scalaLogging
compile libs.slf4jApi
compile libs.slf4jlog4j
// compile 'com.alibaba:fastjson:1.2.83'
compile 'com.alibaba:fastjson:1.2.83'
compile 'net.java.dev.jna:jna:5.4.0'
compile 'org.apache.mina:mina-core:2.0.10'
compile 'com.googlecode.concurrentlinkedhashmap:concurrentlinkedhashmap-lru:1.4.2'

55
cmd.txt Normal file
View File

@@ -0,0 +1,55 @@
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
./bin/kafka-topics.sh --create --zookeeper 10.179.133.194:2181/yh --replication-factor 1 --partitions 1 --topic test
bin/kafka-topics.sh --list --zookeeper 10.179.133.194:2181/yh
bin/kafka-console-consumer.sh --bootstrap-server 10.96.85.50:9092 --from-beginning --topic test --consumer.config config/consumer.properties
bin/kafka-console-producer.sh --broker-list 10.96.85.50:9092 --topic test --producer.config config/producer.properties
bin/kafka-acls.sh --authorizer-properties zookeeper.connect=10.179.133.194:2181/yh --add --allow-principal User:kafka --consumer --topic test --group DemoConsumer
bin/kafka-acls.sh --authorizer-properties zookeeper.connect=10.179.133.194:2181/yh --add --allow-principal User:kafka --producer --topic test
bin/kafka-acls.sh --authorizer-properties zookeeper.connect=10.179.133.194:2181/yh --allow-principal User:ANONYMOUS --consumer --topic=* --group=* --add
bin/kafka-acls.sh --authorizer-properties zookeeper.connect=10.179.133.194:2181/yh --allow-principal User:ANONYMOUS --producer --topic=* --add
bin/kafka-acls.sh --authorizer-properties zookeeper.connect=10.179.133.194:2181/yh --allow-principal User:ANONYMOUS --cluster --operation ClusterAction --add
bin/kafka-configs.sh --zookeeper 10.179.133.194:2181/yh --alter --add-config 'producer_byte_rate=10,consumer_byte_rate=1000' --entity-type users --entity-name kafka
bin/kafka-configs.sh --zookeeper 10.179.162.171:2181/yh2 --alter --add-config 'producer_byte_rate=10,consumer_byte_rate=1000' --entity-type clients --entity-name kafka.test1
kafka acls configration
https://blog.csdn.net/ahzsg1314/article/details/54140909
./gradlew core:test -Dtest.single=ParitionControlTest
./gradlew --max-workers 1 core:test ZookeeperConsumerConnectorTest
./gradlew assemble
#./gradlew test --max-workers 1
#./gradlew unitTest --max-workers 1 --fail-fast
#./gradlew test --continue -x rat -x checkstyleMain -x checkstyleTest -x spotbugsMain
#./gradlew clients:processMessages
./gradlew clean releaseTarGz
#IDEA settings
# main class: kafka.Kafka
# vm options: -ea -Dkafka.logs.dir=logs -Dlog4j.configuration=file:config/log4j.properties -Djava.security.auth.login.config=config/kafka_server_jaas.conf
# program arguments: config/server.properties

View File

@@ -0,0 +1,21 @@
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
//this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
KafkaClient {
org.apache.kafka.common.security.plain.PlainLoginModule required
username="admin"
password="admin";
};

View File

@@ -11,9 +11,17 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
KafkaServer {
org.apache.kafka.common.security.scram.ScramLoginModule required
username="admin"
password="123456";
};
KafkaServer {
com.didichuxing.datachannel.kafka.security.sasl.plain.PlainLoginModule required
username="admin"
password="admin"
user_admin="admin"
user_kafka="12345";
//kafka.security.sasl.DidiLoginModule required
//username="admin"
//password="admin"
//user_admin="admin"
//user_kafka="12345";
};

View File

@@ -35,13 +35,12 @@ auto.create.topics.enable=false
#listeners=PLAINTEXT://:9092
listeners=SASL_PLAINTEXT://:9093,PLAINTEXT://:9092
sasl.enabled.mechanisms=PLAIN
security.inter.broker.protocol=SASL_PLAINTEXT
sasl.mechanism.inter.broker.protocol=SCRAM-SHA-256
sasl.enabled.mechanisms=SCRAM-SHA-256
authorizer.class.name=kafka.security.authorizer.AclAuthorizer
super.users=User:admin
#security.inter.broker.protocol=PLAINTEXT
sasl.mechanism.inter.broker.protocol=PLAIN
authorizer.class.name=com.didichuxing.datachannel.kafka.security.authorizer.DidiAuthorizer
# Hostname and port the broker will advertise to producers and consumers. If not set,
# it uses the value for "listeners" if configured. Otherwise, it will use the value
@@ -133,7 +132,7 @@ log.retention.check.interval.ms=300000
# server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
# You can also append an optional chroot string to the urls to specify the
# root directory for all kafka znodes.
zookeeper.connect=127.0.0.1:2181/kafka
zookeeper.connect=10.179.162.171:2181/yh2
# Timeout in ms for connecting to zookeeper
zookeeper.connection.timeout.ms=5000
@@ -147,12 +146,8 @@ zookeeper.connection.timeout.ms=5000
# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
group.initial.rebalance.delay.ms=0
cluster.id=101
# cluster.id.update=true
gateway.url=http://10.179.32.80:8888
cluster.id=0
max.throttle.time.ms=2000
max.sessions.per.user=2000
kafkaexmetrics.enable=true
num.replica.fetchers=4
didi.mirror.num.fetchers=4

View File

@@ -0,0 +1,29 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.cache;
/**
* Exception class for Datacache
*/
public class CacheException extends RuntimeException {
public CacheException(String message) {
super(message);
}
}

View File

@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.cache;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This class use by watch commit timestamp changed.
* if it get data change event. it should notify Datacache commit cached data.
*/
class CommitTimeWatcher implements Watcher {
private static final Logger log = LoggerFactory.getLogger(CommitTimeWatcher.class);
final private ZkUtil zkUtil;
final private DataCache dataCache;
final String path;
public CommitTimeWatcher(ZkUtil zkUtil, DataCache dateCache) throws Exception {
this.zkUtil = zkUtil;
this.path = zkUtil.getCommitTimestampPath(dateCache.getName());
this.dataCache = dateCache;
watch();
}
public void watch() throws Exception {
zkUtil.getZooKeeper().exists(path, this);
}
@Override
public void process(WatchedEvent event) {
try {
if (event.getType() == Event.EventType.NodeDataChanged) {
long timestamp = Long.parseLong(new String(zkUtil.getZooKeeper().getData(path, this, null)));
if (timestamp <= 0) {
log.error("zkpath: {} invalid value: {}.", path, timestamp);
return;
}
log.info("zkpath: {} value changed. value: {}.", path, timestamp);
this.dataCache.commitCache(timestamp);
} else if (event.getType() != Event.EventType.None && event.getType() != Event.EventType.ChildWatchRemoved) {
zkUtil.getZooKeeper().exists(path, this);
}
} catch (Exception e) {
log.error("internal zk error: ", e);
}
}
public void unwatch() {
try {
zkUtil.getZooKeeper().removeWatches(path, this, WatcherType.Data, true);
} catch (Exception e) {
log.error("internal zk error: ", e);
}
}
}

View File

@@ -0,0 +1,524 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.cache;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.Iterator;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/**
* Datacache is a distribution data mainCache utility for distribute system. it can be deployment in every node.
* The data soruce of Datacache can be sql server and other. Datacache retrieve the data by DataProvider.
* When Datacache is loading. it retrieve all the data from data source. then it schedule a task to incremental
* update from data source to keep the data newest.
* The mechanism of data synchronous using zookeeper. it store sync timestamp and commit timestap in zookeeper.
* When the update data recieved. it store data in uncommitCache that the data can not accessed by user and write
* the uncommitTimestamp to the zookeeper. it will trigger commit timestamp update. if the commit timestamp updated.
* it will commit upncommitCache to main mainCache that user can access the data.
* DataCache mainCache data using HashMap. you can specify the Keytype and the Value type,
* and access data like a normal hashmap.
* @param <KeyType>
* @param <ValueType>
*/
public class DataCache<KeyType, ValueType> {
private static final Logger log = LoggerFactory.getLogger(DataCache.class);
//datacache name.
final private String name;
//unique id for instance of Datacache.
final private int nodeId;
//retrieve data from datasource
final private DataProvider dataProvider;
//run sync data task
final private ScheduledExecutorService schedule;
//sync data frequnce
final private int syncFrequnceMs;
final private int checkFrequnceMs;
//main mainCache
final private AtomicReference<HashMap<KeyType, ValueType>> mainCache;
//uncommit mainCache.
final private Deque<DataRecord> uncommitCache;
final private ZkUtil zkUtil;
final private AtomicReference<ScheduledFuture<?>> future;
// sync timestamp, when retrieve new data will update it.
private long uncommitTimestamp;
// commit timestamp. when commit will update it.
private long commitTimestamp;
private final CommitTimeWatcher commitTimeWatcher;
private final ReloadWatcher reloadWatcher;
private final LeaderWatcher leaderdWatcher;
//make commit,sync and load opration mutual exclusion
private final Lock lock = new ReentrantLock();
private long lastCheckTimestamp;
private long lastSyncTimestamp;
private long lastLogTimestamp;
public DataCache(String name, int nodeId, DataProvider dataProvider, ScheduledExecutorService schedule,
ZkUtil zkUtil, int syncFrequnceMs, int checkFrequnceMs) {
this.name = name;
this.nodeId = nodeId;
this.dataProvider = dataProvider;
this.schedule = schedule;
this.syncFrequnceMs = syncFrequnceMs;
mainCache = new AtomicReference<>(null);
uncommitCache = new ArrayDeque<>();
this.checkFrequnceMs = checkFrequnceMs;
this.zkUtil = zkUtil;
future = new AtomicReference<>();
this.lastCheckTimestamp = System.currentTimeMillis();
this.lastSyncTimestamp = System.currentTimeMillis();
this.lastLogTimestamp = System.currentTimeMillis();
lock.lock();
if (name == null || name.isEmpty()) {
throw new CacheException(String.format("DataCache:%s node %d invalid mainCache name", name, nodeId));
}
if (nodeId < 0) {
throw new CacheException(String.format("DataCache:%s node %d invalid mainCache nodeId", name, nodeId));
}
try {
//add zk watcher for watch commit timestamp change
commitTimeWatcher = new CommitTimeWatcher(zkUtil, this);
//add zk watcher for watch force reload
reloadWatcher = new ReloadWatcher(zkUtil, this);
//add zk watcher for check isleader
leaderdWatcher = new LeaderWatcher(zkUtil, this);
// init mainCache sync info in zookeeper
zkUtil.initCache(nodeId, name);
} catch (Exception e) {
log.error("DataCache:{} node {} zookeeper init exception: ", name, nodeId, e);
throw new CacheException(String.format("DataCache:%s node %d zookeeper init exception: %s", name,
nodeId, e.getMessage()));
}
this.commitTimestamp = loadCommitTimestamp();
if (commitTimestamp == 0) {
throw new CacheException(String.format("DataCache:%s node %d load timestamp failed", name, nodeId));
}
//load mainCache from data provider
boolean loaded = load();
if (!loaded) {
throw new CacheException(String.format("DataCache:%s node %d load data failed", name, nodeId));
}
lock.unlock();
//schedule auto incremental update and self check
startAutoTask();
log.info("DataCache:{} node {} cache initialized.", name, nodeId);
}
private long loadCommitTimestamp() {
return zkUtil.getCommitTimestamp(name);
}
private boolean load() {
log.info("DataCache:{} node {} load cache start.", name, nodeId);
HashMap<KeyType, ValueType> hashmap = new HashMap<>();
long timestamp;
List<DataRecord> uncommitEntries = new ArrayList<>();
try {
//get all date from now
Dataset dataset = dataProvider.fetchData(0, System.currentTimeMillis());
if (dataset == null) {
throw new CacheException(String.format("DataCache:%s node %d Fetch new data should not return null",
name, nodeId));
}
List<DataRecord> entries = dataset.getEntries();
// add all the data from data provider into the mainCache
// check weather the timestamp of record.
// if the record's timestamp is less then the commit timestamp
// add it into mainCache. otherwise add it into uncommit mainCache.
for (DataRecord<KeyType, ValueType> entry : entries) {
KeyType key = entry.getKey();
ValueType value = entry.getValue();
if (entry.getTimestamp() < commitTimestamp) {
if (entry.getOperation() == DataRecord.Operation.delete) {
hashmap.remove(key);
} else {
hashmap.put(key, value);
}
} else {
uncommitEntries.add(entry);
}
}
timestamp = dataset.getTimestamp();
} catch (Exception e) {
log.error("DataCache:{} node {} load cache exception: ", name, nodeId, e);
return false;
}
//load data successfull, update data and status.
uncommitTimestamp = uncommitEntries.isEmpty() ? commitTimestamp : timestamp;
uncommitCache.clear();
uncommitCache.addAll(uncommitEntries);
mainCache.set(hashmap);
//set uncommitTimestmap to zookeeper, this operation will trigger commit timestamp update.
zkUtil.setUncommentTimestamp(nodeId, name, uncommitTimestamp);
log.info("DataCache:{} node {} load cache finished. main cache entries: {}, " +
"uncommit cache entries: {}, uncommit time {}, commit time {}",
name, nodeId, mainCache.get().size(), uncommitCache.size(), uncommitTimestamp, commitTimestamp);
return true;
}
private void sync() {
log.info("DataCache:{} node {} sync cache start.", name, nodeId);
try {
int entries = fetchNewData();
log.info("DataCache:{} node {} sync cache finished, uncommit time {}, {} entries.",
name, nodeId, uncommitTimestamp, entries);
if (leaderdWatcher.isLeader()) {
if (commitTimestamp < uncommitTimestamp) {
log.info("DataCache:{} node {} leader update commit time.", name, nodeId);
zkUtil.updateCommitTimestamp(nodeId, name, uncommitTimestamp, commitTimestamp);
}
}
} catch (Exception e) {
log.error("DataCache:{} node {} sync cache exception: ", name, nodeId, e);
}
}
private int fetchNewData() throws Exception {
//get the newest data from uncommitTimestamp.
Dataset dataset = dataProvider.fetchData(uncommitTimestamp, System.currentTimeMillis());
if (dataset == null) {
throw new CacheException("Fetch new data should not return null");
}
if (dataset.getTimestamp() < uncommitTimestamp) {
throw new CacheException("Dataset timestamp should greater equal than uncommit time");
}
List<DataRecord> entries = dataset.getEntries();
lastSyncTimestamp = dataset.getTimestamp();
//it's no data to update
if (entries.isEmpty()) {
if (uncommitCache.isEmpty()) {
uncommitTimestamp = commitTimestamp;
}
return 0;
}
//all sync data should add into uncommit mainCache. it will be add into main mainCache
// when the commit function called.
lock.lock();
try {
uncommitCache.addAll(entries);
} finally {
lock.unlock();
}
//set uncommitTimestmap to zookeeper, this operation will trigger commit timestamp update.
zkUtil.setUncommentTimestamp(nodeId, name, dataset.getTimestamp());
uncommitTimestamp = dataset.getTimestamp();
return entries.size();
}
private void check() {
long currentTimestamp = System.currentTimeMillis();
//check sync data is working
long behind = currentTimestamp - lastSyncTimestamp;
if (behind > syncFrequnceMs + 1000) {
log.error("DataCache:{} node {} sync is far behind now, behind {}s, last synctime {}",
name, nodeId, behind / 1000, lastSyncTimestamp);
}
//check commit is working
behind = uncommitTimestamp - commitTimestamp;
if (behind > 2L * syncFrequnceMs + 1000 && lastSyncTimestamp - uncommitTimestamp > 2L * syncFrequnceMs + 1000) {
log.warn("DataCache:{} node {} commit is far behind now, behind {}s, uncommit time {}, commit time {}",
name, nodeId, behind / 1000, uncommitTimestamp, commitTimestamp);
}
//check zk
if (!zkUtil.isAvailable() || !zkUtil.isExistWatches()) {
log.error("DataCache:{} node {} zookeeper connection is not good.", name, nodeId);
reloadWatcher();
}
if (currentTimestamp - lastLogTimestamp > 15L * syncFrequnceMs) {
log.trace("DataCache:{} node {} main cache: [{}]", name, nodeId, mainCachetoString());
lastLogTimestamp = currentTimestamp;
}
if (currentTimestamp - lastCheckTimestamp < checkFrequnceMs) {
return;
}
try {
// check the main mainCache is match with data provider by the commit timestamp.
// if missmatch force reload this mainCache
long currentCommitTimestamp = commitTimestamp;
log.info("DataCache:{} node {} check cache fetch data. commit timestamp: {}", name, nodeId, currentCommitTimestamp);
Dataset dataset = dataProvider.fetchData(0, currentCommitTimestamp);
if (dataset == null) {
throw new CacheException("Fetch new data should not return null");
}
List<DataRecord> entries = dataset.getEntries();
HashMap<KeyType, ValueType> hashmap = new HashMap<>();
for (DataRecord<KeyType, ValueType> entry : entries) {
if (entry.getOperation() == DataRecord.Operation.delete) {
hashmap.remove(entry.getKey());
} else {
hashmap.put(entry.getKey(), entry.getValue());
}
}
lock.lock();
try {
if (currentCommitTimestamp == commitTimestamp) {
log.info("DataCache:{} node {} check cache data. commit timestamp: {}", name, nodeId, commitTimestamp);
for (Map.Entry<KeyType, ValueType> entry : hashmap.entrySet()) {
KeyType key = entry.getKey();
ValueType value = entry.getValue();
ValueType cahcedValue = mainCache.get().get(key);
if (cahcedValue == null || !cahcedValue.equals(value)) {
//mainCache is miss match for datasource ,shuold reload data
log.error("DataCache:{} node {} check cache faild. key: {}, map value: [{}], db value: [{}]",
name, nodeId, key, cahcedValue == null ? "null" : cahcedValue.toString(),
value == null ? "null" : value.toString());
load();
break;
}
}
lastCheckTimestamp = currentTimestamp;
} else {
log.info("DataCache:{} node {} ignore check cache, check time: {}, committimestamp: {} ",
name, nodeId, currentCommitTimestamp, commitTimestamp);
}
} finally {
lock.unlock();
}
} catch (Exception e) {
log.error("DataCache:{} node {} check cache exception:{} ", name, nodeId, e);
}
log.info("DataCache:{} node {} check cache finished.", name, nodeId);
}
private void startAutoTask() {
// 1. sync data from data provider
// 2. check main mainCache
if (future.get() != null && future.get().isCancelled()) {
return;
}
log.trace("DataCache:{} node {} schedule auto task.", name, nodeId);
future.set(schedule.schedule(() -> {
try {
sync();
check();
} catch (Throwable e) {
log.error("DataCache:{} node {} auto task run unknown exception: ", name, nodeId, e);
} finally {
startAutoTask();
}
}, syncFrequnceMs, TimeUnit.MILLISECONDS));
}
// This function triggered by zookeeper watcher. when the commit timestamp changed.
// it get all the date less than the commit timestamp from uncommit mainCache. and put
// them into main mainCache.
private void commit(long timestamp) {
log.info("DataCache:{} node {} commit begin.", name, nodeId);
if (commitTimestamp > timestamp) {
log.warn("DataCache:{} node {} the time should greater than commit time", name, nodeId);
return;
}
//no data to update
if (uncommitCache.isEmpty()) {
commitTimestamp = timestamp;
log.info("DataCache:{} node {} commit cache finished, commitTimestamp:{}", name, nodeId, timestamp);
return;
}
List<DataRecord> entries = new ArrayList<>();
int numEntries = 0;
Iterator<DataRecord> iterator = uncommitCache.iterator();
while (iterator.hasNext()) {
DataRecord<KeyType, ValueType> entry = iterator.next();
if (entry.getTimestamp() < timestamp) {
entries.add(entry);
iterator.remove();
numEntries++;
} else {
break;
}
}
//nothing to be commit
if (numEntries == 0) {
commitTimestamp = timestamp;
log.info("DataCache:{} node {} commit cache finished, commitTimestamp:{}", name, nodeId, timestamp);
return;
}
HashMap<KeyType, ValueType> hashmap = new HashMap<>(mainCache.get().size() + entries.size());
hashmap.putAll(mainCache.get());
for (DataRecord<KeyType, ValueType> entry : entries) {
KeyType key = entry.getKey();
ValueType value = entry.getValue();
if (DataRecord.Operation.delete == entry.getOperation()) {
hashmap.remove(key);
} else {
hashmap.put(key, value);
}
}
mainCache.set(hashmap);
commitTimestamp = timestamp;
log.info("DataCache:{} node {} commit cache finished, commitTimestamp:{}, {} entries.",
name, nodeId, timestamp, numEntries);
}
public int getNodeId() {
return nodeId;
}
public String getName() {
return name;
}
public int size() {
return mainCache.get().size();
}
public long getCommitTimestamp() {
return commitTimestamp;
}
// Get value from main mainCache by given key.
public ValueType get(KeyType key) {
return mainCache.get().get(key);
}
// This function triggered by zookeeper watcher when relaod status set.
// it's the way for manually update the mainCache when the auto sync task has problems.
public void reload() {
log.info("DataCache:{} node {} reload data begin.", name, nodeId);
//shutdown schedule task
future.get().cancel(true);
future.set(null);
//load all date from data provider by the commit timestamp in zk
long timestamp = loadCommitTimestamp();
if (timestamp == 0) {
log.error("DataCache:{} node {} load commit timestamp failed.", name, nodeId);
return;
}
commitTimestamp = timestamp;
log.trace("DataCache:{} node {} old main cache: [{}]", nodeId, name, mainCachetoString());
lock.lock();
try {
load();
} finally {
lock.unlock();
}
log.trace("DataCache:{} node {} new main cache: [{}]", name, nodeId, mainCachetoString());
//start the schedue task
startAutoTask();
log.info("DataCache:{} node {} reload data completed.", name, nodeId);
}
public void commitCache(long timestamp) {
lock.lock();
try {
commit(timestamp);
} finally {
lock.unlock();
}
}
//shutdown mainCache. clean resource
public void stop() {
commitTimeWatcher.unwatch();
reloadWatcher.unwatch();
leaderdWatcher.unwatch();
future.get().cancel(true);
uncommitCache.clear();
mainCache.get().clear();
log.info("DataCache:{} node {} cache shutdown.", name, nodeId);
}
public void reloadWatcher() {
try {
commitTimeWatcher.unwatch();
reloadWatcher.unwatch();
leaderdWatcher.unwatch();
zkUtil.refreshzookeeper();
if (zkUtil.isAvailable()) {
zkUtil.setUncommentTimestamp(nodeId, name, uncommitTimestamp);
long committimestamp = loadCommitTimestamp();
commitCache(committimestamp);
commitTimeWatcher.watch();
reloadWatcher.watch();
leaderdWatcher.watch();
}
} catch (Exception e) {
log.info("DataCache:{} node {} cache reload watchers exception . {}", name, nodeId, e);
}
}
private String mainCachetoString() {
StringBuilder sb = new StringBuilder();
for (Map.Entry<KeyType, ValueType> entry : mainCache.get().entrySet()) {
sb.append(String.format("[key: [%s], value: [%s]] ", entry.getKey(), entry.getValue()));
}
return sb.toString();
}
}

View File

@@ -0,0 +1,35 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.cache;
/**
* the interface for DataCache retrieve data.
* make sure all data have modify time.
* initial cache using fetchOld function.
* and incremental update cache using fetchNew function.
*/
public interface DataProvider {
/**
* Get data that the timestamp greater equal than fromtime and less than endtime [from, now) .
* @param startTime
* @param endTime
* @throws Exception
*/
public Dataset fetchData(long startTime, long endTime) throws Exception;
}

View File

@@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.cache;
import com.alibaba.fastjson.JSONObject;
/**
* DataCache Entry type
*/
public class DataRecord<KeyType, ValueType> {
final private KeyType key;
final private ValueType value;
final private Operation operation;
final private long timestamp;
public DataRecord(KeyType key, ValueType value, Operation operation, long timestamp) {
this.key = key;
this.value = value;
this.operation = operation;
this.timestamp = timestamp;
}
public DataRecord(KeyType key, ValueType value, JSONObject json) {
this.key = key;
this.value = value;
Long timestamp = json.getLong("timestamp");
if (timestamp == null) {
throw new IllegalArgumentException("missing timestamp");
}
Integer operation = json.getInteger("operation");
if (operation == null) {
throw new IllegalArgumentException("missing operation");
}
if (operation < 0 || operation > 2) {
throw new IllegalArgumentException("invalid operation");
}
this.operation = Operation.values()[operation];
this.timestamp = timestamp;
}
public KeyType getKey() {
return key;
}
public ValueType getValue() {
return value;
}
public long getTimestamp() {
return timestamp;
}
public Operation getOperation() {
return operation;
}
public enum Operation {
create,
update,
delete
}
}

View File

@@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.cache;
import java.util.ArrayList;
import java.util.List;
/**
* DataProvider fetch data result.
*/
public class Dataset {
//all the entries shoud sorted by timestamp
private List<DataRecord> entries = new ArrayList<>();
// the endtime by fetchData
private long timestamp;
public Dataset(long timestamp) {
this.timestamp = timestamp;
}
public Dataset(List<DataRecord> entries, long timestamp) {
this.entries = entries;
this.timestamp = timestamp;
}
public List<DataRecord> getEntries() {
return entries;
}
public long getTimestamp() {
return timestamp;
}
}

View File

@@ -0,0 +1,101 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.cache;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
/**
* This class use by watch nodes changed.
* it select the first node as Datacache leader
*/
class LeaderWatcher implements Watcher {
private static final Logger log = LoggerFactory.getLogger(LeaderWatcher.class);
final private ZkUtil zkUtil;
final private String nodeId;
final private String path;
private boolean isLeader;
private boolean stop;
public LeaderWatcher(ZkUtil zkUtil, DataCache dataCache) throws Exception {
this.zkUtil = zkUtil;
this.nodeId = String.valueOf(dataCache.getNodeId());
this.path = zkUtil.getUncommitTimestampParentPath(dataCache.getName());
watch();
}
public void watch() throws Exception {
Stat exists = zkUtil.getZooKeeper().exists(path, this);
if (exists != null) {
zkUtil.getZooKeeper().getChildren(path, this);
}
}
public boolean isLeader() {
return isLeader;
}
@Override
public void process(WatchedEvent event) {
try {
if (event.getType() == Event.EventType.NodeChildrenChanged) {
List<String> currentChilds = zkUtil.getZooKeeper().getChildren(path, this);
log.info("zkpath: {} child changed. value: {}.", path, currentChilds);
if (currentChilds.isEmpty()) {
log.error("zkpath: {} unkown exception", path);
return;
}
String id = currentChilds.get(0);
if (nodeId.equals(id) && !isLeader) {
isLeader = true;
log.info("zkpath: {} node {} become leader.", path, nodeId);
}
if (!nodeId.equals(id) && isLeader) {
isLeader = false;
log.info("zkpath: {} node {} become not leader.", path, nodeId);
}
} else if (event.getType() != Event.EventType.None && event.getType() != Event.EventType.ChildWatchRemoved) {
zkUtil.getZooKeeper().getChildren(path, this);
}
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException e) {
log.info("ZooKeeper is lost connection", e);
} catch (Exception e) {
log.error("internal zk error: ", e);
}
}
public void unwatch() {
try {
zkUtil.getZooKeeper().removeWatches(path, this, WatcherType.Children, true);
} catch (Exception e) {
log.error("internal zk error: ", e);
}
}
}

View File

@@ -0,0 +1,89 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.cache;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.concurrent.atomic.AtomicBoolean;
/**
* This class use by watch reload status changed.
* if Datacache has some problem that cannot sync data normally.
* user can write the zookeeper path to force reload the Datacache
* when the watcher recieve the event. it
*/
class ReloadWatcher implements Watcher {
private static final Logger log = LoggerFactory.getLogger(ReloadWatcher.class);
final private ZkUtil zkUtil;
final private DataCache dataCache;
final private String path;
private AtomicBoolean isReloading = new AtomicBoolean();
public ReloadWatcher(ZkUtil zkUtil, DataCache dataCache) throws Exception {
this.zkUtil = zkUtil;
this.path = zkUtil.getReloadPath(dataCache.getName());
this.dataCache = dataCache;
watch();
}
public void watch() throws Exception {
zkUtil.getZooKeeper().exists(path, this);
}
@Override
public void process(WatchedEvent event) {
try {
if (event.getType() == Event.EventType.NodeDataChanged || event.getType() == Event.EventType.NodeCreated) {
String path = event.getPath();
int nodeId = Integer.parseInt(new String(zkUtil.getZooKeeper().getData(path, this, null)));
if (nodeId < 0) {
log.error("zkpath: {} invalid value: {}.", path, nodeId);
return;
}
if (!isReloading.compareAndSet(false, true)) {
return;
}
log.info("zkpath: {} value changed. value: {}.", path, nodeId);
if (nodeId == dataCache.getNodeId()) {
dataCache.reload();
}
} else if (event.getType() != Event.EventType.None && event.getType() != Event.EventType.ChildWatchRemoved) {
zkUtil.getZooKeeper().exists(path, this);
}
} catch (Exception e) {
log.error("internal zk error: ", e);
} finally {
isReloading.set(false);
}
}
public void unwatch() {
try {
zkUtil.getZooKeeper().removeWatches(path, this, WatcherType.Data, true);
} catch (Exception e) {
log.error("internal zk error: ", e);
}
}
}

View File

@@ -0,0 +1,37 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.cache;
import org.apache.zookeeper.ZooKeeper;
/**
* This class use to and manager DataCache metadata in zookeeper.
* it contains uncommit timestamp, commit timestamp and reload flag.
* the zookeeper structure
* /DataCache/ all cache rootpath
* /DataCache/{name} cache rootpath by one cache
* /DataCache/{name}/commit store the commit timestamp
* /DataCache/{name}/sync store all node which identify a client.
* /DataCache/{name}/sync/{nodeId} store one node uncommit timestamp
* /DataCache/{name}/reload store the node id which will be force reloaded.
* name : cache name , nodeId : client nodeId
*/
public interface ZkClientCreator {
public ZooKeeper zookeeper();
}

View File

@@ -0,0 +1,287 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.cache;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooDefs;
import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.reflect.Method;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
/**
* This class use to and manager DataCache metadata in zookeeper.
* it contains uncommit timestamp, commit timestamp and reload flag.
* the zookeeper structure
* /DataCache/ all cache rootpath
* /DataCache/{name} cache rootpath by one cache
* /DataCache/{name}/commit store the commit timestamp
* /DataCache/{name}/sync store all node which identify a client.
* /DataCache/{name}/sync/{nodeId} store one node uncommit timestamp
* /DataCache/{name}/reload store the node id which will be force reloaded.
* name : cache name , nodeId : client nodeId
*/
public class ZkUtil {
private static final Logger log = LoggerFactory.getLogger(ZkUtil.class);
static final String PATH_SEPRATOR = "/";
static final String ROOT_PATH = "/DataCache";
static final String COMMIT_PATH = "commit";
static final String SYNC_PATH = "sync";
static final String RELOAD_PATH = "reload";
final private ZkClientCreator zkClientCreator;
private ZooKeeper zooKeeper = null;
private List<String> dataCacheWatches = new ArrayList<>();
private String name = "";
public ZkUtil(ZkClientCreator zkClientCreator) {
//init zookeeper connect. all the cache share this connection
this.zkClientCreator = zkClientCreator;
refreshzookeeper();
}
private String getRootPath(String name) {
return ROOT_PATH + PATH_SEPRATOR + name;
}
public String getCommitTimestampPath(String name) {
return ROOT_PATH + PATH_SEPRATOR + name + PATH_SEPRATOR + COMMIT_PATH;
}
private String getUncommitTimestampPath(int id, String name) {
return ROOT_PATH + PATH_SEPRATOR + name + PATH_SEPRATOR + SYNC_PATH + PATH_SEPRATOR + id;
}
public String getUncommitTimestampParentPath(String name) {
return ROOT_PATH + PATH_SEPRATOR + name + PATH_SEPRATOR + SYNC_PATH;
}
public String getReloadPath(String name) {
return ROOT_PATH + PATH_SEPRATOR + name + PATH_SEPRATOR + RELOAD_PATH;
}
private void setCommitTimestamp(String name, long timestamp) throws Exception {
//set commit timestamp to zk path: /DataCache/{name}/commit
String commitTimestampPath = getCommitTimestampPath(name);
Stat node = zooKeeper.exists(commitTimestampPath, false);
if (node == null) {
zooKeeper.create(commitTimestampPath, String.valueOf(timestamp).getBytes(StandardCharsets.UTF_8),
ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
} else {
zooKeeper.setData(commitTimestampPath,
String.valueOf(timestamp).getBytes(StandardCharsets.UTF_8), node.getVersion());
}
log.info("DataCache {} set commit timestmap: {}", name, timestamp);
}
private long getTimestamp(String path) throws Exception {
Stat node = zooKeeper.exists(path, false);
if (node == null) {
return 0;
} else {
return Long.parseLong(new String(zooKeeper.getData(path, false, node)));
}
}
private long getUncommitTimestamp(int id, String name) throws Exception {
//read uncommit timestamp to zk path: /DataCache/{name}/sync/{nodeId}
String uncommitTimestampPath = getUncommitTimestampPath(id, name);
long timestamp = getTimestamp(uncommitTimestampPath);
log.debug("DataCache {} get uncommit timestmap: {}", name, timestamp);
return timestamp;
}
public void updateCommitTimestamp(int id, String name, long uncommitTimestamp,
long lastCommitTimestamp) {
//lookup the minimal uncommit timestamp from all the node.
// and set the commit timestamp to the minimal value.
try {
String path = getUncommitTimestampParentPath(name);
List<String> list = zooKeeper.getChildren(path, false);
if (list.isEmpty()) {
log.error("DataCache {} update commit timestmap exception: no nodes found ", name);
return;
}
long commitTimestamp = uncommitTimestamp;
if (list.size() > 1) {
for (String entry : list) {
int nodeId = Integer.parseInt(entry);
if (nodeId == id) {
continue;
}
long timestamp = getUncommitTimestamp(nodeId, name);
if (timestamp < commitTimestamp) {
commitTimestamp = timestamp;
}
}
}
if (commitTimestamp > lastCommitTimestamp) {
log.info("DataCache {} update commit timestmap: {}", name, commitTimestamp);
setCommitTimestamp(name, commitTimestamp);
}
} catch (Exception e) {
log.error("DataCache {} update commit timestmap exception: {} ", name, e);
}
}
public void initCache(int id, String name) throws Exception {
try {
log.info("DataCache {} node {} init cache.", name, id);
dataCacheWatches.add("/DataCache/" + name + "/sync");
dataCacheWatches.add("/DataCache/" + name + "/commit");
dataCacheWatches.add("/DataCache/" + name + "/reload");
this.name = name;
//init root
if (zooKeeper.exists(ROOT_PATH, false) == null) {
try {
zooKeeper.create(ROOT_PATH, "".getBytes(StandardCharsets.UTF_8), ZooDefs.Ids.OPEN_ACL_UNSAFE,
CreateMode.PERSISTENT);
} catch (KeeperException.NodeExistsException exception) {
Thread.sleep(1000);
}
}
// init commit time
long timestatmp = System.currentTimeMillis();
String cacheRootPath = getRootPath(name);
if (zooKeeper.exists(cacheRootPath, false) == null) {
try {
zooKeeper.create(cacheRootPath, "".getBytes(StandardCharsets.UTF_8), ZooDefs.Ids.OPEN_ACL_UNSAFE,
CreateMode.PERSISTENT);
} catch (KeeperException.NodeExistsException exception) {
Thread.sleep(1000);
}
}
String uncommitTimestampParentPath = getUncommitTimestampParentPath(name);
if (zooKeeper.exists(uncommitTimestampParentPath, false) == null) {
try {
zooKeeper.create(uncommitTimestampParentPath,
"".getBytes(StandardCharsets.UTF_8), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
} catch (KeeperException.NodeExistsException exception) {
Thread.sleep(1000);
}
}
//not have connection. reset commit time
List<String> list = zooKeeper.getChildren(uncommitTimestampParentPath, false);
if (list.isEmpty()) {
setCommitTimestamp(name, timestatmp);
} else {
if (getCommitTimestamp(name) == 0) {
setCommitTimestamp(name, timestatmp);
}
}
String uncommitTimestampPath = getUncommitTimestampPath(id, name);
if (zooKeeper.exists(uncommitTimestampPath, false) != null) {
throw new CacheException(String.format("DataCache %s node %d node Id is dupelicate name", name, id));
} else {
//avoid commit time chagned in Datacache loading
setUncommentTimestamp(id, name, timestatmp);
}
log.info("DataCache {} node {} init cache finished.", name, id);
} catch (CacheException e) {
throw e;
} catch (Exception e) {
log.error("DataCache {} node {} init cache exception {}: ", name, id, e);
throw new CacheException(String.format("DataCache {} init cache exception: %s", name, e.getMessage()));
}
}
public void setUncommentTimestamp(int id, String name, long uncommitTimestamp) {
//write uncommit timestamp to zk path: /DataCache/{name}/sync/{nodeId}
try {
long oldUncommitTimestamp = getUncommitTimestamp(id, name);
if (oldUncommitTimestamp < uncommitTimestamp) {
String uncommitTimestampPath = getUncommitTimestampPath(id, name);
Stat node = zooKeeper.exists(uncommitTimestampPath, false);
if (node == null) {
zooKeeper.create(uncommitTimestampPath, String.valueOf(uncommitTimestamp).getBytes(StandardCharsets.UTF_8),
ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
} else {
zooKeeper.setData(uncommitTimestampPath,
String.valueOf(uncommitTimestamp).getBytes(StandardCharsets.UTF_8), node.getVersion());
}
log.info("DataCache {} node {} set uncommit timestamp: {}", name, id, uncommitTimestamp);
}
} catch (Exception e) {
log.error("DataCache {} node {} set uncommit time exception: {}", name, id, e);
}
}
public long getCommitTimestamp(String name) {
//get commit timestamp to zk path: /DataCache/{name}/commit
try {
String commitTimestampPath = getCommitTimestampPath(name);
long timestamp = getTimestamp(commitTimestampPath);
log.debug("DataCache {} get commit timestamp: {}", name, timestamp);
return timestamp;
} catch (Exception e) {
log.error("DataCache {} get commit timestamp exception: {}", name, e);
}
return 0;
}
public boolean isAvailable() {
return zooKeeper.getState() == ZooKeeper.States.CONNECTED;
}
public boolean isExistWatches() {
try {
List<String> allWatches = new ArrayList<>();
Method dataWatchesMethod = zooKeeper.getClass().getDeclaredMethod("getDataWatches");
dataWatchesMethod.setAccessible(true);
allWatches.addAll((List) dataWatchesMethod.invoke(zooKeeper));
Method existWatchesMethod = zooKeeper.getClass().getDeclaredMethod("getExistWatches");
existWatchesMethod.setAccessible(true);
allWatches.addAll((List) existWatchesMethod.invoke(zooKeeper));
if (allWatches.containsAll(dataCacheWatches)) {
log.info("DataCache:{} zk contains all watches", this.name);
return true;
}
log.info("DataCache:{} zk not contains all watches", this.name);
} catch (Exception e) {
log.error("DataCache:{} isExistWatches exception: ", this.name, e);
}
return false;
}
public ZooKeeper getZooKeeper() {
return zooKeeper;
}
public void refreshzookeeper() {
if (zooKeeper == null || !isAvailable()) {
zooKeeper = zkClientCreator.zookeeper();
assert zooKeeper != null;
}
}
}

View File

@@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.config;
public class GatewayConfigs {
public static final String TOPIC_HEART_BEAT_URL = "/api/v1/heartbeat/survive-user";
public static final String TOPIC_JMX_REPORT_URL = "/api/v1/report/jmx/topics";
public static String getTopicHeartBeatUrl(String prefix) {
return prefix + TOPIC_HEART_BEAT_URL;
}
public static String getTopicJmxReportUrl(String prefix) {
return prefix + TOPIC_JMX_REPORT_URL;
}
}

View File

@@ -0,0 +1,125 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.jmx;
import com.didichuxing.datachannel.kafka.metrics.AppIdHostTopicMetrics;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.Set;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class JmxConfigManager {
private static final Logger log = LoggerFactory.getLogger(JmxConfigManager.class);
final static private int DATA_SYNC_TIME_MS = 1 * 60 * 1000;
private Set<String> topicSet = null;
private Set<String> metricNameSet = null;
private JmxConfigProvider jmxConfigProvider = null;
private DateFormat dateFormat = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
private DateFormat dayFormat = new SimpleDateFormat("yy-MM-dd");
private final String initTime = "23:00:00";
private JmxConfigManager() {}
public static JmxConfigManager getInstance() {
return JmxConfigManagerHolder.INSTANCE;
}
public void start(String clusterId, ScheduledExecutorService scheduledExecutorService, String gatewayUrl) {
jmxConfigProvider = new JmxConfigProvider(clusterId, gatewayUrl);
topicSet = new ConcurrentSkipListSet<>();
metricNameSet = new ConcurrentSkipListSet<>();
scheduledExecutorService.scheduleWithFixedDelay(() -> {
try {
getJmxConfigTopic();
} catch (Throwable t) {
log.error("Uncaught error from JmxConfigManager-getJmxConfigTopic: ", t);
}
}, DATA_SYNC_TIME_MS, DATA_SYNC_TIME_MS, TimeUnit.MILLISECONDS);
long initDelay = getTimeMillis(initTime) - System.currentTimeMillis();
long oneDay = 24 * 60 * 60 * 1000;
initDelay = initDelay > 0 ? initDelay : initDelay + oneDay;
scheduledExecutorService.scheduleAtFixedRate(() -> {
try {
clearJmxMetric();
} catch (Throwable t) {
log.error("Uncatch error from JmxConfigManager-clearJmxMetric: ", t);
}
}, initDelay, oneDay, TimeUnit.MILLISECONDS);
}
public Set<String> getTopicSet() {
return topicSet;
}
public void addMetricName(String metricName) {
metricNameSet.add(metricName);
}
public void shutdown() {
log.info("JmxConfig Manager shutdown");
}
private static class JmxConfigManagerHolder {
private static final JmxConfigManager INSTANCE = new JmxConfigManager();
}
private void getJmxConfigTopic() {
try {
String jmxConfigTopic = jmxConfigProvider.fetchData();
if (jmxConfigTopic == null || jmxConfigTopic == "") {
topicSet.clear();
return;
}
topicSet = new ConcurrentSkipListSet<>(Arrays.asList(jmxConfigTopic.split(",")));
log.debug("TopicSet update success, topics:" + topicSet.toString());
} catch (Exception e) {
log.error("Get jmx config topic error, detail: ", e);
}
}
private void clearJmxMetric() {
Set<String> clearSet = metricNameSet;
metricNameSet = new ConcurrentSkipListSet<>();
clearSet.forEach(AppIdHostTopicMetrics::removeClientMetrics);
clearSet.clear();
}
synchronized private long getTimeMillis(String time) {
try {
Date currentDate = dateFormat.parse(dayFormat.format(new Date()) + " " + time);
return currentDate.getTime();
} catch (ParseException e) {
log.error("Parse date error, initTime: {}", initTime);
}
return 0;
}
}

View File

@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.jmx;
import com.didichuxing.datachannel.kafka.cache.CacheException;
import com.didichuxing.datachannel.kafka.config.GatewayConfigs;
import com.didichuxing.datachannel.kafka.util.HttpUtils;
import com.didichuxing.datachannel.kafka.util.JsonUtils;
import com.didichuxing.datachannel.kafka.util.ResponseCommonResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;
public class JmxConfigProvider {
private static final Logger log = LoggerFactory.getLogger(com.didichuxing.datachannel.kafka.jmx.JmxConfigProvider.class);
final private String clusterId;
final private String fetchDataUrl;
public JmxConfigProvider(String clusterId, String gatewayUrl) {
this.clusterId = clusterId;
this.fetchDataUrl = GatewayConfigs.getTopicJmxReportUrl(gatewayUrl);
}
public String fetchData() throws Exception {
log.debug("Fetch jmx config start");
Map<String, String> params = new HashMap<>();
params.put("clusterId", clusterId);
ResponseCommonResult httpResult = HttpUtils.get(fetchDataUrl, params, 0);
if (httpResult.getCode() == ResponseCommonResult.FAILED_STATUS) {
throw new Exception(String.format("Http response error, detail: %s", httpResult.toString()));
}
ResponseCommonResult configDataResult = JsonUtils.string2ResponseCommonResult(httpResult.getData().toString());
if (configDataResult.getCode() == ResponseCommonResult.FAILED_STATUS) {
throw new Exception(String.format("Get jmxConfigTopic error, detail: %s", configDataResult.toString()));
}
return configDataResult.getData().toString();
}
}

View File

@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.partition;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.didichuxing.datachannel.kafka.cache.CacheException;
import com.didichuxing.datachannel.kafka.cache.DataProvider;
import com.didichuxing.datachannel.kafka.cache.DataRecord;
import com.didichuxing.datachannel.kafka.cache.Dataset;
import com.didichuxing.datachannel.kafka.util.JsonUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
/*
public class PartitionDataProvider implements DataProvider {
private static final Logger log = LoggerFactory.getLogger(PartitionDataProvider.class);
final static private int FETCDATA_TIMEOUT = 10000;
final static private String API_PATH = "/api/v1/partition/topicPartitions";
final private String clusterId;
final private String fetchDataUrl;
public PartitionDataProvider(String clusterId, String gatewayUrl) {
this.clusterId = clusterId;
this.fetchDataUrl= gatewayUrl + API_PATH;
}
@Override
public Dataset fetchData(long startTime, long endTime) throws Exception {
log.debug("Fetch data start: {} end {}", startTime, endTime);
String req = String.format("{\"clusterId\":%s,\"start\":%d,\"end\":%d}", clusterId, startTime, endTime);
List<DataRecord> entries = new ArrayList<>();
try {
JSONArray topicPartitions = JsonUtils.getJSONArray(fetchDataUrl, req, FETCDATA_TIMEOUT);
for (int i = 0; i < topicPartitions.size(); i++) {
JSONObject jsonTopicPartitions = topicPartitions.getJSONObject(i);
try {
PartitionMappingTable partitionMappingTable = new PartitionMappingTable((jsonTopicPartitions));
DataRecord<String, PartitionMappingTable> dataRecord = new
DataRecord<>(partitionMappingTable.getTopicName(), partitionMappingTable, jsonTopicPartitions);
entries.add(dataRecord);
} catch (IllegalArgumentException e) {
log.error("invalid data {}", jsonTopicPartitions.toJSONString());
}
}
if (topicPartitions.size() > 0) {
log.info("Fetch some new data total {}", topicPartitions.size());
} else {
log.info("No new data in data soucre");
}
} catch (Exception e) {
log.error("Fetch data error: ", e);
throw new CacheException("Fetch Data error " + e.getMessage());
}
Dataset dataset = new Dataset(entries, endTime);
return dataset;
}
}
*/

View File

@@ -0,0 +1,154 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.partition;
import com.alibaba.fastjson.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
public class PartitionMappingTable {
/*
private static final Logger log = LoggerFactory.getLogger(PartitionRouter.class);
static final int DISABLE_PARTITION = -1;
final private String topicName;
private int [] mapToTable;
private int [] mapFromTable;
private int partitionSize;
private List<Integer> disablePartitions = new ArrayList<>();
public PartitionMappingTable(String topicName, List<Integer> disablePartitions) {
this.topicName = topicName;
this.disablePartitions = disablePartitions;
}
public PartitionMappingTable(JSONObject json) {
String topicName = json.getString("topicName");
if (topicName == null || topicName.equals("")) {
throw new IllegalArgumentException("missing topicName");
}
String disablePartitions = json.getString("disablePartitions");
if (disablePartitions == null || disablePartitions.equals("")) {
this.topicName = topicName;
return;
} else {
String []partitionList = disablePartitions.split(",");
if (partitionList.length == 0) {
throw new IllegalArgumentException("invalid disable partitions");
}
for (int i=0; i < partitionList.length; i++) {
Integer partitionValue = Integer.valueOf(partitionList[i]);
if (partitionValue == null) {
throw new IllegalArgumentException("invalid disable partitions");
}
this.disablePartitions.add(partitionValue);
}
}
this.topicName = topicName;
}
synchronized private void initMaps(int size) {
//partitionSize changed by other thread
if (size < partitionSize)
return;
// if topic has 4 partitions 0,1,2,3. 1,2 is disable
// mapToTable [0,3,-1,-1]
// mapFromTable [0,-1,-1,1]
int partitions = PartitionRouter.getInstance().getNumberPartition(topicName);
mapToTable = new int[partitions];
mapFromTable = new int[partitions];
for (int i = 0; i < mapToTable.length; i++) {
mapToTable[i] = DISABLE_PARTITION;
mapFromTable[i] = DISABLE_PARTITION;
}
int mappedPartition = 0;
for (int i = 0; i < mapToTable.length; i++) {
if (isDisable(i)) {
continue;
} else {
mapToTable[i] = mappedPartition;
mapFromTable[mappedPartition] = i;
mappedPartition++;
}
}
partitionSize = partitions;
log.info("Init partition mapping table, topic: {}, number partitions: {}, disable partitions: {}",
topicName, partitionSize, disablePartitions);
}
public String getTopicName() {
return topicName;
}
public boolean isDisable(int partion) {
return disablePartitions.contains(partion);
}
public int getMapFromPartition(int partition) {
if (partition >= partitionSize) {
initMaps(partition);
}
return mapFromTable[partition];
}
public int getMapToPartition(int partition) {
if (partition >= partitionSize) {
initMaps(partition);
}
return mapToTable[partition];
}
public List<Integer> getDisablePartitions() {
return disablePartitions;
}
@Override
public boolean equals(Object obj) {
PartitionMappingTable mappingTable = (PartitionMappingTable)obj;
if (!topicName.equals(mappingTable.topicName)) {
return false;
}
if (mapToTable.length != mappingTable.mapToTable.length) {
return false;
}
for (int i=0; i<mapToTable.length; i++) {
if (mapToTable[i] != mappingTable.mapToTable[i]) {
return false;
}
}
return true;
}
@Override
public String toString() {
return String.format("topic: %s ,disable partitions: %s", topicName, disablePartitions);
}
*/
}

View File

@@ -0,0 +1,283 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.partition;
import com.didichuxing.datachannel.kafka.cache.DataCache;
import com.didichuxing.datachannel.kafka.cache.DataProvider;
import com.didichuxing.datachannel.kafka.cache.ZkUtil;
import kafka.server.MetadataCache;
import org.apache.kafka.common.Node;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.protocol.Errors;
import org.apache.kafka.common.record.MemoryRecords;
import org.apache.kafka.common.requests.MetadataResponse;
import org.apache.kafka.common.requests.ProduceResponse;
import org.apache.zookeeper.ZooKeeper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.concurrent.ScheduledExecutorService;
public class PartitionRouter {
/*
private static final Logger log = LoggerFactory.getLogger(PartitionRouter.class);
final private String CACHE_NAME = "kafka_partition";
//meta data cache for get number of partition in the topic.
private MetadataCache metadataCache;
//map topic name to parition mapping table.
private DataCache<String, PartitionMappingTable> cache = null;
private PartitionRouter(){}
static public PartitionRouter getInstance() {
return PartitionRouterHolder.INSTANCE;
}
public void start(String clusterId, int brokerId, ZooKeeper zooKeeper, MetadataCache metadataCache,
ScheduledExecutorService scheduledExecutorService, String gatewayUrl) throws Exception {
log.info("Partition Router startup");
ZkUtil zkUtil = new ZkUtil(zooKeeper);
DataProvider dataProvider = new PartitionDataProvider(clusterId, gatewayUrl);
cache = new DataCache<>(CACHE_NAME, brokerId, dataProvider, scheduledExecutorService,
zkUtil, 60000, 4*3600*1000);
this.metadataCache = metadataCache;
}
public void shutdown() {
log.info("Partition Router shutdown");
if (cache != null) {
cache.stop();
}
}
public MetadataResponse updateMetadataResponse(MetadataResponse response, int controllerId, int version) {
// this function update metadata response to route the disable partitions
boolean changed = false;
Collection<MetadataResponse.TopicMetadata> topicMetadatas = response.topicMetadata();
//look update all topics
for (MetadataResponse.TopicMetadata topicMetadata : topicMetadatas) {
changed = updateTopicMetaData(topicMetadata) | changed;
}
//if need update. reconstruct request
if (changed) {
return new MetadataResponse((List<Node>)response.brokers(), response.clusterId(),
controllerId, (List<MetadataResponse.TopicMetadata>)response.topicMetadata(), version);
}
return response;
}
private boolean updateTopicMetaData(MetadataResponse.TopicMetadata topicMetadata) {
if (topicMetadata.error() != Errors.NONE) {
return false;
}
//get partition mapping table.
PartitionMappingTable partitionMappingTable = cache.get(topicMetadata.topic());
if (partitionMappingTable == null) {
//no disable partition
return false;
}
List<MetadataResponse.PartitionMetadata> partitionMetadatas = topicMetadata.partitionMetadata();
List<MetadataResponse.PartitionMetadata> newPartitionMetadatas = new ArrayList<>();
//look up all topic
boolean replace = false;
for (MetadataResponse.PartitionMetadata partitionMetadata : partitionMetadatas) {
replace = replace | updatePartitionMetaData(partitionMetadata, partitionMappingTable, newPartitionMetadatas);
}
// if has disable partiont, rebuild topic metadata
if (replace) {
topicMetadata.partitionMetadata().clear();
if (!newPartitionMetadatas.isEmpty()) {
topicMetadata.partitionMetadata().addAll(newPartitionMetadatas);
} else {
log.error("Topic has not enable partitions: {}", topicMetadata.topic());
}
return true;
}
return false;
}
private boolean updatePartitionMetaData(MetadataResponse.PartitionMetadata partitionMetadata,
PartitionMappingTable partitionMappingTable,
List<MetadataResponse.PartitionMetadata> newPartitionMetadatas) {
if (partitionMetadata.error() != Errors.NONE) {
return false;
}
//partition is disable
int partition = partitionMetadata.partition();
if (isDisable(partition, partitionMappingTable)) {
log.debug("Route topicpartion: {}:{} to disalbe",
partitionMappingTable.getTopicName(), partitionMetadata.partition());
return true;
}
Node leaderNode = partitionMetadata.leader();
int newPartition = partitionMappingTable.getMapToPartition(partitionMetadata.partition());
//partion need to route
if (newPartition != partitionMetadata.partition()) {
log.debug("Route topicpartion: {}:{} to {}", partitionMappingTable.getTopicName(),
partitionMetadata.partition(), newPartition);
//construct new partition metadata.
MetadataResponse.PartitionMetadata newPartitionMetadata = new MetadataResponse.PartitionMetadata(
partitionMetadata.error(), newPartition, leaderNode, partitionMetadata.replicas(),
partitionMetadata.isr());
newPartitionMetadatas.add(newPartitionMetadata);
} else {
newPartitionMetadatas.add(partitionMetadata);
}
return true;
}
public boolean updateProduceRequest(Map<TopicPartition, MemoryRecords> requestData,
HashMap<String, PartitionMappingTable> partitionMappingTables) {
Map<TopicPartition, MemoryRecords> topicPartitionMemoryRecordsMap = requestData;
//lookup all TopicPartitons.
Iterator<Map.Entry<TopicPartition, MemoryRecords>> iterator = topicPartitionMemoryRecordsMap.entrySet().iterator();
Map<TopicPartition, MemoryRecords> newEntries = new HashMap<>();
while(iterator.hasNext()) {
Map.Entry<TopicPartition, MemoryRecords> entry = iterator.next();
TopicPartition topicPartition = entry.getKey();
//topic not have disable partitions
PartitionMappingTable partitionMappingTable = partitionMappingTables.get(topicPartition.topic());
int fromPartition = partitionMappingTable.getMapFromPartition(topicPartition.partition());
//need restore origin partitions, update request
if (fromPartition != topicPartition.partition()) {
log.debug("Restore topicpartion: {}:{} to {}", partitionMappingTable.getTopicName(),
fromPartition, topicPartition.partition());
TopicPartition realTopicPartion = new TopicPartition(topicPartition.topic(), fromPartition);
MemoryRecords memoryRecords = entry.getValue();
iterator.remove();
newEntries.put(realTopicPartion, memoryRecords);
}
}
topicPartitionMemoryRecordsMap.putAll(newEntries);
return !newEntries.isEmpty();
}
public ProduceResponse updateProduceRespones(ProduceResponse response, int version,
HashMap<String, PartitionMappingTable> partitionMappingTables) {
Map<TopicPartition, ProduceResponse.PartitionResponse> responses = response.responses();
Map<TopicPartition, ProduceResponse.PartitionResponse> newResponses = new HashMap<>();
//lookup all topics
Iterator<Map.Entry<TopicPartition, ProduceResponse.PartitionResponse>> iterator = responses.entrySet().iterator();
while(iterator.hasNext()) {
Map.Entry<TopicPartition, ProduceResponse.PartitionResponse> entry = iterator.next();
TopicPartition topicPartition = entry.getKey();
ProduceResponse.PartitionResponse partitionResponse = entry.getValue();
PartitionMappingTable partitionMappingTable = partitionMappingTables.get(topicPartition.topic());
updatePartitionResponse(topicPartition, partitionResponse, responses, newResponses, partitionMappingTable);
}
//has disable partition need reconstruct respones
if (!newResponses.isEmpty()) {
return new ProduceResponse(newResponses, response.getThrottleTime(), version);
}
return response;
}
private void updatePartitionResponse(TopicPartition topicPartition,
ProduceResponse.PartitionResponse partitionResponse,
Map<TopicPartition, ProduceResponse.PartitionResponse> responses,
Map<TopicPartition, ProduceResponse.PartitionResponse> newResponses,
PartitionMappingTable partitionMappingTable) {
//no disable partitions
if (partitionMappingTable == null) {
return;
}
int toPartition = partitionMappingTable.getMapToPartition(topicPartition.partition());
//route partitions update response
if (toPartition != topicPartition.partition()) {
log.debug("Route topicpartion: {}:{} to {}", partitionMappingTable.getTopicName(),
topicPartition.partition(), toPartition);
TopicPartition realTopicPartion = new TopicPartition(topicPartition.topic(), toPartition);
if (newResponses.isEmpty()) {
newResponses.putAll(responses);
}
newResponses.remove(topicPartition);
newResponses.put(realTopicPartion, partitionResponse) ;
}
}
public boolean getTopicPartitionIsDisable(TopicPartition topicPartition,
HashMap<String, PartitionMappingTable> partitionMappingTables) {
PartitionMappingTable partitionMappingTable = partitionMappingTables.get(topicPartition.topic());
if (partitionMappingTable == null || partitionMappingTable.getDisablePartitions().isEmpty()) {
return false;
}
return partitionMappingTable.getMapFromPartition(topicPartition.partition()) == -1;
}
public HashMap<String, PartitionMappingTable> getPartitionMappingTables(Set<TopicPartition> topicPartitions) {
HashMap<String,PartitionMappingTable> partitionMappingTables = new HashMap<>();
for (TopicPartition topicPartition: topicPartitions ) {
PartitionMappingTable partitionMappingTable = cache.get(topicPartition.topic());
if (partitionMappingTable != null && !partitionMappingTable.getDisablePartitions().isEmpty()) {
partitionMappingTables.put(topicPartition.topic(), partitionMappingTable);
}
}
return partitionMappingTables;
}
public int getNumberPartition(String topic) {
return metadataCache.getPartitionNumber(topic);
}
private boolean isDisable(int partion, PartitionMappingTable partitionMappingTable) {
return partitionMappingTable.isDisable(partion);
}
private static class PartitionRouterHolder{
private static final PartitionRouter INSTANCE = new PartitionRouter();
}
public class PartitionStatus {
private PartitionMappingTable mappingTable;
private boolean disable;
public PartitionStatus(PartitionMappingTable mappingTable, boolean disable) {
this.mappingTable = mappingTable;
this.disable = disable;
}
public PartitionMappingTable getMappingTable() {
return mappingTable;
}
public boolean isDisable() {
return disable;
}
}
*/
}

View File

@@ -17,7 +17,11 @@
package com.didichuxing.datachannel.kafka.report;
import com.alibaba.fastjson.JSON;
import com.didichuxing.datachannel.kafka.config.GatewayConfigs;
import com.didichuxing.datachannel.kafka.security.authorizer.SessionManager;
import com.didichuxing.datachannel.kafka.util.HttpUtils;
import com.didichuxing.datachannel.kafka.util.ResponseCommonResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -72,17 +76,20 @@ public class SessionReport {
Map<String, Map<String, List<String>>> resultMap = new HashMap<>();
resultMap.put("produce", topicProduceUser);
resultMap.put("fetch", topicFetchUser);
// String result = JSON.toJSONString(resultMap);
log.debug("Session report: {}", resultMap);
return "";
String result = JSON.toJSONString(resultMap);
log.debug("Session report: {}", result);
return result;
}
public void sendTopicHeartBeat(String clusterId, int brokerId) {
Map<String, String> paramMap = new HashMap<>();
paramMap.put("clusterId", clusterId);
paramMap.put("brokerId", String.valueOf(brokerId));
String data = getTopicHeartBeat();
String url = GatewayConfigs.getTopicHeartBeatUrl(topicHeartBeatUrlPrefix);
ResponseCommonResult response = HttpUtils.post(url, paramMap, getTopicHeartBeat().getBytes(), 0);
if (response.getCode() != ResponseCommonResult.SUCCESS_STATUS) {
log.error("Report topic heart beat fail, detail: {}", response.toString());
}
}
public void shutdown() {

View File

@@ -0,0 +1,91 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.authorizer;
import com.alibaba.fastjson.JSONObject;
import java.util.ArrayList;
import java.util.List;
class AccessKey {
final private String topicName;
final private String userName;
final private Operation operation;
public AccessKey(String topicName, String userName, Operation operation) {
this.topicName = topicName;
this.userName = userName;
this.operation = operation;
}
static public List<AccessKey> createAccessKeysfromJson(JSONObject json) {
List<AccessKey> accessKeys = new ArrayList<>();
String topicName = json.getString("topicName");
if (topicName == null || topicName.equals("")) {
throw new IllegalArgumentException("missing topicName");
}
String userName = json.getString("username");
if (userName == null || userName.equals("")) {
throw new IllegalArgumentException("missing username");
}
Integer access = json.getInteger("access");
if (access == null) {
throw new IllegalArgumentException("missing access");
}
switch (access) {
case 0:
break;
case 1:
accessKeys.add(new AccessKey(topicName, userName, Operation.Read));
break;
case 2:
accessKeys.add(new AccessKey(topicName, userName, Operation.Write));
break;
case 3:
accessKeys.add(new AccessKey(topicName, userName, Operation.Read));
accessKeys.add(new AccessKey(topicName, userName, Operation.Write));
break;
default:
throw new IllegalArgumentException("missing operation");
}
return accessKeys;
}
@Override
public boolean equals(Object obj) {
AccessKey accessKey = (AccessKey)obj;
return this.topicName.equals(accessKey.topicName) &&
this.userName.equals(accessKey.userName) &&
this.operation.equals(accessKey.operation);
}
@Override
public int hashCode() {
return topicName.hashCode() + userName.hashCode() + operation.ordinal() * 4751;
}
@Override
public String toString() {
return String.format("username: %s, toppic: %s, operation: %s", userName, topicName, operation);
}
}

View File

@@ -0,0 +1,127 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.authorizer;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.didichuxing.datachannel.kafka.cache.CacheException;
import com.didichuxing.datachannel.kafka.cache.DataProvider;
import com.didichuxing.datachannel.kafka.cache.DataRecord;
import com.didichuxing.datachannel.kafka.cache.Dataset;
import com.didichuxing.datachannel.kafka.util.JsonUtils;
import joptsimple.internal.Strings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
class AclDataProvider implements DataProvider {
private static final Logger log = LoggerFactory.getLogger(AclDataProvider.class);
final static private int FETCDATA_TIMEOUT = 10000;
final static private String API_PATH = "/api/v1/security/acls";
final private String clusterId;
final private String fetchDataUrl;
final private List<DataRecord<AccessKey, AccessStatus>> defualtAcls = new ArrayList<>();
public AclDataProvider(String clusterId, String gatewayUrl, List<String> acls) {
this.clusterId = clusterId;
this.fetchDataUrl = gatewayUrl + API_PATH;
if (acls != null) {
for (String acl : acls) {
var record = createAclRecord(acl);
if (record != null) {
defualtAcls.add(record);
}
}
}
}
@Override
public Dataset fetchData(long startTime, long endTime) throws Exception {
log.debug("Fetch data start: {} end {}", startTime, endTime);
//send requeset to kafka gateway
String req = String.format("{\"clusterId\":%s,\"start\":%d,\"end\":%d}", clusterId, startTime, endTime);
List<DataRecord> entries = new ArrayList<>();
if (startTime == 0) {
entries.addAll(defualtAcls);
}
JSONArray acls = JsonUtils.getJSONArray(fetchDataUrl, req, FETCDATA_TIMEOUT);
for (int i = 0; i < acls.size(); i++) {
JSONObject jsonAcl = acls.getJSONObject(i);
try {
List<AccessKey> accessKeys = AccessKey.createAccessKeysfromJson(jsonAcl);
for (AccessKey accessKey : accessKeys) {
DataRecord<AccessKey, AccessStatus> dataRecord = new
DataRecord<>(accessKey, AccessStatus.Allow, jsonAcl);
entries.add(dataRecord);
}
} catch (IllegalArgumentException e) {
log.error("invalid data {}", acls.toJSONString());
}
}
if (acls.size() > 0) {
log.info("Fetch some new data total {}", acls.size());
} else {
log.info("No new data in data soucre");
}
return new Dataset(entries, endTime);
}
DataRecord<AccessKey, AccessStatus> createAclRecord(String acl) {
//acl string should be topic:user:operation:status
//operation int: 1(read) 2(write)
//status int: 0(deny), 1(allow)
try {
String[] entry = acl.split(":");
if (entry.length != 4) {
throw new IllegalArgumentException("invalid acl string");
}
String topic = entry[0];
if (Strings.isNullOrEmpty(topic)) {
throw new IllegalArgumentException("missing topicName");
}
String user = entry[1];
if (Strings.isNullOrEmpty(user)) {
throw new IllegalArgumentException("missing user");
}
Operation operation = Operation.from(entry[2]);
if (operation == Operation.Other) {
throw new IllegalArgumentException("invalid operation");
}
AccessStatus access = AccessStatus.from(entry[3]);
if (access == AccessStatus.Continue) {
throw new IllegalArgumentException("invalid operation");
}
return new DataRecord<>(new AccessKey(topic, user, operation),
access, DataRecord.Operation.create, 0);
} catch (Exception e) {
log.error("parsing acl string error: ", e);
return null;
}
}
}

View File

@@ -0,0 +1,264 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.authorizer;
import com.didichuxing.datachannel.kafka.cache.DataCache;
import com.didichuxing.datachannel.kafka.cache.DataProvider;
import com.didichuxing.datachannel.kafka.cache.ZkUtil;
import kafka.network.RequestChannel;
import kafka.security.auth.Acl;
import kafka.security.auth.Authorizer;
import kafka.zk.KafkaZkClient;
import org.apache.kafka.common.security.auth.KafkaPrincipal;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.collection.immutable.Map;
import scala.collection.immutable.Set;
import java.util.List;
import java.util.concurrent.ScheduledExecutorService;
/**
* This class use by check access control for every request.
* It use Datacache cache all the acls.
*/
public class DidiAuthorizer implements Authorizer {
private static final Logger log = LoggerFactory.getLogger("kafka.authorizer.logger");
private static final Logger userErrorlog = LoggerFactory.getLogger("userError");
final private String CACHE_NAME = "kafka_authorizer";
final private String WIDECARD = "*";
//access control map for normal user.
final private AccessStatus[][] ALLOW_USER_OPERATION_RESOURCE =
new AccessStatus[Operation.Other.ordinal() + 1][Resource.Type.Other.ordinal() + 1];
private DataCache<AccessKey, AccessStatus> aclCache = null;
public DidiAuthorizer() {
//init access control map. default is deny.
//Allow get topic metadata
ALLOW_USER_OPERATION_RESOURCE[Operation.Describe.ordinal()][Resource.Type.Topic.ordinal()] = AccessStatus.Allow;
//allow transaction
ALLOW_USER_OPERATION_RESOURCE[Operation.Describe.ordinal()][Resource.Type.TransactionalId.ordinal()] = AccessStatus.Allow;
ALLOW_USER_OPERATION_RESOURCE[Operation.Write.ordinal()][Resource.Type.TransactionalId.ordinal()] = AccessStatus.Allow;
//allow idempoten
ALLOW_USER_OPERATION_RESOURCE[Operation.IdempotentWrite.ordinal()][Resource.Type.Cluster.ordinal()] = AccessStatus.Allow;
//Allow produce data
ALLOW_USER_OPERATION_RESOURCE[Operation.Write.ordinal()][Resource.Type.Topic.ordinal()] = AccessStatus.Continue;
//Allow consume data
ALLOW_USER_OPERATION_RESOURCE[Operation.Read.ordinal()][Resource.Type.Topic.ordinal()] = AccessStatus.Continue;
//allow consumer group
ALLOW_USER_OPERATION_RESOURCE[Operation.Read.ordinal()][Resource.Type.Group.ordinal()] = AccessStatus.Allow;
ALLOW_USER_OPERATION_RESOURCE[Operation.Describe.ordinal()][Resource.Type.Group.ordinal()] = AccessStatus.Allow;
}
public void start(String clusterId, int brokerId, KafkaZkClient zkClient,
ScheduledExecutorService scheduledExecutorService, String gatewayUrl, List<String> defaultAcls) {
log.info("Didi Authorizer startup");
ZkUtil zkUtil = new ZkUtil(zkClient::currentZooKeeper);
DataProvider dataProvider = new AclDataProvider(clusterId, gatewayUrl, defaultAcls);
aclCache = new DataCache<>(CACHE_NAME, brokerId, dataProvider, scheduledExecutorService,
zkUtil, 60000, 4 * 3600 * 1000);
}
public void stop() {
log.info("Didi Authorizer shutdown");
if (aclCache != null) {
aclCache.stop();
}
}
/**
* this funtion check the access for every requests. if allow reture true. otherwise return false
*/
@Override
public boolean authorize(RequestChannel.Session kafkaSession, kafka.security.auth.Operation kafkaOperation,
kafka.security.auth.Resource kafkaResource) {
Session session = kafkaSession.kafkaSession();
if (session == null) {
log.error("Unknow User = {} is denied Operation = {} on resource = {}",
kafkaSession.principal().getName(), kafkaOperation.name(), kafkaResource.name());
userErrorlog.error("Unknow User = {} is denied Operation = {} on resource = {}",
kafkaSession.principal().getName(), kafkaOperation.name(), kafkaResource.name());
return false;
}
Operation operation = Operation.from(kafkaOperation);
Resource resource = new Resource(kafkaResource);
AccessStatus accessStatus = checkAccess(session, resource, operation);
if (accessStatus == AccessStatus.Allow) {
log.debug("User = {} is allowed Operation = {} on resource = {}",
session.getUsername(), kafkaOperation.name(), kafkaResource.name());
return true;
} else {
log.debug("User = {} is denied Operation = {} on resource = {}",
session.getUsername(), kafkaOperation.name(), kafkaResource.name());
userErrorlog.error("User = {} is denied Operation = {} on resource = {}",
session.getUsername(), kafkaOperation.name(), kafkaResource.name());
return false;
}
}
@Override
public void addAcls(Set<Acl> acls, kafka.security.auth.Resource resource) {
}
@Override
public boolean removeAcls(Set<Acl> acls, kafka.security.auth.Resource resource) {
return false;
}
@Override
public boolean removeAcls(kafka.security.auth.Resource resource) {
return false;
}
@Override
public Set<Acl> getAcls(kafka.security.auth.Resource resource) {
return null;
}
@Override
public Map<kafka.security.auth.Resource, Set<Acl>> getAcls(KafkaPrincipal principal) {
return null;
}
@Override
public Map<kafka.security.auth.Resource, Set<Acl>> getAcls() {
return null;
}
@Override
public void close() {
}
@Override
public void configure(java.util.Map<String, ?> configs) {
}
/**
* This function use to check access.
* 1. check the user is super use. if yes. retrun allow
* 2. check resource and topic. if resource is no topic or group, opration is not read ,write and describe.
* reture deny. normal user can allow these resourcese and oprations.
* 3. check access from session if it's cached.
* 4. check access from acls sotre in Datacache.
* @param session
* @param resource
* @param operation
* @return
*/
private AccessStatus checkAccess(Session session, Resource resource, Operation operation) {
//check supper user. all operation is allow.
if (session.getUser().isSuperUser()) {
if (resource.getType().ordinal() == Resource.Type.Topic.ordinal() &&
operation.ordinal() <= Operation.Write.ordinal()) {
//update session
long cacheTimestamp = aclCache.getCommitTimestamp();
AccessStatus result = checkAccessFromSession(session,
resource.getName(), operation, cacheTimestamp);
if (result == AccessStatus.Continue) {
session.setAccessStatus(resource.getName(), operation,
new AccessStatusAndTimestamp(AccessStatus.Allow, cacheTimestamp));
}
}
log.trace("Super user = {} is allowed Operation = {} on resource = {}",
session.getUsername(), operation, resource.getName());
return AccessStatus.Allow;
}
//check access for normal user.
// look up ALLOW_USER_OPERATION_RESOURCE table. only produce and consume need go to next step.
AccessStatus result;
result = checkOperationAndResouce(session, operation, resource);
if (result != AccessStatus.Continue) {
log.trace("Normal user = {} is {} Operation = {} on resource = {}",
session.getUsername(), result == AccessStatus.Allow ? "allowed" : "denied",
operation, resource.getName());
return result;
}
long cacheTimestamp = aclCache.getCommitTimestamp();
//get access status from seesion, the seesion cache the access status.
result = checkAccessFromSession(session, resource.getName(), operation, cacheTimestamp);
if (result != AccessStatus.Continue) {
return result;
}
//session cache expired, check the access from aclcache
result = checkAccessFromAcls(session, resource.getName(), operation, cacheTimestamp);
//update session access cache
session.setAccessStatus(resource.getName(), operation, new AccessStatusAndTimestamp(result, cacheTimestamp));
return result;
}
private AccessStatus checkAccessFromAcls(Session session, String topicname,
Operation operation, long cacheTimestamp) {
//normal acl that indicate the user can access the topic;
AccessKey accessKey = new AccessKey(topicname, session.getUsername(), operation);
AccessStatus status = aclCache.get(accessKey);
if (status != null) {
log.trace("match allow acl: User: {} Topic: {} Operation: {}", session.getUsername(),
topicname, operation);
return status;
}
//spacial acl that indicate the user can access all topic;
accessKey = new AccessKey(WIDECARD, session.getUsername(), operation);
status = aclCache.get(accessKey);
if (status != null) {
log.trace("match allow acl: User: {} Topic: * Operation: {}", topicname, operation);
return status;
}
//spacial acl that indicate all the user can access the topic;
accessKey = new AccessKey(topicname, WIDECARD, operation);
status = aclCache.get(accessKey);
if (status != null) {
log.trace("match allow acl: User: * Topic: {} Operation: {}", session.getUsername(), operation);
return status;
}
return AccessStatus.Deny;
}
//call this function should not be super user.
private AccessStatus checkOperationAndResouce(Session session, Operation operation, Resource resource) {
return ALLOW_USER_OPERATION_RESOURCE[operation.ordinal()][resource.getType().ordinal()];
}
private AccessStatus checkAccessFromSession(Session session, String topicName,
Operation operation, long cacheTimestamp) {
AccessStatusAndTimestamp accessStatusAndTimestamp = session.getAccessStatus(topicName, operation);
//check weather access status cache in session is expired. if yes return continue. otherwise return status.
if (accessStatusAndTimestamp != null && cacheTimestamp == accessStatusAndTimestamp.getAclTimestamp()) {
return accessStatusAndTimestamp.getAccessStatus();
}
return AccessStatus.Continue;
}
}

View File

@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.authorizer;
class Resource {
final private String name;
final private Type type;
public Resource(kafka.security.auth.Resource resource) {
this.name = resource.name();
this.type = Type.fromString(resource);
}
public String getName() {
return name;
}
public Type getType() {
return type;
}
enum Type {
Topic,
Group,
Cluster,
TransactionalId,
DelegationToken,
Other;
static Type fromString(kafka.security.auth.Resource resource) {
String resourceTypeName = resource.resourceType().name();
if (resourceTypeName.equals(Topic.name())) {
return Topic;
} else if (resourceTypeName.equals(Group.name())) {
return Group;
} else if (resourceTypeName.equals(Cluster.name())) {
return Cluster;
} else if (resourceTypeName.equals(TransactionalId.name())) {
return TransactionalId;
} else if (resourceTypeName.equals(DelegationToken.name())) {
return DelegationToken;
} else {
return Other;
}
}
}
}

View File

@@ -183,13 +183,12 @@ public class SessionManager {
Session session = map.get(sessionKey);
if (session == null) {
// User user = LoginManager.getInstance().getUser(userName);
// if (user == null) {
// //log.warn("Session manager can't fount the user User = {}, Host = {}, generate tmp session", userName, hostAddress);
// session = new Session(new User(userName, "", false), hostAddress, kafkaSession.clientPort());
// return session;
// }
User user = new User(userName, "", false);
User user = LoginManager.getInstance().getUser(userName);
if (user == null) {
//log.warn("Session manager can't fount the user User = {}, Host = {}, generate tmp session", userName, hostAddress);
session = new Session(new User(userName, "", false), hostAddress, kafkaSession.clientPort());
return session;
}
session = new Session(user, hostAddress, kafkaSession.clientPort());
log.info("Session manager create session User = {}, Host = {}, Port = {}", userName, hostAddress, kafkaSession.clientPort());
map.putIfAbsent(sessionKey, session);

View File

@@ -17,6 +17,9 @@
package com.didichuxing.datachannel.kafka.security.login;
import com.didichuxing.datachannel.kafka.cache.DataCache;
import com.didichuxing.datachannel.kafka.cache.DataProvider;
import com.didichuxing.datachannel.kafka.cache.ZkUtil;
import kafka.cluster.Broker;
import kafka.cluster.EndPoint;
import kafka.zk.KafkaZkClient;
@@ -50,6 +53,9 @@ public class LoginManager {
private ListenerName listenerName;
//map user name to User.
private DataCache<String, User> cache = null;
private LoginManager(){}
static public LoginManager getInstance() {
@@ -68,6 +74,7 @@ public class LoginManager {
log.info("Login Manager startup");
this.listenerName = listenerName;
this.zkUtil = zkUtil;
ZkUtil dataCacheZkUtil = new ZkUtil(zkUtil::currentZooKeeper);
List<String> systemUsers = new ArrayList<>();
systemUsers.add(String.format("%s:%s:%s",
@@ -76,10 +83,20 @@ public class LoginManager {
admin_USER.getUsername(), admin_USER.getPassword(), admin_USER.isSuperUser()));
if (defaultUsers != null)
systemUsers.addAll(defaultUsers);
DataProvider dataProvider = new UserDataProvider(clusterId, gatewayUrl, systemUsers);
cache = new DataCache<>(CACHE_NAME, brokerId, dataProvider, scheduledExecutorService,
dataCacheZkUtil, 60000, 4 * 3600 * 1000);
if (cache.size() == 0) {
throw new RuntimeException(String.format("System don't have any users clusterId: %s", clusterId));
}
}
public void shutdown() {
log.info("Login Manager shutdown");
if (cache != null) {
cache.stop();
}
}
/**
@@ -89,7 +106,7 @@ public class LoginManager {
* @return login status
*/
public boolean login(String username, String password, String host) {
User user = getUser(username);
User user = cache.get(username);
if (user != null) {
if (username.equals(USER_ANONYMOUS)) {
userErrorlog.error("User = {} from {} login failed. no permission for this user.",
@@ -141,7 +158,11 @@ public class LoginManager {
* @return user
*/
public User getUser(String userName) {
return null;
//this case happend in testing
if (cache == null) {
return null;
}
return cache.get(userName);
}
private String hiddenText(String s) {

View File

@@ -17,6 +17,8 @@
package com.didichuxing.datachannel.kafka.security.login;
import com.alibaba.fastjson.JSONObject;
public class User {
private String username;
@@ -32,6 +34,25 @@ public class User {
this.superUser = superUser;
}
public User(JSONObject json) {
String username = json.getString("username");
if (username == null || username.equals("")) {
throw new IllegalArgumentException("missing username");
}
String password = json.getString("password");
if (password == null || password.equals("")) {
throw new IllegalArgumentException("missing password");
}
Integer userType = json.getInteger("userType");
if (userType == null) {
throw new IllegalArgumentException("missing user type");
}
this.username = username;
this.password = password;
this.superUser = userType != 0;
}
public String getUsername() {
return username;
}

View File

@@ -0,0 +1,117 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.login;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.didichuxing.datachannel.kafka.cache.CacheException;
import com.didichuxing.datachannel.kafka.cache.DataProvider;
import com.didichuxing.datachannel.kafka.cache.DataRecord;
import com.didichuxing.datachannel.kafka.cache.Dataset;
import com.didichuxing.datachannel.kafka.util.JsonUtils;
import joptsimple.internal.Strings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
class UserDataProvider implements DataProvider {
private static final Logger log = LoggerFactory.getLogger(UserDataProvider.class);
final static private int FETCDATA_TIMEOUT = 10000;
final static private String API_PATH = "/api/v1/security/users";
final private String clusterId;
final private String fetchDataUrl;
final private List<DataRecord<String, User>> defualtUsers = new ArrayList<>();
public UserDataProvider(String clusterId, String gatewayUrl, List<String> users) {
this.clusterId = clusterId;
this.fetchDataUrl = gatewayUrl + API_PATH;
if (users != null) {
for (String user : users) {
var record = createUserRecord(user);
if (record != null) {
defualtUsers.add(record);
}
}
}
}
@Override
public Dataset fetchData(long startTime, long endTime) throws Exception {
log.debug("Fetch data start: {} end {}", startTime, endTime);
//send requeset to kafka gateway
String req = String.format("{\"clusterId\":%s,\"start\":%d,\"end\":%d}", clusterId, startTime, endTime);
List<DataRecord> entries = new ArrayList<>();
//init system user when load data from empty
if (startTime == 0) {
entries.addAll(defualtUsers);
}
JSONArray users = JsonUtils.getJSONArray(fetchDataUrl, req, FETCDATA_TIMEOUT);
for (int i = 0; i < users.size(); i++) {
JSONObject jsonUser = users.getJSONObject(i);
try {
User user = new User(jsonUser);
DataRecord<String, User> dataRecord = new DataRecord<>(user.getUsername(), user, jsonUser);
entries.add(dataRecord);
} catch (IllegalArgumentException e) {
log.error("invalid data {}", users.toJSONString());
}
}
if (users.size() > 0) {
log.info("Fetch some new data total {}", users.size());
} else {
log.info("No new data in data soucre");
}
return new Dataset(entries, endTime);
}
DataRecord<String, User> createUserRecord(String user) {
//user string should be name:passwd:issuper
try {
String[] entry = user.split(":");
if (entry.length != 3) {
throw new IllegalArgumentException("invalid user string");
}
String username = entry[0];
if (Strings.isNullOrEmpty(username)) {
throw new IllegalArgumentException("missing username");
}
String password = entry[1];
if (Strings.isNullOrEmpty(password)) {
throw new IllegalArgumentException("missing password");
}
boolean superUser = Boolean.parseBoolean(entry[2]);
return new DataRecord<>(username,
new User(username, password, superUser), DataRecord.Operation.create, 0);
} catch (Exception e) {
log.error("parsing user string error: ", e);
return null;
}
}
}

View File

@@ -0,0 +1,65 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.sasl.didi;
import org.apache.kafka.common.security.plain.internals.PlainSaslServerProvider;
import javax.security.auth.Subject;
import javax.security.auth.callback.CallbackHandler;
import javax.security.auth.login.LoginException;
import javax.security.auth.spi.LoginModule;
import java.util.Map;
public class DidiLoginModule implements LoginModule {
static {
DidiSaslServerProvider.initialize();
PlainSaslServerProvider.initialize();
}
@Override
public void initialize(Subject subject, CallbackHandler callbackHandler, Map<String, ?> sharedState, Map<String, ?> options) {
String username = (String) options.get("username");
if (username != null)
subject.getPublicCredentials().add(username);
String password = (String) options.get("password");
if (password != null)
subject.getPrivateCredentials().add(password);
}
@Override
public boolean login() throws LoginException {
return true;
}
@Override
public boolean commit() throws LoginException {
return true;
}
@Override
public boolean abort() throws LoginException {
return true;
}
@Override
public boolean logout() throws LoginException {
return true;
}
}

View File

@@ -0,0 +1,127 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.sasl.didi;
import javax.security.sasl.SaslClient;
import javax.security.sasl.SaslException;
import java.io.UnsupportedEncodingException;
public class DidiSaslClient implements SaslClient {
public static final String MECHANISM = "DIDI";
private boolean completed = false;
private byte[] pw;
private String authorizationID;
private String authenticationID;
private static byte SEP = 0;
DidiSaslClient(String var1, String var2, byte[] var3) throws SaslException {
if (var2 != null && var3 != null) {
this.authorizationID = var1;
this.authenticationID = var2;
this.pw = var3;
} else {
throw new SaslException("DIDI: authorization ID and password must be specified");
}
}
public String getMechanismName() {
return MECHANISM;
}
public boolean hasInitialResponse() {
return true;
}
public void dispose() throws SaslException {
this.clearPassword();
}
public byte[] evaluateChallenge(byte[] var1) throws SaslException {
if (this.completed) {
throw new IllegalStateException("DIDI authentication already completed");
} else {
this.completed = true;
try {
byte[] var2 = this.authorizationID != null ? this.authorizationID.getBytes("UTF8") : null;
byte[] var3 = this.authenticationID.getBytes("UTF8");
byte[] var4 = new byte[this.pw.length + var3.length + 2 + (var2 == null ? 0 : var2.length)];
int var5 = 0;
if (var2 != null) {
System.arraycopy(var2, 0, var4, 0, var2.length);
var5 = var2.length;
}
var4[var5++] = SEP;
System.arraycopy(var3, 0, var4, var5, var3.length);
var5 += var3.length;
var4[var5++] = SEP;
System.arraycopy(this.pw, 0, var4, var5, this.pw.length);
this.clearPassword();
return var4;
} catch (UnsupportedEncodingException var6) {
throw new SaslException("Cannot get UTF-8 encoding of ids", var6);
}
}
}
public boolean isComplete() {
return this.completed;
}
public byte[] unwrap(byte[] var1, int var2, int var3) throws SaslException {
if (this.completed) {
throw new SaslException("DIDI supports neither integrity nor privacy");
} else {
throw new IllegalStateException("DIDI authentication not completed");
}
}
public byte[] wrap(byte[] var1, int var2, int var3) throws SaslException {
if (this.completed) {
throw new SaslException("DIDI supports neither integrity nor privacy");
} else {
throw new IllegalStateException("DIDI authentication not completed");
}
}
public Object getNegotiatedProperty(String var1) {
if (this.completed) {
return var1.equals("javax.security.sasl.qop") ? "auth" : null;
} else {
throw new IllegalStateException("PLAIN authentication not completed");
}
}
private void clearPassword() {
if (this.pw != null) {
for (int var1 = 0; var1 < this.pw.length; ++var1) {
this.pw[var1] = 0;
}
this.pw = null;
}
}
protected void finalize() {
this.clearPassword();
}
}

View File

@@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.sasl.didi;
import javax.security.auth.callback.*;
import javax.security.sasl.SaslClient;
import javax.security.sasl.SaslClientFactory;
import javax.security.sasl.SaslException;
import java.io.IOException;
import java.util.Map;
public class DidiSaslClientFactory implements SaslClientFactory {
@Override
public SaslClient createSaslClient(String[] mechanisms, String authorizationId, String protocol, String serverName, Map<String, ?> props, CallbackHandler cbh) throws SaslException {
if (mechanisms.length == 1 && mechanisms[0].equals("DIDI")) {
Object[] paras = this.getUserInfo("DIDI", authorizationId, cbh);
return new DidiSaslClient(authorizationId, (String) paras[0], ((String) paras[1]).getBytes());
} else {
throw new SaslException("DidiSaslClientFactory only support mechanism:DIDI");
}
}
@Override
public String[] getMechanismNames(Map<String, ?> props) {
return new String[]{"DIDI"};
}
private Object[] getUserInfo(String mechanism, String authorizationID, CallbackHandler callback) throws SaslException {
if (callback == null) {
throw new SaslException("Callback handler to get username/password required");
} else {
try {
String namePrompt = mechanism + " authentication id: ";
String passwordPrompt = mechanism + " password: ";
NameCallback nameCallBack = authorizationID == null ? new NameCallback(namePrompt) : new NameCallback(namePrompt, authorizationID);
PasswordCallback passwdCallBack = new PasswordCallback(passwordPrompt, false);
callback.handle(new Callback[]{nameCallBack, passwdCallBack});
char[] pwdBytes = passwdCallBack.getPassword();
String password = null;
if (pwdBytes != null) {
password = new String(pwdBytes);
passwdCallBack.clearPassword();
}
String username = nameCallBack.getName();
return new Object[]{username, password};
} catch (IOException var11) {
throw new SaslException("Cannot get password", var11);
} catch (UnsupportedCallbackException var12) {
throw new SaslException("Cannot get userid/password", var12);
}
}
}
}

View File

@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.sasl.didi;
import java.security.Provider;
import java.security.Security;
public class DidiSaslClientProvider extends Provider {
private static final long serialVersionUID = 1L;
protected DidiSaslClientProvider() {
super("Simple SASL/DIDI Server Provider", 1.0, "Simple SASL/DIDI Server Provider for Kafka");
super.put("SaslClientFactory." + "DIDI", DidiSaslClientFactory.class.getName());
}
public static void initialize() {
Security.addProvider(new DidiSaslClientProvider());
}
}

View File

@@ -0,0 +1,137 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.sasl.didi;
import org.apache.kafka.common.security.JaasUtils;
import javax.security.auth.callback.CallbackHandler;
import javax.security.sasl.SaslException;
import javax.security.sasl.SaslServer;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
/**
* DidiSaslServer implementation for SASL/DIDI.
* validate appid and password by kafka gateway
*/
public class DidiSaslServer implements SaslServer {
public static final String MECHANISM = "DIDI";
private static final String JAAS_USER_PREFIX = "user_";
private boolean complete;
private String authorizationID;
public DidiSaslServer(CallbackHandler callbackHandler) {
}
@Override
public byte[] evaluateResponse(byte[] response) throws SaslException {
/*
* Message format (from https://tools.ietf.org/html/rfc4616):
*
* message = [authzid] UTF8NUL authcid UTF8NUL passwd
* authcid = 1*SAFE ; MUST accept up to 255 octets
* authzid = 1*SAFE ; MUST accept up to 255 octets
* passwd = 1*SAFE ; MUST accept up to 255 octets
* UTF8NUL = %x00 ; UTF-8 encoded NUL character
*
* SAFE = UTF1 / UTF2 / UTF3 / UTF4
* ;; any UTF-8 encoded Unicode character except NUL
*/
String[] tokens;
try {
tokens = new String(response, "UTF-8").split("\u0000");
} catch (UnsupportedEncodingException e) {
throw new SaslException("UTF-8 encoding not supported", e);
}
if (tokens.length != 3)
throw new SaslException("Invalid SASL/DIDI response: expected 3 tokens, got " + tokens.length);
authorizationID = tokens[0];
String username = tokens[1];
String password = tokens[2];
if (username.isEmpty()) {
throw new SaslException("Authentication failed: username not specified");
}
if (password.isEmpty()) {
throw new SaslException("Authentication failed: password not specified");
}
if (authorizationID.isEmpty())
authorizationID = username;
/*
try {
String expectedPassword = JaasUtils.defaultServerJaasConfigOption(JAAS_USER_PREFIX + username, DidiLoginModule.class.getName());
if (!password.equals(expectedPassword)) {
throw new SaslException("Authentication failed: Invalid username or password");
}
} catch (IOException e) {
throw new SaslException("Authentication failed: Invalid JAAS configuration", e);
}
*/
complete = true;
return new byte[0];
}
@Override
public String getAuthorizationID() {
if (!complete)
throw new IllegalStateException("Authentication exchange has not completed");
return authorizationID;
}
@Override
public String getMechanismName() {
return MECHANISM;
}
@Override
public Object getNegotiatedProperty(String propName) {
if (!complete)
throw new IllegalStateException("Authentication exchange has not completed");
return null;
}
@Override
public boolean isComplete() {
return complete;
}
@Override
public byte[] unwrap(byte[] incoming, int offset, int len) throws SaslException {
if (!complete)
throw new IllegalStateException("Authentication exchange has not completed");
return Arrays.copyOfRange(incoming, offset, offset + len);
}
@Override
public byte[] wrap(byte[] outgoing, int offset, int len) throws SaslException {
if (!complete)
throw new IllegalStateException("Authentication exchange has not completed");
return Arrays.copyOfRange(outgoing, offset, offset + len);
}
@Override
public void dispose() throws SaslException {
}
}

View File

@@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.sasl.didi;
import javax.security.auth.callback.CallbackHandler;
import javax.security.sasl.SaslException;
import javax.security.sasl.SaslServer;
import javax.security.sasl.SaslServerFactory;
import java.util.Map;
public class DidiSaslServerFactory implements SaslServerFactory {
@Override
public SaslServer createSaslServer(String mechanism, String protocol, String serverName, Map<String, ?> props, CallbackHandler cbh) throws SaslException {
if (!"DIDI".equals(mechanism)) {
throw new SaslException(String.format("Mechanism \'%s\' is not supported. Only DIDI is supported.", mechanism));
}
return new DidiSaslServer(cbh);
}
@Override
public String[] getMechanismNames(Map<String, ?> props) {
return new String[]{DidiSaslServer.MECHANISM};
}
}

View File

@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.security.sasl.didi;
import java.security.Provider;
import java.security.Security;
public class DidiSaslServerProvider extends Provider {
private static final long serialVersionUID = 1L;
protected DidiSaslServerProvider() {
super("Simple SASL/DIDI Server Provider", 1.0, "Simple SASL/DIDI Server Provider for Kafka");
super.put("SaslServerFactory." + "DIDI", DidiSaslServerFactory.class.getName());
}
public static void initialize() {
Security.addProvider(new DidiSaslServerProvider());
}
}

View File

@@ -0,0 +1,294 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Map;
import java.util.Random;
/**
* HTTP 请求工具
*
* // GET 请求, 超时时间设置为0, 表示只请求一次, 返回响应文本
* String result = HttpUtils.get("http://www.baidu.com", 0);
*
* // GET 请求, 5s超时, 返回响应文本
* Map<String, String> paramMap = new HashMap<>();
* paramMap.put("key", "value");
* HttpUtils.get("http://www.baidu.com", paramMap, 5000);
*
* // POST 请求, 10s超时时间, 返回响应文本
* String respText = HttpUtils.post("http://url", null, "data".getBytes(), 10000);
*
* // 还有其他若干 get(...) 和 post(...) 方法的重载(例如请求时单独添加请求头), 详见代码实现
*
*/
public class HttpUtils {
private static final Logger log = LoggerFactory.getLogger(HttpUtils.class);
// 返回结果最大长度, 10MB
private static final int RESPONSE_DATA_MAX_LENGTH = 10 * 1024 * 1024;
// 连接超时时间, 单位: ms
private static int CONNECT_TIME_OUT = 5 * 1000;
// 读取超时时间, 单位: ms
private static int READ_TIME_OUT = 30 * 1000;
private static boolean fastFailed = true;
// 设置编码格式
private static String CONTENT_TYPE = "application/json;charset=UTF-8";
public static ResponseCommonResult get(String url, Map<String, String> params, int timeoutMs) {
return get(url, params, null, timeoutMs);
}
public static ResponseCommonResult get(String url, Map<String, String> params, Map<String, String> headers, int timeoutMs) {
return sendRequest(url, params, "GET", headers, null, timeoutMs);
}
public static ResponseCommonResult post(String url, Map<String, String> params, byte[] body, int timeoutMs) {
return post(url, params, null, body, timeoutMs);
}
public static ResponseCommonResult post(String url, Map<String, String> params, Map<String, String> headers, byte[] body, int timeoutMs) {
InputStream in = null;
if (body != null && body.length > 0) {
in = new ByteArrayInputStream(body);
}
return post(url, params, headers, in, timeoutMs);
}
public static ResponseCommonResult post(String url, Map<String, String> params, InputStream bodyStream, int timeoutMs) {
return post(url, params, null, bodyStream, timeoutMs);
}
public static ResponseCommonResult post(String url, Map<String, String> params, Map<String, String> headers, InputStream bodyStream, int timeoutMs) {
return sendRequest(url, params, "POST", headers, bodyStream, timeoutMs);
}
private static ResponseCommonResult sendRequest(String url, Map<String, String> params, String method, Map<String, String> headers, InputStream bodyStream, int timeoutMs) {
log.debug(String.format("sendRequest params detail. url:%s, params:%s, method:%s, headers:%s, bodyStream:%s, timeoutMs:%d", url, params == null ? "null" : params.toString(), method, headers == null ? "null" : headers.toString(), bodyStream == null ? "null" : bodyStream.toString(), timeoutMs));
try {
if (timeoutMs < 0) {
return ResponseCommonResult.failure("timeoutMs must be positive or 0");
}
ResponseCommonResult result = sendRequestInternal(url, params, method, headers, bodyStream);
if (timeoutMs == 0 || fastFailed) {
return result;
}
long timeoutTimestamp = System.currentTimeMillis() + timeoutMs;
// random sleep [1s,5s]
int backOffMs = (new Random().nextInt(5) + 1) * 1000;
while (result.getCode() != ResponseCommonResult.SUCCESS_STATUS) {
if (System.currentTimeMillis() > timeoutTimestamp) {
return result;
}
String paramaters = params == null ? "null" : params.toString();
String body = bodyStream == null ? "null" : bodyStream.toString();
log.warn("send request failed, request url:{}, request parameters:{}, body:{}, backOff milliseconds:{}", url, paramaters, body, backOffMs);
try {
Thread.sleep(backOffMs);
} catch (InterruptedException e) {
log.error("Interrupted when do url request, detail: ", e);
}
result = sendRequestInternal(url, params, method, headers, bodyStream);
backOffMs = (new Random().nextInt(5) + 1) * 1000;
}
return result;
} catch (Exception e) {
return ResponseCommonResult.failure(e.getMessage());
}
}
/**
* @param url 请求的链接, 只支持 http 和 https 链接
* @param method GET or POST
* @param headers 请求头 (将覆盖默认请求), 可以为 null
* @param bodyStream 请求内容, 流将自动关闭, 可以为 null
* @return 返回响应内容的文本
* @throws Exception http 响应 code 非 200, 或发生其他异常均抛出异常
*/
private static ResponseCommonResult sendRequestInternal(String url, Map<String, String> params, String method, Map<String, String> headers, InputStream bodyStream) throws Exception {
assertUrlValid(url);
HttpURLConnection conn = null;
try {
String paramUrl = url;
if (params != null) {
paramUrl = setUrlParams(url, params);
}
// 打开链接
URL urlObj = new URL(paramUrl);
conn = (HttpURLConnection) urlObj.openConnection();
// 设置各种默认属性
setDefaultProperties(conn);
// 设置请求方法
if (method != null && method.length() > 0) {
conn.setRequestMethod(method);
}
// 添加请求头
if (headers != null && headers.size() > 0) {
for (Map.Entry<String, String> entry : headers.entrySet()) {
conn.setRequestProperty(entry.getKey(), entry.getValue());
}
}
// 设置请求内容
if (bodyStream != null) {
conn.setDoOutput(true);
copyStreamAndClose(bodyStream, conn.getOutputStream());
}
// 获取响应code
int code = conn.getResponseCode();
// 获取响应内容长度
long contentLength = conn.getContentLengthLong();
// 获取响应内容输入流
InputStream in = conn.getInputStream();
if (contentLength > RESPONSE_DATA_MAX_LENGTH) {
throw new IOException(String.format("Response content length too large: %d", contentLength));
}
String resultStr = handleResponseBodyToString(in);
// 没有响应成功, 均抛出异常
if (code != HttpURLConnection.HTTP_OK) {
throw new IOException(String.format("Http Error: %d, detail: %s", code, resultStr));
}
return ResponseCommonResult.success(resultStr);
} catch (Exception e) {
return ResponseCommonResult.failure(e.getMessage());
} finally {
closeConnection(conn);
}
}
private static void assertUrlValid(String url) throws IllegalAccessException {
if (url == null || !(url.startsWith("http://") || url.startsWith("https://")))
throw new IllegalAccessException("url cannot be null");
url = url.toLowerCase();
if (!url.startsWith("http://") && !url.startsWith("https://"))
throw new IllegalAccessException(String.format("Only support http or https url:%s", url));
}
private static String setUrlParams(String url, Map<String, String> params) {
String paramUrl = url + "?";
for (Map.Entry<String, String> entry: params.entrySet()) {
String kv = entry.getKey() + "=" + entry.getValue() + "&";
paramUrl += kv;
}
return paramUrl.substring(0, paramUrl.length() - 1);
}
private static void setDefaultProperties(HttpURLConnection conn) {
// 设置连接超时时间
conn.setConnectTimeout(CONNECT_TIME_OUT);
// 设置读取超时时间
conn.setReadTimeout(READ_TIME_OUT);
// 设置编码格式
conn.setRequestProperty("Content-Type", CONTENT_TYPE);
}
private static String handleResponseBodyToString(InputStream in) throws Exception {
ByteArrayOutputStream bytesOut = null;
try {
bytesOut = new ByteArrayOutputStream();
// 读取响应内容
copyStreamAndClose(in, bytesOut);
// 响应内容的字节序列
byte[] contentBytes = bytesOut.toByteArray();
return new String(contentBytes, "utf-8");
} finally {
closeStream(bytesOut);
}
}
private static void copyStreamAndClose(InputStream in, OutputStream out) {
try {
byte[] buf = new byte[1024];
int len = -1;
while ((len = in.read(buf)) != -1) {
out.write(buf, 0, len);
}
out.flush();
} catch (Exception e) {
e.printStackTrace();
} finally {
closeStream(in);
closeStream(out);
}
}
private static void closeConnection(HttpURLConnection conn) {
if (conn != null) {
try {
conn.disconnect();
} catch (Exception e) {
log.error("close httpURLConnection fail", e);
}
}
}
private static void closeStream(Closeable stream) {
if (stream != null) {
try {
stream.close();
} catch (Exception e) {
log.error("close stream fail", e);
}
}
}
public static void setFastFailed(boolean value) {
fastFailed = value;
}
}

View File

@@ -0,0 +1,68 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.util;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.didichuxing.datachannel.kafka.cache.CacheException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;
public class JsonUtils {
private static final Logger log = LoggerFactory.getLogger(JSONObject.class);
public static Map<String, Object> jsonString2Map(String jsonString) {
Map<String, Object> resultMap = new HashMap<>();
return resultMap;
}
public static ResponseCommonResult string2ResponseCommonResult(String jsonString) {
return JSON.parseObject(jsonString, ResponseCommonResult.class);
}
public static JSONArray getJSONArray(String url, String req, int timeoutMs) throws Exception {
//send requeset to kafka gateway
ResponseCommonResult resp =
HttpUtils.post(url, null, req.getBytes(), timeoutMs);
if (resp == null || resp.getCode() == ResponseCommonResult.FAILED_STATUS) {
throw new CacheException(String.format("send request Data failed: %s %s", url, req));
}
String respStr = (String) resp.getData();
if (respStr == null || respStr.equals("")) {
throw new CacheException(String.format("Invalid response: %s", resp));
}
JSONObject respJson = JSON.parseObject(respStr);
if (respJson == null || !respJson.containsKey("data")) {
throw new CacheException(String.format("Invalid data: missing 'data' resp: %s", resp));
}
JSONObject data = respJson.getJSONObject("data");
if (data == null || !data.containsKey("rows")) {
throw new CacheException(String.format("Invalid data: missing 'rows' resp: %s", resp));
}
JSONArray records = data.getJSONArray("rows");
return records;
}
}

View File

@@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.didichuxing.datachannel.kafka.util;
import com.alibaba.fastjson.JSONObject;
public class ResponseCommonResult<T> {
public static final int SUCCESS_STATUS = 0;
public static final int FAILED_STATUS = -1;
public static final String SUCCESS_MESSAGE = "process succeeded!";
public static final String FAILED_MESSAGE = "process failed!";
private int code;
private String message;
private T data;
public static <T> ResponseCommonResult<T> success(T data) {
ResponseCommonResult<T> responseCommonResult = new ResponseCommonResult<T>();
responseCommonResult.setMessage(SUCCESS_MESSAGE);
responseCommonResult.setCode(SUCCESS_STATUS);
responseCommonResult.setData(data);
return responseCommonResult;
}
public static <T> ResponseCommonResult<T> success() {
ResponseCommonResult<T> responseCommonResult = new ResponseCommonResult<T>();
responseCommonResult.setCode(SUCCESS_STATUS);
responseCommonResult.setMessage(SUCCESS_MESSAGE);
return responseCommonResult;
}
public static <T> ResponseCommonResult<T> failure() {
ResponseCommonResult<T> responseCommonResult = new ResponseCommonResult<T>();
responseCommonResult.setMessage(FAILED_MESSAGE);
responseCommonResult.setCode(FAILED_STATUS);
return responseCommonResult;
}
public static <T> ResponseCommonResult<T> failure(String message) {
ResponseCommonResult<T> responseCommonResult = new ResponseCommonResult<T>();
responseCommonResult.setMessage(message);
responseCommonResult.setCode(FAILED_STATUS);
return responseCommonResult;
}
public int getCode() {
return code;
}
public void setCode(int code) {
this.code = code;
}
public String getMessage() {
return message;
}
public void setMessage(String message) {
this.message = message;
}
public T getData() {
return data;
}
public void setData(T data) {
this.data = data;
}
@Override
public String toString() {
JSONObject jsonObject = new JSONObject();
jsonObject.put("code", code);
jsonObject.put("message", message);
jsonObject.put("data", data);
return jsonObject.toJSONString();
}
}

View File

@@ -345,6 +345,7 @@ object KafkaConfig {
/** ********* didi kafka Configuration ***********/
val ClusterIdProp = "cluster.id"
val ClusterIdUpdateProp = "cluster.id.update"
val GatewayUrlProp = "gateway.url"
val DiskLoadProtectorEnableProp = "diskloadprotector.enable"
val SessionReportTimeMsProp = "session.report.ms"
val MaxSessionsPerUserProp = "max.sessions.per.user"
@@ -630,6 +631,7 @@ object KafkaConfig {
"start from " + MaxReservedBrokerIdProp + " + 1."
val ClusterIdDoc = "The cluster id for this kafka cluster. If unset, a random cluster id will be generated."
val ClusterIdUpdateDoc = "force update cluster id for this kafka cluster. If unset, the broker not startup when the cluster id missmatch."
val GatewayUrlDoc = "The kafka gateway sever url for this kafka cluster. If unset, kafka gateway features are not worked."
val MessageMaxBytesDoc = TopicConfig.MAX_MESSAGE_BYTES_DOC +
s"This can be set per topic with the topic level <code>${TopicConfig.MAX_MESSAGE_BYTES_CONFIG}</code> config."
val NumNetworkThreadsDoc = "The number of threads that the server uses for receiving requests from the network and sending responses to the network"
@@ -1009,6 +1011,7 @@ object KafkaConfig {
/** ********* General Configuration ***********/
.define(ClusterIdProp, INT, -1, HIGH, ClusterIdDoc)
.define(ClusterIdUpdateProp, BOOLEAN, false, LOW, ClusterIdUpdateDoc)
.define(GatewayUrlProp, STRING, "", HIGH, GatewayUrlDoc)
.define(DiskLoadProtectorEnableProp, BOOLEAN, true, MEDIUM, "")
.define(SessionReportTimeMsProp, INT, 5*60*1000, MEDIUM, "")
.define(KafkaExMetricsEnableAllProp, BOOLEAN, false, MEDIUM, "")
@@ -1434,6 +1437,7 @@ class KafkaConfig(val props: java.util.Map[_, _], doLog: Boolean, dynamicConfigO
/** ********* didi kafka Configuration ***********/
var clusterId: Int = getInt(KafkaConfig.ClusterIdProp)
var clusterIdUpdate: Boolean = getBoolean(KafkaConfig.ClusterIdUpdateProp)
var gatewayUrl: String = getString(KafkaConfig.GatewayUrlProp)
var diskLoadProtectorEnable: Boolean = getBoolean(KafkaConfig.DiskLoadProtectorEnableProp)
var sessionReportTimeMs: Int = getInt(KafkaConfig.SessionReportTimeMsProp)
var kafkaExMetricsEnableAll: Boolean = getBoolean(KafkaConfig.KafkaExMetricsEnableAllProp)

View File

@@ -19,9 +19,10 @@ package kafka.server
import com.didichuxing.datachannel.kafka.metrics.KafkaExMetrics
import com.didichuxing.datachannel.kafka.report.SessionReport
import com.didichuxing.datachannel.kafka.security.authorizer.SessionManager
import com.didichuxing.datachannel.kafka.security.authorizer.{DidiAuthorizer, SessionManager}
import com.didichuxing.datachannel.kafka.security.login.LoginManager
import com.didichuxing.datachannel.kafka.server.{DiskLoadProtector, OSUtil}
import com.didichuxing.datachannel.kafka.util.HttpUtils
import kafka.api.{KAFKA_0_9_0, KAFKA_2_2_IV0, KAFKA_2_4_IV1}
import kafka.cluster.Broker
import kafka.common.{GenerateBrokerIdException, InconsistentBrokerIdException, InconsistentBrokerMetadataException, InconsistentClusterIdException}
@@ -33,6 +34,7 @@ import kafka.log.{LogConfig, LogManager}
import kafka.metrics.{KafkaMetricsGroup, KafkaMetricsReporter}
import kafka.network.SocketServer
import kafka.security.CredentialProvider
import kafka.security.authorizer.AuthorizerWrapper
import kafka.utils._
import kafka.zk.{BrokerInfo, KafkaZkClient}
import org.apache.kafka.clients._
@@ -317,11 +319,14 @@ class KafkaServer(val config: KafkaConfig, time: Time = Time.SYSTEM, threadNameP
tokenManager = new DelegationTokenManager(config, tokenCache, time , zkClient)
tokenManager.startup()
// LoginManager.getInstance().start(clusterId, config.brokerId, config.interBrokerListenerName, zkClient, kafkaGatewayScheduler.getExecutor,
// gatewayUrl, users.asJava)
val gatewayUrl = config.gatewayUrl
//val users = List("kafka:12345:false")
val users = List()
LoginManager.getInstance().start(clusterId, config.brokerId, config.interBrokerListenerName, zkClient, kafkaGatewayScheduler.getExecutor,
gatewayUrl, users.asJava)
SessionManager.getInstance().start(kafkaGatewayScheduler.getExecutor, config.maxSessionsPerUser)
// SessionReport.getInstance().start(clusterId, config.brokerId, config.gatewayUrl, config.sessionReportTimeMs,
// kafkaGatewayScheduler.getExecutor)
SessionReport.getInstance().start(clusterId, config.brokerId, config.gatewayUrl, config.sessionReportTimeMs,
kafkaGatewayScheduler.getExecutor)
com.didichuxing.datachannel.kafka.security.login.SecurityUtils.start(config)
/* start kafka controller */
@@ -348,7 +353,21 @@ class KafkaServer(val config: KafkaConfig, time: Time = Time.SYSTEM, threadNameP
authorizer.foreach(_.configure(config.originals))
val authorizerFutures: Map[Endpoint, CompletableFuture[Void]] = authorizer match {
case Some(authZ) =>
authZ.start(brokerInfo.broker.toServerInfo(clusterId, config)).asScala.mapValues(_.toCompletableFuture).toMap
authZ match {
case authZWrapper:AuthorizerWrapper =>
authZWrapper.baseAuthorizer match {
case didiAuthorizer: DidiAuthorizer =>
//val acls = List("*:kafka:Read:Allow", "*:kafka:Write:Allow")
val acls = List()
didiAuthorizer.start(clusterId, config.brokerId, zkClient, kafkaGatewayScheduler.getExecutor,
gatewayUrl, acls.asJava)
case _ =>
}
authZ.start(brokerInfo.broker.toServerInfo(clusterId, config)).asScala.mapValues(_.toCompletableFuture).toMap
case _ =>
authZ.start(brokerInfo.broker.toServerInfo(clusterId, config)).asScala.mapValues(_.toCompletableFuture).toMap
}
case None =>
brokerInfo.broker.endPoints.map { ep => ep.toJava -> CompletableFuture.completedFuture[Void](null) }.toMap
}
@@ -403,6 +422,7 @@ class KafkaServer(val config: KafkaConfig, time: Time = Time.SYSTEM, threadNameP
shutdownLatch = new CountDownLatch(1)
startupComplete.set(true)
isStartingUp.set(false)
HttpUtils.setFastFailed(false);
AppInfoParser.registerAppInfo(jmxPrefix, config.brokerId.toString, metrics, time.milliseconds())
info("started")
}

View File

@@ -0,0 +1,92 @@
package com.didichuxing.datachannel.kafka.cache;
import kafka.zk.ZooKeeperTestHarness;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import static org.junit.Assert.*;
public class DataCacheTest extends ZooKeeperTestHarness {
private DataCache<String, Integer> cache = null;
private DataCache<String, Integer> cache2 = null;
private TestDataProvider dataProvider = null;
private boolean checkSyncTime;
public void setUp() {
super.setUp();
String node = System.getProperty("node");
int nodeId = 0;
if (node != null) {
nodeId = Integer.parseInt(node);
}
ScheduledThreadPoolExecutor scheduledThreadPoolExecutor = new ScheduledThreadPoolExecutor(5);
dataProvider = new TestDataProvider(true);
cache = new DataCache<>("ice", nodeId, dataProvider, scheduledThreadPoolExecutor,
new ZkUtil(()->zkClient().currentZooKeeper()), 3000, 10000);
cache2 = new DataCache<>("ice", nodeId+1, dataProvider, scheduledThreadPoolExecutor,
new ZkUtil(()->zkClient().currentZooKeeper()), 3000, 10000);
}
@Test
public void testCache() throws Exception {
for (int i=0; i< 600; i++) {
String key = dataProvider.getRandomKey();
long commitTimestamp = cache.getCommitTimestamp();
TestDataProvider.Element element = dataProvider.getElement(key, commitTimestamp);
Integer value = cache.get(key);
Integer value2 = null;
if (element != null) {
value2 = element.value;
}
Thread.sleep(100);
if (commitTimestamp < cache.getCommitTimestamp()) {
continue;
}
if (element == null) {
if (value != null) {
System.out.println("key: " + key + " ,map value: " + value + " ,element value: " +
value2 + ", timestamp: " + commitTimestamp);
}
assertNull(value);
} else {
if (value == null || element.value != value) {
System.out.println("key: " + key + " ,map value: " + value + " ,element value: " +
value2 + ", timestamp: " + commitTimestamp);
}
assertNotNull(value);
assertEquals(element.value, value.intValue());
}
long timestamp = dataProvider.getLastTimestamp();
long timeSpan = (timestamp - commitTimestamp)/1000;
if (timeSpan < 20) {
checkSyncTime = true;
}
if (checkSyncTime) {
if (timeSpan >= 20) {
System.out.println("now: " + timestamp + ", commit: " + commitTimestamp);
}
Assert.assertTrue(timeSpan < 20);
}
}
}
@Override
public void tearDown() {
if (cache != null) {
cache.stop();
cache2.stop();
}
super.tearDown();
}
}

View File

@@ -0,0 +1,110 @@
package com.didichuxing.datachannel.kafka.cache;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class TestDataProvider implements DataProvider {
private final Random random = new Random();
private List<Element> data = new ArrayList<>();
public TestDataProvider(boolean create) {
if (create)
createElements(); {
}
loadElements();;
}
private void createElements() {
List<Element> elements = new ArrayList<>();
long timestamp = System.currentTimeMillis();
for (int i= -1800; i < 3600*2; i++) {
Element element = new Element();
element.key = String.format("appid_%04d", random.nextInt(3000));
element.value = random.nextInt(100000);
element.timestamp = timestamp + (i + 3-random.nextInt(6))*1000;
elements.add(element);
}
elements.sort((f1, f2)->{ return (int)(f1.timestamp - f2.timestamp); });
data = elements;
}
private void loadElements() {
}
@Override
public Dataset fetchData(long startTime, long endTime) throws Exception {
if (startTime == 0) {
List<DataRecord> entries = new ArrayList<>();
for (Element element : data) {
if (element.timestamp < endTime) {
DataRecord record = new DataRecord<String, Integer>(element.key, element.value,
DataRecord.Operation.update, element.timestamp);
entries.add(record);
} else {
break;
}
}
Dataset dataset = new Dataset(entries, endTime);
return dataset;
}else {
long now = System.currentTimeMillis();
long timestamp = startTime;
List<DataRecord> entries = new ArrayList<>();
for (Element element : data) {
if (element.timestamp < startTime) {
continue;
} else if (element.timestamp >= now) {
break;
} else {
timestamp = element.timestamp;
DataRecord record = new DataRecord<String, Integer>(element.key, element.value,
DataRecord.Operation.update, element.timestamp);
entries.add(record);
}
}
Dataset dataset = new Dataset(entries, timestamp);
return dataset;
}
}
public String getRandomKey() {
String key = String.format("appid_%04d", random.nextInt(3000));
return key;
}
public Element getElement(String key, long timestamp) {
Element el = null;
for (Element element : data) {
if (element.timestamp < timestamp) {
if (element.key.equals(key)) {
el = element;
}
} else {
break;
}
}
return el;
}
public long getLastTimestamp() {
long now = System.currentTimeMillis();
Element el = null;
for (Element element : data) {
if (element.timestamp >= now) {
break;
}
el = element;
}
return el.timestamp;
}
public class Element {
long timestamp;
String key;
int value;
}
}

View File

@@ -0,0 +1,67 @@
package com.didichuxing.datachannel.kafka.report;
import com.alibaba.fastjson.JSONObject;
import com.didichuxing.datachannel.kafka.security.authorizer.DidiAuthorizer;
import com.didichuxing.datachannel.kafka.security.authorizer.SessionManager;
import com.didichuxing.datachannel.kafka.security.login.LoginManager;
import kafka.network.RequestChannel;
import kafka.security.auth.*;
import kafka.zk.ZooKeeperTestHarness;
import org.apache.kafka.common.security.auth.KafkaPrincipal;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.net.InetAddress;
import java.util.List;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import static org.junit.Assert.assertTrue;
public class SessionReportTest extends ZooKeeperTestHarness {
private DidiAuthorizer didiAuthorizer;
@Before
public void setUp() {
super.setUp();
ScheduledThreadPoolExecutor scheduledThreadPoolExecutor = new ScheduledThreadPoolExecutor(1);
LoginManager.getInstance().start("0", 0, zkClient(), scheduledThreadPoolExecutor, "",
List.of("kafka:*:false"));
SessionManager.getInstance().start(scheduledThreadPoolExecutor);
didiAuthorizer = new DidiAuthorizer();
didiAuthorizer.start("0", 0, zkClient(), scheduledThreadPoolExecutor, "",
List.of("*:kafka:Read:Allow", "*:kafka:Write:Allow"));
}
@Test
public void testGetTopicHeartBeat() throws Exception {
RequestChannel.Session session =
new RequestChannel.Session(
new KafkaPrincipal(KafkaPrincipal.USER_TYPE, "kafka"), InetAddress.getLocalHost(), -1);
Operation operation1 = Read$.MODULE$;
Resource resource1 = new Resource(Topic$.MODULE$, "test");
didiAuthorizer.authorize(session, operation1, resource1);
Operation operation2 = Write$.MODULE$;
Resource resource2 = new Resource(Topic$.MODULE$, "test");
didiAuthorizer.authorize(session, operation2, resource2);
String key = "kafka#" + InetAddress.getLocalHost().getHostAddress() + "#unknown#unknown";
SessionReport sessionReport = SessionReport.getInstance();
JSONObject jsonObject = JSONObject.parseObject(sessionReport.getTopicHeartBeat());
String produceArray = JSONObject.parseObject(jsonObject.get("produce").toString()).get("test").toString();
String fetchArray = JSONObject.parseObject(jsonObject.get("fetch").toString()).get("test").toString();
assertTrue("produce size should be one", JSONObject.parseArray(produceArray).size() == 1 && JSONObject.parseArray(produceArray).get(0).toString().equals(key));
assertTrue("fetch size should be one", JSONObject.parseArray(produceArray).size() == 1 && JSONObject.parseArray(fetchArray).get(0).toString().equals(key));
}
@After
public void stop() throws Exception{
SessionManager.getInstance().shutdown();
didiAuthorizer.stop();
}
}

View File

@@ -0,0 +1,52 @@
package com.didichuxing.datachannel.kafka.security;
import com.didichuxing.datachannel.kafka.security.authorizer.DidiAuthorizer;
import com.didichuxing.datachannel.kafka.security.authorizer.SessionManager;
import com.didichuxing.datachannel.kafka.security.login.LoginManager;
import kafka.network.RequestChannel;
import kafka.security.auth.*;
import kafka.zk.ZooKeeperTestHarness;
import org.apache.kafka.common.security.auth.KafkaPrincipal;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.io.FileReader;
import java.net.InetAddress;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import static org.junit.Assert.*;
public class DidiAuthorizerTest extends ZooKeeperTestHarness {
private DidiAuthorizer didiAuthorizer;
@Before
public void setUp() {
super.setUp();
ScheduledThreadPoolExecutor scheduledThreadPoolExecutor = new ScheduledThreadPoolExecutor(1);
SessionManager.getInstance().start(scheduledThreadPoolExecutor);
didiAuthorizer = new DidiAuthorizer();
didiAuthorizer.start("0", 0, zkClient(), scheduledThreadPoolExecutor, "",
List.of("*:kafka:Read:Allow", "*:kafka:Write:Allow"));
}
@Test
public void testAuthorize() throws Exception {
RequestChannel.Session session =
new RequestChannel.Session(
new KafkaPrincipal(KafkaPrincipal.USER_TYPE, "kafka"), InetAddress.getByName("localhost"), -1);
Operation operation = Read$.MODULE$;
Resource resource = new Resource(Topic$.MODULE$, "test");
assertTrue(didiAuthorizer.authorize(session, operation, resource));
}
@After
public void stop() throws Exception{
LoginManager.getInstance().shutdown();
SessionManager.getInstance().shutdown();
didiAuthorizer.stop();
}
}

View File

@@ -0,0 +1,47 @@
package com.didichuxing.datachannel.kafka.security;
import com.didichuxing.datachannel.kafka.security.login.LoginManager;
import com.didichuxing.datachannel.kafka.security.login.User;
import kafka.zk.ZooKeeperTestHarness;
import org.junit.*;
import java.util.List;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import static org.junit.Assert.*;
public class LoginTest extends ZooKeeperTestHarness {
@Before
public void setUp() {
super.setUp();
ScheduledThreadPoolExecutor scheduledThreadPoolExecutor = new ScheduledThreadPoolExecutor(1);
LoginManager.getInstance().start("0", 0, zkClient(), scheduledThreadPoolExecutor, "",
List.of("kafka-admin:diditest@bdt:true", "kafka:12345:false"));
}
@Test
public void testUsers() throws Exception {
assertFalse(LoginManager.getInstance().login("admin", "12345", ""));
User user = LoginManager.getInstance().getUser("admin");
assertTrue(user.isSuperUser());
assertTrue(LoginManager.getInstance().login("kafka-admin", "diditest@bdt", ""));
user = LoginManager.getInstance().getUser("kafka-admin");
assertTrue(user.isSuperUser());
user = LoginManager.getInstance().getUser("ANONYMOUS");
assertTrue(user.isSuperUser());
assertTrue(LoginManager.getInstance().login("kafka", "12345", ""));
assertFalse(LoginManager.getInstance().login("kafka", "123456", ""));
user = LoginManager.getInstance().getUser("kafka");
assertFalse(user.isSuperUser());
}
@After
public void stop() throws Exception{
LoginManager.getInstance().shutdown();
}
}

View File

@@ -0,0 +1,309 @@
package com.didichuxing.datachannel.kafka.server;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.didichuxing.datachannel.kafka.util.KafkaUtils;
import com.didichuxing.datachannel.kafka.util.ScalaUtil;
import com.sun.net.httpserver.HttpExchange;
import com.sun.net.httpserver.HttpServer;
import kafka.server.KafkaConfig;
import kafka.utils.TestUtils;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.producer.*;
import org.apache.kafka.common.network.ListenerName;
import org.apache.kafka.common.protocol.ApiKeys;
import org.apache.kafka.common.security.auth.SecurityProtocol;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Option;
import scala.collection.Seq;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.time.Duration;
import java.util.List;
import java.util.Properties;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
public class KafkaGatewayTest extends kafka.integration.KafkaServerTestHarness {
protected static final Logger log = LoggerFactory.getLogger(KafkaGatewayTest.class);
private KafkaGatewayServics kafkaGatewayServics;
private final String testUserName = "test_user";
private final String testUserPassword = "12345";
private final String testTopic = "test_0";
private JSONObject sessionReportResult;
@Override
public void setUp() {
createKafkaGatewayServices();
super.setUp();
}
void createKafkaGatewayServices() {
try {
kafkaGatewayServics = new KafkaGatewayServics();
} catch (IOException e) {
e.printStackTrace();
}
assertNotNull(kafkaGatewayServics);
assertNotNull(kafkaGatewayServics.getKafkaGatewayUrl());
}
@Override
public Seq<KafkaConfig> generateConfigs() {
Properties properties = TestUtils.createBrokerConfig(0, zkConnect(),
true, true, TestUtils.RandomPort(), Option.apply(SecurityProtocol.SASL_PLAINTEXT),
Option.apply(null), Option.apply(null), true, true, TestUtils.RandomPort(), false,
TestUtils.RandomPort(), false, TestUtils.RandomPort(), Option.apply(null), 1, false, 1, (short) 1);
properties.setProperty(KafkaConfig.GatewayUrlProp(), kafkaGatewayServics.getKafkaGatewayUrl());
properties.setProperty(KafkaConfig.ClusterIdProp(), "0");
properties.setProperty(KafkaConfig.SessionReportTimeMsProp(), "30000");
properties.setProperty(KafkaConfig.KafkaExMetricsEnableAllProp(), "true");
properties.setProperty("authorizer.class.name", "com.didichuxing.datachannel.kafka.security.authorizer.DidiAuthorizer");
/*
listeners=SASL_PLAINTEXT://:9093,PLAINTEXT://:9092
security.inter.broker.protocol=SASL_PLAINTEXT
#security.inter.broker.protocol=PLAINTEXT
sasl.mechanism.inter.broker.protocol=PLAIN
authorizer.class.name=com.didichuxing.datachannel.kafka.security.authorizer.DidiAuthorizer
*/
configSaslServer(properties);
var config = KafkaConfig.fromProps(properties);
return ScalaUtil.toSeq(List.of(config));
}
private KafkaProducer<String, String> createProducer() {
Properties props = new Properties();
props.put("bootstrap.servers", TestUtils.bootstrapServers(servers(),
ListenerName.forSecurityProtocol(SecurityProtocol.SASL_PLAINTEXT)));
props.put("request.timeout.ms", "5000");
//props.put("buffer.memory", 2*1024*1024);
props.put("compression.type", "lz4"); //压缩方式
//props.put("batch.size", 1024);
props.put("linger.ms", 1000 );
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
configSaslClient(props, "0", testUserName, testUserPassword);
return new KafkaProducer<>(props);
}
private KafkaConsumer<String, String> createConsumer() {
Properties props = new Properties();
props.put("bootstrap.servers", TestUtils.bootstrapServers(servers(),
ListenerName.forSecurityProtocol(SecurityProtocol.SASL_PLAINTEXT)));
props.put(ConsumerConfig.GROUP_ID_CONFIG, "cg-007");
props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");
props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000");
props.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, "30000");
props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
props.put(ConsumerConfig.FETCH_MAX_BYTES_CONFIG, 999);
//props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
configSaslClient(props, "0", testUserName, testUserPassword);
return new KafkaConsumer<String, String>(props);
}
private static void configSaslServer(Properties properties) {
properties.put("sasl.enabled.mechanisms", "PLAIN"); //安全认证机制
properties.put("sasl.mechanism.inter.broker.protocol", "PLAIN");
String jaas_config = "com.didichuxing.datachannel.kafka.security.sasl.plain.PlainLoginModule " +
"required username=\"admin\" password=\"*\";";
properties.put("listener.name.sasl_plaintext.plain.sasl.jaas.config", jaas_config);
}
private static void configSaslClient(Properties properties, String clusterId, String username, String password) {
properties.put("security.protocol", "SASL_PLAINTEXT"); //安全认证协议
properties.put("sasl.mechanism", "PLAIN"); //安全认证机制
String format = "org.apache.kafka.common.security.plain.PlainLoginModule required username=\"%s.%s\" password=\"%s\";";
String jaas_config = String.format(format, clusterId, username, password);
properties.put("sasl.jaas.config", jaas_config);
}
@Override
public void tearDown() {
super.tearDown();
kafkaGatewayServics.stop();
}
@Test
public void testProducerAndConsumer() throws Exception {
long startTime = System.currentTimeMillis();
TestUtils.createTopic(zkClient(), testTopic, 1, 1, servers(), new Properties());
int numRecords = 50;
final int[] success = {0};
var producer = createProducer();
for (int i = 0; i < numRecords; i++) {
final int index = i;
var f = producer.send(new ProducerRecord<String, String>(testTopic, "hellow world"), new Callback() {
@Override
public void onCompletion(RecordMetadata recordMetadata, Exception e) {
if (null != e) {
System.out.println(e.getMessage());
} else {
success[0]++;
assertEquals(recordMetadata.offset(), index);
}
}
});
f.get();
}
producer.close();
assertEquals(numRecords, success[0]);
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(List.of(testTopic));
for (int i=0; i<success[0]; ){
ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(1000));
for (ConsumerRecord<String, String> record : records) {
assertEquals(record.offset(), i);
i++;
}
}
consumer.close();
verifySessionReport();
}
private void verifySessionReport() throws Exception{
synchronized (kafkaGatewayServics) {
kafkaGatewayServics.wait(30000);
}
assertNotNull(sessionReportResult);
JSONObject produceInfo = sessionReportResult.getJSONObject("produce");
JSONArray connInfo = produceInfo.getJSONArray(testTopic);
String value = connInfo.getString(0);
assertEquals(value, "test_user#127.0.0.1#" +
KafkaUtils.apiVersionToKafkaVersion(ApiKeys.PRODUCE.id, ApiKeys.PRODUCE.latestVersion()) +
"#consumer-cg-007-1"
);
JSONObject fetchInfo = sessionReportResult.getJSONObject("fetch");
connInfo = fetchInfo.getJSONArray(testTopic);
value = connInfo.getString(0);
assertEquals(value, "test_user#127.0.0.1#" +
KafkaUtils.apiVersionToKafkaVersion(ApiKeys.FETCH.id, ApiKeys.FETCH.latestVersion()) +
"#consumer-cg-007-1"
);
sessionReportResult = null;
}
class KafkaGatewayServics {
private final Thread httpServerThread;
private final HttpServer httpServer;
private final String kafkaGatewayUrl;
public KafkaGatewayServics() throws IOException {
httpServer = HttpServer.create(new InetSocketAddress(0), 0);
httpServer.createContext("/api/v1/security/users", (HttpExchange ex) -> {
try {
String requestBody = new String(ex.getRequestBody().readAllBytes());
JSONObject jsonObject = JSONObject.parseObject(requestBody);
log.info("recieve request {} {}", ex.getRequestURI(), jsonObject.toJSONString());
JSONArray users = new JSONArray();
JSONObject user = new JSONObject();
user.put("username", testUserName);
user.put("password", testUserPassword);
user.put("userType", "0");
user.put("timestamp", System.currentTimeMillis() - 5000);
user.put("operation", "0");
users.add(user);
JSONObject resp = new JSONObject();
resp.put("code", 0);
resp.put("message", "");
JSONObject data = new JSONObject();
data.put("rows", users);
resp.put("data", data);
String respData = resp.toJSONString();
ex.sendResponseHeaders(200, respData.length());
var output = ex.getResponseBody();
output.write(respData.getBytes());
ex.close();
}catch(Exception e) {
log.info("handle request exception: ", e);
}
});
httpServer.createContext("/api/v1/security/acls", (HttpExchange ex) -> {
try {
String requestBody = new String(ex.getRequestBody().readAllBytes());
JSONObject jsonObject = JSONObject.parseObject(requestBody);
log.info("recieve request {} {}", ex.getRequestURI(), jsonObject.toJSONString());
JSONArray acls = new JSONArray();
JSONObject acl = new JSONObject();
acl.put("topicName", testTopic);
acl.put("username", testUserName);
acl.put("access", "3");
acl.put("timestamp", System.currentTimeMillis()-5000);
acl.put("operation", "0");
acls.add(acl);
JSONObject resp = new JSONObject();
resp.put("code", 0);
resp.put("message", "");
JSONObject data = new JSONObject();
data.put("rows", acls);
resp.put("data", data);
String respData = resp.toJSONString();
ex.sendResponseHeaders(200, respData.length());
var output = ex.getResponseBody();
output.write(respData.getBytes());
ex.close();
}catch (Exception e) {
log.info("handle request exception: ", e);
}
});
httpServer.createContext("/api/v1/heartbeat/survive-user", (HttpExchange ex) -> {
try {
String requestBody = new String(ex.getRequestBody().readAllBytes());
JSONObject jsonObject = JSONObject.parseObject(requestBody);
log.info("recieve request {} {}", ex.getRequestURI(), jsonObject.toJSONString());
sessionReportResult = jsonObject;
synchronized (kafkaGatewayServics) {
kafkaGatewayServics.notify();
}
JSONObject resp = new JSONObject();
resp.put("code", 0);
resp.put("message", "");
resp.put("data", "");
String respData = resp.toJSONString();
ex.sendResponseHeaders(200, respData.length());
var output = ex.getResponseBody();
output.write(respData.getBytes());
output.close();
ex.close();
}catch (Exception e) {
log.info("handle request exception: ", e);
}
});
httpServer.setExecutor(null);
httpServerThread = new Thread(httpServer::start);
var adders = httpServer.getAddress();
kafkaGatewayUrl = String.format("http://localhost:%s", adders.getPort());
httpServerThread.start();
}
public void stop() {
try {
httpServer.stop(0);
httpServerThread.join();
}catch (Exception e) {
log.error("stop exception: ", e);
}
}
public String getKafkaGatewayUrl() {
return kafkaGatewayUrl;
}
}
}

View File

@@ -0,0 +1,31 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE
* file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file
* to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package com.didichuxing.datachannel.kafka.util;
import com.alibaba.fastjson.JSONObject;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
public class JsonUtilsTest {
@Test
public void testString2ResponseCommonResult() {
String msg = "success message";
String jsonString = JSONObject.toJSONString(ResponseCommonResult.success(msg));
ResponseCommonResult result = JsonUtils.string2ResponseCommonResult(jsonString);
assertEquals(ResponseCommonResult.SUCCESS_STATUS, result.getCode());
assertEquals(msg, result.getData().toString());
}
}

View File

@@ -28,3 +28,5 @@ log4j.appender.stdout.layout.ConversionPattern=[%d][%p][%t](%F:%L): %m%n
# zkclient can be verbose, during debugging it is common to adjust it separately
log4j.logger.org.apache.zookeeper=WARN
#log4j.logger.com.didichuxing.datachannel.kafka.server.KafkaGatewayTest=DEBUG

View File

@@ -607,6 +607,7 @@ class KafkaConfigTest {
case KafkaConfig.BrokerIdProp => assertPropertyInvalid(getBaseProperties(), name, "not_a_number")
case KafkaConfig.ClusterIdProp => assertPropertyInvalid(getBaseProperties(), name, "not_a_number")
case KafkaConfig.GatewayUrlProp =>
case KafkaConfig.DiskLoadProtectorEnableProp=>
case KafkaConfig.SessionReportTimeMsProp=>
case KafkaConfig.KafkaExMetricsEnableAllProp=>

View File

@@ -0,0 +1,243 @@
## 前言
之前我们有解析过[【kafka源码】Controller启动过程以及选举流程源码分析](), 其中在分析过程中,Broker在当选Controller之后,需要初始化Controller的上下文中, 有关于Controller与Broker之间的网络通信的部分我没有细讲,因为这个部分我想单独来讲;所以今天 我们就来好好分析分析**Controller与Brokers之间的网络通信**
## 源码分析
### 1. 源码入口 ControllerChannelManager.startup()
调用链路
->`KafkaController.processStartup`
->`KafkaController.elect()`
->`KafkaController.onControllerFailover()`
->`KafkaController.initializeControllerContext()`
```scala
def startup() = {
// 把所有存活的Broker全部调用 addNewBroker这个方法
controllerContext.liveOrShuttingDownBrokers.foreach(addNewBroker)
brokerLock synchronized {
//开启 网络请求线程
brokerStateInfo.foreach(brokerState => startRequestSendThread(brokerState._1))
}
}
```
### 2. addNewBroker 构造broker的连接信息
> 将所有存活的brokers 构造一些对象例如`NetworkClient`、`RequestSendThread` 等等之类的都封装到对象`ControllerBrokerStateInfo`中;
> 由`brokerStateInfo`持有对象 key=brokerId value = `ControllerBrokerStateInfo`
```scala
private def addNewBroker(broker: Broker): Unit = {
// 省略部分代码
val threadName = threadNamePrefix match {
case None => s"Controller-${config.brokerId}-to-broker-${broker.id}-send-thread"
case Some(name) => s"$name:Controller-${config.brokerId}-to-broker-${broker.id}-send-thread"
}
val requestRateAndQueueTimeMetrics = newTimer(
RequestRateAndQueueTimeMetricName, TimeUnit.MILLISECONDS, TimeUnit.SECONDS, brokerMetricTags(broker.id)
)
//构造请求发送线程
val requestThread = new RequestSendThread(config.brokerId, controllerContext, messageQueue, networkClient,
brokerNode, config, time, requestRateAndQueueTimeMetrics, stateChangeLogger, threadName)
requestThread.setDaemon(false)
val queueSizeGauge = newGauge(QueueSizeMetricName, () => messageQueue.size, brokerMetricTags(broker.id))
//封装好对象 缓存在brokerStateInfo中
brokerStateInfo.put(broker.id, ControllerBrokerStateInfo(networkClient, brokerNode, messageQueue,
requestThread, queueSizeGauge, requestRateAndQueueTimeMetrics, reconfigurableChannelBuilder))
}
```
1. 将所有存活broker 封装成一个个`ControllerBrokerStateInfo`对象保存在缓存中; 对象中包含了`RequestSendThread` 请求发送线程 对象; 什么时候执行发送线程 ,我们下面分析
2. `messageQueue` 一个阻塞队列,里面放的都是待执行的请求,里面的对象`QueueItem` 封装了
请求接口`ApiKeys`,`AbstractControlRequest`请求体对象;`AbstractResponse` 回调函数和`enqueueTimeMs`入队时间
3. `RequestSendThread` 发送请求的线程 , 跟Broker们的网络连接就是通过这里进行的比如下图中向Brokers们(当然包含自己)发送`UPDATE_METADATA`更新元数据的请求
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611174518555.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
### 3. startRequestSendThread 启动网络请求线程
>把所有跟Broker连接的网络请求线程开起来
```scala
protected def startRequestSendThread(brokerId: Int): Unit = {
val requestThread = brokerStateInfo(brokerId).requestSendThread
if (requestThread.getState == Thread.State.NEW)
requestThread.start()
}
}
```
线程执行代码块 ; 以下省略了部分代码
```scala
override def doWork(): Unit = {
def backoff(): Unit = pause(100, TimeUnit.MILLISECONDS)
//从阻塞请求队列里面获取有没有待执行的请求
val QueueItem(apiKey, requestBuilder, callback, enqueueTimeMs) = queue.take()
requestRateAndQueueTimeMetrics.update(time.milliseconds() - enqueueTimeMs, TimeUnit.MILLISECONDS)
var clientResponse: ClientResponse = null
try {
var isSendSuccessful = false
while (isRunning && !isSendSuccessful) {
// if a broker goes down for a long time, then at some point the controller's zookeeper listener will trigger a
// removeBroker which will invoke shutdown() on this thread. At that point, we will stop retrying.
try {
//检查跟Broker的网络连接是否畅通,如果连接不上会重试
if (!brokerReady()) {
isSendSuccessful = false
backoff()
}
else {
//构建请求参数
val clientRequest = networkClient.newClientRequest(brokerNode.idString, requestBuilder,
time.milliseconds(), true)
//发起网络请求
clientResponse = NetworkClientUtils.sendAndReceive(networkClient, clientRequest, time)
isSendSuccessful = true
}
} catch {
}
if (clientResponse != null) {
val requestHeader = clientResponse.requestHeader
val api = requestHeader.apiKey
if (api != ApiKeys.LEADER_AND_ISR && api != ApiKeys.STOP_REPLICA && api != ApiKeys.UPDATE_METADATA)
throw new KafkaException(s"Unexpected apiKey received: $apiKey")
if (callback != null) {
callback(response)
}
}
} catch {
}
}
```
1. 从请求队列`queue`中take请求; 如果有的话就开始执行,没有的话就阻塞住
2. 检查请求的目标Broker是否可以连接; 连接不通会一直进行尝试,然后在某个时候,控制器的 zookeeper 侦听器将触发一个 `removeBroker`,它将在此线程上调用 shutdown()。就不会在重试了
3. 发起请求;
4. 如果请求失败,则重新连接Broker发送请求
5. 返回成功,调用回调接口
6. 值得注意的是<font color="red"> Controller发起的请求,收到Response中的ApiKeys中如果不是 `LEADER_AND_ISR`、`STOP_REPLICA`、`UPDATE_METADATA` 三个请求,就会抛出异常; 不会进行callBack的回调; </font> 不过也是很奇怪,如果Controller限制只能发起这几个请求的话,为什么在发起请求之前去做拦截,而要在返回之后做拦截; **个人猜测 可能是Broker在Response带上ApiKeys, 在Controller 调用callBack的时候可能会根据ApiKeys的不同而处理不同逻辑吧;但是又只想对Broker开放那三个接口;**
### 4. 向RequestSendThread的请求队列queue中添加请求
> 上面的线程启动完成之后queue中还没有待执行的请求的那么什么时候有添加请求呢
添加请求最终都会调用接口`` ,反查一下就知道了;
```java
def sendRequest(brokerId: Int, request: AbstractControlRequest.Builder[_ <: AbstractControlRequest],
callback: AbstractResponse => Unit = null): Unit = {
brokerLock synchronized {
val stateInfoOpt = brokerStateInfo.get(brokerId)
stateInfoOpt match {
case Some(stateInfo) =>
stateInfo.messageQueue.put(QueueItem(request.apiKey, request, callback, time.milliseconds()))
case None =>
warn(s"Not sending request $request to broker $brokerId, since it is offline.")
}
}
}
```
**这里举一个**🌰 ; 看看Controller向Broker发起一个`UPDATE_METADATA`请求;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611182731937.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611183114551.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
1. 可以看到调用了`sendRequest`请求 ; 请求的接口ApiKey=`UPDATE_METADATA`
2. 回调方法就是如上所示; 向事件管理器`ControllerChannelManager`中添加一个事件`UpdateMetadataResponseReceived`
3. 当请求成功之后,调用2中的callBack, `UpdateMetadataResponseReceived`被添加到事件管理器中; 就会立马被执行(排队)
4. 执行地方如下图所示,只不过它也没干啥,也就是如果返回异常response就打印一下日志
![在这里插入图片描述](https://img-blog.csdnimg.cn/2021061118385771.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
### 5. Broker接收Controller的请求
> 上面说了Controller对所有Brokers(当然也包括自己)发起请求; 那么Brokers接受请求的地方在哪里呢,我们下面分析分析
这个部分内容我们在[【kafka源码】TopicCommand之创建Topic源码解析]() 中也分析过,处理过程都是一样的;
比如还是上面的例子🌰, 发起请求了之后,Broker处理的地方在`KafkaRequestHandler.run`里面的`apis.handle(request)`;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611184840506.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
可以看到这里列举了所有的接口请求;我们找到`UPDATE_METADATA`处理逻辑;
里面的处理逻辑就不进去看了,不然超出了本篇文章的范畴;
### 6. Broker服务下线
我们模拟一下Broker宕机了, 手动把zk上的` /brokers/ids/broker节点`删除; 因为Controller是有对节点`watch`的, 就会看到Controller收到了变更通知,并且调用了 `KafkaController.processBrokerChange()`接口;
```scala
private def processBrokerChange(): Unit = {
if (!isActive) return
val curBrokerAndEpochs = zkClient.getAllBrokerAndEpochsInCluster
val curBrokerIdAndEpochs = curBrokerAndEpochs map { case (broker, epoch) => (broker.id, epoch) }
val curBrokerIds = curBrokerIdAndEpochs.keySet
val liveOrShuttingDownBrokerIds = controllerContext.liveOrShuttingDownBrokerIds
val newBrokerIds = curBrokerIds -- liveOrShuttingDownBrokerIds
val deadBrokerIds = liveOrShuttingDownBrokerIds -- curBrokerIds
val bouncedBrokerIds = (curBrokerIds & liveOrShuttingDownBrokerIds)
.filter(brokerId => curBrokerIdAndEpochs(brokerId) > controllerContext.liveBrokerIdAndEpochs(brokerId))
val newBrokerAndEpochs = curBrokerAndEpochs.filter { case (broker, _) => newBrokerIds.contains(broker.id) }
val bouncedBrokerAndEpochs = curBrokerAndEpochs.filter { case (broker, _) => bouncedBrokerIds.contains(broker.id) }
val newBrokerIdsSorted = newBrokerIds.toSeq.sorted
val deadBrokerIdsSorted = deadBrokerIds.toSeq.sorted
val liveBrokerIdsSorted = curBrokerIds.toSeq.sorted
val bouncedBrokerIdsSorted = bouncedBrokerIds.toSeq.sorted
info(s"Newly added brokers: ${newBrokerIdsSorted.mkString(",")}, " +
s"deleted brokers: ${deadBrokerIdsSorted.mkString(",")}, " +
s"bounced brokers: ${bouncedBrokerIdsSorted.mkString(",")}, " +
s"all live brokers: ${liveBrokerIdsSorted.mkString(",")}")
newBrokerAndEpochs.keySet.foreach(controllerChannelManager.addBroker)
bouncedBrokerIds.foreach(controllerChannelManager.removeBroker)
bouncedBrokerAndEpochs.keySet.foreach(controllerChannelManager.addBroker)
deadBrokerIds.foreach(controllerChannelManager.removeBroker)
if (newBrokerIds.nonEmpty) {
controllerContext.addLiveBrokersAndEpochs(newBrokerAndEpochs)
onBrokerStartup(newBrokerIdsSorted)
}
if (bouncedBrokerIds.nonEmpty) {
controllerContext.removeLiveBrokers(bouncedBrokerIds)
onBrokerFailure(bouncedBrokerIdsSorted)
controllerContext.addLiveBrokersAndEpochs(bouncedBrokerAndEpochs)
onBrokerStartup(bouncedBrokerIdsSorted)
}
if (deadBrokerIds.nonEmpty) {
controllerContext.removeLiveBrokers(deadBrokerIds)
onBrokerFailure(deadBrokerIdsSorted)
}
if (newBrokerIds.nonEmpty || deadBrokerIds.nonEmpty || bouncedBrokerIds.nonEmpty) {
info(s"Updated broker epochs cache: ${controllerContext.liveBrokerIdAndEpochs}")
}
}
```
1. 这里会去zk里面获取所有的Broker信息; 并将得到的数据跟当前Controller缓存中的所有Broker信息做对比;
2. 如果有新上线的Broker,则会执行 Broker上线的流程
3. 如果有删除的Broker,则执行Broker下线的流程; 比如`removeLiveBrokers`
收到删除节点之后, Controller 会觉得Broker已经下线了,即使那台Broker服务是正常的,那么它仍旧提供不了服务
### 7. Broker上下线
本篇主要讲解**Controller与Brokers之间的网络通信**
故**Broker上下线**内容单独开一篇文章来详细讲解 [【kafka源码】Brokers的上下线流程](https://shirenchuang.blog.csdn.net/article/details/117846476)
## 源码总结
本篇文章内容比较简单, Controller和Broker之间的通信就是通过 `RequestSendThread` 这个线程来进行发送请求;
`RequestSendThread`维护的阻塞请求队列在没有任务的时候处理阻塞状态;
当有需要发起请求的时候,直接向`queue`中添加任务就行了;
Controller自身也是一个Broker,所以Controller发出的请求,自己也会收到并且执行
## Q&A
### 如果Controller与Broker网络连接不通会怎么办
> 会一直进行重试, 直到zookeeper发现Broker通信有问题,会将这台Broker的节点移除,Controller就会收到通知,并将Controller与这台Broker的`RequestSendThread`线程shutdown;就不会再重试了; 如果zk跟Broker之间网络通信是正常的,只是发起的逻辑请求就是失败,则会一直进行重试
### 如果手动将zk中的 /brokers/ids/ 下的子节点删除会怎么样?
>手动删除` /brokers/ids/Broker的ID`, Controller收到变更通知,则将该Broker在Controller中处理下线逻辑; 所有该Broker已经游离于集群之外,即使它服务还是正常的,但是它却提供不了服务了; 只能重启该Broker重新注册;

View File

@@ -0,0 +1,289 @@
前言
>Controller中有两个状态机分别是`ReplicaStateMachine 副本状态机``PartitionStateMachine分区状态机` ; 他们的作用是负责处理每个分区和副本在状态变更过程中要处理的事情; 并且确保从上一个状态变更到下一个状态是合法的; 源码中你能看到很多地方只是进行状态流转; 所以我们要清楚每个流转都做了哪些事情;对我们阅读源码更清晰
>
>----
>在之前的文章 [【kafka源码】Controller启动过程以及选举流程源码分析]() 中,我们有分析到,
>`replicaStateMachine.startup()``partitionStateMachine.startup()`
>副本专状态机和分区状态机的启动; 那我们就从这里开始好好讲下两个状态机
## 源码解析
<font color="red">如果觉得阅读源码解析太枯燥,请直接看 源码总结及其后面部分</font>
### ReplicaStateMachine 副本状态机
Controller 选举成功之后 调用`ReplicaStateMachine.startup`启动副本状态机
```scala
def startup(): Unit = {
//初始化所有副本的状态
initializeReplicaState()
val (onlineReplicas, offlineReplicas) = controllerContext.onlineAndOfflineReplicas
handleStateChanges(onlineReplicas.toSeq, OnlineReplica)
handleStateChanges(offlineReplicas.toSeq, OfflineReplica)
}
```
1. 初始化所有副本的状态,如果副本在线则状态变更为`OnlineReplica` ;否则变更为`ReplicaDeletionIneligible`副本删除失败状态; 判断副本是否在线的条件是 副本所在Broker需要在线&&副本没有被标记为已下线状态Map `replicasOnOfflineDirs`用于维护副本失败在线),一般情况下这个里面是被标记为删除的Topic
2. 执行状态变更处理器
#### ReplicaStateMachine状态变更处理器
>它确保每个状态转换都发生从合法的先前状态到目标状态。有效的状态转换是:
>1. `NonExistentReplica --> NewReplica `-- 将 LeaderAndIsr 请求与当前领导者和 isr 发送到新副本,并将分区的 UpdateMetadata 请求发送到每个实时代理
>2. `NewReplica -> OnlineReplica` --如果需要,将新副本添加到分配的副本列表中
>3. `OnlineReplica,OfflineReplica -> OnlineReplica`--将带有当前领导者和 isr 的 LeaderAndIsr 请求发送到新副本,并将分区的 UpdateMetadata 请求发送到每个实时代理
>4. `NewReplica,OnlineReplica,OfflineReplica,ReplicaDeletionIneligible -> OfflineReplica`-- 向副本发送 `StopReplicaRequest`
> -- 从 isr 中删除此副本并将 LeaderAndIsr 请求(带有新的 isr发送到领导副本并将分区的 UpdateMetadata 请求发送到每个实时代理。
> 5. `OfflineReplica -> ReplicaDeletionStarted` -- 向副本发送 `StopReplicaRequest` (带 删除参数)
> 6. `ReplicaDeletionStarted -> ReplicaDeletionSuccessful` --在状态机中标记副本的状态
> 7. `ReplicaDeletionStarted -> ReplicaDeletionIneligible` --在状态机中标记副本的状态
> 8. `ReplicaDeletionSuccessful -> NonExistentReplica`--从内存分区副本分配缓存中删除副本
```scala
private def doHandleStateChanges(replicaId: Int, replicas: Seq[PartitionAndReplica], targetState: ReplicaState): Unit = {
//如果有副本没有设置状态,则初始化为`NonExistentReplica`
replicas.foreach(replica => controllerContext.putReplicaStateIfNotExists(replica, NonExistentReplica))
//校验状态流转是不是正确
val (validReplicas, invalidReplicas) = controllerContext.checkValidReplicaStateChange(replicas, targetState)
invalidReplicas.foreach(replica => logInvalidTransition(replica, targetState))
//代码省略,在下面细细说来
}
```
```scala
controllerBrokerRequestBatch.sendRequestsToBrokers(controllerContext.epoch)
```
1. 如果有副本没有设置状态,则初始化为`NonExistentReplica`
2. 校验状态流转是不是正确
3. 执行完了之后,还会可能尝试发一次`UPDATA_METADATA`
##### 先前状态 ==> OnlineReplica
可流转的状态有
1. `NewReplica`
2. `OnlineReplica`
3. `OfflineReplica`
4. `ReplicaDeletionIneligible`
###### NewReplica ==》OnlineReplica
>如果有需要,将新副本添加到分配的副本列表中;
>比如[【kafka源码】TopicCommand之创建Topic源码解析]()
```scala
case NewReplica =>
val assignment = controllerContext.partitionFullReplicaAssignment(partition)
if (!assignment.replicas.contains(replicaId)) {
error(s"Adding replica ($replicaId) that is not part of the assignment $assignment")
val newAssignment = assignment.copy(replicas = assignment.replicas :+ replicaId)
controllerContext.updatePartitionFullReplicaAssignment(partition, newAssignment)
}
```
###### 其他状态 ==》OnlineReplica
> 将带有当前领导者和 isr 的 LeaderAndIsr 请求发送到新副本,并将分区的 UpdateMetadata 请求发送到每个实时代理
```scala
case _ =>
controllerContext.partitionLeadershipInfo.get(partition) match {
case Some(leaderIsrAndControllerEpoch) =>
controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(Seq(replicaId),
replica.topicPartition,
leaderIsrAndControllerEpoch,
controllerContext.partitionFullReplicaAssignment(partition), isNew = false)
case None =>
}
```
##### 先前状态 ==> ReplicaDeletionIneligible
> 在内存`replicaStates`中更新一下副本状态为`ReplicaDeletionIneligible`
##### 先前状态 ==》OfflinePartition
>-- 向副本发送 StopReplicaRequest
从 isr 中删除此副本并将 LeaderAndIsr 请求(带有新的 isr发送到领导副本并将分区的 UpdateMetadata 请求发送到每个实时代理。
```scala
case OfflineReplica =>
// 添加构建StopReplicaRequest请求的擦书,deletePartition = false表示还不删除分区
validReplicas.foreach { replica =>
controllerBrokerRequestBatch.addStopReplicaRequestForBrokers(Seq(replicaId), replica.topicPartition, deletePartition = false)
}
val (replicasWithLeadershipInfo, replicasWithoutLeadershipInfo) = validReplicas.partition { replica =>
controllerContext.partitionLeadershipInfo.contains(replica.topicPartition)
}
//尝试从多个分区的 isr 中删除副本。从 isr 中删除副本会更新 Zookeeper 中的分区状态
//反复尝试从多个分区的 isr 中删除副本,直到没有更多剩余的分区可以重试。
//从/brokers/topics/test_create_topic13/partitions获取分区相关数据
//移除副本之后,重新写入到zk中
val updatedLeaderIsrAndControllerEpochs = removeReplicasFromIsr(replicaId, replicasWithLeadershipInfo.map(_.topicPartition))
updatedLeaderIsrAndControllerEpochs.foreach { case (partition, leaderIsrAndControllerEpoch) =>
if (!controllerContext.isTopicQueuedUpForDeletion(partition.topic)) {
val recipients = controllerContext.partitionReplicaAssignment(partition).filterNot(_ == replicaId)
controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(recipients,
partition,
leaderIsrAndControllerEpoch,
controllerContext.partitionFullReplicaAssignment(partition), isNew = false)
}
val replica = PartitionAndReplica(partition, replicaId)
val currentState = controllerContext.replicaState(replica)
logSuccessfulTransition(replicaId, partition, currentState, OfflineReplica)
controllerContext.putReplicaState(replica, OfflineReplica)
}
replicasWithoutLeadershipInfo.foreach { replica =>
val currentState = controllerContext.replicaState(replica)
logSuccessfulTransition(replicaId, replica.topicPartition, currentState, OfflineReplica)
controllerBrokerRequestBatch.addUpdateMetadataRequestForBrokers(controllerContext.liveOrShuttingDownBrokerIds.toSeq, Set(replica.topicPartition))
controllerContext.putReplicaState(replica, OfflineReplica)
}
```
1. 添加构建StopReplicaRequest请求的参数,`deletePartition = false`表示还不删除分区
2. 反复尝试从多个分区的 isr 中删除副本,直到没有更多剩余的分区可以重试。从`/brokers/topics/{TOPICNAME}/partitions`获取分区相关数据进过计算然后重新写入到zk中`/brokers/topics/{TOPICNAME}/partitions/state/`; 当然内存中的副本状态机的状态也会变更成 `OfflineReplica` ;
3. 根据条件判断是否需要发送`LeaderAndIsrRequest``UpdateMetadataRequest`
4. 发送`StopReplicaRequests`请求;
##### 先前状态==>ReplicaDeletionStarted
> 向指定的副本发送 [StopReplicaRequest 请求]()(带 删除参数)
```scala
controllerBrokerRequestBatch.addStopReplicaRequestForBrokers(Seq(replicaId), replica.topicPartition, deletePartition = true)
```
##### 当前状态 ==> NewReplica
>一般情况下,创建Topic的时候会触发这个流转;
```scala
case NewReplica =>
validReplicas.foreach { replica =>
val partition = replica.topicPartition
val currentState = controllerContext.replicaState(replica)
controllerContext.partitionLeadershipInfo.get(partition) match {
case Some(leaderIsrAndControllerEpoch) =>
if (leaderIsrAndControllerEpoch.leaderAndIsr.leader == replicaId) {
val exception = new StateChangeFailedException(s"Replica $replicaId for partition $partition cannot be moved to NewReplica state as it is being requested to become leader")
logFailedStateChange(replica, currentState, OfflineReplica, exception)
} else {
controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(Seq(replicaId),
replica.topicPartition,
leaderIsrAndControllerEpoch,
controllerContext.partitionFullReplicaAssignment(replica.topicPartition),
isNew = true)
logSuccessfulTransition(replicaId, partition, currentState, NewReplica)
controllerContext.putReplicaState(replica, NewReplica)
}
case None =>
logSuccessfulTransition(replicaId, partition, currentState, NewReplica)
controllerContext.putReplicaState(replica, NewReplica)
}
}
```
1. 在内存中更新 副本状态;
2. 在某些情况下,将带有当前领导者和 isr 的 LeaderAndIsr 请求发送到新副本,并将分区的 UpdateMetadata 请求发送到每个实时代理
##### 当前状态 ==> NonExistentPartition
1. `OfflinePartition`
##### 当前状态 ==> NonExistentPartition
### PartitionStateMachine分区状态机
`PartitionStateMachine.startup`
```scala
def startup(): Unit = {
initializePartitionState()
triggerOnlinePartitionStateChange()
}
```
`PartitionStateMachine.initializePartitionState()`
> 初始化分区状态
```scala
/**
* Invoked on startup of the partition's state machine to set the initial state for all existing partitions in
* zookeeper
*/
private def initializePartitionState(): Unit = {
for (topicPartition <- controllerContext.allPartitions) {
// check if leader and isr path exists for partition. If not, then it is in NEW state
//检查leader和isr路径是否存在
controllerContext.partitionLeadershipInfo.get(topicPartition) match {
case Some(currentLeaderIsrAndEpoch) =>
if (controllerContext.isReplicaOnline(currentLeaderIsrAndEpoch.leaderAndIsr.leader, topicPartition))
// leader is alive
controllerContext.putPartitionState(topicPartition, OnlinePartition)
else
controllerContext.putPartitionState(topicPartition, OfflinePartition)
case None =>
controllerContext.putPartitionState(topicPartition, NewPartition)
}
}
}
```
1. 如果分区不存在`LeaderIsr`,则状态是`NewPartition`
2. 如果分区存在`LeaderIsr`,就判断一下Leader是否存活
2.1 如果存活的话,状态是`OnlinePartition`
2.2 否则是`OfflinePartition`
`PartitionStateMachine. triggerOnlinePartitionStateChange()`
>尝试将所有处于 `NewPartition ``OfflinePartition `状态的分区移动到 `OnlinePartition` 状态,但属于要删除的主题的分区除外
```scala
def triggerOnlinePartitionStateChange(): Unit = {
val partitions = controllerContext.partitionsInStates(Set(OfflinePartition, NewPartition))
triggerOnlineStateChangeForPartitions(partitions)
}
private def triggerOnlineStateChangeForPartitions(partitions: collection.Set[TopicPartition]): Unit = {
// try to move all partitions in NewPartition or OfflinePartition state to OnlinePartition state except partitions
// that belong to topics to be deleted
val partitionsToTrigger = partitions.filter { partition =>
!controllerContext.isTopicQueuedUpForDeletion(partition.topic)
}.toSeq
handleStateChanges(partitionsToTrigger, OnlinePartition, Some(OfflinePartitionLeaderElectionStrategy(false)))
// TODO: If handleStateChanges catches an exception, it is not enough to bail out and log an error.
// It is important to trigger leader election for those partitions.
}
```
#### PartitionStateMachine 分区状态机
`PartitionStateMachine.doHandleStateChanges `
` controllerBrokerRequestBatch.sendRequestsToBrokers(controllerContext.epoch)
`
>它确保每个状态转换都发生从合法的先前状态到目标状态。有效的状态转换是:
>1. `NonExistentPartition -> NewPartition:` 将分配的副本从 ZK 加载到控制器缓存
>2. `NewPartition -> OnlinePartition:` 将第一个活动副本指定为领导者,将所有活动副本指定为 isr将此分区的leader和isr写入ZK ;向每个实时副本发送 LeaderAndIsr 请求,向每个实时代理发送 UpdateMetadata 请求
>3. `OnlinePartition,OfflinePartition -> OnlinePartition` 为这个分区选择新的leader和isr以及一组副本来接收LeaderAndIsr请求并将leader和isr写入ZK
> 对于这个分区向每个接收副本发送LeaderAndIsr请求向每个live broker发送UpdateMetadata请求
> 4. `NewPartition,OnlinePartition,OfflinePartition -> OfflinePartition:` 将分区状态标记为 Offline
> 5. `OfflinePartition -> NonExistentPartition:` 将分区状态标记为 NonExistentPartition
>
##### 先前状态==》NewPartition
>将分配的副本从 ZK 加载到控制器缓存
##### 先前状态==》OnlinePartition
> 将第一个活动副本指定为领导者,将所有活动副本指定为 isr将此分区的leader和isr写入ZK ;向每个实时副本发送 LeaderAndIsr 请求向每个实时Broker发送 UpdateMetadata 请求
创建一个新的Topic的时候,我们主要看下面这个接口`initializeLeaderAndIsrForPartitions`
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210616183028700.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
0. 获取`leaderIsrAndControllerEpochs`; Leader为副本的第一个;
1. 向zk中写入`/brokers/topics/{topicName}/partitions/` 持久节点; 无数据
2. 向zk中写入`/brokers/topics/{topicName}/partitions/{分区号}` 持久节点; 无数据
3. 向zk中写入`/brokers/topics/{topicName}/partitions/{分区号}/state` 持久节点; 数据为`leaderIsrAndControllerEpoch`![在这里插入图片描述](https://img-blog.csdnimg.cn/20210616183747171.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
4. 向副本所属Broker发送[`leaderAndIsrRequest`]()请求
5. 向所有Broker发送[`UPDATE_METADATA` ]()请求
##### 先前状态==》OfflinePartition
>将分区状态标记为 Offline ; 在Map对象`partitionStates`中维护的; `NewPartition,OnlinePartition,OfflinePartition ` 可转;
##### 先前状态==》NonExistentPartition
>将分区状态标记为 Offline ; 在Map对象`partitionStates`中维护的; `OfflinePartition ` 可转;
## 源码总结
## Q&A

View File

@@ -0,0 +1,339 @@
[TOC]
## 前言
>本篇文章,我们开始来分析分析Kafka的`Controller`部分的源码,Controller 作为 Kafka Server 端一个重要的组件,它的角色类似于其他分布式系统 Master 的角色跟其他系统不一样的是Kafka 集群的任何一台 Broker 都可以作为 Controller但是在一个集群中同时只会有一个 Controller 是 alive 状态。Controller 在集群中负责的事务很多,比如:集群 meta 信息的一致性保证、Partition leader 的选举、broker 上下线等都是由 Controller 来具体负责。
## 源码分析
老样子,我们还是先来撸一遍源码之后,再进行总结
<font color="red">如果觉得阅读源码解析太枯燥,请直接看 **源码总结及其后面部分**</font>
### 1.源码入口KafkaServer.startup
我们在启动kafka服务的时候,最开始执行的是`KafkaServer.startup`方法; 这里面包含了kafka启动的所有流程; 我们主要看Controller的启动流程
```scala
def startup(): Unit = {
try {
//省略部分代码....
/* start kafka controller */
kafkaController = new KafkaController(config, zkClient, time, metrics, brokerInfo, brokerEpoch, tokenManager, threadNamePrefix)
kafkaController.startup()
//省略部分代码....
}
}
```
### 2. kafkaController.startup() 启动
```scala
/**
每个kafka启动的时候都会调用, 注意这并不假设当前代理是控制器。
它只是注册会话过期侦听器 并启动控制器尝试选举Controller
*/
def startup() = {
//注册状态变更处理器; 这里是把`StateChangeHandler`这个处理器放到一个`stateChangeHandlers` Map中了
zkClient.registerStateChangeHandler(new StateChangeHandler {
override val name: String = StateChangeHandlers.ControllerHandler
override def afterInitializingSession(): Unit = {
eventManager.put(RegisterBrokerAndReelect)
}
override def beforeInitializingSession(): Unit = {
val queuedEvent = eventManager.clearAndPut(Expire)
// Block initialization of the new session until the expiration event is being handled,
// which ensures that all pending events have been processed before creating the new session
queuedEvent.awaitProcessing()
}
})
// 在事件管理器的队列里面放入 一个 Startup启动事件; 这个时候放入还不会执行;
eventManager.put(Startup)
//启动事件管理器,启动的是一个 `ControllerEventThread`的线程
eventManager.start()
}
```
1. `zkClient.registerStateChangeHandler` 注册一个`StateChangeHandler` 状态变更处理器; 有一个map `stateChangeHandlers`来维护这个处理器列表; 这个类型的处理器有下图三个方法,可以看到我们这里实现了`beforeInitializingSession``afterInitializingSession`方法,具体调用的时机,我后面再分析(监听zk的数据变更)![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611112428811.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
2. `ControllerEventManager`是Controller的事件管理器; 里面维护了一个阻塞队列`queue`; 这个queue里面存放的是所有的Controller事件; 按顺序排队执行入队的事件; 上面的代码中`eventManager.put(Startup)` 在队列中放入了一个`Startup`启动事件; 所有的事件都是集成了`ControllerEvent`类的![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611113223844.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
3. 启动事件管理器, 从待执行事件队列`queue`中获取事件进行执行,刚刚不是假如了一个`StartUp`事件么,这个事件就会执行这个事件
### 3. ControllerEventThread 执行事件线程
` eventManager.start()` 之后执行了下面的方法
```scala
class ControllerEventThread(name: String) extends ShutdownableThread(name = name, isInterruptible = false) {
override def doWork(): Unit = {
//从待执行队列里面take一个事件; 没有事件的时候这里会阻塞
val dequeued = queue.take()
dequeued.event match {
case ShutdownEventThread => // The shutting down of the thread has been initiated at this point. Ignore this event.
case controllerEvent =>
//获取事件的ControllerState值不同事件不一样,都集成自ControllerState
_state = controllerEvent.state
eventQueueTimeHist.update(time.milliseconds() - dequeued.enqueueTimeMs)
try {
// 定义process方法; 最终执行的是 事件提供的process方法
def process(): Unit = dequeued.process(processor)
//根据state获取不同的KafkaTimer 主要是为了采集数据; 我们只要关注里面是执行了 process()方法就行了
rateAndTimeMetrics.get(state) match {
case Some(timer) => timer.time { process() }
case None => process()
}
} catch {
case e: Throwable => error(s"Uncaught error processing event $controllerEvent", e)
}
_state = ControllerState.Idle
}
}
}
}
```
1. `val dequeued = queue.take()`从待执行队列里面take一个事件; 没有事件的时候这里会阻塞
2. `dequeued.process(processor)`调用具体事件实现的 `process方法`如下图, 不过要注意的是这里使用了`CountDownLatch(1)`, 那肯定有个地方调用了`processingStarted.await()` 来等待这里的`process()执行完成`;上面的startUp方法就调用了; ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611114915829.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611115440890.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
### 4. processStartup 启动流程
启动Controller的流程
```scala
private def processStartup(): Unit = {
//注册znode变更事件和watch Controller节点是否在zk中存在
zkClient.registerZNodeChangeHandlerAndCheckExistence(controllerChangeHandler)
//选举逻辑
elect()
}
```
1. 注册`ZNodeChangeHandler` 节点变更事件处理器,在map `zNodeChangeHandlers`中保存了key=`/controller`;value=`ZNodeChangeHandler`的键值对; 其中`ZNodeChangeHandler`处理器有如下三个接口
![在这里插入图片描述](https://img-blog.csdnimg.cn/2021061111595850.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
2. 然后向zk发起一个`ExistsRequest(/controller)`的请求,去查询一下`/controller`节点是否存在; 并且如果不存在的话,就注册一个`watch` 监视这个节点;从下面的代码可以看出
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611120501331.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611120515913.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
因为上一步中我们在map `zNodeChangeHandlers`中保存了key=`/controller`; 所以上图中可知,需要注册`watch`来进行`/controller`节点的监控;
kafka是是怎实现监听的呢`zookeeper`构建的时候传入了自定义的`WATCH`
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210613104354849.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
![在这里插入图片描述](https://img-blog.csdnimg.cn/2021061310443834.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
3. 选举; 选举的过程其实就是几个Broker抢占式去成为Controller; 谁先创建`/controller`这个节点; 谁就成为Controller; 我们下面仔细分析以下选择
### 5. Controller的选举elect()
```scala
private def elect(): Unit = {
//去zk上获取 /controller 节点的数据 如果没有就赋值为-1
activeControllerId = zkClient.getControllerId.getOrElse(-1)
//如果获取到了数据就
if (activeControllerId != -1) {
debug(s"Broker $activeControllerId has been elected as the controller, so stopping the election process.")
return
}
try {
//尝试去zk中写入自己的Brokerid作为Controller并且更新Controller epoch
val (epoch, epochZkVersion) = zkClient.registerControllerAndIncrementControllerEpoch(config.brokerId)
controllerContext.epoch = epoch
controllerContext.epochZkVersion = epochZkVersion
activeControllerId = config.brokerId
//
onControllerFailover()
} catch {
//尝试卸任Controller的职责
maybeResign()
//省略...
}
}
```
1. 去zk上获取` /controller `节点的数据 如果没有就赋值为-1
2. 如果获取到了数据说明已经有Controller注册成功了;直接结束选举流程
3. 尝试去zk中写入自己的Brokerid作为Controller并且更新Controller epoch
- 获取zk节点`/controller_epoch`, 这个节点是表示Controller变更的次数,如果没有的话就创建这个节点(**持久节点**); 起始`controller_epoch=0` `ControllerEpochZkVersion=0`
- 向zk发起一个`MultiRequest`请求;里面包含两个命令; 一个是向zk中创建`/controller`节点,节点内容是自己的brokerId;另一个命令是向`/controller_epoch`中更新数据; 数据+1 ;
- 如果写入过程中抛出异常提示说节点已经存在,说明别的Broker已经抢先成为Controller了; 这个时候会做一个检查`checkControllerAndEpoch` 来检查是不是别的Controller抢先了; 如果是的话就抛出`ControllerMovedException`异常; 抛出了这个异常之后,当前Broker会尝试的去卸任一下Controller的职责; 因为有可能他之前是Controller,Controller转移之后都需要尝试卸任一下
5. Controller确定之后,就是做一下成功之后的事情了 `onControllerFailover`
### 6. 当选Controller之后的处理 onControllerFailover
进入到`KafkaController.onControllerFailover`
```scala
private def onControllerFailover(): Unit = {
// 都是ZNodeChildChangeHandler处理器 含有接口 handleChildChange注册了不同事件的处理器
// 对应的事件分别有`BrokerChange`、`TopicChange`、`TopicDeletion`、`LogDirEventNotification`
val childChangeHandlers = Seq(brokerChangeHandler, topicChangeHandler, topicDeletionHandler, logDirEventNotificationHandler,
isrChangeNotificationHandler)
//把这些handle都维护在 map类型`zNodeChildChangeHandlers`中
childChangeHandlers.foreach(zkClient.registerZNodeChildChangeHandler)
//都是ZNodeChangeHandler处理器,含有增删改节点接口;
//分别对应的事件 `ReplicaLeaderElection`、`ZkPartitionReassignment`、``
val nodeChangeHandlers = Seq(preferredReplicaElectionHandler, partitionReassignmentHandler)
//把这些handle都维护在 map类型`zNodeChangeHandlers`中
nodeChangeHandlers.foreach(zkClient.registerZNodeChangeHandlerAndCheckExistence)
info("Deleting log dir event notifications")
//删除所有日志目录事件通知。 ;获取zk中节点`/log_dir_event_notification`的值;然后把节点下面的节点全部删除
zkClient.deleteLogDirEventNotifications(controllerContext.epochZkVersion)
info("Deleting isr change notifications")
// 删除节点 `/isr_change_notification`下的所有节点
zkClient.deleteIsrChangeNotifications(controllerContext.epochZkVersion)
info("Initializing controller context")
initializeControllerContext()
info("Fetching topic deletions in progress")
val (topicsToBeDeleted, topicsIneligibleForDeletion) = fetchTopicDeletionsInProgress()
info("Initializing topic deletion manager")
topicDeletionManager.init(topicsToBeDeleted, topicsIneligibleForDeletion)
// We need to send UpdateMetadataRequest after the controller context is initialized and before the state machines
// are started. The is because brokers need to receive the list of live brokers from UpdateMetadataRequest before
// they can process the LeaderAndIsrRequests that are generated by replicaStateMachine.startup() and
// partitionStateMachine.startup().
info("Sending update metadata request")
sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq, Set.empty)
replicaStateMachine.startup()
partitionStateMachine.startup()
info(s"Ready to serve as the new controller with epoch $epoch")
initializePartitionReassignments()
topicDeletionManager.tryTopicDeletion()
val pendingPreferredReplicaElections = fetchPendingPreferredReplicaElections()
onReplicaElection(pendingPreferredReplicaElections, ElectionType.PREFERRED, ZkTriggered)
info("Starting the controller scheduler")
kafkaScheduler.startup()
if (config.autoLeaderRebalanceEnable) {
scheduleAutoLeaderRebalanceTask(delay = 5, unit = TimeUnit.SECONDS)
}
scheduleUpdateControllerMetricsTask()
if (config.tokenAuthEnabled) {
info("starting the token expiry check scheduler")
tokenCleanScheduler.startup()
tokenCleanScheduler.schedule(name = "delete-expired-tokens",
fun = () => tokenManager.expireTokens,
period = config.delegationTokenExpiryCheckIntervalMs,
unit = TimeUnit.MILLISECONDS)
}
}
```
1. 把事件`BrokerChange``TopicChange``TopicDeletion``LogDirEventNotification`对应的handle处理器都维护在 map类型`zNodeChildChangeHandlers`
2. 把事件 `ReplicaLeaderElection``ZkPartitionReassignment`对应的handle处理器都维护在 map类型`zNodeChildChangeHandlers`
3. 删除zk中节点`/log_dir_event_notification`下的所有节点
4. 删除zk中节点 `/isr_change_notification`下的所有节点
5. 初始化Controller的上下文对象`initializeControllerContext()`
- 获取`/brokers/ids`节点信息拿到所有的存活的BrokerID; 然后获取每个Broker的信息 `/brokers/ids/对应BrokerId`的信息以及对应的节点的Epoch; 也就是`cZxid`; 然后将数据保存在内存中
- 获取`/brokers/topics`节点信息;拿到所有Topic之后,放到Map `partitionModificationsHandlers`中,key=topicName;value=对应节点的`PartitionModificationsHandler`; 节点是`/brokers/topics/topic名称`;最终相当于是在事件处理队列`queue`中给每个Topic添加了一个`PartitionModifications`事件; 这个事件是怎么处理的,我们下面分析
- 同时又注册一下上面的`PartitionModificationsHandler`,保存在map `zNodeChangeHandlers` 中; key= `/brokers/topics/Topic名称`,Value=`PartitionModificationsHandler`; 我们上面也说到过,这个有个功能就是判断需不需要向zk中注册`watch`; 从下图的代码中可以看出,在获取zk数据(`GetDataRequest`)的时候,会去 `zNodeChangeHandlers`判断一下存不存在对应节点key;存在的话就注册`watch`监视数据![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611120515913.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
- zk中获取`/brokers/topics/topic名称`所有topic的分区数据; 保存在内存中
- 给每个broker注册broker变更处理器`BrokerModificationsHandler`(也是`ZNodeChangeHandler`)它对应的事件是`BrokerModifications`; 同样的`zNodeChangeHandlers`中也保存着对应的`/brokers/ids/对应BrokerId` 同样的`watch`监控并且map `brokerModificationsHandlers`保存对应关系 key=`brokerID` value=`BrokerModificationsHandler`
- 从zk中获取所有的topic-partition 信息; 节点: `/brokers/topics/Topic名称/partitions/分区号/state` ; 然后保存在缓存中`controllerContext.partitionLeadershipInfo`
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210611161631995.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
- `controllerChannelManager.startup()` 这个单独开了一篇文章讲解,请看[【kafka源码】Controller与Brokers之间的网络通信](), 简单来说就是创建一个map来保存于所有Broker的发送请求线程对象`RequestSendThread`;这个对象中有一个 阻塞队列`queue`; 用来排队执行要执行的请求,没有任务时候回阻塞; Controller需要发送请求的时候只需要向这个`queue`中添加任务就行了
6. 初始化删除Topic管理器`topicDeletionManager.init()`
- 读取zk节点`/admin/delete_topics`的子节点数据,表示的是标记为已经删除的Topic
- 将被标记为删除的Topic,做一些开始删除Topic的操作;具体详情情况请看[【kafka源码】TopicCommand之删除Topic源码解析]()
7. `sendUpdateMetadataRequest` 给Brokers们发送`UPDATA_METADATA` 更新元数据的请求,关于更新元数据详细情况 [【kafka源码】更新元数据`UPDATA_METADATA`请求源码分析 ]()
8. `replicaStateMachine.startup()` 启动副本状态机,获取所有在线的和不在线的副本;
①. 将在线副本状态变更为`OnlineReplica:`将带有当前领导者和 isr 的 `LeaderAndIsr `请求发送到新副本,并将分区的 `UpdateMetadata `请求发送到每个实时代理
②. 将不在线副本状态变更为`OfflineReplica:` 向副本发送 [StopReplicaRequest]() 从 isr 中删除此副本并将 [LeaderAndIsr]() 请求(带有新的 isr发送到领导副本并将分区的 UpdateMetadata 请求发送到每个实时代理。
详细请看 [【kafka源码】Controller中的状态机](https://shirenchuang.blog.csdn.net/article/details/117848213)
9. `partitionStateMachine.startup()`启动分区状态机,获取所有在线的和不在线(判断Leader是否在线)的分区;
1. 如果分区不存在`LeaderIsr`,则状态是`NewPartition`
2. 如果分区存在`LeaderIsr`,就判断一下Leader是否存活
2.1 如果存活的话,状态是`OnlinePartition`
2.2 否则是`OfflinePartition`
3. 尝试将所有处于 `NewPartition ``OfflinePartition `状态的分区移动到 `OnlinePartition` 状态,但属于要删除的主题的分区除外
PS:如果之前创建Topic过程中,Controller发生了变更,Topic创建么有完成,那么这个状态流转的过程会继续创建下去; [【kafka源码】TopicCommand之创建Topic源码解析]()
关于状态机 详细请看 [【kafka源码】Controller中的状态机](https://shirenchuang.blog.csdn.net/article/details/117848213)
11. ` initializePartitionReassignments` 初始化挂起的重新分配。这包括通过 `/admin/reassign_partitions` 发送的重新分配,它将取代任何正在进行的 API 重新分配。[【kafka源码】分区重分配 TODO..]()
12. `topicDeletionManager.tryTopicDeletion()`尝试恢复未完成的Topic删除操作;相关情况 [【kafka源码】TopicCommand之删除Topic源码解析](https://shirenchuang.blog.csdn.net/article/details/117847877)
13.`/admin/preferred_replica_election` 获取值,调用`onReplicaElection()` 尝试为每个给定分区选举一个副本作为领导者 ;相关内容请看[【kafka源码】Kafka的优先副本选举源码分析]();
14. `kafkaScheduler.startup()`启动一些定时任务线程
15. 如果配置了`auto.leader.rebalance.enable=true`则启动LeaderRebalace的定时任务;线程名`auto-leader-rebalance-task`
16. 如果配置了 `delegation.token.master.key`,则启动一些token的清理线程
### 7. Controller重新选举
当我们把zk中的节点`/controller`删除之后; 会调用下面接口;进行重新选举
```scala
private def processReelect(): Unit = {
//尝试卸任一下
maybeResign()
//进行选举
elect()
}
```
## 源码总结
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210630195523983.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70#pic_center)
PS: 可以看到 Broker当选Controller之后,保存了很多zk上的数据到自己的内存中, 也承担了很多责任; 如果这台Broker自身压力就挺大,那么它当选Controller之后压力会更大,所以尽量让比较空闲的Broker当选Controller,那么如何实现这样一个目标呢? 可以指定Broker作为Controller;
这样一个功能可以在 <font color=red size=5>项目地址: [didi/Logi-KafkaManager: 一站式Apache Kafka集群指标监控与运维管控平台](https://github.com/didi/Logi-KafkaManager)</font> 里面可以实现
## Q&A
### 直接删除zk节点`/controller`会怎么样
>Broker之间会立马重新选举Controller;
### 如果修改节点`/controller/`下的数据会成功将Controller转移吗
假如`/controller`节点数据是`{"version":1,"brokerid":3,"timestamp":"1623746563454"}` 我把BrokerId=1Controller会直接变成Broker-1
>Answer: **不会成功转移,并且当前的集群中Broker是没有Controller角色的;这就是一个非常严重的问题了**
分析源码:
修改`/controller/`数据在Controller执行的代码是
```scala
private def processControllerChange(): Unit = {
maybeResign()
}
private def maybeResign(): Unit = {
val wasActiveBeforeChange = isActive
zkClient.registerZNodeChangeHandlerAndCheckExistence(controllerChangeHandler)
activeControllerId = zkClient.getControllerId.getOrElse(-1)
if (wasActiveBeforeChange && !isActive) {
onControllerResignation()
}
}
```
代码就非常清楚的看到, 修改数据之后,如果修改后的Broker-Id和当前的Controller的BrokerId不一致,执行`onControllerResignation` 就让当前的Controller卸任这个角色了;
### /log_dir_event_notification 是干啥 的
> 当`log.dir`日志文件夹出现访问不了,磁盘损坏等等异常导致读写失败,就会触发一些异常通知事件;
> 流程是->
> 1. Broker检查到`log.dir`异常,做一些清理工作,然后向zk中创建持久序列节点`/log_dir_event_notification/log_dir_event_+序列号`;数据是 BrokerID;例如:
>`/log_dir_event_notification/log_dir_event_0000000003`
>2. Controller 监听到了zk的变更; 将从zk节点 /log_dir_event_notification/log_dir_event_序列号 中获取到的数据的Broker上的所有副本进行一个副本状态流转 ->OnlineReplica
> 2.1 给所有broker 发送`LeaderAndIsrRequest`请求让brokers们去查询他们的副本的状态如果副本logDir已经离线则返回KAFKA_STORAGE_ERROR异常;
> 2.2 完事之后会删除节点
### /isr_change_notification 是干啥用的
> 当有isr变更的时候会在这个节点写入数据; Controller监听之后做一些通知
### /admin/preferred_replica_election 是干啥用的
>优先副本选举, 详情请戳[kafka的优先副本选举流程 .]()
>
## 思考
### 有什么办法实现Controller的优先选举
>既然我们知道了Controller承担了这么多的任务,又是Broker又是Controller,身兼数职压力难免会比较大;
>所以我们很希望能够有一个功能能够知道Broker为Controller角色; 这样就可以指定压力比较小的Broker来承担Controller的角色了;
**那么,如何实现呢?**
>Kafka原生目前并不支持这个功能,所以我们想要实现这个功能,就得要改源码了;
>知道了原理, 改源码实现这个功能就很简单了; 有很多种实现方式;
比如说: 在zk里面设置一个节点专门用来存放候选节点; 竞选Controller的时候优先从这里面选择;
然后Broker们启动的时候,可以判断一下自己是不是候选节点, 如果不是的话,那就让它睡个两三秒; (让候选者99米再跑)
那么大概率的情况下,候选者肯定就会当选了;

View File

@@ -0,0 +1,26 @@
## Controller优先选举
> 在原生的kafka中,Controller角色的选举,是每个Broker抢占式的去zk写入节点`Controller`
> 任何一个Broker都有可能当选Controller;
> 但是Controller角色除了是一个正常的Broker外,还承担着Controller角色的一些任务;
> 具体情况 [【kafka源码】Controller启动过程以及选举流程源码分析]()
> 当这台Broker本身压力很大的情况下,又当选Controller让Broker压力更大了;
> 所以我们期望让Controller角色落在一些压力较小的Broker上;或者专门用一台机器用来当做Controller角色;
> 基于这么一个需求,我们内部就对引擎做了些改造,用于支持`Controller优先选举`
## 改造原理
> 在`/config`节点下新增了节点`/config/extension/candidates/ `;
> 将所有需要被优先选举的BrokerID存放到该节点下面;
> 例如:
> `/config/extension/candidates/0`
> ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210625145023974.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
当Controller发生重新选举的时候, 每个Broker都去抢占式写入`/controller`节点, 但是会先去节点`/config/extension/candidates/`节点获取所有子节点,获取到有一个BrokerID=0; 这个时候会判断一下是否跟自己的BrokerID相等; 不相等的话就`sleep 3秒` 钟; 这样的话,那么BrokerId=0这个Broker就会大概率当选Controller; 如果这个Broker挂掉了,那么其他Broker就可能会当选
<font color=red>PS: `/config/extension/candidates/` 节点下可以配置多个候选Controller </font>
## KM管理平台操作
![在这里插入图片描述](https://img-blog.csdnimg.cn/202106251511242.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)

View File

@@ -0,0 +1,614 @@
## 1.脚本的使用
>请看 [【kafka运维】副本扩缩容、数据迁移、分区重分配]()
## 2.源码解析
<font color=red>如果阅读源码太枯燥,可以直接跳转到 源码总结和Q&A部分<font>
### 2.1`--generate ` 生成分配策略分析
配置启动类`--zookeeper xxxx:2181 --topics-to-move-json-file config/move-json-file.json --broker-list "0,1,2,3" --generate`
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210619121453741.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
配置`move-json-file.json`文件
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210619121618298.png)
启动,调试:
`ReassignPartitionsCommand.generateAssignment`
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210619121959199.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
1. 获取入参的数据
2. 校验`--broker-list`传入的BrokerId是否有重复的,重复就报错
3. 开始进行分配
`ReassignPartitionsCommand.generateAssignment`
```scala
def generateAssignment(zkClient: KafkaZkClient, brokerListToReassign: Seq[Int], topicsToMoveJsonString: String, disableRackAware: Boolean): (Map[TopicPartition, Seq[Int]], Map[TopicPartition, Seq[Int]]) = {
//解析出游哪些Topic
val topicsToReassign = parseTopicsData(topicsToMoveJsonString)
//检查是否有重复的topic
val duplicateTopicsToReassign = CoreUtils.duplicates(topicsToReassign)
if (duplicateTopicsToReassign.nonEmpty)
throw new AdminCommandFailedException("List of topics to reassign contains duplicate entries: %s".format(duplicateTopicsToReassign.mkString(",")))
//获取topic当前的副本分配情况 /brokers/topics/{topicName}
val currentAssignment = zkClient.getReplicaAssignmentForTopics(topicsToReassign.toSet)
val groupedByTopic = currentAssignment.groupBy { case (tp, _) => tp.topic }
//机架感知模式
val rackAwareMode = if (disableRackAware) RackAwareMode.Disabled else RackAwareMode.Enforced
val adminZkClient = new AdminZkClient(zkClient)
val brokerMetadatas = adminZkClient.getBrokerMetadatas(rackAwareMode, Some(brokerListToReassign))
val partitionsToBeReassigned = mutable.Map[TopicPartition, Seq[Int]]()
groupedByTopic.foreach { case (topic, assignment) =>
val (_, replicas) = assignment.head
val assignedReplicas = AdminUtils.assignReplicasToBrokers(brokerMetadatas, assignment.size, replicas.size)
partitionsToBeReassigned ++= assignedReplicas.map { case (partition, replicas) =>
new TopicPartition(topic, partition) -> replicas
}
}
(partitionsToBeReassigned, currentAssignment)
}
```
1. 检查是否有重复的topic,重复则抛出异常
2. 从zk节点` /brokers/topics/{topicName}`获取topic当前的副本分配情况
3. 从zk节点`brokers/ids`中获取所有在线节点,并跟`--broker-list`参数传入的取个交集
4. 获取Brokers元数据,如果机架感知模式`RackAwareMode.Enforced`(默认)&&上面3中获取到的交集列表brokers不是都有机架信息或者都没有机架信息的话就抛出异常; 因为要根据机架信息做分区分配的话,必须要么都有机架信息,要么都没有机架信息; 出现这种情况怎么办呢? 那就将机架感知模式`RackAwareMode`设置为`RackAwareMode.Disabled` ;只需要加上一个参数`--disable-rack-aware`就行了
5. 调用`AdminUtils.assignReplicasToBrokers` 计算分配情况;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210619125420415.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
我们在[【kafka源码】创建Topic的时候是如何分区和副本的分配规则]()里面分析过就不再赘述了, `AdminUtils.assignReplicasToBrokers(要分配的Broker们的元数据, 分区数, 副本数)`
需要注意的是副本数是通过`assignment.head.replicas.size`获取的,意思是第一个分区的副本数量,正常情况下分区副本都会相同,但是也不一定,也可能被设置为了不同
<font color=red>根据这条信息我们是不是就可以直接调用这个接口来实现其他功能? **比如副本的扩缩容**</font>
### 2.2`--execute ` 执行阶段分析
> 使用脚本执行
> `--zookeeper xxx --reassignment-json-file config/reassignment-json-file.json --execute --throttle 10000`
`ReassignPartitionsCommand.executeAssignment`
```scala
def executeAssignment(zkClient: KafkaZkClient, adminClientOpt: Option[Admin], reassignmentJsonString: String, throttle: Throttle, timeoutMs: Long = 10000L): Unit = {
//对json文件进行校验和解析
val (partitionAssignment, replicaAssignment) = parseAndValidate(zkClient, reassignmentJsonString)
val adminZkClient = new AdminZkClient(zkClient)
val reassignPartitionsCommand = new ReassignPartitionsCommand(zkClient, adminClientOpt, partitionAssignment.toMap, replicaAssignment, adminZkClient)
//检查是否已经存在副本重分配进程, 则尝试限流
if (zkClient.reassignPartitionsInProgress()) {
reassignPartitionsCommand.maybeLimit(throttle)
} else {
//打印当前的副本分配方式,方便回滚
printCurrentAssignment(zkClient, partitionAssignment.map(_._1.topic))
if (throttle.interBrokerLimit >= 0 || throttle.replicaAlterLogDirsLimit >= 0)
println(String.format("Warning: You must run Verify periodically, until the reassignment completes, to ensure the throttle is removed. You can also alter the throttle by rerunning the Execute command passing a new value."))
//开始进行重分配进程
if (reassignPartitionsCommand.reassignPartitions(throttle, timeoutMs)) {
println("Successfully started reassignment of partitions.")
} else
println("Failed to reassign partitions %s".format(partitionAssignment))
}
}
```
1. 解析json文件并做些校验
1. (partition、replica非空校验,partition重复校验)
3. 校验`partition`是否有不存在的分区;(新增分区请用`kafka-topic`)
4. 检查配置中的Brokers-id是否都存在
3. 如果发现已经存在副本重分配进程(检查是否有节点`/admin/reassign_partitions`),则检查是否需要更改限流; 如果有参数(`--throttle`,`--replica-alter-log-dirs-throttle`) 则设置限流信息; 而后不再执行下一步
4. 如果当前没有执行中的副本重分配任务(检查是否有节点`/admin/reassign_partitions`),则开始进行副本重分配任务;
#### 2.2.1 已有任务,尝试限流
如果zk中有节点`/admin/reassign_partitions`; 则表示当前已有一个任务在进行,那么当前操作就不继续了,如果有参数
`--throttle`
`--replica-alter-log-dirs-throttle`
则进行限制
>限制当前移动副本的节流阀。请注意,此命令可用于更改节流阀,但如果某些代理已完成重新平衡,则它可能不会更改最初设置的所有限制。所以后面需要将这个限制给移除掉 通过`--verify`
`maybeLimit`
```scala
def maybeLimit(throttle: Throttle): Unit = {
if (throttle.interBrokerLimit >= 0 || throttle.replicaAlterLogDirsLimit >= 0) {
//当前存在的broker
val existingBrokers = existingAssignment().values.flatten.toSeq
//期望的broker
val proposedBrokers = proposedPartitionAssignment.values.flatten.toSeq ++ proposedReplicaAssignment.keys.toSeq.map(_.brokerId())
//前面broker相加去重
val brokers = (existingBrokers ++ proposedBrokers).distinct
//遍历与之相关的Brokers, 添加限流配置写入到zk节点/config/broker/{brokerId}中
for (id <- brokers) {
//获取broker的配置 /config/broker/{brokerId}
val configs = adminZkClient.fetchEntityConfig(ConfigType.Broker, id.toString)
if (throttle.interBrokerLimit >= 0) {
configs.put(DynamicConfig.Broker.LeaderReplicationThrottledRateProp, throttle.interBrokerLimit.toString)
configs.put(DynamicConfig.Broker.FollowerReplicationThrottledRateProp, throttle.interBrokerLimit.toString)
}
if (throttle.replicaAlterLogDirsLimit >= 0)
configs.put(DynamicConfig.Broker.ReplicaAlterLogDirsIoMaxBytesPerSecondProp, throttle.replicaAlterLogDirsLimit.toString)
adminZkClient.changeBrokerConfig(Seq(id), configs)
}
}
}
```
`/config/brokers/{brokerId}`节点配置是Broker端的动态配置,不需要重启Broker实时生效;
1. 如果传入了参数`--throttle` 则从zk节点`/config/brokers/{BrokerId}`节点获取Broker们的配置信息,然后再加上以下两个配置重新写入到节点`/config/brokers/{BrokerId}`
`leader.replication.throttled.rate` 控制leader副本端处理FETCH请求的速率
`follower.replication.throttled.rate` 控制follower副本发送FETCH请求的速率
2. 如果传入了参数`--replica-alter-log-dirs-throttle` 则将如下配置也写入节点中;
`replica.alter.log.dirs.io.max.bytes.per.second:` broker内部目录之间迁移数据流量限制功能限制数据拷贝从一个目录到另外一个目录带宽上限
例如写入之后的数据
```json
{"version":1,"config":{"leader.replication.throttled.rate":"1","follower.replication.throttled.rate":"1"}}
```
**注意: 这里写入的限流配置,是写入所有与之相关的Broker的限流配置;**
#### 2.2.2 当前未有执行任务,开始执行副本重分配任务
`ReassignPartitionsCommand.reassignPartitions`
```scala
def reassignPartitions(throttle: Throttle = NoThrottle, timeoutMs: Long = 10000L): Boolean = {
//写入一些限流数据
maybeThrottle(throttle)
try {
//验证分区是否存在
val validPartitions = proposedPartitionAssignment.groupBy(_._1.topic())
.flatMap { case (topic, topicPartitionReplicas) =>
validatePartition(zkClient, topic, topicPartitionReplicas)
}
if (validPartitions.isEmpty) false
else {
if (proposedReplicaAssignment.nonEmpty && adminClientOpt.isEmpty)
throw new AdminCommandFailedException("bootstrap-server needs to be provided in order to reassign replica to the specified log directory")
val startTimeMs = System.currentTimeMillis()
// Send AlterReplicaLogDirsRequest to allow broker to create replica in the right log dir later if the replica has not been created yet.
if (proposedReplicaAssignment.nonEmpty)
alterReplicaLogDirsIgnoreReplicaNotAvailable(proposedReplicaAssignment, adminClientOpt.get, timeoutMs)
// Create reassignment znode so that controller will send LeaderAndIsrRequest to create replica in the broker
zkClient.createPartitionReassignment(validPartitions.map({case (key, value) => (new TopicPartition(key.topic, key.partition), value)}).toMap)
// Send AlterReplicaLogDirsRequest again to make sure broker will start to move replica to the specified log directory.
// It may take some time for controller to create replica in the broker. Retry if the replica has not been created.
var remainingTimeMs = startTimeMs + timeoutMs - System.currentTimeMillis()
val replicasAssignedToFutureDir = mutable.Set.empty[TopicPartitionReplica]
while (remainingTimeMs > 0 && replicasAssignedToFutureDir.size < proposedReplicaAssignment.size) {
replicasAssignedToFutureDir ++= alterReplicaLogDirsIgnoreReplicaNotAvailable(
proposedReplicaAssignment.filter { case (replica, _) => !replicasAssignedToFutureDir.contains(replica) },
adminClientOpt.get, remainingTimeMs)
Thread.sleep(100)
remainingTimeMs = startTimeMs + timeoutMs - System.currentTimeMillis()
}
replicasAssignedToFutureDir.size == proposedReplicaAssignment.size
}
} catch {
case _: NodeExistsException =>
val partitionsBeingReassigned = zkClient.getPartitionReassignment()
throw new AdminCommandFailedException("Partition reassignment currently in " +
"progress for %s. Aborting operation".format(partitionsBeingReassigned))
}
}
```
1. `maybeThrottle(throttle)` 设置副本移动时候的限流配置,这个方法只用于任务初始化的时候
```scala
private def maybeThrottle(throttle: Throttle): Unit = {
if (throttle.interBrokerLimit >= 0)
assignThrottledReplicas(existingAssignment(), proposedPartitionAssignment, adminZkClient)
maybeLimit(throttle)
if (throttle.interBrokerLimit >= 0 || throttle.replicaAlterLogDirsLimit >= 0)
throttle.postUpdateAction()
if (throttle.interBrokerLimit >= 0)
println(s"The inter-broker throttle limit was set to ${throttle.interBrokerLimit} B/s")
if (throttle.replicaAlterLogDirsLimit >= 0)
println(s"The replica-alter-dir throttle limit was set to ${throttle.replicaAlterLogDirsLimit} B/s")
}
```
1.1 将一些topic的限流配置写入到节点`/config/topics/{topicName}`中
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210619154313953.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
将计算得到的leader、follower 值写入到`/config/topics/{topicName}`中
leader: 找到 TopicPartition中有新增的副本的 那个分区;数据= 分区号:副本号,分区号:副本号
follower: 遍历 预期 TopicPartition,副本= 预期副本-现有副本;数据= 分区号:副本号,分区号:副本号
`leader.replication.throttled.replicas`: leader
`follower.replication.throttled.replicas`: follower
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210619154858445.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
1.2. 执行 《**2.2.1 已有任务,尝试限流**》流程
2. 从zk中获取`/broker/topics/{topicName}`数据来验证给定的分区是否存在,如果分区不存在则忽略此分区的配置,继续流程
3. 如果尚未创建副本,则发送 `AlterReplicaLogDirsRequest` 以允许代理稍后在正确的日志目录中创建副本。这个跟 `log_dirs` 有关 TODO....
4. 将重分配的数据写入到zk的节点`/admin/reassign_partitions`中;数据内容如:
```
{"version":1,"partitions":[{"topic":"test_create_topic1","partition":0,"replicas":[0,1,2,3]},{"topic":"test_create_topic1","partition":1,"replicas":[1,2,0,3]},{"topic":"test_create_topic1","partition":2,"replicas":[2,1,0,3]}]}
```
5. 再次发送 `AlterReplicaLogDirsRequest `以确保代理将开始将副本移动到指定的日志目录。控制器在代理中创建副本可能需要一些时间。如果尚未创建副本,请重试。
1. 像Broker发送`alterReplicaLogDirs`请求
#### 2.2.3 Controller监听`/admin/reassign_partitions`节点变化
`KafkaController.processZkPartitionReassignment`
```scala
private def processZkPartitionReassignment(): Set[TopicPartition] = {
// We need to register the watcher if the path doesn't exist in order to detect future
// reassignments and we get the `path exists` check for free
if (isActive && zkClient.registerZNodeChangeHandlerAndCheckExistence(partitionReassignmentHandler)) {
val reassignmentResults = mutable.Map.empty[TopicPartition, ApiError]
val partitionsToReassign = mutable.Map.empty[TopicPartition, ReplicaAssignment]
zkClient.getPartitionReassignment().foreach { case (tp, targetReplicas) =>
maybeBuildReassignment(tp, Some(targetReplicas)) match {
case Some(context) => partitionsToReassign.put(tp, context)
case None => reassignmentResults.put(tp, new ApiError(Errors.NO_REASSIGNMENT_IN_PROGRESS))
}
}
reassignmentResults ++= maybeTriggerPartitionReassignment(partitionsToReassign)
val (partitionsReassigned, partitionsFailed) = reassignmentResults.partition(_._2.error == Errors.NONE)
if (partitionsFailed.nonEmpty) {
warn(s"Failed reassignment through zk with the following errors: $partitionsFailed")
maybeRemoveFromZkReassignment((tp, _) => partitionsFailed.contains(tp))
}
partitionsReassigned.keySet
} else {
Set.empty
}
}
```
1. 判断是否是Controller角色并且是否存在节点`/admin/reassign_partitions`
2. `maybeTriggerPartitionReassignment` 重分配如果topic已经被标记为删除了,则此topic流程终止;
3. `maybeRemoveFromZkReassignment`将执行失败的一些分区信息从zk中删除;(覆盖信息)
##### onPartitionReassignment
`KafkaController.onPartitionReassignment`
```scala
private def onPartitionReassignment(topicPartition: TopicPartition, reassignment: ReplicaAssignment): Unit = {
// 暂停一些正在删除的Topic操作
topicDeletionManager.markTopicIneligibleForDeletion(Set(topicPartition.topic), reason = "topic reassignment in progress")
//更新当前的分配
updateCurrentReassignment(topicPartition, reassignment)
val addingReplicas = reassignment.addingReplicas
val removingReplicas = reassignment.removingReplicas
if (!isReassignmentComplete(topicPartition, reassignment)) {
// A1. Send LeaderAndIsr request to every replica in ORS + TRS (with the new RS, AR and RR).
updateLeaderEpochAndSendRequest(topicPartition, reassignment)
// A2. replicas in AR -> NewReplica
startNewReplicasForReassignedPartition(topicPartition, addingReplicas)
} else {
// B1. replicas in AR -> OnlineReplica
replicaStateMachine.handleStateChanges(addingReplicas.map(PartitionAndReplica(topicPartition, _)), OnlineReplica)
// B2. Set RS = TRS, AR = [], RR = [] in memory.
val completedReassignment = ReplicaAssignment(reassignment.targetReplicas)
controllerContext.updatePartitionFullReplicaAssignment(topicPartition, completedReassignment)
// B3. Send LeaderAndIsr request with a potential new leader (if current leader not in TRS) and
// a new RS (using TRS) and same isr to every broker in ORS + TRS or TRS
moveReassignedPartitionLeaderIfRequired(topicPartition, completedReassignment)
// B4. replicas in RR -> Offline (force those replicas out of isr)
// B5. replicas in RR -> NonExistentReplica (force those replicas to be deleted)
stopRemovedReplicasOfReassignedPartition(topicPartition, removingReplicas)
// B6. Update ZK with RS = TRS, AR = [], RR = [].
updateReplicaAssignmentForPartition(topicPartition, completedReassignment)
// B7. Remove the ISR reassign listener and maybe update the /admin/reassign_partitions path in ZK to remove this partition from it.
removePartitionFromReassigningPartitions(topicPartition, completedReassignment)
// B8. After electing a leader in B3, the replicas and isr information changes, so resend the update metadata request to every broker
sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq, Set(topicPartition))
// signal delete topic thread if reassignment for some partitions belonging to topics being deleted just completed
topicDeletionManager.resumeDeletionForTopics(Set(topicPartition.topic))
}
}
```
1. 暂停一些正在删除的Topic操作
2. 更新 Zk节点`brokers/topics/{topicName}`,和内存中的当前分配状态。如果重新分配已经在进行中,那么新的重新分配将取代它并且一些副本将被关闭。
2.1 更新zk中的topic节点信息`brokers/topics/{topicName}`,这里会标记AR哪些副本是新增的,RR哪些副本是要删除的;例如:![在这里插入图片描述](https://img-blog.csdnimg.cn/20210619174300940.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210619174325110.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
2.2 更新当前内存
2.3 如果**重新分配**已经在进行中,那么一些当前新增加的副本有可能被立即删除,在这种情况下,我们需要停止副本。
2.4 注册一个监听节点`/brokers/topics/{topicName}/partitions/{分区号}/state`变更的处理器`PartitionReassignmentIsrChangeHandler`
3. 如果该分区的重新分配还没有完成(根据`/brokers/topics/{topicName}/partitions/{分区号}/state`里面的isr来判断是否已经包含了新增的BrokerId了);则
以下几个名称说明:
`ORS`: OriginReplicas 原先的副本
`TRS`: targetReplicas 将要变更成的目标副本
`AR`: adding_replicas 正在添加的副本
`RR`:removing_replicas 正在移除的副本
3.1 向 ORS + TRS 中的每个副本发送` LeaderAndIsr `请求(带有新的 RS、AR 和 RR
3.2 给新增加的AR副本 进行状态变更成`NewReplica` ; 这个过程有发送`LeaderAndIsrRequest`详细请看[【kafka源码】Controller中的状态机]()
#### 2.2.4 Controller监听节点`brokers/topics/{topicName}`变化,检查是否有新增分区
这一个流程可以不必在意,因为在这里没有做任何事情;
>上面的 **2.2.3** 的第2小段中不是有将新增的和删掉的副本写入到了 zk中吗
>例如:
>```json
>
>{"version":2,"partitions":{"2":[0,1],"1":[0,1],"0":[0,1]},"adding_replicas":{"2":[1],"1":[1],"0":[1]},"removing_replicas":{}}
>
>```
Controller监听到这个节点之后,执行方法`processPartitionModifications`
`KafkaController.processPartitionModifications`
```scala
private def processPartitionModifications(topic: String): Unit = {
def restorePartitionReplicaAssignment(
topic: String,
newPartitionReplicaAssignment: Map[TopicPartition, ReplicaAssignment]
): Unit = {
info("Restoring the partition replica assignment for topic %s".format(topic))
//从zk节点中获取所有分区
val existingPartitions = zkClient.getChildren(TopicPartitionsZNode.path(topic))
//找到已经存在的分区
val existingPartitionReplicaAssignment = newPartitionReplicaAssignment
.filter(p => existingPartitions.contains(p._1.partition.toString))
.map { case (tp, _) =>
tp -> controllerContext.partitionFullReplicaAssignment(tp)
}.toMap
zkClient.setTopicAssignment(topic,
existingPartitionReplicaAssignment,
controllerContext.epochZkVersion)
}
if (!isActive) return
val partitionReplicaAssignment = zkClient.getFullReplicaAssignmentForTopics(immutable.Set(topic))
val partitionsToBeAdded = partitionReplicaAssignment.filter { case (topicPartition, _) =>
controllerContext.partitionReplicaAssignment(topicPartition).isEmpty
}
if (topicDeletionManager.isTopicQueuedUpForDeletion(topic)) {
if (partitionsToBeAdded.nonEmpty) {
warn("Skipping adding partitions %s for topic %s since it is currently being deleted"
.format(partitionsToBeAdded.map(_._1.partition).mkString(","), topic))
restorePartitionReplicaAssignment(topic, partitionReplicaAssignment)
} else {
// This can happen if existing partition replica assignment are restored to prevent increasing partition count during topic deletion
info("Ignoring partition change during topic deletion as no new partitions are added")
}
} else if (partitionsToBeAdded.nonEmpty) {
info(s"New partitions to be added $partitionsToBeAdded")
partitionsToBeAdded.foreach { case (topicPartition, assignedReplicas) =>
controllerContext.updatePartitionFullReplicaAssignment(topicPartition, assignedReplicas)
}
onNewPartitionCreation(partitionsToBeAdded.keySet)
}
}
```
1. 从`brokers/topics/{topicName}`中获取完整的分配信息,例如
```json
{
"version": 2,
"partitions": {
"2": [0, 1],
"1": [0, 1],
"0": [0, 1]
},
"adding_replicas": {
"2": [1],
"1": [1],
"0": [1]
},
"removing_replicas": {}
}
```
2. 如果有需要新增的分区,如下操作
2.1 如果当前Topic刚好在删掉队列中,那么就没有必要进行分区扩容了; 将zk的`brokers/topics/{topicName}`数据恢复回去
2.2 如果不在删除队列中,则开始走新增分区的流程;关于新增分区的流程 在[【kafka源码】TopicCommand之创建Topic源码解析
]()里面已经详细讲过了,跳转后请搜索关键词`onNewPartitionCreation`
3. 如果该Topic正在删除中,则跳过该Topic的处理; 并且同时如果有AR(adding_replical)则重写一下zk节点`/broker/topics/{topicName}`节点的数据; 相当于是还原数据; 移除掉里面的AR;
**这一步完全不用理会,因为 分区副本重分配不会出现新增分区的情况;**
#### 2.2.5 Controller监听zk节点`/brokers/topics/{topicName}/partitions/{分区号}/state`
> 上面2.2.3 里面的 2.4不是有说过注册一个监听节点`/brokers/topics/{topicName}/partitions/{分区号}/state`变更的处理器`PartitionReassignmentIsrChangeHandler`
>
到底是什么时候这个节点有变化呢? 前面我们不是对副本们发送了`LEADERANDISR`的请求么, 当新增的副本去leader
fetch数据开始同步的时候,当数据同步完成跟上了ISR的节奏,就会去修改这个节点; 修改之后那么下面就开始执行监听流程了
这里跟 **2.2.3** 中有调用同一个接口; 不过这个时候经过了`LeaderAndIsr`请求
`kafkaController.processPartitionReassignmentIsrChange->onPartitionReassignment`
```scala
private def onPartitionReassignment(topicPartition: TopicPartition, reassignment: ReplicaAssignment): Unit = {
// While a reassignment is in progress, deletion is not allowed
topicDeletionManager.markTopicIneligibleForDeletion(Set(topicPartition.topic), reason = "topic reassignment in progress")
updateCurrentReassignment(topicPartition, reassignment)
val addingReplicas = reassignment.addingReplicas
val removingReplicas = reassignment.removingReplicas
if (!isReassignmentComplete(topicPartition, reassignment)) {
// A1. Send LeaderAndIsr request to every replica in ORS + TRS (with the new RS, AR and RR).
updateLeaderEpochAndSendRequest(topicPartition, reassignment)
// A2. replicas in AR -> NewReplica
startNewReplicasForReassignedPartition(topicPartition, addingReplicas)
} else {
// B1. replicas in AR -> OnlineReplica
replicaStateMachine.handleStateChanges(addingReplicas.map(PartitionAndReplica(topicPartition, _)), OnlineReplica)
// B2. Set RS = TRS, AR = [], RR = [] in memory.
val completedReassignment = ReplicaAssignment(reassignment.targetReplicas)
controllerContext.updatePartitionFullReplicaAssignment(topicPartition, completedReassignment)
// B3. Send LeaderAndIsr request with a potential new leader (if current leader not in TRS) and
// a new RS (using TRS) and same isr to every broker in ORS + TRS or TRS
moveReassignedPartitionLeaderIfRequired(topicPartition, completedReassignment)
// B4. replicas in RR -> Offline (force those replicas out of isr)
// B5. replicas in RR -> NonExistentReplica (force those replicas to be deleted)
stopRemovedReplicasOfReassignedPartition(topicPartition, removingReplicas)
// B6. Update ZK with RS = TRS, AR = [], RR = [].
updateReplicaAssignmentForPartition(topicPartition, completedReassignment)
// B7. Remove the ISR reassign listener and maybe update the /admin/reassign_partitions path in ZK to remove this partition from it.
removePartitionFromReassigningPartitions(topicPartition, completedReassignment)
// B8. After electing a leader in B3, the replicas and isr information changes, so resend the update metadata request to every broker
sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq, Set(topicPartition))
// signal delete topic thread if reassignment for some partitions belonging to topics being deleted just completed
topicDeletionManager.resumeDeletionForTopics(Set(topicPartition.topic))
}
}
```
以下几个名称说明:
`ORS`: origin repilicas 原先的副本
`RS`: Replicas 现在的副本
`TRS`: targetReplicas 将要变更成的目标副本
`AR`: adding_replicas 正在添加的副本
`RR`:removing_replicas 正在移除的副本
1. 副本状态变更 -> `OnlineReplica`,将 AR 中的所有副本移动到 OnlineReplica 状态
2. 在内存中设置 RS = TRS, AR = [], RR = []
3. 向 ORS + TRS 或 TRS 中的每个经纪人发送带有潜在新Leader如果当前Leader不在 TRS 中)和新 RS使用 TRS和相同 isr 的` LeaderAndIsr `请求
6. 我们可能会将 `LeaderAndIsr `发送到多个 TRS 副本。将 RR 中的所有副本移动到 `OfflineReplica `状态。转换的过程中,有删除 ZooKeeper 中的 RR并且仅向 Leader 发送一个 `LeaderAndIsr `以通知它缩小的 isr。之后向 RR 中的副本发送一个 `StopReplica (delete = false)` 这个时候还没有正在的进行删除。
7. 将 RR 中的所有副本移动到` NonExistentReplica `状态。这将向 RR 中的副本发送一个 `StopReplica (delete = true) `以物理删除磁盘上的副本。这里的流程可以看看文章[【kafka源码】TopicCommand之删除Topic源码解析]()
5. 用RS=TRS, AR=[], RR=[] 更新 zk `/broker/topics/{topicName}` 节点,更新partitions并移除AR(adding_replicas)RR(removing_replicas) 例如
```json
{"version":2,"partitions":{"2":[0,1],"1":[0,1],"0":[0,1]},"adding_replicas":{},"removing_replicas":{}}
```
8. 删除 ISR 重新分配侦听器`/brokers/topics/{topicName}/partitions/{分区号}/state`,并可能更新 ZK 中的 `/admin/reassign_partitions `路径以从中删除此分区(如果存在)
9. 选举leader后replicas和isr信息发生变化。因此向每个代理重新发送`UPDATE_METADATA`更新元数据请求。
10. 恢复删除线程`resumeDeletions`; 该操作[【kafka源码】TopicCommand之删除Topic源码解析]()在分析过; 请移步阅读,并搜索关键字`resumeDeletions`
#### 2.2.6 Controller重新选举恢复 恢复任务
> KafkaController.onControllerFailover() 里面 有调用接口`initializePartitionReassignments` 会恢复未完成的重分配任务
#### alterReplicaLogDirs请求
> 副本跨路径迁移相关
`KafkaApis.handleAlterReplicaLogDirsRequest`
```scala
def handleAlterReplicaLogDirsRequest(request: RequestChannel.Request): Unit = {
val alterReplicaDirsRequest = request.body[AlterReplicaLogDirsRequest]
val responseMap = {
if (authorize(request, ALTER, CLUSTER, CLUSTER_NAME))
replicaManager.alterReplicaLogDirs(alterReplicaDirsRequest.partitionDirs.asScala)
else
alterReplicaDirsRequest.partitionDirs.asScala.keys.map((_, Errors.CLUSTER_AUTHORIZATION_FAILED)).toMap
}
sendResponseMaybeThrottle(request, requestThrottleMs => new AlterReplicaLogDirsResponse(requestThrottleMs, responseMap.asJava))
}
```
### 2.3`--verify ` 验证结果分析
>校验执行情况, 顺便移除之前加过的限流配置
>`--zookeeper xxxxx --reassignment-json-file config/reassignment-json-file.json --verify`
>
>
源码在`ReassignPartitionsCommand.verifyAssignment` ,很简单 这里就不分析了
主要就是把之前写入的配置给清理掉
### 2.4 副本跨路径迁移
>为什么线上Kafka机器各个磁盘间的占用不均匀经常出现“一边倒”的情形 这是因为Kafka只保证分区数量在各个磁盘上均匀分布但它无法知晓每个分区实际占用空间故很有可能出现某些分区消息数量巨大导致占用大量磁盘空间的情况。在1.1版本之前用户对此毫无办法因为1.1之前Kafka只支持分区数据在不同broker间的重分配而无法做到在同一个broker下的不同磁盘间做重分配。1.1版本正式支持副本在不同路径间的迁移
**怎么在一台Broker上用多个路径存放分区呢?**
只需要在配置上接多个文件夹就行了
```
############################# Log Basics #############################
# A comma separated list of directories under which to store log files
log.dirs=kafka-logs-5,kafka-logs-6,kafka-logs-7,kafka-logs-8
```
**注意同一个Broker上不同路径只会存放不同的分区而不会将副本存放在同一个Broker; 不然那副本就没有意义了(容灾)**
**怎么针对跨路径迁移呢?**
迁移的json文件有一个参数是`log_dirs`; 默认请求不传的话 它是`"log_dirs": ["any"]` (这个数组的数量要跟副本保持一致)
但是你想实现跨路径迁移,只需要在这里填入绝对路径就行了,例如下面
迁移的json文件示例
```json
{
"version": 1,
"partitions": [{
"topic": "test_create_topic4",
"partition": 2,
"replicas": [0],
"log_dirs": ["/Users/xxxxx/work/IdeaPj/source/kafka/kafka-logs-5"]
}, {
"topic": "test_create_topic4",
"partition": 1,
"replicas": [0],
"log_dirs": ["/Users/xxxxx/work/IdeaPj/source/kafka/kafka-logs-6"]
}]
}
```
## 3.源码总结
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210621210656372.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70#pic_center)
## 4.Q&A
### 如果新增副本之后,会触发副本重新选举吗
>**Question:** 如果原来副本分配方式是: `"replicas": [0,1]` 重新分配方式变更成 `"replicas": [0,1,2] `或者 `"replicas": [2,0,1]` Leader会变更吗
> **Answer:** 不会,只要没有涉及到原来的Leader的变更,就不会触发重新选举
### 如果删除副本之后,会触发副本重新选举吗
>**Question:** 如果原来副本分配方式是: `"replicas": [0,1,2]` 重新分配方式变更成 `"replicas": [0,1] `或者 `"replicas": [2,0]` 或者 `"replicas": [1,2] ` Leader会变更吗
> **Answer:** 不会,只要没有涉及到原来的Leader的变更,就不会触发重新选举 ;
> 但是如果是之前的Leader被删除了,那就会触发重新选举了
> 如果触发选举了,那么选举策略是什么?策略如下图所述
> ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210621210442371.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
### 在重新分配的过程中,如果执行删除操作会怎么样
> 删除操作会等待,等待重新分配完成之后,继续进行删除操作
> 可参考文章 [【kafka源码】TopicCommand之删除Topic源码解析]()中的 源码总结部分
> ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210621172839258.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
### 副本增加是在哪个时机发生的
> ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210621211253921.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
>副本新增之后会开始与leader进行同步, 并修改节点`/brokers/topics/{topicName}/partitions/{分区号}/state` 的isr信息
### 副本删除是在哪个时机发生的
>![在这里插入图片描述](https://img-blog.csdnimg.cn/20210621211429939.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
>副本的删除是一个副本状态转换的过程,具体请看 [【kafka源码】Controller中的状态机]()
### 手动在zk中创建`/admin/reassign_partitions`节点能成功重分配吗
> 可以但是没必要, 需要做好一些前置校验
### 限流配置详情
> 里面有很多限流的配置, 关于限流相关 请看 [TODO.....]()
### 如果重新分配没有新增和删除副本,只是副本位置变更了
> Q: 假设分区副本 [0,1,2] 变更为[2,1,0] 会把副本删除之后再新增吗? 会触发leader选举吗
> A: 不会, 副本么有增多和减少就不会有 新增和删除副本的流程; 最终只是在zk节点`/broker/topics/{topicName}` 修改了一下顺序而已, 产生影响只会在下一次进行优先副本选举的时候 让第一个副本作为了Leader
### 重分配过程手动写入限流信息会生效吗
>关于限流相关 请看 [TODO.....]()
### 如果Controller角色重新选举 那重新分配任务还会继续吗
> KafkaController.onControllerFailover() 里面 有调用接口`initializePartitionReassignments` 会恢复未完成的重分配任务

View File

@@ -0,0 +1,411 @@
## 脚本参数
`sh bin/kafka-topic -help` 查看更具体参数
下面只是列出了跟` --alter` 相关的参数
| 参数 |描述 |例子|
|--|--|--|
|`--bootstrap-server ` 指定kafka服务|指定连接到的kafka服务; 如果有这个参数,则 `--zookeeper`可以不需要|--bootstrap-server localhost:9092 |
|`--replica-assignment `|副本分区分配方式;修改topic的时候可以自己指定副本分配情况; |`--replica-assignment id0:id1:id2,id3:id4:id5,id6:id7:id8 `其中“id0:id1:id2,id3:id4:id5,id6:id7:id8”表示Topic TopicName一共有3个Partition以“,”分隔每个Partition均有3个Replica以“:”分隔Topic Partition Replica与Kafka Broker之间的对应关系如下:![在这里插入图片描述](https://img-blog.csdnimg.cn/20210617140207438.png)
## Alert Topic脚本
## 分区扩容
**zk方式(不推荐)**
```sh
bin/kafka-topics.sh --zookeeper localhost:2181 --alter --topic topic1 --partitions 2
```
**kafka版本 >= 2.2 支持下面方式(推荐)**
**单个Topic扩容**
>`bin/kafka-topics.sh --bootstrap-server broker_host:port --alter --topic test_create_topic1 --partitions 4`
**批量扩容** (将所有正则表达式匹配到的Topic分区扩容到4个)
>`sh bin/kafka-topics.sh --topic ".*?" --bootstrap-server 172.23.248.85:9092 --alter --partitions 4`
>
`".*?"` 正则表达式的意思是匹配所有; 您可按需匹配
**PS:** 当某个Topic的分区少于指定的分区数时候,他会抛出异常;但是不会影响其他Topic正常进行;
---
相关可选参数
| 参数 |描述 |例子|
|--|--|--|
|`--replica-assignment `|副本分区分配方式;创建topic的时候可以自己指定副本分配情况; |`--replica-assignment` BrokerId-0:BrokerId-1:BrokerId-2,BrokerId-1:BrokerId-2:BrokerId-0,BrokerId-2:BrokerId-1:BrokerId-0 ; 这个意思是有三个分区和三个副本,对应分配的Broker; 逗号隔开标识分区;冒号隔开表示副本|
**PS: 虽然这里配置的是全部的分区副本分配配置,但是正在生效的是新增的分区;**
比如: 以前3分区1副本是这样的
| Broker-1 |Broker-2 |Broker-3|Broker-4|
|--|--|--|--|
|0 | 1 |2|
现在新增一个分区,`--replica-assignment` 2,1,3,4 ; 看这个意思好像是把01号分区互相换个Broker
| Broker-1 |Broker-2 |Broker-3|Broker-4|
|--|--|--|--|
|1 | 0 |2|3||
但是实际上不会这样做,Controller在处理的时候会把前面3个截掉; 只取新增的分区分配方式,原来的还是不会变
| Broker-1 |Broker-2 |Broker-3|Broker-4|
|--|--|--|--|
|0 | 1 |2|3||
## 源码解析
> <font color=red>如果觉得源码解析过程比较枯燥乏味,可以直接如果 **源码总结及其后面部分**</font>
因为在 [【kafka源码】TopicCommand之创建Topic源码解析]() 里面分析的比较详细; 故本文就着重点分析了;
### 1. `TopicCommand.alterTopic`
```scala
override def alterTopic(opts: TopicCommandOptions): Unit = {
val topic = new CommandTopicPartition(opts)
val topics = getTopics(opts.topic, opts.excludeInternalTopics)
//校验Topic是否存在
ensureTopicExists(topics, opts.topic)
//获取一下该topic的一些基本信息
val topicsInfo = adminClient.describeTopics(topics.asJavaCollection).values()
adminClient.createPartitions(topics.map {topicName =>
//判断是否有参数 replica-assignment 指定分区分配方式
if (topic.hasReplicaAssignment) {
val startPartitionId = topicsInfo.get(topicName).get().partitions().size()
val newAssignment = {
val replicaMap = topic.replicaAssignment.get.drop(startPartitionId)
new util.ArrayList(replicaMap.map(p => p._2.asJava).asJavaCollection).asInstanceOf[util.List[util.List[Integer]]]
}
topicName -> NewPartitions.increaseTo(topic.partitions.get, newAssignment)
} else {
topicName -> NewPartitions.increaseTo(topic.partitions.get)
}}.toMap.asJava).all().get()
}
```
1. 校验Topic是否存在
2. 如果设置了`--replica-assignment `参数, 则会算出新增的分区数的分配; 这个并不会修改原本已经分配好的分区结构.从源码就可以看出来,假如我之前的分配方式是3,3,3(3分区一个副本都在BrokerId-3上)现在我传入的参数是: `3,3,3,3`(多出来一个分区),这个时候会把原有的给截取掉;只传入3(表示在Broker3新增一个分区)![在这里插入图片描述](https://img-blog.csdnimg.cn/20210617142452499.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
3. 如果没有传入参数`--replica-assignment`,则后面会用默认分配策略分配
#### 客户端发起请求createPartitions
`KafkaAdminClient.createPartitions` 省略部分代码
```java
@Override
public CreatePartitionsResult createPartitions(Map<String, NewPartitions> newPartitions,
final CreatePartitionsOptions options) {
final Map<String, KafkaFutureImpl<Void>> futures = new HashMap<>(newPartitions.size());
for (String topic : newPartitions.keySet()) {
futures.put(topic, new KafkaFutureImpl<>());
}
runnable.call(new Call("createPartitions", calcDeadlineMs(now, options.timeoutMs()),
new ControllerNodeProvider()) {
//省略部分代码
@Override
void handleFailure(Throwable throwable) {
completeAllExceptionally(futures.values(), throwable);
}
}, now);
return new CreatePartitionsResult(new HashMap<>(futures));
}
```
1. 从源码中可以看到向`ControllerNodeProvider` 发起来`createPartitions`请求
### 2. Controller角色的服务端接受createPartitions请求处理逻辑
>
`KafkaApis.handleCreatePartitionsRequest`
```scala
def handleCreatePartitionsRequest(request: RequestChannel.Request): Unit = {
val createPartitionsRequest = request.body[CreatePartitionsRequest]
//部分代码省略..
//如果当前不是Controller角色直接抛出异常
if (!controller.isActive) {
val result = createPartitionsRequest.data.topics.asScala.map { topic =>
(topic.name, new ApiError(Errors.NOT_CONTROLLER, null))
}.toMap
sendResponseCallback(result)
} else {
// Special handling to add duplicate topics to the response
val topics = createPartitionsRequest.data.topics.asScala
val dupes = topics.groupBy(_.name)
.filter { _._2.size > 1 }
.keySet
val notDuped = topics.filterNot(topic => dupes.contains(topic.name))
val authorizedTopics = filterAuthorized(request, ALTER, TOPIC, notDuped.map(_.name))
val (authorized, unauthorized) = notDuped.partition { topic => authorizedTopics.contains(topic.name) }
val (queuedForDeletion, valid) = authorized.partition { topic =>
controller.topicDeletionManager.isTopicQueuedUpForDeletion(topic.name)
}
val errors = dupes.map(_ -> new ApiError(Errors.INVALID_REQUEST, "Duplicate topic in request.")) ++
unauthorized.map(_.name -> new ApiError(Errors.TOPIC_AUTHORIZATION_FAILED, "The topic authorization is failed.")) ++
queuedForDeletion.map(_.name -> new ApiError(Errors.INVALID_TOPIC_EXCEPTION, "The topic is queued for deletion."))
adminManager.createPartitions(createPartitionsRequest.data.timeoutMs,
valid,
createPartitionsRequest.data.validateOnly,
request.context.listenerName, result => sendResponseCallback(result ++ errors))
}
}
```
1. 检验自身是不是Controller角色,不是的话就抛出异常终止流程
2. 鉴权
3. 调用` adminManager.createPartitions`
3.1 从zk中获取`/brokers/ids/`Brokers列表的元信息的
3.2 从zk获取`/brokers/topics/{topicName}`已经存在的副本分配方式,并判断是否有正在进行副本重分配的进程在执行,如果有的话就抛出异常结束流程
3.3 如果从zk获取`/brokers/topics/{topicName}`数据不存在则抛出异常 `The topic '$topic' does not exist`
3.4 检查修改的分区数是否比原来的分区数大,如果比原来还小或者等于原来分区数则抛出异常结束流程
3.5 如果传入的参数`--replica-assignment` 中有不存在的BrokerId;则抛出异常`Unknown broker(s) in replica assignment`结束流程
3.5 如果传入的`--partitions`数量 与`--replica-assignment`中新增的部分数量不匹配则抛出异常`Increasing the number of partitions by...` 结束流程
3.6 调用` adminZkClient.addPartitions`
#### ` adminZkClient.addPartitions` 添加分区
1. 校验`--partitions`数量是否比存在的分区数大,否则异常`The number of partitions for a topic can only be increased`
2. 如果传入了`--replica-assignment` ,则对副本进行一些简单的校验
3. 调用`AdminUtils.assignReplicasToBrokers`分配副本 ; 这个我们在[【kafka源码】TopicCommand之创建Topic源码解析]() 也分析过; 具体请看[【kafka源码】创建Topic的时候是如何分区和副本的分配规则](); 当然这里由于我们是新增的分区,只会将新增的分区进行分配计算
4. 得到分配规则只后,调用`adminZkClient.writeTopicPartitionAssignment` 写入
#### adminZkClient.writeTopicPartitionAssignment将分区信息写入zk中
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210617154406685.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
我们在 [【kafka源码】TopicCommand之创建Topic源码解析]()的时候也分析过这段代码,但是那个时候调用的是`zkClient.createTopicAssignment` 创建接口
这里我们是调用` zkClient.setTopicAssignment` 写入接口, 写入当然会覆盖掉原有的信息,所以写入的时候会把原来分区信息获取到,重新写入;
1. 获取Topic原有分区副本分配信息
2. 将原有的和现在要添加的组装成一个数据对象写入到zk节点`/brokers/topics/{topicName}`
### 3. Controller监控节点`/brokers/topics/{topicName}` ,真正在Broker上将分区写入磁盘
监听到节点信息变更之后调用下面的接口;
`KafkaController.processPartitionModifications`
```scala
private def processPartitionModifications(topic: String): Unit = {
def restorePartitionReplicaAssignment(
topic: String,
newPartitionReplicaAssignment: Map[TopicPartition, ReplicaAssignment]
): Unit = {
info("Restoring the partition replica assignment for topic %s".format(topic))
val existingPartitions = zkClient.getChildren(TopicPartitionsZNode.path(topic))
val existingPartitionReplicaAssignment = newPartitionReplicaAssignment
.filter(p => existingPartitions.contains(p._1.partition.toString))
.map { case (tp, _) =>
tp -> controllerContext.partitionFullReplicaAssignment(tp)
}.toMap
zkClient.setTopicAssignment(topic,
existingPartitionReplicaAssignment,
controllerContext.epochZkVersion)
}
if (!isActive) return
val partitionReplicaAssignment = zkClient.getFullReplicaAssignmentForTopics(immutable.Set(topic))
val partitionsToBeAdded = partitionReplicaAssignment.filter { case (topicPartition, _) =>
controllerContext.partitionReplicaAssignment(topicPartition).isEmpty
}
if (topicDeletionManager.isTopicQueuedUpForDeletion(topic)) {
if (partitionsToBeAdded.nonEmpty) {
warn("Skipping adding partitions %s for topic %s since it is currently being deleted"
.format(partitionsToBeAdded.map(_._1.partition).mkString(","), topic))
restorePartitionReplicaAssignment(topic, partitionReplicaAssignment)
} else {
// This can happen if existing partition replica assignment are restored to prevent increasing partition count during topic deletion
info("Ignoring partition change during topic deletion as no new partitions are added")
}
} else if (partitionsToBeAdded.nonEmpty) {
info(s"New partitions to be added $partitionsToBeAdded")
partitionsToBeAdded.foreach { case (topicPartition, assignedReplicas) =>
controllerContext.updatePartitionFullReplicaAssignment(topicPartition, assignedReplicas)
}
onNewPartitionCreation(partitionsToBeAdded.keySet)
}
}
```
1. 判断是否Controller,不是则直接结束流程
2. 获取`/brokers/topics/{topicName}` 节点信息, 然后再对比一下当前该节点的分区分配信息; 看看有没有是新增的分区; 如果是新增的分区这个时候是还没有`/brokers/topics/{topicName}/partitions/{分区号}/state` ;
3. 如果当前的TOPIC正在被删除中,那么就没有必要执行扩分区了
5. 将新增加的分区信息加载到内存中
6. 调用接口`KafkaController.onNewPartitionCreation`
#### KafkaController.onNewPartitionCreation 新增分区
从这里开始 , 后面的流程就跟创建Topic的对应流程一样了;
> 该接口主要是针对新增分区和副本的一些状态流转过程; 在[【kafka源码】TopicCommand之创建Topic源码解析]() 也同样分析过
```scala
/**
* This callback is invoked by the topic change callback with the list of failed brokers as input.
* It does the following -
* 1. Move the newly created partitions to the NewPartition state
* 2. Move the newly created partitions from NewPartition->OnlinePartition state
*/
private def onNewPartitionCreation(newPartitions: Set[TopicPartition]): Unit = {
info(s"New partition creation callback for ${newPartitions.mkString(",")}")
partitionStateMachine.handleStateChanges(newPartitions.toSeq, NewPartition)
replicaStateMachine.handleStateChanges(controllerContext.replicasForPartition(newPartitions).toSeq, NewReplica)
partitionStateMachine.handleStateChanges(
newPartitions.toSeq,
OnlinePartition,
Some(OfflinePartitionLeaderElectionStrategy(false))
)
replicaStateMachine.handleStateChanges(controllerContext.replicasForPartition(newPartitions).toSeq, OnlineReplica)
}
```
1. 将待创建的分区状态流转为`NewPartition`;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210616180239988.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
2. 将待创建的副本 状态流转为`NewReplica`;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210616180940961.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
3. 将分区状态从刚刚的`NewPartition`流转为`OnlinePartition`
0. 获取`leaderIsrAndControllerEpochs`; Leader为副本的第一个;
1. 向zk中写入`/brokers/topics/{topicName}/partitions/` 持久节点; 无数据
2. 向zk中写入`/brokers/topics/{topicName}/partitions/{分区号}` 持久节点; 无数据
3. 向zk中写入`/brokers/topics/{topicName}/partitions/{分区号}/state` 持久节点; 数据为`leaderIsrAndControllerEpoch`![在这里插入图片描述](https://img-blog.csdnimg.cn/20210616183747171.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
4. 向副本所属Broker发送[`leaderAndIsrRequest`]()请求
5. 向所有Broker发送[`UPDATE_METADATA` ]()请求
4. 将副本状态从刚刚的`NewReplica`流转为`OnlineReplica` ,更新下内存
关于分区状态机和副本状态机详情请看[【kafka源码】Controller中的状态机](TODO)
### 4. Broker收到LeaderAndIsrRequest 创建本地Log
>上面步骤中有说到向副本所属Broker发送[`leaderAndIsrRequest`]()请求,那么这里做了什么呢
>其实主要做的是 创建本地Log
>
代码太多,这里我们直接定位到只跟创建Topic相关的关键代码来分析
`KafkaApis.handleLeaderAndIsrRequest->replicaManager.becomeLeaderOrFollower->ReplicaManager.makeLeaders...LogManager.getOrCreateLog`
```scala
/**
* 如果日志已经存在,只返回现有日志的副本否则如果 isNew=true 或者如果没有离线日志目录,则为给定的主题和给定的分区创建日志 否则抛出 KafkaStorageException
*/
def getOrCreateLog(topicPartition: TopicPartition, config: LogConfig, isNew: Boolean = false, isFuture: Boolean = false): Log = {
logCreationOrDeletionLock synchronized {
getLog(topicPartition, isFuture).getOrElse {
// create the log if it has not already been created in another thread
if (!isNew && offlineLogDirs.nonEmpty)
throw new KafkaStorageException(s"Can not create log for $topicPartition because log directories ${offlineLogDirs.mkString(",")} are offline")
val logDirs: List[File] = {
val preferredLogDir = preferredLogDirs.get(topicPartition)
if (isFuture) {
if (preferredLogDir == null)
throw new IllegalStateException(s"Can not create the future log for $topicPartition without having a preferred log directory")
else if (getLog(topicPartition).get.dir.getParent == preferredLogDir)
throw new IllegalStateException(s"Can not create the future log for $topicPartition in the current log directory of this partition")
}
if (preferredLogDir != null)
List(new File(preferredLogDir))
else
nextLogDirs()
}
val logDirName = {
if (isFuture)
Log.logFutureDirName(topicPartition)
else
Log.logDirName(topicPartition)
}
val logDir = logDirs
.toStream // to prevent actually mapping the whole list, lazy map
.map(createLogDirectory(_, logDirName))
.find(_.isSuccess)
.getOrElse(Failure(new KafkaStorageException("No log directories available. Tried " + logDirs.map(_.getAbsolutePath).mkString(", "))))
.get // If Failure, will throw
val log = Log(
dir = logDir,
config = config,
logStartOffset = 0L,
recoveryPoint = 0L,
maxProducerIdExpirationMs = maxPidExpirationMs,
producerIdExpirationCheckIntervalMs = LogManager.ProducerIdExpirationCheckIntervalMs,
scheduler = scheduler,
time = time,
brokerTopicStats = brokerTopicStats,
logDirFailureChannel = logDirFailureChannel)
if (isFuture)
futureLogs.put(topicPartition, log)
else
currentLogs.put(topicPartition, log)
info(s"Created log for partition $topicPartition in $logDir with properties " + s"{${config.originals.asScala.mkString(", ")}}.")
// Remove the preferred log dir since it has already been satisfied
preferredLogDirs.remove(topicPartition)
log
}
}
}
```
1. 如果日志已经存在,只返回现有日志的副本否则如果 isNew=true 或者如果没有离线日志目录,则为给定的主题和给定的分区创建日志 否则抛出` KafkaStorageException`
详细请看 [【kafka源码】LeaderAndIsrRequest请求]()
## 源码总结
看图说话
![在这里插入图片描述](https://img-blog.csdnimg.cn/2021061718435568.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70#pic_center)
## Q&A
### 如果自定义的分配Broker不存在会怎么样
> 会抛出异常`Unknown broker(s) in replica assignment`, 因为在执行的时候会去zk获取当前的在线Broker列表然后判断是否在线;
### 如果设置的分区数不等于 `--replica-assignment`中新增的数目会怎么样
>会抛出异常`Increasing the number of partitions by..`结束流程
### 如果写入`/brokers/topics/{topicName}`之后 Controller监听到请求正好挂掉怎么办
> Controller挂掉会发生重新选举,选举成功之后, 检查到`/brokers/topics/{topicName}`之后发现没有生成对应的分区,会自动执行接下来的流程;
### 如果我手动在zk中写入节点`/brokers/topics/{topicName}/partitions/{分区号}/state` 会怎么样
> Controller并没有监听这个节点,所以不会有变化; 但是当Controller发生重新选举的时候,
> **被删除的节点会被重新添加回来;**
>但是**写入的节点 就不会被删除了**;写入的节点信息会被保存在Controller内存中;
>同样这会影响到分区扩容
>
>
> ----
> 例子🌰:
> 当前分区3个,副本一个,手贱在zk上添加了一个节点如下图:
> ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210617175311911.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
> 这个时候我想扩展一个分区; 然后执行了脚本, 虽然`/brokers/topics/test_create_topic3`节点数据变; 但是Broker真正在`LeaderAndIsrRequest`请求里面没有执行创建本地Log文件; 这是因为源码读取到zk下面partitions的节点数量和新增之后的节点数量没有变更,那么它就认为本次请求没有变更就不会执行创建本地Log文件了;
> 如果判断有变更,还是会去创建的;
> 手贱zk写入N个partition节点 + 扩充N个分区 = Log文件不会被创建
> 手贱zk写入N个partition节点 + 扩充>N个分区 = 正常扩容
### 如果直接修改节点/brokers/topics/{topicName}中的配置会怎么样
>如果该节点信息是`{"version":2,"partitions":{"2":[1],"1":[1],"0":[1]},"adding_replicas":{},"removing_replicas":{}}` 看数据,说明3个分区1个副本都在Broker-1上;
>我在zk上修改成`{"version":2,"partitions":{"2":[2],"1":[1],"0":[0]},"adding_replicas":{},"removing_replicas":{}}`
>想将分区分配到 Broker-0Broker-1Broker-2上
>TODO。。。
---
<font color=red size=5>Tips:如果关于本篇文章你有疑问,可以在评论区留下,我会在**Q&A**部分进行解答 </font>
<font color=red size=2>PS: 文章阅读的源码版本是kafka-2.5 </font>

View File

@@ -0,0 +1,597 @@
## 脚本参数
`sh bin/kafka-topic -help` 查看更具体参数
下面只是列出了跟` --create` 相关的参数
| 参数 |描述 |例子|
|--|--|--|
|`--bootstrap-server ` 指定kafka服务|指定连接到的kafka服务; 如果有这个参数,则 `--zookeeper`可以不需要|--bootstrap-server localhost:9092 |
|`--zookeeper`|弃用, 通过zk的连接方式连接到kafka集群;|--zookeeper localhost:2181 或者localhost:2181/kafka|
|`--replication-factor `|副本数量,注意不能大于broker数量;如果不提供,则会用集群中默认配置|--replication-factor 3 |
|`--partitions`|分区数量|当创建或者修改topic的时候,用这个来指定分区数;如果创建的时候没有提供参数,则用集群中默认值; 注意如果是修改的时候,分区比之前小会有问题|--partitions 3 |
|`--replica-assignment `|副本分区分配方式;创建topic的时候可以自己指定副本分配情况; |`--replica-assignment` BrokerId-0:BrokerId-1:BrokerId-2,BrokerId-1:BrokerId-2:BrokerId-0,BrokerId-2:BrokerId-1:BrokerId-0 ; 这个意思是有三个分区和三个副本,对应分配的Broker; 逗号隔开标识分区;冒号隔开表示副本|
| `--config `<String: name=value> |用来设置topic级别的配置以覆盖默认配置;**只在--create 和--bootstrap-server 同时使用时候生效**; 可以配置的参数列表请看文末附件 |例如覆盖两个配置 `--config retention.bytes=123455 --config retention.ms=600001`|
|`--command-config` <String: command 文件路径> |用来配置客户端Admin Client启动配置,**只在--bootstrap-server 同时使用时候生效**;|例如:设置请求的超时时间 `--command-config config/producer.proterties `; 然后在文件中配置 request.timeout.ms=300000|
|`--create`|命令方式; 表示当前请求是创建Topic|`--create`|
## 创建Topic脚本
**zk方式(不推荐)**
```shell
bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 3 --partitions 3 --topic test
```
<font color="red">需要注意的是--zookeeper后面接的是kafka的zk配置, 假如你配置的是localhost:2181/kafka 带命名空间的这种,不要漏掉了 </font>
**kafka版本 >= 2.2 支持下面方式(推荐)**
```shell
bin/kafka-topics.sh --create --bootstrap-server localhost:9092 --replication-factor 3 --partitions 3 --topic test
```
当前分析的kafka源码版本为 `kafka-2.5`
## 创建Topic 源码分析
<font color="red">温馨提示: 如果阅读源码略显枯燥,你可以直接看源码总结以及后面部分</font>
首先我们找到源码入口处, 查看一下 `kafka-topic.sh`脚本的内容
`exec $(dirname $0)/kafka-run-class.sh kafka.admin.TopicCommand "$@"`
最终是执行了`kafka.admin.TopicCommand`这个类,找到这个地方之后就可以断点调试源码了,用IDEA启动
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210608151956926.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
记得配置一下入参
比如: `--create --bootstrap-server 127.0.0.1:9092 --partitions 3 --topic test_create_topic3`
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210608152149713.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
### 1. 源码入口
![在这里插入图片描述](https://img-blog.csdnimg.cn/2021060815275820.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
上面的源码主要作用是
1. 根据是否有传入参数`--zookeeper` 来判断创建哪一种 对象`topicService`
如果传入了`--zookeeper` 则创建 类 `ZookeeperTopicService`的对象
否则创建类`AdminClientTopicService`的对象(我们主要分析这个对象)
2. 根据传入的参数类型判断是创建topic还是删除等等其他 判断依据是 是否在参数里传入了`--create`
### 2. 创建AdminClientTopicService 对象
> `val topicService = new AdminClientTopicService(createAdminClient(commandConfig, bootstrapServer))`
#### 2.1 先创建 Admin
```scala
object AdminClientTopicService {
def createAdminClient(commandConfig: Properties, bootstrapServer: Option[String]): Admin = {
bootstrapServer match {
case Some(serverList) => commandConfig.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, serverList)
case None =>
}
Admin.create(commandConfig)
}
def apply(commandConfig: Properties, bootstrapServer: Option[String]): AdminClientTopicService =
new AdminClientTopicService(createAdminClient(commandConfig, bootstrapServer))
}
```
1. 如果有入参`--command-config` ,则将这个文件里面的参数都放到map `commandConfig`里面, 并且也加入`bootstrap.servers`的参数;假如配置文件里面已经有了`bootstrap.servers`配置,那么会将其覆盖
2. 将上面的`commandConfig` 作为入参调用`Admin.create(commandConfig)`创建 Admin; 这个时候调用的Client模块的代码了, 从这里我们就可以看出,我们调用`kafka-topic.sh`脚本实际上是kafka模拟了一个客户端`Client`来创建Topic的过程;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210608160130820.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
### 3. AdminClientTopicService.createTopic 创建Topic
` topicService.createTopic(opts)`
```scala
case class AdminClientTopicService private (adminClient: Admin) extends TopicService {
override def createTopic(topic: CommandTopicPartition): Unit = {
//如果配置了副本副本数--replication-factor 一定要大于0
if (topic.replicationFactor.exists(rf => rf > Short.MaxValue || rf < 1))
throw new IllegalArgumentException(s"The replication factor must be between 1 and ${Short.MaxValue} inclusive")
//如果配置了--partitions 分区数 必须大于0
if (topic.partitions.exists(partitions => partitions < 1))
throw new IllegalArgumentException(s"The partitions must be greater than 0")
//查询是否已经存在该Topic
if (!adminClient.listTopics().names().get().contains(topic.name)) {
val newTopic = if (topic.hasReplicaAssignment)
//如果指定了--replica-assignment参数则按照指定的来分配副本
new NewTopic(topic.name, asJavaReplicaReassignment(topic.replicaAssignment.get))
else {
new NewTopic(
topic.name,
topic.partitions.asJava,
topic.replicationFactor.map(_.toShort).map(Short.box).asJava)
}
// 将配置--config 解析成一个配置map
val configsMap = topic.configsToAdd.stringPropertyNames()
.asScala
.map(name => name -> topic.configsToAdd.getProperty(name))
.toMap.asJava
newTopic.configs(configsMap)
//调用adminClient创建Topic
val createResult = adminClient.createTopics(Collections.singleton(newTopic))
createResult.all().get()
println(s"Created topic ${topic.name}.")
} else {
throw new IllegalArgumentException(s"Topic ${topic.name} already exists")
}
}
```
1. 检查各项入参是否有问题
2. `adminClient.listTopics()`,然后比较是否已经存在待创建的Topic;如果存在抛出异常;
3. 判断是否配置了参数`--replica-assignment` ; 如果配置了,那么Topic就会按照指定的方式来配置副本情况
4. 解析配置`--config ` 配置放到` configsMap`中; `configsMap`给到`NewTopic`对象
5. 调用`adminClient.createTopics`创建Topic; 它是如何创建Topic的呢往下分析源码
#### 3.1 KafkaAdminClient.createTopics(NewTopic) 创建Topic
```java
@Override
public CreateTopicsResult createTopics(final Collection<NewTopic> newTopics,
final CreateTopicsOptions options) {
//省略部分源码...
Call call = new Call("createTopics", calcDeadlineMs(now, options.timeoutMs()),
new ControllerNodeProvider()) {
@Override
public CreateTopicsRequest.Builder createRequest(int timeoutMs) {
return new CreateTopicsRequest.Builder(
new CreateTopicsRequestData().
setTopics(topics).
setTimeoutMs(timeoutMs).
setValidateOnly(options.shouldValidateOnly()));
}
@Override
public void handleResponse(AbstractResponse abstractResponse) {
//省略
}
@Override
void handleFailure(Throwable throwable) {
completeAllExceptionally(topicFutures.values(), throwable);
}
};
}
```
这个代码里面主要看下Call里面的接口; 先不管Kafka如何跟服务端进行通信的细节; 我们主要关注创建Topic的逻辑;
1. `createRequest`会构造一个请求参数`CreateTopicsRequest` 例如下图
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210609174617186.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
2. 选择ControllerNodeProvider这个节点发起网络请求
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210609200925505.png)
可以清楚的看到, 创建Topic这个操作是需要Controller来执行的;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210609200938586.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
### 4. 发起网络请求
[==>服务端客户端网络模型 ](TODO)
### 5. Controller角色的服务端接受请求处理逻辑
首先找到服务端处理客户端请求的 **源码入口**`KafkaRequestHandler.run()`
主要看里面的 `apis.handle(request)` 方法; 可以看到客户端的请求都在`request.bodyAndSize()`里面
![在这里插入图片描述](https://img-blog.csdnimg.cn/2021060917574268.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
#### 5.1 KafkaApis.handle(request) 根据请求传递Api调用不同接口
进入方法可以看到根据`request.header.apiKey` 调用对应的方法,客户端传过来的是`CreateTopics`
![在这里插入图片描述](https://img-blog.csdnimg.cn/2021060918000338.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
#### 5.2 KafkaApis.handleCreateTopicsRequest 处理创建Topic的请求
```java
def handleCreateTopicsRequest(request: RequestChannel.Request): Unit = {
// 部分代码省略
//如果当前Broker不是属于Controller的话,就抛出异常
if (!controller.isActive) {
createTopicsRequest.data.topics.asScala.foreach { topic =>
results.add(new CreatableTopicResult().setName(topic.name).
setErrorCode(Errors.NOT_CONTROLLER.code))
}
sendResponseCallback(results)
} else {
// 部分代码省略
}
adminManager.createTopics(createTopicsRequest.data.timeoutMs,
createTopicsRequest.data.validateOnly,
toCreate,
authorizedForDescribeConfigs,
handleCreateTopicsResults)
}
}
```
1. 判断当前处理的broker是不是Controller,如果不是Controller的话直接抛出异常,从这里可以看出,CreateTopic这个操作必须是Controller来进行, 出现这种情况有可能是客户端发起请求的时候Controller已经变更;
2. 鉴权 [【Kafka源码】kafka鉴权机制]()
3. 调用`adminManager.createTopics()`
#### 5.3 adminManager.createTopics()
> 创建主题并等等主题完全创建,回调函数将会在超时、错误、或者主题创建完成时触发
该方法过长,省略部分代码
```scala
def createTopics(timeout: Int,
validateOnly: Boolean,
toCreate: Map[String, CreatableTopic],
includeConfigsAndMetatadata: Map[String, CreatableTopicResult],
responseCallback: Map[String, ApiError] => Unit): Unit = {
// 1. map over topics creating assignment and calling zookeeper
val brokers = metadataCache.getAliveBrokers.map { b => kafka.admin.BrokerMetadata(b.id, b.rack) }
val metadata = toCreate.values.map(topic =>
try {
//省略部分代码
//检查Topic是否存在
//检查 --replica-assignment参数和 (--partitions || --replication-factor ) 不能同时使用
// 如果(--partitions || --replication-factor ) 没有设置,则使用 Broker的配置(这个Broker肯定是Controller)
// 计算分区副本分配方式
createTopicPolicy match {
case Some(policy) =>
//省略部分代码
adminZkClient.validateTopicCreate(topic.name(), assignments, configs)
if (!validateOnly)
adminZkClient.createTopicWithAssignment(topic.name, configs, assignments)
case None =>
if (validateOnly)
//校验创建topic的参数准确性
adminZkClient.validateTopicCreate(topic.name, assignments, configs)
else
//把topic相关数据写入到zk中
adminZkClient.createTopicWithAssignment(topic.name, configs, assignments)
}
}
```
1. 做一些校验检查
①.检查Topic是否存在
②. 检查` --replica-assignment`参数和 (`--partitions || --replication-factor` ) 不能同时使用
③.如果(`--partitions || --replication-factor` ) 没有设置,则使用 Broker的配置(这个Broker肯定是Controller)
④.计算分区副本分配方式
2. `createTopicPolicy` 根据Broker是否配置了创建Topic的自定义校验策略; 使用方式是自定义实现`org.apache.kafka.server.policy.CreateTopicPolicy`接口;并 在服务器配置 `create.topic.policy.class.name=自定义类`; 比如我就想所有创建Topic的请求分区数都要大于10; 那么这里就可以实现你的需求了
3. `createTopicWithAssignment`把topic相关数据写入到zk中; 进去分析一下
#### 5.4 写入zookeeper数据
我们进入到` adminZkClient.createTopicWithAssignment(topic.name, configs, assignments)
`看看有哪些数据写入到了zk中;
```scala
def createTopicWithAssignment(topic: String,
config: Properties,
partitionReplicaAssignment: Map[Int, Seq[Int]]): Unit = {
validateTopicCreate(topic, partitionReplicaAssignment, config)
// 将topic单独的配置写入到zk中
zkClient.setOrCreateEntityConfigs(ConfigType.Topic, topic, config)
// 将topic分区相关信息写入zk中
writeTopicPartitionAssignment(topic, partitionReplicaAssignment.mapValues(ReplicaAssignment(_)).toMap, isUpdate = false)
}
```
源码就不再深入了,这里直接详细说明一下
**写入Topic配置信息**
1. 先调用`SetDataRequest`请求往节点` /config/topics/Topic名称` 写入数据; 这里
一般这个时候都会返回 `NONODE (NoNode)`;节点不存在; 假如zk已经存在节点就直接覆盖掉
2. 节点不存在的话,就发起`CreateRequest`请求,写入数据; 并且节点类型是**持久节点**
这里写入的数据,是我们入参时候传的topic配置`--config`; 这里的配置会覆盖默认配置
**写入Topic分区副本信息**
1. 将已经分配好的副本分配策略写入到 `/brokers/topics/Topic名称` 中; 节点类型 **持久节点**
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210610152129161.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
**具体跟zk交互的地方在**
`ZookeeperClient.send()` 这里包装了很多跟zk的交互;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210610151032490.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
### 6. Controller监听 `/brokers/topics/Topic名称`, 通知Broker将分区写入磁盘
> Controller 有监听zk上的一些节点; 在上面的流程中已经在zk中写入了 `/brokers/topics/Topic名称` ; 这个时候Controller就监听到了这个变化并相应;
`KafkaController.processTopicChange`
```scala
private def processTopicChange(): Unit = {
//如果处理的不是Controller角色就返回
if (!isActive) return
//从zk中获取 `/brokers/topics 所有Topic
val topics = zkClient.getAllTopicsInCluster
//找出哪些是新增的
val newTopics = topics -- controllerContext.allTopics
//找出哪些Topic在zk上被删除了
val deletedTopics = controllerContext.allTopics -- topics
controllerContext.allTopics = topics
registerPartitionModificationsHandlers(newTopics.toSeq)
val addedPartitionReplicaAssignment = zkClient.getFullReplicaAssignmentForTopics(newTopics)
deletedTopics.foreach(controllerContext.removeTopic)
addedPartitionReplicaAssignment.foreach {
case (topicAndPartition, newReplicaAssignment) => controllerContext.updatePartitionFullReplicaAssignment(topicAndPartition, newReplicaAssignment)
}
info(s"New topics: [$newTopics], deleted topics: [$deletedTopics], new partition replica assignment " +
s"[$addedPartitionReplicaAssignment]")
if (addedPartitionReplicaAssignment.nonEmpty)
onNewPartitionCreation(addedPartitionReplicaAssignment.keySet)
}
```
1. 从zk中获取 `/brokers/topics` 所有Topic跟当前Broker内存中所有Broker`controllerContext.allTopics`的差异; 就可以找到我们新增的Topic; 还有在zk中被删除了的Broker(该Topic会在当前内存中remove掉)
2. 从zk中获取`/brokers/topics/{TopicName}` 给定主题的副本分配。并保存在内存中![在这里插入图片描述](https://img-blog.csdnimg.cn/20210616175718504.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
4. 执行`onNewPartitionCreation`;分区状态开始流转
#### 6.1 onNewPartitionCreation 状态流转
> 关于Controller的状态机 详情请看: [【kafka源码】Controller中的状态机](TODO)
```scala
/**
* This callback is invoked by the topic change callback with the list of failed brokers as input.
* It does the following -
* 1. Move the newly created partitions to the NewPartition state
* 2. Move the newly created partitions from NewPartition->OnlinePartition state
*/
private def onNewPartitionCreation(newPartitions: Set[TopicPartition]): Unit = {
info(s"New partition creation callback for ${newPartitions.mkString(",")}")
partitionStateMachine.handleStateChanges(newPartitions.toSeq, NewPartition)
replicaStateMachine.handleStateChanges(controllerContext.replicasForPartition(newPartitions).toSeq, NewReplica)
partitionStateMachine.handleStateChanges(
newPartitions.toSeq,
OnlinePartition,
Some(OfflinePartitionLeaderElectionStrategy(false))
)
replicaStateMachine.handleStateChanges(controllerContext.replicasForPartition(newPartitions).toSeq, OnlineReplica)
}
```
1. 将待创建的分区状态流转为`NewPartition`;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210616180239988.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
2. 将待创建的副本 状态流转为`NewReplica`;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210616180940961.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
3. 将分区状态从刚刚的`NewPartition`流转为`OnlinePartition`
0. 获取`leaderIsrAndControllerEpochs`; Leader为副本的第一个;
1. 向zk中写入`/brokers/topics/{topicName}/partitions/` 持久节点; 无数据
2. 向zk中写入`/brokers/topics/{topicName}/partitions/{分区号}` 持久节点; 无数据
3. 向zk中写入`/brokers/topics/{topicName}/partitions/{分区号}/state` 持久节点; 数据为`leaderIsrAndControllerEpoch`![在这里插入图片描述](https://img-blog.csdnimg.cn/20210616183747171.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
4. 向副本所属Broker发送[`leaderAndIsrRequest`]()请求
5. 向所有Broker发送[`UPDATE_METADATA` ]()请求
4. 将副本状态从刚刚的`NewReplica`流转为`OnlineReplica` ,更新下内存
关于分区状态机和副本状态机详情请看[【kafka源码】Controller中的状态机](TODO)
### 7. Broker收到LeaderAndIsrRequest 创建本地Log
>上面步骤中有说到向副本所属Broker发送[`leaderAndIsrRequest`]()请求,那么这里做了什么呢
>其实主要做的是 创建本地Log
>
代码太多,这里我们直接定位到只跟创建Topic相关的关键代码来分析
`KafkaApis.handleLeaderAndIsrRequest->replicaManager.becomeLeaderOrFollower->ReplicaManager.makeLeaders...LogManager.getOrCreateLog`
```scala
/**
* 如果日志已经存在,只返回现有日志的副本否则如果 isNew=true 或者如果没有离线日志目录,则为给定的主题和给定的分区创建日志 否则抛出 KafkaStorageException
*/
def getOrCreateLog(topicPartition: TopicPartition, config: LogConfig, isNew: Boolean = false, isFuture: Boolean = false): Log = {
logCreationOrDeletionLock synchronized {
getLog(topicPartition, isFuture).getOrElse {
// create the log if it has not already been created in another thread
if (!isNew && offlineLogDirs.nonEmpty)
throw new KafkaStorageException(s"Can not create log for $topicPartition because log directories ${offlineLogDirs.mkString(",")} are offline")
val logDirs: List[File] = {
val preferredLogDir = preferredLogDirs.get(topicPartition)
if (isFuture) {
if (preferredLogDir == null)
throw new IllegalStateException(s"Can not create the future log for $topicPartition without having a preferred log directory")
else if (getLog(topicPartition).get.dir.getParent == preferredLogDir)
throw new IllegalStateException(s"Can not create the future log for $topicPartition in the current log directory of this partition")
}
if (preferredLogDir != null)
List(new File(preferredLogDir))
else
nextLogDirs()
}
val logDirName = {
if (isFuture)
Log.logFutureDirName(topicPartition)
else
Log.logDirName(topicPartition)
}
val logDir = logDirs
.toStream // to prevent actually mapping the whole list, lazy map
.map(createLogDirectory(_, logDirName))
.find(_.isSuccess)
.getOrElse(Failure(new KafkaStorageException("No log directories available. Tried " + logDirs.map(_.getAbsolutePath).mkString(", "))))
.get // If Failure, will throw
val log = Log(
dir = logDir,
config = config,
logStartOffset = 0L,
recoveryPoint = 0L,
maxProducerIdExpirationMs = maxPidExpirationMs,
producerIdExpirationCheckIntervalMs = LogManager.ProducerIdExpirationCheckIntervalMs,
scheduler = scheduler,
time = time,
brokerTopicStats = brokerTopicStats,
logDirFailureChannel = logDirFailureChannel)
if (isFuture)
futureLogs.put(topicPartition, log)
else
currentLogs.put(topicPartition, log)
info(s"Created log for partition $topicPartition in $logDir with properties " + s"{${config.originals.asScala.mkString(", ")}}.")
// Remove the preferred log dir since it has already been satisfied
preferredLogDirs.remove(topicPartition)
log
}
}
}
```
1. 如果日志已经存在,只返回现有日志的副本否则如果 isNew=true 或者如果没有离线日志目录,则为给定的主题和给定的分区创建日志 否则抛出` KafkaStorageException`
详细请看 [【kafka源码】LeaderAndIsrRequest请求]()
## 源码总结
> 如果上面的源码分析,你不想看,那么你可以直接看这里的简洁叙述
1. 根据是否有传入参数`--zookeeper` 来判断创建哪一种 对象`topicService`
如果传入了`--zookeeper` 则创建 类 `ZookeeperTopicService`的对象
否则创建类`AdminClientTopicService`的对象(我们主要分析这个对象)
2. 如果有入参`--command-config` ,则将这个文件里面的参数都放到mapl类型 `commandConfig`里面, 并且也加入`bootstrap.servers`的参数;假如配置文件里面已经有了`bootstrap.servers`配置,那么会将其覆盖
3. 将上面的`commandConfig `作为入参调用`Admin.create(commandConfig)`创建 Admin; 这个时候调用的Client模块的代码了, 从这里我们就可以猜测,我们调用`kafka-topic.sh`脚本实际上是kafka模拟了一个客户端Client来创建Topic的过程;
4. 一些异常检查
①.如果配置了副本副本数--replication-factor 一定要大于0
②.如果配置了--partitions 分区数 必须大于0
③.去zk查询是否已经存在该Topic
5. 判断是否配置了参数`--replica-assignment` ; 如果配置了,那么Topic就会按照指定的方式来配置副本情况
6. 解析配置`--config ` 配置放到`configsMap`中; configsMap给到NewTopic对象
7. **将上面所有的参数包装成一个请求参数`CreateTopicsRequest` ;然后找到是`Controller`的节点发起请求(`ControllerNodeProvider`)**
8. 服务端收到请求之后,开始根据`CreateTopicsRequest`来调用创建Topic的方法; 不过首先要判断一下自己这个时候是不是`Controller`; 有可能这个时候Controller重新选举了; 这个时候要抛出异常
9. 服务端进行一下请求参数检查
①.检查Topic是否存在
②.检查 `--replica-assignment`参数和 (`--partitions` || `--replication-factor` ) 不能同时使用
10. 如果(`--partitions` || `--replication-factor` ) 没有设置,则使用 Broker的默认配置(这个Broker肯定是Controller)
11. 计算分区副本分配方式;如果是传入了 `--replica-assignment`;则会安装自定义参数进行组装;否则的话系统会自动计算分配方式; 具体详情请看 [【kafka源码】创建Topic的时候是如何分区和副本的分配规则 ]()
12. `createTopicPolicy `根据Broker是否配置了创建Topic的自定义校验策略; 使用方式是自定义实现`org.apache.kafka.server.policy.CreateTopicPolicy`接口;并 在服务器配置 `create.topic.policy.class.name`=自定义类; 比如我就想所有创建Topic的请求分区数都要大于10; 那么这里就可以实现你的需求了
13. **zk中写入Topic配置信息** 发起`CreateRequest`请求,这里写入的数据,是我们入参时候传的topic配置`--config`; 这里的配置会覆盖默认配置;并且节点类型是持久节点;**path** = `/config/topics/Topic名称`
14. **zk中写入Topic分区副本信息** 发起`CreateRequest`请求 ,将已经分配好的副本分配策略 写入到 `/brokers/topics/Topic名称 `中; 节点类型 持久节点
15. `Controller`监听zk上面的topic信息; 根据zk上变更的topic信息;计算出新增/删除了哪些Topic; 然后拿到新增Topic的 副本分配信息; 并做一些状态流转
16. 向新增Topic所在Broker发送`leaderAndIsrRequest`请求,
17. Broker收到`发送leaderAndIsrRequest请求`; 创建副本Log文件;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210616220350958.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70#pic_center)
## Q&A
### 创建Topic的时候 在Zk上创建了哪些节点
>接受客户端请求阶段:
>1. topic的配置信息 ` /config/topics/Topic名称` 持久节点
>2. topic的分区信息`/brokers/topics/Topic名称` 持久节点
>
>Controller监听zk节点`/brokers/topics`变更阶段
>1. `/brokers/topics/{topicName}/partitions/ `持久节点; 无数据
>2. 向zk中写入`/brokers/topics/{topicName}/partitions/{分区号}` 持久节点; 无数据
>3. 向zk中写入`/brokers/topics/{topicName}/partitions/{分区号}/state` 持久节点;
### 创建Topic的时候 什么时候在Broker磁盘上创建的日志文件
>当Controller监听zk节点`/brokers/topics`变更之后,将新增的Topic 解析好的分区状态流转
>`NonExistentPartition`->`NewPartition`->`OnlinePartition` 当流转到`OnlinePartition`的时候会像分区分配到的Broker发送一个`leaderAndIsrRequest`请求,当Broker们收到这个请求之后,根据请求参数做一些处理,其中就包括检查自身有没有这个分区副本的本地Log;如果没有的话就重新创建;
### 如果我没有指定分区数或者副本数,那么会如何创建
>我们都知道,如果我们没有指定分区数或者副本数, 则默认使用Broker的配置, 那么这么多Broker,假如不小心默认值配置不一样,那究竟使用哪一个呢? 那肯定是哪台机器执行创建topic的过程,就是使用谁的配置;
**所以是谁执行的?** 那肯定是Controller啊! 上面的源码我们分析到了,创建的过程,会指定Controller这台机器去进行;
### 如果我手动删除了`/brokers/topics/`下的某个节点会怎么样?
>在Controller中的内存中更新一下相关信息
>其他Broker呢TODO.
### 如果我手动在zk中添加`/brokers/topics/{TopicName}`节点会怎么样
>**先说结论:** 根据上面分析过的源码画出的时序图可以指定; 客户端发起创建Topic的请求,本质上是去zk里面写两个数据
>1. topic的配置信息 ` /config/topics/Topic名称` 持久节点
>2. topic的分区信息`/brokers/topics/Topic名称` 持久节点
>所以我们绕过这一步骤直接去写入数据,可以达到一样的效果;不过我们的数据需要保证准确
>因为在这一步已经没有了一些基本的校验了; 假如这一步我们写入的副本Brokerid不存在会怎样,从时序图中可以看到,`leaderAndIsrRequest请求`; 就不会正确的发送的不存在的BrokerId上,那么那台机器就不会创建Log文件;
>
>
>**下面不妨让我们来验证一下;**
>创建一个节点`/brokers/topics/create_topic_byhand_zk` 节点数据为下面数据;
>```
>{"version":2,"partitions":{"2":[3],"1":[3],"0":[3]},"adding_replicas":{},"removing_replicas":{}}
>```
>![在这里插入图片描述](https://img-blog.csdnimg.cn/20210617112646965.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
>这里我用的工具`PRETTYZOO`手动创建的,你也可以用命令行创建;
>创建完成之后我们再看看本地有没有生成一个Log文件
>![在这里插入图片描述](https://img-blog.csdnimg.cn/20210617112806599.png)
>可以看到我们指定的Broker,已经生成了对应的分区副本Log文件;
>而且zk中也写入了其他的数据![在这里插入图片描述](https://img-blog.csdnimg.cn/20210617113415168.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
>`在我们写入zk数据的时候就已经确定好了哪个每个分区的Leader是谁了,那就是第一个副本默认为Leader`
>
### 如果写入`/brokers/topics/{TopicName}`节点之后Controller挂掉了会怎么样
> **先说结论**Controller 重新选举的时候,会有一些初始化的操作; 会把创建过程继续下去
> 然后我们来模拟这么一个过程,先停止集群,然后再zk中写入`/brokers/topics/{TopicName}`节点数据; 然后再启动一台Broker;
> **源码分析:** 我们之前分析过[Controller的启动过程与选举]() 有提到过,这里再提一下Controller当选之后有一个地方处理这个事情
> ```
> replicaStateMachine.startup()
> partitionStateMachine.startup()
> ```
> 启动状态机的过程是不是跟上面的**6.1 onNewPartitionCreation 状态流转** 的过程很像; 最终都把状态流转到了`OnlinePartition`; 伴随着是不发起了`leaderAndIsrRequest`请求; 是不是Broker收到请求之后,创建本地Log文件了
>
## 附件
### --config 可生效参数
请以`sh bin/kafka-topic -help` 为准
```xml
configurations:
cleanup.policy
compression.type
delete.retention.ms
file.delete.delay.ms
flush.messages
flush.ms
follower.replication.throttled.
replicas
index.interval.bytes
leader.replication.throttled.replicas
max.compaction.lag.ms
max.message.bytes
message.downconversion.enable
message.format.version
message.timestamp.difference.max.ms
message.timestamp.type
min.cleanable.dirty.ratio
min.compaction.lag.ms
min.insync.replicas
preallocate
retention.bytes
retention.ms
segment.bytes
segment.index.bytes
segment.jitter.ms
segment.ms
unclean.leader.election.enable
```
---
<font color=red size=5>Tips:如果关于本篇文章你有疑问,可以在评论区留下,我会在**Q&A**部分进行解答 </font>
<font color=red size=2>PS: 文章阅读的源码版本是kafka-2.5 </font>

View File

@@ -0,0 +1,420 @@
## 删除Topic命令
>bin/kafka-topics.sh --bootstrap-server localhost:9092 --delete --topic test
支持正则表达式匹配Topic来进行删除,只需要将topic 用双引号包裹起来
例如: 删除以`create_topic_byhand_zk`为开头的topic;
>>bin/kafka-topics.sh --bootstrap-server localhost:9092 --delete --topic "create_topic_byhand_zk.*"
> `.`表示任意匹配除换行符 \n 之外的任何单字符。要匹配 . ,请使用 \. 。
`·*·`:匹配前面的子表达式零次或多次。要匹配 * 字符,请使用 \*
`.*` : 任意字符
**删除任意Topic (慎用)**
> bin/kafka-topics.sh --bootstrap-server localhost:9092 --delete --topic ".*?"
>
更多的用法请[参考正则表达式](https://www.runoob.com/regexp/regexp-syntax.html)
## 源码解析
<font color="red">如果觉得阅读源码解析太枯燥,请直接看 **源码总结及其后面部分**</font>
### 1. 客户端发起删除Topic的请求
在[【kafka源码】TopicCommand之创建Topic源码解析]() 里面已经分析过了整个请求流程; 所以这里就不再详细的分析请求的过程了,直接看重点;
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210613133230944.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
**向Controller发起 `deleteTopics`请求**
### 2. Controller处理deleteTopics的请求
`KafkaApis.handle`
`AdminManager.deleteTopics`
```scala
/**
* Delete topics and wait until the topics have been completely deleted.
* The callback function will be triggered either when timeout, error or the topics are deleted.
*/
def deleteTopics(timeout: Int,
topics: Set[String],
responseCallback: Map[String, Errors] => Unit): Unit = {
// 1. map over topics calling the asynchronous delete
val metadata = topics.map { topic =>
try {
// zk中写入数据 标记要被删除的topic /admin/delete_topics/Topic名称
adminZkClient.deleteTopic(topic)
DeleteTopicMetadata(topic, Errors.NONE)
} catch {
case _: TopicAlreadyMarkedForDeletionException =>
// swallow the exception, and still track deletion allowing multiple calls to wait for deletion
DeleteTopicMetadata(topic, Errors.NONE)
case e: Throwable =>
error(s"Error processing delete topic request for topic $topic", e)
DeleteTopicMetadata(topic, Errors.forException(e))
}
}
// 2. 如果客户端传过来的timeout<=0或者 写入zk数据过程异常了 则执行下面的,直接返回异常
if (timeout <= 0 || !metadata.exists(_.error == Errors.NONE)) {
val results = metadata.map { deleteTopicMetadata =>
// ignore topics that already have errors
if (deleteTopicMetadata.error == Errors.NONE) {
(deleteTopicMetadata.topic, Errors.REQUEST_TIMED_OUT)
} else {
(deleteTopicMetadata.topic, deleteTopicMetadata.error)
}
}.toMap
responseCallback(results)
} else {
// 3. else pass the topics and errors to the delayed operation and set the keys
val delayedDelete = new DelayedDeleteTopics(timeout, metadata.toSeq, this, responseCallback)
val delayedDeleteKeys = topics.map(new TopicKey(_)).toSeq
// try to complete the request immediately, otherwise put it into the purgatory
topicPurgatory.tryCompleteElseWatch(delayedDelete, delayedDeleteKeys)
}
}
```
1. zk中写入数据topic` /admin/delete_topics/Topic名称` 标记要被删除的Topic
2. 如果客户端传过来的timeout<=0或者 写入zk数据过程异常了 则直接返回异常
### 3. Controller监听zk变更 执行删除Topic流程
`KafkaController.processTopicDeletion`
```scala
private def processTopicDeletion(): Unit = {
if (!isActive) return
var topicsToBeDeleted = zkClient.getTopicDeletions.toSet
val nonExistentTopics = topicsToBeDeleted -- controllerContext.allTopics
if (nonExistentTopics.nonEmpty) {
warn(s"Ignoring request to delete non-existing topics ${nonExistentTopics.mkString(",")}")
zkClient.deleteTopicDeletions(nonExistentTopics.toSeq, controllerContext.epochZkVersion)
}
topicsToBeDeleted --= nonExistentTopics
if (config.deleteTopicEnable) {
if (topicsToBeDeleted.nonEmpty) {
info(s"Starting topic deletion for topics ${topicsToBeDeleted.mkString(",")}")
// 标记暂时不可删除的Topic
topicsToBeDeleted.foreach { topic =>
val partitionReassignmentInProgress =
controllerContext.partitionsBeingReassigned.map(_.topic).contains(topic)
if (partitionReassignmentInProgress)
topicDeletionManager.markTopicIneligibleForDeletion(Set(topic),
reason = "topic reassignment in progress")
}
// add topic to deletion list
topicDeletionManager.enqueueTopicsForDeletion(topicsToBeDeleted)
}
} else {
// If delete topic is disabled remove entries under zookeeper path : /admin/delete_topics
info(s"Removing $topicsToBeDeleted since delete topic is disabled")
zkClient.deleteTopicDeletions(topicsToBeDeleted.toSeq, controllerContext.epochZkVersion)
}
}
```
1. 如果`/admin/delete_topics/`下面的节点有不存在的Topic,则清理掉
2. 如果配置了`delete.topic.enable=false`不可删除Topic的话,则将`/admin/delete_topics/`下面的节点全部删除,然后流程结束
3. `delete.topic.enable=true`; 将主题标记为不符合删除条件,放到`topicsIneligibleForDeletion`中; 不符合删除条件的是:**Topic分区正在进行分区重分配**
4. 将Topic添加到删除Topic列表`topicsToBeDeleted`中;
5. 然后调用`TopicDeletionManager.resumeDeletions()`方法执行删除操作
#### 3.1 resumeDeletions 执行删除方法
`TopicDeletionManager.resumeDeletions()`
```scala
private def resumeDeletions(): Unit = {
val topicsQueuedForDeletion = Set.empty[String] ++ controllerContext.topicsToBeDeleted
val topicsEligibleForRetry = mutable.Set.empty[String]
val topicsEligibleForDeletion = mutable.Set.empty[String]
if (topicsQueuedForDeletion.nonEmpty)
topicsQueuedForDeletion.foreach { topic =>
// if all replicas are marked as deleted successfully, then topic deletion is done
//如果所有副本都被标记为删除成功了,然后执行删除Topic成功操作;
if (controllerContext.areAllReplicasInState(topic, ReplicaDeletionSuccessful)) {
// clear up all state for this topic from controller cache and zookeeper
//执行删除Topic成功之后的操作;
completeDeleteTopic(topic)
info(s"Deletion of topic $topic successfully completed")
} else if (!controllerContext.isAnyReplicaInState(topic, ReplicaDeletionStarted)) {
// if you come here, then no replica is in TopicDeletionStarted and all replicas are not in
// TopicDeletionSuccessful. That means, that either given topic haven't initiated deletion
// or there is at least one failed replica (which means topic deletion should be retried).
if (controllerContext.isAnyReplicaInState(topic, ReplicaDeletionIneligible)) {
topicsEligibleForRetry += topic
}
}
// Add topic to the eligible set if it is eligible for deletion.
if (isTopicEligibleForDeletion(topic)) {
info(s"Deletion of topic $topic (re)started")
topicsEligibleForDeletion += topic
}
}
// topic deletion retry will be kicked off
if (topicsEligibleForRetry.nonEmpty) {
retryDeletionForIneligibleReplicas(topicsEligibleForRetry)
}
// topic deletion will be kicked off
if (topicsEligibleForDeletion.nonEmpty) {
//删除Topic,发送UpdataMetaData请求
onTopicDeletion(topicsEligibleForDeletion)
}
}
}
```
1. 重点看看`onTopicDeletion`方法,标记所有待删除分区;向Brokers发送`updateMetadataRequest`请求,告知Brokers这个主题正在被删除,并将Leader设置为`LeaderAndIsrLeaderDuringDelete`
1. 将待删除的Topic的所有分区,执行分区状态机的转换 ;当前状态-->`OfflinePartition`->`NonExistentPartition` ; 这两个状态转换只是在当前Controller内存中更新了一下状态; 关于状态机请看 [【kafka源码】Controller中的状态机TODO....]();
2. `client.sendMetadataUpdate(topics.flatMap(controllerContext.partitionsForTopic))` 向待删除Topic分区发送`UpdateMetadata`请求; 这个时候更新了什么数据呢? ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210615213621790.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
看上面图片源码, 发送`UpdateMetadata`请求的时候把分区的Leader= -2; 表示这个分区正在被删除;那么所有正在被删除的分区就被找到了;拿到这些待删除分区之后干嘛呢?
1. 更新一下限流相关信息
2. 调用`groupCoordinator.handleDeletedPartitions(deletedPartitions)`: 清除给定的`deletedPartitions`的组偏移量以及执行偏移量删除的函数;就是现在该分区不能提供服务啦,不能被消费啦
详细请看 [Kafka的元数据更新UpdateMetadata]()
4. 调用`TopicDeletionManager.onPartitionDeletion`接口如下;
#### 3.2 TopicDeletionManager.onPartitionDeletion
1. 将所有Dead replicas 副本直接移动到`ReplicaDeletionIneligible`状态,如果某些副本已死,也将相应的主题标记为不适合删除,因为它无论如何都不会成功完成
2. 副本状态转换成`OfflineReplica`; 这个时候会对该Topic的所有副本所在Broker发起[`StopReplicaRequest` ]()请求;(参数`deletePartitions = false`,表示还不执行删除操作); 以便他们停止向`Leader`发送`fetch`请求; 关于状态机请看 [【kafka源码】Controller中的状态机TODO....]();
3. 副本状态转换成 `ReplicaDeletionStarted`状态这个时候会对该Topic的所有副本所在Broker发起[`StopReplicaRequest` ]()请求;(参数`deletePartitions = true`,表示执行删除操作)。这将发送带有 deletePartition=true 的 [`StopReplicaRequest` ]()。并将删除相应分区的所有副本中的所有持久数据
### 4. Brokers 接受StopReplica请求
最终调用的是接口
`ReplicaManager.stopReplica` ==> `LogManager.asyncDelete`
>将给定主题分区“logdir”的目录重命名为“logdir.uuid.delete”并将其添加到删除队列中
>例如 :
>![在这里插入图片描述](https://img-blog.csdnimg.cn/20210615124118290.png)
```scala
def asyncDelete(topicPartition: TopicPartition, isFuture: Boolean = false): Log = {
val removedLog: Log = logCreationOrDeletionLock synchronized {
//将待删除的partition在 Logs中删除掉
if (isFuture)
futureLogs.remove(topicPartition)
else
currentLogs.remove(topicPartition)
}
if (removedLog != null) {
//我们需要等到要删除的日志上没有更多的清理任务,然后才能真正删除它。
if (cleaner != null && !isFuture) {
cleaner.abortCleaning(topicPartition)
cleaner.updateCheckpoints(removedLog.dir.getParentFile)
}
//重命名topic副本文件夹 命名规则 topic-uuid-delete
removedLog.renameDir(Log.logDeleteDirName(topicPartition))
checkpointRecoveryOffsetsAndCleanSnapshot(removedLog.dir.getParentFile, ArrayBuffer.empty)
checkpointLogStartOffsetsInDir(removedLog.dir.getParentFile)
//将Log添加到待删除Log队列中,等待删除
addLogToBeDeleted(removedLog)
} else if (offlineLogDirs.nonEmpty) {
throw new KafkaStorageException(s"Failed to delete log for ${if (isFuture) "future" else ""} $topicPartition because it may be in one of the offline directories ${offlineLogDirs.mkString(",")}")
}
removedLog
}
```
#### 4.1 日志清理定时线程
>上面我们知道最终是将待删除的Log添加到了`logsToBeDeleted`这个队列中; 这个队列就是待删除Log队列有一个线程 `kafka-delete-logs`专门来处理的;我们来看看这个线程怎么工作的
`LogManager.startup` 启动的时候 ,启动了一个定时线程
```scala
scheduler.schedule("kafka-delete-logs", // will be rescheduled after each delete logs with a dynamic period
deleteLogs _,
delay = InitialTaskDelayMs,
unit = TimeUnit.MILLISECONDS)
```
**删除日志的线程**
```scala
/**
* Delete logs marked for deletion. Delete all logs for which `currentDefaultConfig.fileDeleteDelayMs`
* has elapsed after the delete was scheduled. Logs for which this interval has not yet elapsed will be
* considered for deletion in the next iteration of `deleteLogs`. The next iteration will be executed
* after the remaining time for the first log that is not deleted. If there are no more `logsToBeDeleted`,
* `deleteLogs` will be executed after `currentDefaultConfig.fileDeleteDelayMs`.
* 删除标记为删除的日志文件;
* file.delete.delay.ms 文件延迟删除时间 默认60000毫秒
*
*/
private def deleteLogs(): Unit = {
var nextDelayMs = 0L
try {
def nextDeleteDelayMs: Long = {
if (!logsToBeDeleted.isEmpty) {
val (_, scheduleTimeMs) = logsToBeDeleted.peek()
scheduleTimeMs + currentDefaultConfig.fileDeleteDelayMs - time.milliseconds()
} else
currentDefaultConfig.fileDeleteDelayMs
}
while ({nextDelayMs = nextDeleteDelayMs; nextDelayMs <= 0}) {
val (removedLog, _) = logsToBeDeleted.take()
if (removedLog != null) {
try {
//立即彻底删除此日志目录和文件系统中的所有内容
removedLog.delete()
info(s"Deleted log for partition ${removedLog.topicPartition} in ${removedLog.dir.getAbsolutePath}.")
} catch {
case e: KafkaStorageException =>
error(s"Exception while deleting $removedLog in dir ${removedLog.dir.getParent}.", e)
}
}
}
} catch {
case e: Throwable =>
error(s"Exception in kafka-delete-logs thread.", e)
} finally {
try {
scheduler.schedule("kafka-delete-logs",
deleteLogs _,
delay = nextDelayMs,
unit = TimeUnit.MILLISECONDS)
} catch {
case e: Throwable =>
if (scheduler.isStarted) {
// No errors should occur unless scheduler has been shutdown
error(s"Failed to schedule next delete in kafka-delete-logs thread", e)
}
}
}
}
```
`file.delete.delay.ms` 决定延迟多久删除
### 5.StopReplica 请求成功 执行回调接口
> Topic删除完成, 清理相关信息
触发这个接口的地方是: 每个Broker执行删除`StopReplica`成功之后,都会执行一个回调函数;`TopicDeletionStopReplicaResponseReceived` ; 当然调用方是Controller,回调到的也就是Controller;
传入回调函数的地方
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210615122649613.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
执行回调函数 `KafkaController.processTopicDeletionStopReplicaResponseReceived`
1. 如果回调有异常,删除失败则将副本状态转换成==》`ReplicaDeletionIneligible`,并且重新执行`resumeDeletions`方法;
2. 如果回调正常,则变更状态 `ReplicaDeletionStarted`==》`ReplicaDeletionSuccessful`;并且重新执行`resumeDeletions`方法;
3. `resumeDeletions`方法会判断所有副本是否均被删除,如果全部删除了就会执行下面的`completeDeleteTopic`代码;否则会继续删除未被成功删除的副本
```scala
private def completeDeleteTopic(topic: String): Unit = {
// deregister partition change listener on the deleted topic. This is to prevent the partition change listener
// firing before the new topic listener when a deleted topic gets auto created
client.mutePartitionModifications(topic)
val replicasForDeletedTopic = controllerContext.replicasInState(topic, ReplicaDeletionSuccessful)
// controller will remove this replica from the state machine as well as its partition assignment cache
replicaStateMachine.handleStateChanges(replicasForDeletedTopic.toSeq, NonExistentReplica)
controllerContext.topicsToBeDeleted -= topic
controllerContext.topicsWithDeletionStarted -= topic
client.deleteTopic(topic, controllerContext.epochZkVersion)
controllerContext.removeTopic(topic)
}
```
1. 清理内存中相关信息
2. 取消注册被删除Topic的相关节点监听器;节点是`/brokers/topics/Topic名称`
3. 删除zk中的数据包括;`/brokers/topics/Topic名称`、`/config/topics/Topic名称` 、`/admin/delete_topics/Topic名称`
### 6. Controller启动时候 尝试继续处理待删除的Topic
我们之前分析Controller上线的时候有看到
`KafkaController.onControllerFailover`
以下省略部分代码
```scala
private def onControllerFailover(): Unit = {
// 获取哪些Topic需要被删除哪些暂时还不能删除
val (topicsToBeDeleted, topicsIneligibleForDeletion) = fetchTopicDeletionsInProgress()
info("Initializing topic deletion manager")
//Topic删除管理器初始化
topicDeletionManager.init(topicsToBeDeleted, topicsIneligibleForDeletion)
//Topic删除管理器 尝试开始删除Topi
topicDeletionManager.tryTopicDeletion()
```
#### 6.1 获取需要被删除的Topic和暂时不能删除的Topic
` fetchTopicDeletionsInProgress`
1. `topicsToBeDeleted`所有需要被删除的Topic从zk中`/admin/delete_topics` 获取
2. `topicsIneligibleForDeletion`有一部分Topic还暂时不能被删除:
①. Topic任意分区正在进行副本重分配
②. Topic任意分区副本存在不在线的情况(只有topic有一个副本所在的Broker异常就不能能删除)
3. 将得到的数据存在在`controllerContext`内存中
#### 6.2 topicDeletionManager.init初始化删除管理器
1. 如果服务器配置`delete.topic.enable=false`不允许删除topic的话,则删除`/admin/delete_topics` 中的节点; 这个节点下面的数据是标记topic需要被删除的意思;
#### 6.3 topicDeletionManager.tryTopicDeletion尝试恢复删除
这里又回到了上面分析过的`resumeDeletions`啦;恢复删除操作
```scala
def tryTopicDeletion(): Unit = {
if (isDeleteTopicEnabled) {
resumeDeletions()
}
}
```
## 源码总结
整个Topic删除, 请看下图
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210616114403991.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70#pic_center)
几个注意点:
1. Controller 也是Broker
2. Controller发起删除请求的时候,只是跟相关联的Broker发起删除请求;
3. Broker不在线或者删除失败,Controller会持续进行删除操作; 或者Broker上线之后继续进行删除操作
## Q&A
<font color="red">列举在此主题下比较常见的问题; 如果读者有其他问题可以在评论区评论, 博主会不定期更新</font>
### 什么时候在/admin/delete_topics写入节点的
>客户端发起删除操作deleteTopics的时候,Controller响应deleteTopics请求, 这个时候Controller就将待删除Topic写入了zk的`/admin/delete_topics/Topic名称`节点中了;
### 什么时候真正执行删除Topic磁盘日志
>Controller监听到zk节点`/admin/delete_topics`之后,向所有存活的Broker发送删除Topic的请求; Broker收到请求之后将待删除副本标记为--delete后缀; 然后会有专门日志清理现场来进行真正的删除操作; 延迟多久删除是靠`file.delete.delay.ms`来决定的默认是60000毫秒 = 一分钟
### 为什么正在重新分配的Topic不能被删除
> 正在重新分配的Topic,你都不知道它具体会落在哪个地方,所以肯定也就不知道啥时候删除啊;
> 等分配完毕之后,就会继续删除流程
### 如果在`/admin/delete_topics/`中手动写入一个节点会不会正常删除
> 如果写入的节点,并不是一个真实存在的Topic则将会直接被删除
> 当然要注意如果配置了`delete.topic.enable=false`不可删除Topic的话,则将`/admin/delete_topics/`下面的节点全部删除,然后流程结束
> 如果写入的节点是一个真实存在的Topic; 则将会执行删除Topic的流程; 本质上跟用Kafka客户端执行删除Topic操作没有什么不同
### 如果直接删除ZK上的`/brokers/topics/{topicName}`节点会怎样
>TODO...
### Controller通知Brokers 执行StopReplica是通知所有的Broker还是只通知跟被删除Topic有关联的Broker
> **只是通知跟被删除Topic有关联的Broker;**
> 请看下图源码,可以看到所有需要被`StopReplica`的副本都是被过滤了一遍,获取它们所在的BrokerId; 最后调用的时候也是`sendRequest(brokerId, stopReplicaRequest)` ;根据获取到的BrokerId发起的请求
> ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210615141430911.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
### 删除过程有Broker不在线 或者执行失败怎么办
>Controller会继续删除操作或者等Broker上线然后继续删除操作; 反正就是一定会保证所有的分区都被删除(被标记了--delete)之后才会把zk上的数据清理掉;
### ReplicaStateMachine 副本状态机
> 请看 [【kafka源码】Controller中的状态机TODO]()
### 在重新分配的过程中,如果执行删除操作会怎么样
> 删除操作会等待,等待重新分配完成之后,继续进行删除操作
> ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210621172944227.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
Finally: 本文阅读源码为 `Kafka-2.5`

View File

@@ -0,0 +1,149 @@
我们有分析过[TopicCommand之创建Topic源码解析]();
因为篇幅太长所以 关于分区分配的问题单独开一篇文章写;
## 源码分析
**创建Topic的源码入口 `AdminManager.createTopics()`**
以下只列出了分区分配相关代码其他省略
```java
def createTopics(timeout: Int,
validateOnly: Boolean,
toCreate: Map[String, CreatableTopic],
includeConfigsAndMetatadata: Map[String, CreatableTopicResult],
responseCallback: Map[String, ApiError] => Unit): Unit = {
// 1. map over topics creating assignment and calling zookeeper
val brokers = metadataCache.getAliveBrokers.map { b => kafka.admin.BrokerMetadata(b.id, b.rack) }
val metadata = toCreate.values.map(topic =>
try {
val assignments = if (topic.assignments().isEmpty) {
AdminUtils.assignReplicasToBrokers(
brokers, resolvedNumPartitions, resolvedReplicationFactor)
} else {
val assignments = new mutable.HashMap[Int, Seq[Int]]
// Note: we don't check that replicaAssignment contains unknown brokers - unlike in add-partitions case,
// this follows the existing logic in TopicCommand
topic.assignments.asScala.foreach {
case assignment => assignments(assignment.partitionIndex()) =
assignment.brokerIds().asScala.map(a => a: Int)
}
assignments
}
trace(s"Assignments for topic $topic are $assignments ")
}
```
1. 以上有两种方式,一种是我们没有指定分区分配的情况也就是没有使用参数`--replica-assignment`;一种是自己指定了分区分配
### 1. 自己指定了分区分配规则
从源码中得知, 会把我们指定的规则进行了包装,**注意它并没有去检查你指定的Broker是否存在;**
### 2. 自动分配 AdminUtils.assignReplicasToBrokers
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210609202822549.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTA2MzQwNjY=,size_16,color_FFFFFF,t_70)
1. 参数检查: 分区数>0; 副本数>0; 副本数<=Broker数 (如果自己未定义会直接使用Broker中个配置)
2. 根据是否有 机架信息来进行不同方式的分配;
3. 要么整个集群都有机架信息,要么整个集群都没有机架信息; 否则抛出异常
#### 无机架方式分配
`AdminUtils.assignReplicasToBrokersRackUnaware`
```scala
/**
* 副本分配时,有三个原则:
* 1. 将副本平均分布在所有的 Broker 上;
* 2. partition 的多个副本应该分配在不同的 Broker 上;
* 3. 如果所有的 Broker 有机架信息的话, partition 的副本应该分配到不同的机架上。
*
* 为实现上面的目标,在没有机架感知的情况下,应该按照下面两个原则分配 replica:
* 1. 从 broker.list 随机选择一个 Broker,使用 round-robin 算法分配每个 partition 的第一个副本;
* 2. 对于这个 partition 的其他副本,逐渐增加 Broker.id 来选择 replica 的分配。
*/
private def assignReplicasToBrokersRackUnaware(nPartitions: Int,
replicationFactor: Int,
brokerList: Seq[Int],
fixedStartIndex: Int,
startPartitionId: Int): Map[Int, Seq[Int]] = {
val ret = mutable.Map[Int, Seq[Int]]()
// 这里是上一层传递过了的所有 存活的Broker列表的ID
val brokerArray = brokerList.toArray
//默认随机选一个index开始
val startIndex = if (fixedStartIndex >= 0) fixedStartIndex else rand.nextInt(brokerArray.length)
//默认从0这个分区号开始
var currentPartitionId = math.max(0, startPartitionId)
var nextReplicaShift = if (fixedStartIndex >= 0) fixedStartIndex else rand.nextInt(brokerArray.length)
for (_ <- 0 until nPartitions) {
if (currentPartitionId > 0 && (currentPartitionId % brokerArray.length == 0))
nextReplicaShift += 1
val firstReplicaIndex = (currentPartitionId + startIndex) % brokerArray.length
val replicaBuffer = mutable.ArrayBuffer(brokerArray(firstReplicaIndex))
for (j <- 0 until replicationFactor - 1)
replicaBuffer += brokerArray(replicaIndex(firstReplicaIndex, nextReplicaShift, j, brokerArray.length))
ret.put(currentPartitionId, replicaBuffer)
currentPartitionId += 1
}
ret
}
```
#### 有机架方式分配
```java
private def assignReplicasToBrokersRackAware(nPartitions: Int,
replicationFactor: Int,
brokerMetadatas: Seq[BrokerMetadata],
fixedStartIndex: Int,
startPartitionId: Int): Map[Int, Seq[Int]] = {
val brokerRackMap = brokerMetadatas.collect { case BrokerMetadata(id, Some(rack)) =>
id -> rack
}.toMap
val numRacks = brokerRackMap.values.toSet.size
val arrangedBrokerList = getRackAlternatedBrokerList(brokerRackMap)
val numBrokers = arrangedBrokerList.size
val ret = mutable.Map[Int, Seq[Int]]()
val startIndex = if (fixedStartIndex >= 0) fixedStartIndex else rand.nextInt(arrangedBrokerList.size)
var currentPartitionId = math.max(0, startPartitionId)
var nextReplicaShift = if (fixedStartIndex >= 0) fixedStartIndex else rand.nextInt(arrangedBrokerList.size)
for (_ <- 0 until nPartitions) {
if (currentPartitionId > 0 && (currentPartitionId % arrangedBrokerList.size == 0))
nextReplicaShift += 1
val firstReplicaIndex = (currentPartitionId + startIndex) % arrangedBrokerList.size
val leader = arrangedBrokerList(firstReplicaIndex)
val replicaBuffer = mutable.ArrayBuffer(leader)
val racksWithReplicas = mutable.Set(brokerRackMap(leader))
val brokersWithReplicas = mutable.Set(leader)
var k = 0
for (_ <- 0 until replicationFactor - 1) {
var done = false
while (!done) {
val broker = arrangedBrokerList(replicaIndex(firstReplicaIndex, nextReplicaShift * numRacks, k, arrangedBrokerList.size))
val rack = brokerRackMap(broker)
// Skip this broker if
// 1. there is already a broker in the same rack that has assigned a replica AND there is one or more racks
// that do not have any replica, or
// 2. the broker has already assigned a replica AND there is one or more brokers that do not have replica assigned
if ((!racksWithReplicas.contains(rack) || racksWithReplicas.size == numRacks)
&& (!brokersWithReplicas.contains(broker) || brokersWithReplicas.size == numBrokers)) {
replicaBuffer += broker
racksWithReplicas += rack
brokersWithReplicas += broker
done = true
}
k += 1
}
}
ret.put(currentPartitionId, replicaBuffer)
currentPartitionId += 1
}
ret
}
```
## 源码总结

View File

@@ -0,0 +1,51 @@
# `Logi-Kafka` 云平台-简介
[TOC]
## 1、产品架构
![产品架构](https://img-ys011.didistatic.com/static/dicloudpub/do1_xgDHNDLj2ChKxctSuf72)
- 资源层:`Logi-Kafka` 云平台最底层是资源层是MySQL、Zookeeper及一些容器和物理机
- 引擎层:资源层之上是引擎层,这块主要是在社区`Kafka`基础上增加了磁盘过载保护、指标埋点等40+优化改进后的`Kafka`消息队列服务;
- 网关层引擎层再之上是网关层网关层主要是对Kafka-Topic的消费与发送进行权限管控、流量管控。以及还有客户端接入时的服务发现和降级等功能
- 服务层网关层再往上是服务层基于滴滴内部Kafka平台的服务的经验沉淀服务层具备Topic管理、集群管理等一套比较完善的监控及管控服务能力
- 平台层:最顶层是平台层,基于服务层的能力及用户角色权限的管控,平台层面向不同用户分别展示了用户控制台、运维控制台及一些开放的接口。
## 2、模块功能
![模块交互](./assets/kafka_cloud_arch.jpg)
- Kafka集群(`Kafka-Brokers`):在`Apache-Kafka`的基础上增加磁盘过载保护、指标体系细化及性能优化等特性后的Kafka。
&nbsp;
- Kafka网关(`Kafka-Gateway`)滴滴自研的具备服务发现、流量控制、服务降级及安全管控等能力的Kafka集群网关。备注部分网关的能力被嵌入于`Kafka-Broker`中。
&nbsp;
- Kafka管控平台(`Kafka-Manager`):滴滴自研的面向`Kafka`的普通用户、研发人员及运维人员的一站式`Kafka`集群监控 & 运维管控平台。
&nbsp;
介绍完云平台整体架构之后,我们再来大致介绍一下各模块的功能及他们之间的交互。
- Kafka集群(`Kafka-Brokers`)
1、承接`Kafka`客户端的发送及消费请求并进行处理。
**2、`Kafka`网关中的流控和安全的能力嵌入于其中。**
3、从`Kafka-Manager`定时同步权限和用户信息。
4、将`Topic`的连接信息`POST``Kafka-Manager`
5、Kafka网关服务发现模块会到`Kafka`集群同步元信息。
6、强依赖于`Zookeeper`
&nbsp;
- Kafka网关(`Kafka-Gateway`) 之 服务发现
1、`Kafka`统一对外的服务地址。
2、`Kafka`客户端启动时,会首先请求服务发现,从而获取`Topic`的元信息。
3、从`Kafka-Manager`定时同步各个集群的实际服务地址和流控降级信息。
&nbsp;
- Kafka管控平台(`Kafka-Manager`)
1、用户管控平台。
2、服务发现和`Kafka`集群会定期进行集群实际服务地址、用户信息及权限信息进行同步。
3、从`Kafka`集群中获取集群元信息及指标信息。
## 3、总结
本节概要介绍了一下滴滴`Logi-Kafka`云平台产品的整体架构 以及 相关模块的大体功能及之间的相互交互关系。

Binary file not shown.

After

Width:  |  Height:  |  Size: 213 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 878 KiB

View File

@@ -0,0 +1,112 @@
结论:按照时间的过期策略,时间是按照时间索引文件的最后一条数据里面记录的时间作为是否过期的判断标准。那么另外一个问题,这个时间是怎么写入的,继续看下面
```java
/**
* If topic deletion is enabled, delete any log segments that have either expired due to time based retention
* or because the log size is > retentionSize.
* Whether or not deletion is enabled, delete any log segments that are before the log start offset
*/
// 清理策略
def deleteOldSegments(): Int = {
if (config.delete) {
// 清理策略保存时间、保存大小、开始offset
deleteRetentionMsBreachedSegments() + deleteRetentionSizeBreachedSegments() + deleteLogStartOffsetBreachedSegments()
} else {
deleteLogStartOffsetBreachedSegments()
}
}
// 调用按照时间的清理策略
private def deleteRetentionMsBreachedSegments(): Int = {
if (config.retentionMs < 0) return 0
val startMs = time.milliseconds
// segment按照 startMs - segment.largestTimestamp > config.retentionMs 的策略进行清理.
// 那么我们再看一下 segment.largestTimestamp 的时间是怎么获取的
deleteOldSegments((segment, _) => startMs - segment.largestTimestamp > config.retentionMs,
reason = s"retention time ${config.retentionMs}ms breach")
}
/**
* The largest timestamp this segment contains.
*/
// LogSegment的代码可以发现如果maxTimestampSoFar>0时就是maxTimestampSoFar否则是最近一次修改时间
// 那么maxTimestampSoFar是怎么获取的呢
def largestTimestamp = if (maxTimestampSoFar >= 0) maxTimestampSoFar else lastModified
// maxTimestampSoFar相当于是时间索引的最后一个entry的时间那么我们继续看一下timeIndex.lastEntry是什么时间
def maxTimestampSoFar: Long = {
if (_maxTimestampSoFar.isEmpty)
_maxTimestampSoFar = Some(timeIndex.lastEntry.timestamp)
_maxTimestampSoFar.get
}
// 获取时间索引的最后一个entry里面的时间&offset
// 在同一个时间索引文件里面,时间字段是单调递增的,因此这里获取到的是时间索引里面最大的那个时间。
// 那么这个时间是怎么写入的呢?我们继续往下看,看时间索引的写入这块
private def lastEntryFromIndexFile: TimestampOffset = {
inLock(lock) {
_entries match {
case 0 => TimestampOffset(RecordBatch.NO_TIMESTAMP, baseOffset)
case s => parseEntry(mmap, s - 1)
}
}
}
```
时间索引文件这个时间是如何写入的?
结论如果配置了LOG_APPEND_TIME那么就是写入服务器的时间。如果是配置CREATE_TIME那么就是record时间里面的最大的那一个。
```java
// 从TimeIndex类的maybeAppend方法一步一步的向上查找查看里面的时间数据的写入我们可以发现
// 这个时间是在 LogValidator.validateMessagesAndAssignOffsets 这个方法里面生成的
// 遍历records
for (batch <- records.batches.asScala) {
validateBatch(topicPartition, firstBatch, batch, origin, toMagicValue, brokerTopicStats)
val recordErrors = new ArrayBuffer[ApiRecordError](0)
for ((record, batchIndex) <- batch.asScala.view.zipWithIndex) {
validateRecord(batch, topicPartition, record, batchIndex, now, timestampType,
timestampDiffMaxMs, compactedTopic, brokerTopicStats).foreach(recordError => recordErrors += recordError)
// we fail the batch if any record fails, so we stop appending if any record fails
if (recordErrors.isEmpty)
// 拼接offset这里还会计算那个时间戳
builder.appendWithOffset(offsetCounter.getAndIncrement(), record)
}
processRecordErrors(recordErrors)
}
//
private long appendLegacyRecord(long offset, long timestamp, ByteBuffer key, ByteBuffer value, byte magic) throws IOException {
ensureOpenForRecordAppend();
if (compressionType == CompressionType.NONE && timestampType == TimestampType.LOG_APPEND_TIME)
// 定义了LOG_APPEND_TIME则使用logAppendTime
timestamp = logAppendTime;
int size = LegacyRecord.recordSize(magic, key, value);
AbstractLegacyRecordBatch.writeHeader(appendStream, toInnerOffset(offset), size);
if (timestampType == TimestampType.LOG_APPEND_TIME)
timestamp = logAppendTime;
long crc = LegacyRecord.write(appendStream, magic, timestamp, key, value, CompressionType.NONE, timestampType);
// 时间计算
recordWritten(offset, timestamp, size + Records.LOG_OVERHEAD);
return crc;
}
// 最值的计算
private void recordWritten(long offset, long timestamp, int size) {
if (numRecords == Integer.MAX_VALUE)
throw new IllegalArgumentException("Maximum number of records per batch exceeded, max records: " + Integer.MAX_VALUE);
if (offset - baseOffset > Integer.MAX_VALUE)
throw new IllegalArgumentException("Maximum offset delta exceeded, base offset: " + baseOffset +
", last offset: " + offset);
numRecords += 1;
uncompressedRecordsSizeInBytes += size;
lastOffset = offset;
if (magic > RecordBatch.MAGIC_VALUE_V0 && timestamp > maxTimestamp) {
// 时间更新最后时间索引记录的是maxTimestamp这个字段
maxTimestamp = timestamp;
offsetOfMaxTimestamp = offset;
}
}
```

View File

@@ -0,0 +1,112 @@
结论:按照时间的过期策略,时间是按照时间索引文件的最后一条数据里面记录的时间作为是否过期的判断标准。那么另外一个问题,这个时间是怎么写入的,继续看下面
```java
/**
* If topic deletion is enabled, delete any log segments that have either expired due to time based retention
* or because the log size is > retentionSize.
* Whether or not deletion is enabled, delete any log segments that are before the log start offset
*/
// 清理策略
def deleteOldSegments(): Int = {
if (config.delete) {
// 清理策略保存时间、保存大小、开始offset
deleteRetentionMsBreachedSegments() + deleteRetentionSizeBreachedSegments() + deleteLogStartOffsetBreachedSegments()
} else {
deleteLogStartOffsetBreachedSegments()
}
}
// 调用按照时间的清理策略
private def deleteRetentionMsBreachedSegments(): Int = {
if (config.retentionMs < 0) return 0
val startMs = time.milliseconds
// segment按照 startMs - segment.largestTimestamp > config.retentionMs 的策略进行清理.
// 那么我们再看一下 segment.largestTimestamp 的时间是怎么获取的
deleteOldSegments((segment, _) => startMs - segment.largestTimestamp > config.retentionMs,
reason = s"retention time ${config.retentionMs}ms breach")
}
/**
* The largest timestamp this segment contains.
*/
// LogSegment的代码可以发现如果maxTimestampSoFar>0时就是maxTimestampSoFar否则是最近一次修改时间
// 那么maxTimestampSoFar是怎么获取的呢
def largestTimestamp = if (maxTimestampSoFar >= 0) maxTimestampSoFar else lastModified
// maxTimestampSoFar相当于是时间索引的最后一个entry的时间那么我们继续看一下timeIndex.lastEntry是什么时间
def maxTimestampSoFar: Long = {
if (_maxTimestampSoFar.isEmpty)
_maxTimestampSoFar = Some(timeIndex.lastEntry.timestamp)
_maxTimestampSoFar.get
}
// 获取时间索引的最后一个entry里面的时间&offset
// 在同一个时间索引文件里面,时间字段是单调递增的,因此这里获取到的是时间索引里面最大的那个时间。
// 那么这个时间是怎么写入的呢?我们继续往下看,看时间索引的写入这块
private def lastEntryFromIndexFile: TimestampOffset = {
inLock(lock) {
_entries match {
case 0 => TimestampOffset(RecordBatch.NO_TIMESTAMP, baseOffset)
case s => parseEntry(mmap, s - 1)
}
}
}
```
时间索引文件这个时间是如何写入的?
结论如果配置了LOG_APPEND_TIME那么就是写入服务器的时间。如果是配置CREATE_TIME那么就是record时间里面的最大的那一个。
```java
// 从TimeIndex类的maybeAppend方法一步一步的向上查找查看里面的时间数据的写入我们可以发现
// 这个时间是在 LogValidator.validateMessagesAndAssignOffsets 这个方法里面生成的
// 遍历records
for (batch <- records.batches.asScala) {
validateBatch(topicPartition, firstBatch, batch, origin, toMagicValue, brokerTopicStats)
val recordErrors = new ArrayBuffer[ApiRecordError](0)
for ((record, batchIndex) <- batch.asScala.view.zipWithIndex) {
validateRecord(batch, topicPartition, record, batchIndex, now, timestampType,
timestampDiffMaxMs, compactedTopic, brokerTopicStats).foreach(recordError => recordErrors += recordError)
// we fail the batch if any record fails, so we stop appending if any record fails
if (recordErrors.isEmpty)
// 拼接offset这里还会计算那个时间戳
builder.appendWithOffset(offsetCounter.getAndIncrement(), record)
}
processRecordErrors(recordErrors)
}
//
private long appendLegacyRecord(long offset, long timestamp, ByteBuffer key, ByteBuffer value, byte magic) throws IOException {
ensureOpenForRecordAppend();
if (compressionType == CompressionType.NONE && timestampType == TimestampType.LOG_APPEND_TIME)
// 定义了LOG_APPEND_TIME则使用logAppendTime
timestamp = logAppendTime;
int size = LegacyRecord.recordSize(magic, key, value);
AbstractLegacyRecordBatch.writeHeader(appendStream, toInnerOffset(offset), size);
if (timestampType == TimestampType.LOG_APPEND_TIME)
timestamp = logAppendTime;
long crc = LegacyRecord.write(appendStream, magic, timestamp, key, value, CompressionType.NONE, timestampType);
// 时间计算
recordWritten(offset, timestamp, size + Records.LOG_OVERHEAD);
return crc;
}
// 最值的计算
private void recordWritten(long offset, long timestamp, int size) {
if (numRecords == Integer.MAX_VALUE)
throw new IllegalArgumentException("Maximum number of records per batch exceeded, max records: " + Integer.MAX_VALUE);
if (offset - baseOffset > Integer.MAX_VALUE)
throw new IllegalArgumentException("Maximum offset delta exceeded, base offset: " + baseOffset +
", last offset: " + offset);
numRecords += 1;
uncompressedRecordsSizeInBytes += size;
lastOffset = offset;
if (magic > RecordBatch.MAGIC_VALUE_V0 && timestamp > maxTimestamp) {
// 时间更新最后时间索引记录的是maxTimestamp这个字段
maxTimestamp = timestamp;
offsetOfMaxTimestamp = offset;
}
}
```

View File

@@ -0,0 +1,42 @@
# Kafka服务端—权限控制
[TOC]
资源类型:
- UNKNOWN 未知
- ANY任意的资源
- TOPICTopic
- GROUP消费组
- CLUSTER整个集群
- TRANSACTIONAL_ID事物ID
- DELEGATION_TOKENToken
资源操作:
- UNKNOWN未知
- ANY任意的操作
- ALL所有的操作
- READ
- WRITE
- CREATE创建
- DELETE删除
- ALTER修改
- DESCRIBE描述查看
- CLUSTER_ACTION集群动作
- DESCRIBE_CONFIGS查看配置
- ALTER_CONFIGS修改配置
- IDEMPOTENT_WRITE幂等写
资源书写类型:
- UNKNOWN未知
- ANY任意
- MATCH满足LITERAL、PREFIXED或者*的任意中的一个即可
- LITERAL全匹配完全按照原文匹配
- PREFIXED前缀匹配
认证结果:
- ALLOWED允许
- DENIED拒绝

View File

@@ -0,0 +1,180 @@
# Kafka集群平稳滚动重启实践
[TOC]
## 0、前言
Kafka集群的滚动重启是一件非常危险的事情操作不当的情况下可能会导致Kafka集群不可服务。即便操作上准确无误也可能因为业务方服务非常敏感、服务健壮性不足、使用的客户端存在BUG等原因导致业务方业务受损。
基于以上的原因以及我们以往的经验我们梳理了一下在对Kafka集群滚动重启中需要做的事情以及注意的点。
## 1、用户告知
### 1.1、告知内容
提前告知用户:
- 我们要做什么;
- 为什么要做;
- 可能的影响及简单处理方式比如因为leader会切换node客户端可能会消费中断需要重启等
- 联系人;
- 操作时间;**在操作时间选择上,建议选择业务方在的工作时间,方便出问题后能及时协同处理**
告知内容例子:
```
标题:
[2021-11-11] XXX-Kafka 集群升级至 kafka_2.12-xxxx
变更原因:
1、性能优化
变更内容:
1、集群单机连接数限制调整到 1200
变更影响:
升级过程中会有leader切换理论上无影响有问题及时联系kafka服务号
联系人:
xxxxx@xxxx.com
计划时间:
2021-11-11T10:00:00+08:00 至 2021-11-11T16:30:00+08:00
```
### 1.2、相关建议
- 增加对自身服务监控比如监控服务对应的Topic的流量监控消费的Lag等指标以便出现问题时能被及时发现
-
---
## 2、滚动重启
**真正操作前,建议演练一下。**
---
### 2.1、整体操作流程
- 1、再次通知用户我们现在要开始进行重启操作有问题随时联系
- 2、重启**非Controller**的一台Broker
- 3、观察重启后指标等是否都正常如果出现异常则进行相应的处理
- 4、告知用户我们已重启一台xxx分钟后要操作剩余所有的机器让用户注意自身服务是否正常有问题随时反馈
- 5、xxx分钟后剩余机器逐台重启**Kafka-Controller放在最后重启**
- 6、操作完成后告知用户已操作完成让用户关注自身服务是否正常有问题随时反馈
---
### 2.2、单台操作流程
单台操作时,主要分两部分,第一部分时操作进行重启,第二部分是重启完成之后观察服务是否正常。
#### 2.2.1、重启
**第一步:停服务**
```bash
# 以kill的方式停Kafka服务。
# 强调不能以kill -9的方式停服
```
**第二步:修改配置**
```bash
# 对本次重启需要进行修改的配置进行修改;
# 强烈要求将本次修改的配置的具体操作步骤罗列出来;
```
**第三步Broker限流**
```bash
# 如果在停服务和启动服务之间的时间间隔非常的久导致启动后需要同步非常多的数据则在启动服务之前我们需要做好副本同步之间的限流否则可能会拉打满带宽挂其他Broker等。
# 这里的需要同步的数据量怎么样算多这个没有一个非常准确的值只要说可能将leader带宽打满拉挂其他Broker都算是数据量大。
```
**第四步:起服务**
```bash
# 启动Kafk服务然后观察服务是否正常
```
---
#### 2.2.2、观察
**第一步:观察启动日志**
```bash
# 查看server.log文件看到该日志后表示Kafka服务端已启动完成
[2021-11-17 14:07:22,459][INFO][main]: [KafkaServer id=2] started
# 查看server.log文件检查是否存在ERROR及FATAL日志如果出现这些日志需要暂停升级并分析出现这些日志的影响。
```
**第二步:观察服务监控**
如果可以做到下列指标的监控的话,建议都在监控系统中,配置上这些监控。
这一步正常来说只要配置上了,如果出现异常监控系统会主动通知,不需要我们细致的去看,所以虽然列的比较多,但是操作的时候不太需要主动去看所有的指标。
```bash
# 服务存活监控;
# 错误日志监控;
# GC监控
# 脏选举监控;
# ISR收缩速度监控
# leader=-1监控
# 网络处理线程负载监控;
# 请求处理线程负载监控;
# 副本未同步监控;
# 系统负载监控(CPU、磁盘IO、磁盘容量、网络带宽、网络丢包、TCP连接数、TCP连接增速、文件句柄数)
```
**第三步:检查变更是否生效**
这一步骤没有什么好说的,就是检查是否生效。
**第四步:观察流量是否正常**
```
观察一存在Broker组的概念则可以观察重启所在的Broker组的整个流量和重启之前是否基本一致。
观察二重点选取几个Broker上的Topic观察流量是否出现异常比如突然没有流入或流出流量了。
```
**第五步:等待副本同步完成**
```bash
# 查看整个集群的副本同步状态确保整个集群都是处于已同步的状态。该信息可以通过LogiKM查看。
# 实际上是不需要整个集群所有的Broker处于已同步的状态只需要是落在所重启的Broker上的所有的分区都处于同步状态即可但是这个不太好判断因此简单粗暴的就是看整个集群都处于同步状态。
```
### 2.3、其他重要说明
- 如果重启中需要同步非常大的数据量Broker本身负载也较高则建议重启操作要避开leader rebalance的时间
- 重启的过程中会进行leader的切换最后一台操作完成之后需要进行leader rebalance
---
## 3、信息记录
操作中,很难避免就不出现任何问题,出现问题时就需要我们做好相关的记录,比如记录:
- 1、重要的业务及其Topic
- 2、敏感的业务及其Topic
- 3、特殊客户端的业务及其Topic
- 4、不合理使用的业务及其Topic
后续我们可以将这些Topic进行重点保障以及再次进行操作的时候我们能够更准确的触达到用户。

View File

@@ -0,0 +1,173 @@
# `Kafka` 本地开发环境搭建
[TOC]
## 1、环境准备
本地开发环境搭建之前,需要准备好如下环境。
- JDK 11
- IDEA 2020.x
- Gradle 5.5.1
- Zookeeper
## 2、开发环境配置
环境准备好之后,我们便开始进行开发环境的配置。
**步骤一打开工程修改IDEA的配置**
修改Gradle配置
![dev_change_gradle_config](./assets/dev_change_gradle_config.jpg)
修改Java Compiler配置如下图所示增加`--add-exports=java.base/sun.nio.ch=ALL-UNNAMED`
![dev_change_java_compile_config](./assets/dev_change_java_compile_config.jpg)
**步骤二:进行编译,生成消息协议文件**
```
./gradlew assemble
```
![dev_start_build](./assets/dev_start_build.jpg)
**步骤三修改build.gradle文件**
修改`artifactory` 及 开启`1.8`的兼容
![dev_change_artifactory](./assets/dev_change_artifactory.jpg)
![dev_compatible_8](./assets/dev_compatible_8.jpg)
**步骤四:修改启动配置**
```java
// 部分看不清的补充说明
// VM配置
// 日志输出位置、log4j配置文件位置、认证文件的位置
-Dkafka.logs.dir=logs -Dlog4j.configuration=file:config/log4j.properties -Djava.security.auth.login.config=config/kafka_server_jaas.conf
// 参数配置
config/server.properties
```
![dev_app_debug_config](./assets/dev_app_debug_config.jpg)
**步骤五:开始编译**
点击`IDEA`正上方绿色的类似锤子的按钮,开始进行编译。
编译中:
![kb_kafka_engine_dev_env](./assets/dev_start_compile.jpg)
编译完成:
![dev_finished_compile](./assets/dev_finished_compile.jpg)
**步骤六配置Kafka配置文件**
在步骤三中我们设置了Kafka本地启动涉及到的`server.properties``log4j.properties`等文件,这里需要修改的主要是`server.properties`
```java
// server.properties 中主要需要修改的配置
zookeeper.connect=xxxx
gateway.url=xxxx
cluster.id=xxxx
// 其他相关的配置可按需进行调整
```
server.properties配置
![dev_server_properties](./assets/dev_server_properties.jpg)
log4j.properties配置
![dev_log4j_properties](./assets/dev_log4j_properties.jpg)
**步骤七启动Kafka**
![dev_start_kafka](./assets/dev_start_kafka.jpg)
至此Kafka本地开发环境便搭建完成了。
## 3、日常命令
```java
// 编译
./gradlew assemble
// 打包打包完成之后会在core/build/distributions生成打包之后的.tgz文件
./gradlew clean releaseTarGz
// 更多具体的命令可以看2.5版本源码包里面的cmd.txt文件
```
## 4、Kafka 工程代码结构
主要代码在`clients``core`这两个地方。`clients`主要是Java客户端代码。`core`是Kafka服务端代码也是最重要的代码。
本次主要介绍一下`core`模块,`clients`模块会在后续进行介绍。`core`模块主要有两部分代码,一部分是社区原生的代码,还有一部分是我们滴滴加入的一些代码。
### 4.1 Kafka-Core
这部分`core`模块里面主要是原生的`kafka scala`代码。
首先看一下图:
![kafka-core](./assets/kafka_core_module.png)
&nbsp;
模块的说明:
| 模块 | 说明
| :-------- |:--------:|
| admin | 管理员运维操作相关模块
| api | 该模块主要负责交互数据的组装,客户端与服务端交互数据编解码
| cluster | Cluster、broker等几个实体类
| common | 通用模块,主要是异常类和错误验证
| contoroller | Controller相关模块
| coordinator | 消费的Coordinator和事物的Coordinator
| log | Kafka文件存储模块
| metrics | 监控指标metrics模块
| network | 网络事件处理模块
| security | 安全模块
| server | 服务端主模块,业务请求处理入口
| tools/utils | 工具相关模块
| zk/zookeeper | ZK相关模块
&nbsp;
### 4.2 Kafka-Core-DiDi
这部分`core`模块里面主要是我们滴滴扩展的`kafka java`代码。
首先看一下图:
![kafka-core-didi](./assets/kafka_core_didi_module.png)
&nbsp;
模块的说明:
| 模块 | 说明
| :-------- |:--------:|
| cache | 缓存模块,主要缓存权限和用户信息并进行同步等
| config | 配置模块
| jmx | jmx相关模块
| metrics | 滴滴Kafka特有的指标
| partition | 旧版的分区禁用模块,代码基本废弃了
| report | 上报模块主要上报Topic连接信息
| security | 安全管控模块
| server | 服务端能力增强模块,包括磁盘过载保护等
| util | 工具类
## 5、环境搭建问题记录
1. 启动程序报错:
Error:scalac: jvm-11 is not a valid choice for -target和scalac: bad option: -target:jvm-11
解决办法:
1. 项目的根目录下找到.idea文件夹
2. 找到文件夹中scala_compiler.xml文件
3. 注释掉其中的<parameter value="-target:jvm-11"
4. 最后重启IDEA即可
## 6、总结
本次介绍了Kafka本地开发环境的搭建以及Kafka相关模块的说明有兴趣的同学可以尝试着搭建一套本地开发环境方便后续的学习日常的开发及问题的定位等

Binary file not shown.

After

Width:  |  Height:  |  Size: 188 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 531 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 275 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 275 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 518 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 670 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 673 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 580 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 263 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 600 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 757 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 MiB

View File

@@ -0,0 +1,301 @@
# Kafka控制器—处理Broker上下线
[TOC]
## 1、前言
Broker的上下线除了Broker自身的启停之外呢Controller还需要对Broker的上下线做元信息的同步等。
Controller在感知Broker上下线的过程中主要做了
1. 更新本地缓存的元信息;
2. 下线的关闭连接,新增的增加连接;
3. 调整下线副本的状态;
4. 调整需要重新选举Leader的分区
5. 进行元信息的同步;
以上就是Controller做的相关事情下面我们在细看一下具体的流程。
## 2、处理上下线
### 2.1、感知方式
1. Broker正常上线Controller感知ZK节点的变化来感知到Broker的上线。
2. Broker正常下线Broker主动发送ControlShutdown请求给Controller进行处理后再退出退出后Controller感知到ZK节点变化后再次进行处理。
3. Broker异常下线Controller感知ZK节点的变化来感知Broker的下线。
那么归结起来处理上下线就两个流程一个是通过ZK进行上下线的处理。还有一个是处理ControlShutdown请求来进行Broker下线的处理。
### 2.2、通过ZK感知Broker上下线
#### 2.2.1、大体流程
![Broker上下线—大体流程](./assets/controller_aware_broker_updown.jpg)
#### 2.2.2、AddBroker & RemoveBroker
这块流程非常的简单,这里就不画相关说明图了,我们直接来看一下代码。
**AddBroker**
```Java
def addBroker(broker: Broker): Unit = {
// be careful here. Maybe the startup() API has already started the request send thread
brokerLock synchronized {
if (!brokerStateInfo.contains(broker.id)) {
addNewBroker(broker)
startRequestSendThread(broker.id)
}
}
}
private def addNewBroker(broker: Broker): Unit = {
// 日志及配置等
// 创建NetworkClient
val (networkClient, reconfigurableChannelBuilder) = {
val channelBuilder = ChannelBuilders.clientChannelBuilder(。。。。。。)
val reconfigurableChannelBuilder = channelBuilder match {。。。。。。。}
val selector = new Selector(。。。。。。)
val networkClient = new NetworkClient(。。。。。。)
(networkClient, reconfigurableChannelBuilder)
}
val threadName = threadNamePrefix match {
case None => s"Controller-${config.brokerId}-to-broker-${broker.id}-send-thread"
case Some(name) => s"$name:Controller-${config.brokerId}-to-broker-${broker.id}-send-thread"
}
// metrics
// 创建请求网络IO处理线程
val requestThread = new RequestSendThread(config.brokerId, controllerContext, messageQueue, networkClient,
brokerNode, config, time, requestRateAndQueueTimeMetrics, stateChangeLogger, threadName)
requestThread.setDaemon(false)
// metrics
// 缓存创建的信息
brokerStateInfo.put(broker.id, ControllerBrokerStateInfo(networkClient, brokerNode, messageQueue,
requestThread, queueSizeGauge, requestRateAndQueueTimeMetrics, reconfigurableChannelBuilder))
}
```
**RemoveBroker**
```Java
def removeBroker(brokerId: Int): Unit = {
brokerLock synchronized {
removeExistingBroker(brokerStateInfo(brokerId))
}
}
private def removeExistingBroker(brokerState: ControllerBrokerStateInfo): Unit = {
try {
// 关闭相关新建的对象
brokerState.reconfigurableChannelBuilder.foreach(config.removeReconfigurable)
brokerState.requestSendThread.shutdown()
brokerState.networkClient.close()
brokerState.messageQueue.clear()
removeMetric(QueueSizeMetricName, brokerMetricTags(brokerState.brokerNode.id))
removeMetric(RequestRateAndQueueTimeMetricName, brokerMetricTags(brokerState.brokerNode.id))
brokerStateInfo.remove(brokerState.brokerNode.id)
} catch {
case e: Throwable => error("Error while removing broker by the controller", e)
}
}
```
#### 2.2.3、处理Broker上线(onBrokerStartup)
##### 2.2.3.1、大体流程
![Broker上线](./assets/controller_on_broker_startup.jpg)
##### 2.2.3.2、相关代码
```Java
private def onBrokerStartup(newBrokers: Seq[Int]): Unit = {
info(s"New broker startup callback for ${newBrokers.mkString(",")}")
newBrokers.foreach(controllerContext.replicasOnOfflineDirs.remove)
val newBrokersSet = newBrokers.toSet
val existingBrokers = controllerContext.liveOrShuttingDownBrokerIds -- newBrokers
// 发送空的元信息到已存在的broker上
sendUpdateMetadataRequest(existingBrokers.toSeq, Set.empty)
// 发送完整的元信息到新增的Broker上
sendUpdateMetadataRequest(newBrokers, controllerContext.partitionLeadershipInfo.keySet)
// 或者到所有在新增Broker上的副本
val allReplicasOnNewBrokers = controllerContext.replicasOnBrokers(newBrokersSet)
// 变更副本状态
replicaStateMachine.handleStateChanges(allReplicasOnNewBrokers.toSeq, OnlineReplica)
// 变更分区状态
partitionStateMachine.triggerOnlinePartitionStateChange()
// 恢复迁移
maybeResumeReassignments { (_, assignment) =>
assignment.targetReplicas.exists(newBrokersSet.contains)
}
// 恢复删除
val replicasForTopicsToBeDeleted = allReplicasOnNewBrokers.filter(p => topicDeletionManager.isTopicQueuedUpForDeletion(p.topic))
if (replicasForTopicsToBeDeleted.nonEmpty) {
// 日志
topicDeletionManager.resumeDeletionForTopics(replicasForTopicsToBeDeleted.map(_.topic))
}
// 注册监听
registerBrokerModificationsHandler(newBrokers)
}
```
#### 2.2.4、处理Broker下线(onBrokerFailure)
##### 2.2.4.1、大体流程
![Broker上线](./assets/controller_on_broker_shutdown.jpg)
##### 2.2.4.2、相关代码
```Java
private def onBrokerFailure(deadBrokers: Seq[Int]): Unit = {
info(s"Broker failure callback for ${deadBrokers.mkString(",")}")
// 缓存中移除dead-broker
// 获取到dead-broker上相关的副本
val allReplicasOnDeadBrokers = controllerContext.replicasOnBrokers(deadBrokers.toSet)
// 相关副本状态处理
onReplicasBecomeOffline(allReplicasOnDeadBrokers)
// 取消Broker节点被修改的事件的监听
unregisterBrokerModificationsHandler(deadBrokers)
}
private def onReplicasBecomeOffline(newOfflineReplicas: Set[PartitionAndReplica]): Unit = {
// 被影响的副本中,区分是要被删除的和不用被删除的
val (newOfflineReplicasForDeletion, newOfflineReplicasNotForDeletion) =
newOfflineReplicas.partition(p => topicDeletionManager.isTopicQueuedUpForDeletion(p.topic))
// 获取broker下线后将无leader的分区
val partitionsWithoutLeader = controllerContext.partitionLeadershipInfo.filter(partitionAndLeader =>
!controllerContext.isReplicaOnline(partitionAndLeader._2.leaderAndIsr.leader, partitionAndLeader._1) &&
!topicDeletionManager.isTopicQueuedUpForDeletion(partitionAndLeader._1.topic)).keySet
// 无leader的分区进行状态切换及leader选举
partitionStateMachine.handleStateChanges(partitionsWithoutLeader.toSeq, OfflinePartition)
partitionStateMachine.triggerOnlinePartitionStateChange()
// 不删除的副本的状态切换
replicaStateMachine.handleStateChanges(newOfflineReplicasNotForDeletion.toSeq, OfflineReplica)
if (newOfflineReplicasForDeletion.nonEmpty) {
// 需要删除的副本的Topic标记删除失败
topicDeletionManager.failReplicaDeletion(newOfflineReplicasForDeletion)
}
// 如果没有leader变化的分区则对所有broker进行空的元信息同步
if (partitionsWithoutLeader.isEmpty) {
sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq, Set.empty)
}
}
```
### 2.3、Broker主动下线——处理ControlShutdown请求
#### 2.3.1、大体流程
![Broker主动下线](./assets/controller_handle_control_shutdown.jpg)
#### 2.3.2、相关代码
```Java
// 逐层调用
def handleControlledShutdownRequest(request: RequestChannel.Request): Unit = {
//////
}
private def processControlledShutdown(id: Int, brokerEpoch: Long, controlledShutdownCallback: Try[Set[TopicPartition]] => Unit): Unit = {
//////
}
// 执行下线请求的处理
private def doControlledShutdown(id: Int, brokerEpoch: Long): Set[TopicPartition] = {
if (!isActive) {
throw new ControllerMovedException("Controller moved to another broker. Aborting controlled shutdown")
}
// epoch值异常抛出异常。broker不存在抛出异常等
// 加入shuttingdown中
controllerContext.shuttingDownBrokerIds.add(id)
// 获取本次broker下线影响到的分区
val partitionsToActOn = controllerContext.partitionsOnBroker(id).filter { partition =>
controllerContext.partitionReplicaAssignment(partition).size > 1 &&
controllerContext.partitionLeadershipInfo.contains(partition) &&
!topicDeletionManager.isTopicQueuedUpForDeletion(partition.topic)
}
// 分区区分是leader分区还是follower分区
val (partitionsLedByBroker, partitionsFollowedByBroker) = partitionsToActOn.partition { partition =>
controllerContext.partitionLeadershipInfo(partition).leaderAndIsr.leader == id
}
// leader分区进行leader重新选举等
partitionStateMachine.handleStateChanges(partitionsLedByBroker.toSeq, OnlinePartition, Some(ControlledShutdownPartitionLeaderElectionStrategy))
try {
brokerRequestBatch.newBatch()
partitionsFollowedByBroker.foreach { partition =>
brokerRequestBatch.addStopReplicaRequestForBrokers(Seq(id), partition, deletePartition = false)
}
brokerRequestBatch.sendRequestsToBrokers(epoch)
} catch {
case e: IllegalStateException =>
handleIllegalState(e)
}
// Follower分区的副本调整状态为OfflineReplica
replicaStateMachine.handleStateChanges(partitionsFollowedByBroker.map(partition =>
PartitionAndReplica(partition, id)).toSeq, OfflineReplica)
def replicatedPartitionsBrokerLeads() = {
// 获取获取落在broker上的leader分区
}
replicatedPartitionsBrokerLeads().toSet
}
```
## 3、常见问题
### 3.1、元信息同步的范围
准守的基本原则:
1. Topic的Leader及Follower的信息没有变化时基本上只需要发送UpdateMetadata请求会发送到所有的Broker。
2. 如果Topic的Leader或Follower的信息发生变化了则会对迁移到的相关Broker发送LeaderAndIsr请求以更新副本之间的同步状态。此外还会对整个集群的Broker发送UpdateMetadata请求从而保证集群每个Broker上缓存的元信息是一致的。
3. 牵扯到副本的暂停副本同步的时候会对相关的Broker发送StopReplica的请求。
此外呢我们在代码中也可以看到有时候还会发送空的UpdateMetadata请求到Broker。
这个的主要原因是:
UpdateMetadata请求除了同步Topic元信息之外还会同步集群的Broker信息。所以最后一个原则
- 即使Topic都没有变化但是Broker发生变化的时候也会发送UpdateMetadata请求。
### 3.2、元信息同步性能
上述的操作的主流程上除了和ZK可能存在部分的网络IO之外不会存在和集群其他的Broker的直接的网络IO。
因此,基本上秒级或者更短的时间可处理完。
## 4、总结
本次分享了Broker上下线过程中Controller需要做的事情然后对常见的问题进行了讨论。以上就是本次分享的全部内容谢谢大家。

Binary file not shown.

After

Width:  |  Height:  |  Size: 311 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 319 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 382 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 290 KiB

View File

@@ -0,0 +1,251 @@
# KIP-415—增量Rebalance协议
[TOC]
## 1、背景
Kafka为了让消费数据这个过程在Kafka集群中尽可能的均衡Kafka设计了消费客户端的Rebalance功能Rebalance能够帮助Kafka客户端尽可能的实现负载均衡。
但是在Kafka 2.3版本之前Rebalance各种分配策略基本都是基于Eager协议(包括RangeAssignorRoundRobinAssignor等)也就是大家熟悉的旧的Rebalance。旧的Rebalance一直以来都为人诟病因为Rebalance过程会触发Stop The World(STW)此时对应Topic的资源都会处于不可用的状态小规模的集群还好如果是大规模的集群比如几百个节点的Consumer消费客户度等那么重平衡就是一场灾难。
在2.x的时候社区意识到需要对现有的Rebalance做出改变。所以在Kafka 2.3版本首先在Kafka Connect应用了Cooperative协议然后在Kafka 2.4版本时候的时候在Kafka Consumer Client中也添加了该协议的支持。
本次分享,我们就来对这个特性进行一个简单的介绍。
## 2、增量Rebalance协议
### 2.1、Eager协议 与 Cooperative协议 的Rebalance过程
**Eager协议**
![Eager协议](./assets/rebalance_eager_protocal.jpg)
网上抄袭的美图:
![Eager协议](./assets/rebalance_eager_bchat.jpg)
**Cooperative协议**
![Cooperative协议](./assets/rebalance_cooperative_protocal.jpg)
网上抄袭的美图:
![Cooperative协议](./assets/rebalance_co_bchat.jpg)
### 2.2、代码实现
客户端这块的代码实现上整体和Eager协议差不多仅仅只是在一些点做了一些改动具体的见
#### 2.2.1、JoinGroup前
```Java
@Override
protected void onJoinPrepare(int generation, String memberId) {
// 相关日志等
final Set<TopicPartition> revokedPartitions;
if (generation == Generation.NO_GENERATION.generationId && memberId.equals(Generation.NO_GENERATION.memberId)) {
// 。。。 错误的情况
} else {
switch (protocol) {
case EAGER:
// EAGER协议放弃了所有的分区
revokedPartitions = new HashSet<>(subscriptions.assignedPartitions());
exception = invokePartitionsRevoked(revokedPartitions);
subscriptions.assignFromSubscribed(Collections.emptySet());
break;
case COOPERATIVE:
// COOPERATIVE协议仅放弃不在subscription中的分区
// 不被放弃的分区,还是处于一个可用的状态(FETCHING状态)
Set<TopicPartition> ownedPartitions = new HashSet<>(subscriptions.assignedPartitions());
revokedPartitions = ownedPartitions.stream()
.filter(tp -> !subscriptions.subscription().contains(tp.topic()))
.collect(Collectors.toSet());
if (!revokedPartitions.isEmpty()) {
exception = invokePartitionsRevoked(revokedPartitions);
ownedPartitions.removeAll(revokedPartitions);
subscriptions.assignFromSubscribed(ownedPartitions);
}
break;
}
}
isLeader = false;
subscriptions.resetGroupSubscription();
if (exception != null) {
throw new KafkaException("User rebalance callback throws an error", exception);
}
}
```
#### 2.2.2、SyncGroup前
SyncGroup之前就是使用Cooperative协议的分配器对分区进行分配。在2.5版本中CooperativeStickyAssignor是支持Cooperative协议具体的代码可以看CooperativeStickyAssignor这个类这里就不展开介绍了。
#### 2.2.3、SyncGroup后
在一轮的Rebalance结束之后最后会重新设置分配的状态。
```Java
@Override
protected void onJoinComplete(int generation,
String memberId,
String assignmentStrategy,
ByteBuffer assignmentBuffer) {
// 公共部分
final AtomicReference<Exception> firstException = new AtomicReference<>(null);
Set<TopicPartition> addedPartitions = new HashSet<>(assignedPartitions);
addedPartitions.removeAll(ownedPartitions);
if (protocol == RebalanceProtocol.COOPERATIVE) {// COOPERATIVE协议单独多处理的部分
// revokedPartitions是需要放弃的分区ownedPartitions是上一次拥有的分区assignedPartitions是本次分配的分区
Set<TopicPartition> revokedPartitions = new HashSet<>(ownedPartitions);
revokedPartitions.removeAll(assignedPartitions);
log.info("Updating assignment with\n" +
"now assigned partitions: {}\n" +
"compare with previously owned partitions: {}\n" +
"newly added partitions: {}\n" +
"revoked partitions: {}\n",
Utils.join(assignedPartitions, ", "),
Utils.join(ownedPartitions, ", "),
Utils.join(addedPartitions, ", "),
Utils.join(revokedPartitions, ", ")
);
if (!revokedPartitions.isEmpty()) {
// 如果存在需要放弃的分区则触发re-join等
firstException.compareAndSet(null, invokePartitionsRevoked(revokedPartitions));
// if revoked any partitions, need to re-join the group afterwards
log.debug("Need to revoke partitions {} and re-join the group", revokedPartitions);
requestRejoin();
}
}
// 其他公共调用
```
### 2.3、使用例子
在Kafka集群支持该协议的前提下仅需在Kafka消费客户端的配置中加上这个配置即可。
```Java
props.put(ConsumerConfig.PARTITION_ASSIGNMENT_STRATEGY_CONFIG, Collections.singletonList(CooperativeStickyAssignor.class));
```
### 2.4、客户端日志
**客户端一**
```Java
// 第一轮:
// 仅有一个客户端时,所有的分区都分配给该客户端
2021-06-08 20:17:50.252 [main] INFO o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Finished assignment for group at generation 9: {consumer-cg_logi_kafka_test_1-1-56a695ad-68c2-4e09-88a2-759e3854e366=Assignment(partitions=[kmo_community-0, kmo_community-1, kmo_community-2])}
// 第一轮仅有一个客户端的时候,所有分区都分配该客户端
2021-06-08 20:17:50.288 [main] DEBUG o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Executing onJoinComplete with generation 9 and memberId consumer-cg_logi_kafka_test_1-1-56a695ad-68c2-4e09-88a2-759e3854e366
2021-06-08 20:17:50.288 [main] INFO o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Updating assignment with
now assigned partitions: kmo_community-0, kmo_community-1, kmo_community-2
compare with previously owned partitions:
newly added partitions: kmo_community-0, kmo_community-1, kmo_community-2
revoked partitions:
// 第二轮:
// 存在两个客户端的时候,有一个分区没有分配给任何客户端
2021-06-08 20:18:26.431 [main] INFO o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Finished assignment for group at generation 10: {consumer-cg_logi_kafka_test_1-1-56a695ad-68c2-4e09-88a2-759e3854e366=Assignment(partitions=[kmo_community-1, kmo_community-2]), consumer-cg_logi_kafka_test_1-1-6ea3c93c-d878-4451-81f7-fc6c41d12963=Assignment(partitions=[])}
// 放弃了kmo_community-0分区但是12分区继续保留消费
2021-06-08 20:18:26.465 [main] DEBUG o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Executing onJoinComplete with generation 10 and memberId consumer-cg_logi_kafka_test_1-1-56a695ad-68c2-4e09-88a2-759e3854e366
2021-06-08 20:18:26.465 [main] INFO o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Updating assignment with
now assigned partitions: kmo_community-1, kmo_community-2
compare with previously owned partitions: kmo_community-0, kmo_community-1, kmo_community-2
newly added partitions:
revoked partitions: kmo_community-0
// 第三轮:
// 存在两个客户端的时候,没有分配的客户端,重新非配给了新的消费客户端
2021-06-08 20:18:29.548 [main] INFO o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Finished assignment for group at generation 11: {consumer-cg_logi_kafka_test_1-1-56a695ad-68c2-4e09-88a2-759e3854e366=Assignment(partitions=[kmo_community-1, kmo_community-2]), consumer-cg_logi_kafka_test_1-1-6ea3c93c-d878-4451-81f7-fc6c41d12963=Assignment(partitions=[kmo_community-0])}
// 第三轮rebalance的时候该客户端没有任何变化
2021-06-08 20:18:29.583 [main] INFO o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Updating assignment with
now assigned partitions: kmo_community-1, kmo_community-2
compare with previously owned partitions: kmo_community-1, kmo_community-2
newly added partitions:
revoked partitions:
```
**客户端二**
客户端二是在客户端一稳定运行之后上线的。
```Java
// 第二轮:
// 第二轮rebalance的时候没有分配到任何分区
2021-06-08 20:18:26.467 [main] DEBUG o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Executing onJoinComplete with generation 10 and memberId consumer-cg_logi_kafka_test_1-1-6ea3c93c-d878-4451-81f7-fc6c41d12963
2021-06-08 20:18:26.468 [main] INFO o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Updating assignment with
now assigned partitions:
compare with previously owned partitions:
newly added partitions:
revoked partitions:
// 第三轮:
// 第三轮rebalance的时候分配到了kmo_community-0
2021-06-08 20:18:29.584 [main] DEBUG o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Executing onJoinComplete with generation 11 and memberId consumer-cg_logi_kafka_test_1-1-6ea3c93c-d878-4451-81f7-fc6c41d12963
2021-06-08 20:18:29.584 [main] INFO o.a.k.c.consumer.internals.ConsumerCoordinator - [Consumer clientId=consumer-cg_logi_kafka_test_1-1, groupId=cg_logi_kafka_test_1] Updating assignment with
now assigned partitions: kmo_community-0
compare with previously owned partitions:
newly added partitions: kmo_community-0
revoked partitions:
```
## 3、常见问题
### 3.1、为什么SyncGroup后如果存在revokedPartitions分区的时候还要进行Re-Join操作
现在的做法:
- 在进行分配的时候如果将分区X从客户端1夺走但是并不会立即将其分配给客户端2。因此造成了在这一轮Rebalance结束之后呢如果存在revokedPartitions则就还需要进行一轮Rebalance。
那么为什么不修改成:
- 在进行分配的时候如果将分区X从客户端1夺走就立即将其分配给客户端2。这样的话是不是在Rebalance结束之后即便存在revokedPartitions那么也不需要进行Rebalance了。
如果这样修改的话可能存在的问题是分区X在分配给了客户端2时还在被客户端1使用那么客户端得去处理分区X同时被客户端1和客户端2消费的情况这种情况的正确处理**可能不是非常好处理**,因此没有采用这种方案。
采用增量Rebalance方式同时串行化进行分区的放弃和分配和Eager的Rebalance协议的大体处理流程基本一致因此在实现相对比较简单不需要去考虑前面提到的竞争问题而且收益也还可以。
## 4、总结
本次分享简要介绍了一下KIP-429: Kafka Consumer Incremental Rebalance Protocol功能还是非常的性感的大家在使用增量Rebalance协议的方式进行消费的时候有遇到什么问题也欢迎大家一起交流。
## 5、参考
[KIP-429: Kafka Consumer Incremental Rebalance Protocol](https://cwiki.apache.org/confluence/display/KAFKA/KIP-429%3A+Kafka+Consumer+Incremental+Rebalance+Protocol)

Binary file not shown.

After

Width:  |  Height:  |  Size: 402 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 410 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 305 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 222 KiB

View File

@@ -0,0 +1,10 @@
# Kafka服务端—API请求—Fetch请求
[TOC]
## 1、前言
## 2、Fetch请求

View File

@@ -0,0 +1,114 @@
# Kafka Broker 元信息变化请求处理
[TOC]
## 1、前言
Kafka Controller 主要通过 LEADER_AND_ISR、STOP_REPLICA 和 UPDATE_METADATA 三类请求,进行元信息变化的通知。
因此Kafka Broker 主要也是通过接收 Kafka Controller 发出来的这三个请求,来调整自身的状态。
本期分享主要介绍 Kafka Broker 如何处理这三类请求,以便后续在分享 Kafka Controller 的状态转变、Topic增删改查等专题的时候对 Kafka Broker 所做的事情有更加清晰快速的认识。
## 2、实现概述
这三个请求对象,都是继承自 AbstractControlRequest ,具体请求类之间的关系如下图所示:
<img src="./assets/abstract_control_request_class_related_entry.jpg" width="738px" height="400px">
- AbstractControlRequest三个字段用于告知controllerId同时还有版本信息确保只有最新的controller发出来的信息可以被处理。
&nbsp;
- UpdateMetadataRequest同步集群元信息的请求同步的信息包括分区信息和存活的broker信息。所有的Broker都会存储一份完整的集群元信息因此客户端随便请求哪一台Broker都可以获取到Topic的元信息。
&nbsp;
- StopReplicaRequest通知停副本同步的请求此外还带有一个是否将副本删除的字段。一般Broker下线、Topic删除、Topic缩副本、Topic迁移等Kafka Controller都会发送该请求。
&nbsp;
- LeaderAndIsrRequest通知分区状态(Leader、AR、ISR等)的请求
**问题一:这里我们发现 UPDATE_METADATA 请求和 LEADER_AND_ISR 请求,他们请求的数据格式基本上是一致的,这块为什么要这么设计,为什么不设计成一个接口呢?**
**问题二这里的Node、Broker还有EndPoint的区别是什么**
---
## 3、UPDATE_METADATA
### 3.1、UPDATE_METADATA 功能概述
区分 METADATA 请求和 UPDATE_METADATA
- METADATA大部分是客户端发起请求获取Topic元信息的。
- UPDATE_METADATA大部分是Controller发出来对Broker上的元信息进行更新。
### 3.2、UPDATE_METADATA 大体流程
<img src="./assets/update_metadata_summary_flow_chat.jpg" width="552px" height="500px">
### 3.3、UPDATE_METADATA 代码详读
#### 3.3.1、存储的元信息
```scala
// 在package kafka.server中的MetadataCache中存储了如下信息
// 分区状态(UpdateMetadataPartitionState)
// controllerId
// 存活的broker信息
// 存活的节点
case class MetadataSnapshot(partitionStates: mutable.AnyRefMap[String, mutable.LongMap[UpdateMetadataPartitionState]],
controllerId: Option[Int],
aliveBrokers: mutable.LongMap[Broker],
aliveNodes: mutable.LongMap[collection.Map[ListenerName, Node]])
```
#### 3.3.2、Quota分配策略
按照Leader的分布按Leader的比例数量分配Quota。因此存在的问题是如果Topic的分区流量不均衡那么可能当Topic的整体流量没有到限流值的是就显示已经被限流了。
<img src="./assets/update_metadata_change_quota.jpg" width="680px" height="400px">
---
## 4、STOP_REPLICA
### 4.1、STOP_REPLICA 功能概述
正如名字一样该请求的主要功能就是用于停Broker的副本的。
### 4.2、STOP_REPLICA 大体流程
<img src="./assets/stop_replica_summary_flow_chat.jpg" width="653px" height="400px">
## 5、LEADER_AND_ISR
### 5.1、LEADER_AND_ISR 功能概述
Leader_And_Isr请求的主要功能就是将分区的leader和follower切换的消息通知给broker然后broker进行Leader和Follower的切换。
### 5.2、LEADER_AND_ISR 大体流程
<img src="./assets/leader_and_isr_summary_flow_chat.jpg" width="700px" height="600px">
### 5.3、makeLeader 详细说明
<img src="./assets/leader_and_isr_make_leader_flow_chat.jpg" width="799px" height="600px">
### 5.4、makeFollower 详细说明
<img src="./assets/leader_and_isr_make_follower_flow_chat.jpg" width="799px" height="600px">
---
## 6、日常问题
### 6.1、问题一UPDATE_METADATA 和 LEADER_AND_ISR之间的区别
从Kafka Broker的角度看确实两个请求的数据基本是一样的
### 6.2、问题二这里的Node、Broker还有EndPoint的区别是什么

Binary file not shown.

After

Width:  |  Height:  |  Size: 404 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 238 KiB

Some files were not shown because too many files have changed in this diff Show More