2021年6月1日 星期二

MySQL NDB Cluster 實務規劃

Data Node 記憶體需求計算


官方的 Data Node 記憶體需求計算方式

(SizeofDatabase × NumberOfReplicas × 1.1 ) / NumberOfDataNodes

範例:(10G * 2 *1.1)/ 2 =11G

目前線上狀況

Data node 記憶體設定

DataMemory=200G
IndexMemory=20G

Data node 記憶體實際使用狀況


上圖記憶體量計算方式
總共 6553600 pages,每個 page 32k
總 bytes = 6553600*32*1024=214748364800
總 G 數  = 214748364800 /1024/1024/1024 =200 G

//實際使用狀況
Data memory=6098653 pages
Bytes = 6098653*32*1024=199840661504
G 數  = 199840661504/1024/1024/1024=186.11G

Index memory = 359457 pages
Bytes = 359457*8*1024=2944671744
G 數  = 2.74 G

規劃目標4倍

Data node 記憶體設定

//目前 DataMemory 使用 186 G,取 192 G
//目前 IndexMemory 使用 2.74 G,取 4 G
DataMemroy 需求  = (192*4*2*1.1)/2= 844 G
IndexMemory 需求 = (4*4*2*1.1)/2= 17.6 G

#實際設定 1  規劃目標4倍
1 實體記憶體 1 T
2 系統保留 96G
3 DataMemory = 896G
4 IndexMemory= 32G

#實際設定 2 規劃目標3.5倍
1 實體記憶體 768 G
2 系統保留 48G
3 DataMemory = 688G
4 IndexMemory= 32G

設定檔


config.ini

[tcp default]
#The default size of the send buffer is 2MB, 
#which is the size recommended in most situations. 
#The minimum size is 64 KB; 
#the theoretical maximum is 4 GB
SendBufferMemory=4M

#Specifies the size of the buffer used when receiving data from the TCP/IP socket.
#The default value of this parameter is 2MB. 
#The minimum possible value is 16KB; the theoretical maximum is 4GB.
ReceiveBufferMemory=2M

#通过网络传输消息ID(取值: Y/N或1/0)
#This feature is disabled by default in production builds
#SendSignalId=0
 
#启用该参数将在所有消息置于发送缓冲之前,为所有参数计算校验和.
#默认禁止该特性.(取值: Y/N或1/0)
#This feature is disabled by default in production builds
#Checksum=0
 
[ndb_mgmd]
#Management Node 1
HostName=10.10.100.157
DataDir=/var/lib/mysql-cluster

[ndb_mgmd]
#Management Node 2
HostName=10.10.100.199
DataDir=/var/lib/mysql-cluster

[ndbd default]
NoOfReplicas=2 		# Number of replicas
DataMemory=928G 	# Memory allocate for data storage

#In NDB 7.5 and earlier, this parameter controls the amount of storage used for hash indexes in NDB Cluster
#size  = ( (fragments * 32K) + (rows * 18) )
#          * fragment_replicas
#fragments is the number of fragments, 
#fragment_replicas is the number of fragment replicas (normally two), 
#and rows is the number of rows. #If a table has one million rows, 
#eight fragments, and two fragment replicas, the expected index memory usage is calculated as shown here:
#((8 * 32K) + (1000000 * 18)) * 2 = ((8 * 32768) + (1000000 * 18)) * 2
#  = (262144 + 18000000) * 2
#  = 18262144 * 2 = 36524288 bytes = ~35MB
IndexMemory=32G

#This parameter determines how much memory is allocated for strings such as table names
#A value greater than 100 is interpreted as a number of bytes.
#The default value is 25—that is, 25 percent of the default maximum.
#StringMemory=25

DataDir=/var/lib/mysql-cluster

//cluster 總表上限
//The default value of this parameter is 128. 
//The minimum is 8 and the maximum is 20320.
//Each table object consumes approximately 20KB per node.
MaxNoOfTables = 2048

//cluster 總共可定義多少屬性(總欄位上限)
//The default value is 1000, with the minimum possible value being 32. 
//The maximum is 4294967039.
//每個attrubute消耗200bytes per node
MaxNoOfAttributes = 51200

//隨時保留多少ordered indexes 供使用
//Each unique index and primary key has both an ordered index and a hash index
//MaxNoOfOrderedIndexes sets the total number of ordered indexes that can be in use in the system at any one time.
//The default value of this parameter is 128. 
//Each index object consumes approximately 10KB of data per node.
MaxNoOfOrderedIndexes = 2048

//For each unique index that is not a primary key, 
//a special table is allocated that maps the unique key to the primary key of the indexed table. 
//By default, an ordered index is also defined for each unique index. 
//To prevent this, you must specify the USING HASH option when defining the unique index.
//The default value is 64. Each index consumes approximately 15KB per node.
MaxNoOfUniqueHashIndexes = 2048

#每個 unique hash index 會產生 3個 triggers
#每個 ordered index 會產生 1個 triggers
#Internal update, insert, and delete triggers are allocated for each unique hash index. 
#(This means that three triggers are created for each unique hash index.) 
#However, an ordered index requires only a single trigger object. 
#Backups also use three trigger objects for each normal table in the cluster.
#The default value is 768.
MaxNoOfTriggers = 7168

#把 pages 鎖定在實體記憶體中
#這個不要打開,如果記憶體不夠,會錯誤
#LockPagesInMainMemory=0

//每個transaction 在每個data node都需要transaction record
#TotalNoOfConcurrentTransactions =
#   (maximum number of tables accessed in any single transaction + 1)
#   * number of SQL nodes
#The default value is 4096.
MaxNoOfConcurrentTransactions = 409600


#要大於 MaxNoOfConcurrentTransactions
#The default value is 32768.
MaxNoOfConcurrentOperations = 3276800
#MaxNoOfLocalOperations 預設是 MaxNoOfConcurrentOperations*1.1

#For queries using a unique hash index, 
#another temporary set of operation records is used during a query's execution phase. 
#This parameter sets the size of that pool of records. 
#Only in rare cases of extremely high parallelism 
#using unique hash indexes should it be necessary to increase this value.
#The default value of this parameter is 8192. 
MaxNoOfConcurrentIndexOperations=16384



#The memory affected by this parameter is used for 
#tracking operations fired when updating index tables and reading unique indexes. 
#This memory is used to store the key and column information for these operations. 
#It is only very rarely that the value for this parameter needs to be altered from the default
#The default value for TransactionBufferMemory is 1MB.
#TransactionBufferMemory=1M

#The default value of MaxNoOfFiredTriggers is 4000, 
#which is sufficient for most situations. 
#In some cases it can even be decreased if the DBA feels certain the need for parallelism in the cluster is not high.
#MaxNoOfFiredTriggers=4000


#This parameter is used to control the number of parallel scans that can be performed in the cluster. 
#Each transaction coordinator can handle the number of parallel scans defined for this parameter.
#The default value of MaxNoOfConcurrentScans is 256. The maximum value is 500.
MaxNoOfConcurrentScans=500


#This parameter specifies whether a data node process should exit or perform an automatic restart when an error condition is encountered.
#This parameter's default value is 1; this means that, by default, an error causes the data node process to halt.
#When an error is encountered and StopOnError is 0, the data node process is restarted.
#StopOnError=1

#設定redo log 檔 size
#The default value for this parameter is 16M
FragmentLogFileSize=256M

#The default parameter value is 16,
NoOfFragmentLogFiles=16

#The default value for this parameter is 1MB.
BackupMaxWriteSize = 2M

#The default value for this parameter is 16MB.
BackupDataBufferSize = 32M

#The default value for this parameter is 16MB
BackupLogBufferSize = 32M


[ndbd]
HostName=10.10.100.157
NodeId=3

[ndbd]
HostName=10.10.100.199
NodeId=4

[mysqld]
#Node 5
HostName=10.10.100.165

[mysqld]
#Node 6
HostName=10.10.100.166

[mysqld]
#Node 7
HostName=10.10.100.200

[mysqld]
#Node 8
HostName=10.10.100.201

Data Node設定檔

/etc/my.cnf
[mysqld]
ndbcluster
ndb-connectstring=10.10.100.157
ndb-connectstring=10.10.100.199

[MYSQL_CLUSTER]
ndb-connectstring=10.10.100.157
ndb-connectstring=10.10.100.199

SQL Node設定檔

/etc/my.cnf
[mysqld]
ndbcluster
ndb-connectstring=10.10.100.157
ndb-connectstring=10.10.100.199

default-storage-engine=NDBCLUSTER

validate_password_length=6
validate_password_policy=LOW

datadir=/var/lib/mysql
socket=/var/lib/mysql/mysql.sock
symbolic-links=0

log-error=/var/log/mysqld.log
pid-file=/var/run/mysqld/mysqld.pid

max_connections=5000
max_connect_errors=1844674407370954751
connect_timeout=15
default-time-zone='+07:00'
max_user_connections=4900

#slow query
slow_query_log=1
slow_query_log_file=/var/lib/mysql/slow-query.log
long_query_time=1

max_heap_table_size=1024M
tmp_table_size=1024M

[MYSQL_CLUSTER]
ndb-connectstring=10.10.100.157
ndb-connectstring=10.10.100.199

Linux kernel 變數

 ipv4 tcp 先參考

Linux Command sysctl

 

sysctl

設定 linux kernel 變數

sysctl 變數

//顯示某個變數
shell> sysctl kernel.ostype

//列出全部變數
shell> sysctl -a  

//grep 變數
shell> sysctl -a | grep tcp_timeout

//改變數
shell> sysctl -w variable = value  
	sysctl -w net.ipv4.ip_forward=1        

修改設定檔 sysctl.conf

shell> vi /etc/sysct.conf

//加入變數、值
net.ipv4.ip_forward=1

載入設定檔,不需重開機

//載入 /etc/sysct.conf
shell> sysctl -p


//重新載入以下所有檔案
shell> sysctl --system

/run/sysctl.d/*.conf
/etc/sysctl.d/*.conf
/usr/local/lib/sysctl.d/*.conf
/usr/lib/sysctl.d/*.conf
/lib/sysctl.d/*.conf
/etc/sysctl.conf

ipv4 tcp 變數

2021年5月31日 星期一

MySQL NDB cluster 狀況處理

各種狀況教學

各狀況教學

Error 233 MaxNoOfConcurrentOperations

Got temporary error 233 'Out of operation records in transaction coordinator (increase MaxNoOfConcurrentOperations)' from NDBCLUSTER

參考說明
  • 1 編輯每個 ndb_mgmd 設定:/var/lib/mysql-cluster/config.ini
    [NDBD Default]
    MaxNoOfConcurrentOperations=100000 //加大同步處理的上限
    MaxNoOfLocalOperations=110000//加大local 處理上限,一般多 10%
  • 2 kill mgm, restart ndb_mgmd 每次一個
  • 3 從 mgm 裡面 stop ndbd, process停了,要從外部啟動 sudo ndbd 每次一個
  • 4 sql node 基本不需要重啟


  • Error 1297 Got temporary error 410

    Error 1297 Got temporary error 410 'REDO log files overloaded (decrease TimeBetweenLocalCheckpoints or increase NoOfFragmentLogFiles)' from NDBCLUSTER

    參考說明
    local check point需要一段時間才完成
    在這期間 REDO Log 會增長,如果用完 REDO Log 的容量就會出錯。
    1 增加容量
       data node number x NoOfFragmentLogFiles x NoOfFragmentLogFiles
       
    2 加快 LCP 完成速度。
      MinDiskWriteSpeed
      MaxDiskWriteSpeed
      MaxDiskWriteSpeedOwnRestart
      MaxDiskWriteSpeedOtherRestart
    
    
    The TimeBetweenLocalCheckPoints is plenty short. 
    Probably The REDO log needs to be increased such that it is 
    long enough to accommodate all transactions that occur over 
    the course of 2-3x LCPs. 
    
    Reducing the TimeBetweenLocalCheckPoints only decreases the 
    amount of time ndbd sits idle between LCPs. With this volume of 
    inserts there should be essentially no time waited between LCPs. 
    
    The time it takes to complete the LCP is determined by used 
    DataMemory/DiskCheckpointSpeed.
    
    If you know the size and number of inserts being written to the 
    cluster per second you can predict the minimal size of the log.
    
    For example if you have 3.5G of DataMemory used and default 
    DiskCheckpontSpeed of 10MB/s, 
    Each LCP will take about 6 minutes (360s.) to complete. 
    
    The default REDO log is 4 x NoOfFragmentLogFiles x FragmentLogFileSize = 1G long. 
    
    This would be overloaded performing 200x5k writes/second.
    
    方案:
    Increasing NoOfFragmentLogFiles and/or FragmentLogFileSize will 
    increase the total available length of the REDO log, 
    allowing for longer LCPs and/or higher volume of inserts/updates 
    per second. This requires more disk space.
    
    方案:
    Increasing DiskCheckPointSpeed 
    (改用 MinDiskWriteSpeed / MaxDiskWriteSpeed) leaves the length 
    of the log the same but causes the the LCP to complete faster. 
    Thus reducing the length of the REDO log required to hold all 
    incoming updates for two complete LCP. 
    
    This requires ndbd to use more disk IO bandwidth and CPU. 
    So if dimensioned incorrectly this risks reducing available disk bandwidth for 
    global checkpoint and disk data operations to the point 
    there is GCP Stop errors or a loss in performance.
    
    

    MySQL NDB Cluster 設定/資源/啟動

    Data node 記憶體需求計算

    Ndb cluster是把資料、索引全部放記憶體,所以記憶體需求很高。

    (SizeofDatabase × NumberOfReplicas × 1.1 ) / NumberOfDataNodes
    
    範例:
    (200G * 2 *1.1)/ 2 =220G
    

    Node Group 數量計算

    每個節點組,要有相同的 data node 數量

    Node group = Data nodes / NoOfReplicas
    假設有4個 data node
    config.ini中
    NoOfReplicas=1,4/1=4,有4個節點組,每個節點組1個副本
    NoOfReplicas=2,4/2=2,有2個節點組,每個節點組2個副本
    NoOfReplicas=4,4/4=1,有1個節點組,有4個副本
    
    NDB cluster 的重點是 data node,
    以 data node 為中心。
    
    

    SQL 語法不同

    CREATE TABLE 要加上 ENGING=NDBCluster

    CREATE TABLE tbl_name (col_name column_definitions) ENGINE=NDBCLUSTER;
    
    每个NDBCLUSTERtable 都有一个主键。
    如果在创建 table 时用户未定义主键,
    则NDBCLUSTER存储引擎会自动生成一个隐藏的主键
    

    ALTER TABLE 要加上 ENGING=NDBCluster

    ALTER TABLE tbl_name ENGINE=NDBCLUSTER;

    Cluster Online 開關測試

    ndb_mgm> 1 stop 	// stop data node id 4
    shell> ndbd 	//在 data node id 4 啟動 ndbd
    //可正常關閉、啟動
      
    ndb_mgm> 1 stop
    shell> ndbd --initial //用 initial option 啟動
    //可正常、關閉啟動,資料無影響
      
    ndb_mgm> 1 stop
    shell> ndbd
    ndb_mgm> exit 	//ndb 啟動中, mgm exit
    //可正常、關閉啟動
      
    ndb_mgm> 1 stop
    shell> ndbd 
    ndb_mgm> exit     //ndb 啟動中, mgm exit
    //可正常、關閉啟動
      
    ndb_mgm> 1 stop
    shell> ndbd --initial
    ndb_mgm> exit 	//ndb initial 啟動中, mgm exit
    //可正常、關閉啟動

    關全部 ndb_mgmd

    shell> kill 全部 ndb_mgmd
    shell> 啟動第一台 ndb_mgmd -f /var/lib/mysql-cluster/config.ini --reload
    shell> 啟動第二台 ndb_mgmd -f /var/lib/mysql-cluster/config.ini
    ndb_mgm> 1 stop //依序關閉,啟動 ndbd

    先關data,再關全部 ndb_mgmd

    ndb_mgm> 1 stop
    shell> kill 全部 ndb_mgmdpid
    shell> 啟動第一台 ndb_mgmd -f /var/lib/mysql-cluster/config.ini --reload
    shell> 啟動第二台 ndb_mgmd -f /var/lib/mysql-cluster/config.ini
    shell> ndbd (node 1)
    shell> ndbd --initial (node 2) //第二台需要initial

    啟動各node

    首次啟動

     啟動順序:
    1 shell>sudo ndb_mgmd --initial -f /var/lib/mysql-cluster/config.ini --reload
    有更改 config.ini ,重新啟動 mgmd 要加 --reload,沒更改不需加。
    
    2 shell>sudo ndbd(data node全部) 
    3 shell>sudo systemctl start mysqld(sql node全部)
    

    一般啟動

     啟動順序:
    1 shell>sudo ndb_mgmd  -f /var/lib/mysql-cluster/config.ini --reload
    2 shell>sudo ndbd(data node全部) 
    帶c參數  shell> sudo ndbd -c 10.10.100.100  //指定 mgm 位址
    帶initial參數 shell>sudo ndbd --initial  ////會重整這台ndbd的文件,只有第一次需要,每次都下initial 也沒關係
    --initial 會清掉這台ndb 的file system,並且從其他node 載入data 及 metadata.
    
    3 shell>sudo systemctl start mysqld(sql node全部)
    
    

    一般關閉

    關閉順序:
    1 shell>sudo systemctl stop mysqld
    2 ndb_mgm> node_id stop
    3 kill ndb_mgm_pid(kill mgm process)
    

    關閉重啟某個 data node

    關閉順序:
    1 ndb_mgm> node_id stop
    2 等關閉完成
    3 shell > ndbd / ndbd --initial
    

    Rolling Restart (依序重啟cluster mgm/data node)

    有幾種情況需要重啟

    1 改 config.ini
    2 增加 data node
    3 改 node host
    4 cluster重啟

    流程:
    1 關掉全部 ndb_mgmd,改全部 config.ini,全部重啟,reload config。
    shell>ndb_mgmd -f /var/lib/mysql-cluster/config.ini --reload //第一個 ndb_mgmd
    shell>ndb_mgmd	//其餘的 ndb_mgmd
      如果有某個 ndb_mgmd 在執行,那麼啟動的ndb_mgmd,會使用該程式的設定,
      所以要全部關閉才會生效。只有第一個ndb_mgmd --reload,後續的不需要 reload。
      
    2 依序關閉 ndbd,改config, 重啟。可依序 node 處理。
    3 依序關閉 mysql,改 config,重啟。可依序處理。
    

    實務開關經驗

    備份某個200G的線上 data node,要5小時

    啟動某個200G的線上 data node,要3.5小時

    關閉某個200G的線上 data node,要 ? 小時

    設定檔

    ndb_mgmd 設定檔 config.ini

    /var/lib/mysql-cluster/confing.ini
    [ndb_mgmd]
    #Management Node 1
    HostName=10.10.100.157
    DataDir=/var/lib/mysql-cluster
    
    [ndb_mgmd]
    #Management Node 2
    HostName=10.10.100.199
    DataDir=/var/lib/mysql-cluster
    
    [ndbd default]
    NoOfReplicas=2 	# Number of replicas
    DataMemory=912G # Memory allocate for data storage
    IndexMemory = 48G
    DataDir=/var/lib/mysql-cluster
    MaxNoOfTables = 1024
    MaxNoOfAttributes = 5000000
    MaxNoOfOrderedIndexes = 100000
    FragmentLogFileSize=256M
    MaxNoOfConcurrentOperations=400000
    MaxNoOfLocalOperations=440000
    
    //配置多少記憶體給string,例如table name
    //defalut 25 = 25% 最大 25
    //超過 100,ndbd會解釋為設定多少 bytes
    #StringMemory=25	
    
    
    
    [ndbd]
    HostName=10.10.100.157
    NodeId=3
    
    [ndbd]
    HostName=10.10.100.199
    NodeId=4
    
    [mysqld]
    #Node 5
    HostName=10.10.100.165
    
    [mysqld]
    #Node 6
    HostName=10.10.100.166
    
    [mysqld]
    #Node 7
    HostName=10.10.100.200
    
    [mysqld]
    #Node 8
    HostName=10.10.100.201
    
    

    ndbd 設定檔

    /etc/my.cnf
    [mysqld]
    ndbcluster
    ndb-connectstring=10.10.100.157
    ndb-connectstring=10.10.100.199
    
    [MYSQL_CLUSTER]
    ndb-connectstring=10.10.100.157
    ndb-connectstring=10.10.100.199
    

    mysql node設定檔

    /etc/my.cnf
    [mysqld]
    ndbcluster
    ndb-connectstring=10.10.100.157
    ndb-connectstring=10.10.100.199
    
    default-storage-engine=NDBCLUSTER
    
    validate_password_length=6
    validate_password_policy=LOW
    
    datadir=/var/lib/mysql
    socket=/var/lib/mysql/mysql.sock
    symbolic-links=0
    
    log-error=/var/log/mysqld.log
    pid-file=/var/run/mysqld/mysqld.pid
    
    max_connections=5000
    max_connect_errors=1844674407370954751
    connect_timeout=15
    default-time-zone='+07:00'
    max_user_connections=4900
    
    #slow query
    slow_query_log=1
    slow_query_log_file=/var/lib/mysql/slow-query.log
    long_query_time=1
    
    max_heap_table_size=1024M
    tmp_table_size=1024M
    
    [MYSQL_CLUSTER]
    ndb-connectstring=10.10.100.157
    ndb-connectstring=10.10.100.199
    

    2021年5月27日 星期四

    MySQL NDB Cluster 概念

    Node Group 節點組

    [# of node groups] = [# of data nodes] / NoOfReplicas
    

    Partition 分區

    Partition 數量取決於 data node 數量,及 data node 使用的 LDM process 數。

    運行 ndbmtd 時,LDM數量由 MaxNoOfExecutionThreads 變數控制

    運行 ndbd 時,LDM 數量只有1個,所以 partion數量就是 data node 數量。

    每個 data node 至少保留一份指定給他的分區(partition, 也就是一個分區副本 fragment replica)供系統使用

    通常對 ndbcluster table 自動分區,也可以使用者定義分區

    [# of partitions] = [# of data nodes] * [# of LDM threads]
    

    Fragment replica 副本

    副本是分區的副本(partition),副本數是節點組中的節點數。

    group 中有兩個節點,那某個 partion 的副本數就是2。

    副本數的圖