の続き。
環境は、1Gbpsで、iSCSI multipath環境。
XenServer6.2を2台用意して、Live Migration。
そのとき、対象VMに同一セグメントの別ハードウェアのCentOSからPINGを1秒ごとに打った。
なお、この環境では、VMwareはseamlessにLive Migrationできた。(1秒未満)
VMはCentOS5。
とりあえず、SourceとLogを比較してみた。
対象の Sourceはこれ。
xapi-0.2/ocaml/xenops/xenops_server.ml
XenMotion でVMを受ける側
####################### Receive######################
1288 | VM_receive_memory (id, memory_limit, s) ->1289 debug "VM.receive_memory %s" id;1290 let open Xenops_migrate in1291 (* let state = B.VM.get_state (VM_DB.read_exn id) in1292 debug "VM.receive_memory %s power_state = %s" id (state.Vm.power_state |> rpc_of_power_s tate |> Jsonrpc.to_string);*)12931294 (try1295 Handshake.send s Handshake.Success1296 with e ->1297 Handshake.send s (Handshake.Error (Printexc.to_string e));1298 raise e1299 );1300 debug "Synchronisation point 1";
LOG> 22:22:30 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Synchronisation point 113011302 debug "VM.receive_memory calling create";
LOG> 22:22:30 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] VM.receive_memory calling create
1303 perform_atomics [1304 VM_create (id, Some memory_limit);
LOG> 22:22:30 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VM_create", ["188bbe5c-547c-0367-b848-2be29fc2275b", [2147483648]]]
1305 VM_restore (id, FD s);
LOG> 22:22:30 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VM_restore", ["188bbe5c-547c-0367-b848-2be29fc2275b", ["FD", 16]]]
1306 ] t;1307 debug "VM.receive_memory restore complete";
##### Ping fail: 22:22:49 - 22:23:00 #####LOG> 22:22:49 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] VM.receive_memory restore complete
1308 debug "Synchronisation point 2";
LOG> 22:22:49 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Synchronisation point 2
13091310 begin try1311 (* Receive the all-clear to unpause *)1312 Handshake.recv_success ~verbose:true s;
LOG> 22:22:51 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Handshake.recv: finished reading result code from remote.
1313 debug "Synchronisation point 3";
LOG> 22:22:51 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Synchronisation point 3
13141315 perform_atomics ([1316 ] @ (atomics_of_operation (VM_restore_devices id)) @ [1317 VM_unpause id;
LOG> 22:22:51 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VBD_set_active", ["188bbe5c-547c-0367-b848-2be29fc2275b", "xvda"], true]LOG> 22:22:51 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VBD_plug", ["188bbe5c-547c-0367-b848-2be29fc2275b", "xvda"]]LOG> 22:22:53 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VIF_set_active", ["188bbe5c-547c-0367-b848-2be29fc2275b", "0"], true]LOG> 22:22:53 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VIF_set_active", ["188bbe5c-547c-0367-b848-2be29fc2275b", "1"], true]LOG> 22:22:53 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VIF_plug", ["188bbe5c-547c-0367-b848-2be29fc2275b", "0"]]LOG> 22:22:54 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VIF_plug", ["188bbe5c-547c-0367-b848-2be29fc2275b", "1"]]LOG> 22:22:54 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VM_create_device_model", ["188bbe5c-547c-0367-b848-2be29fc2275b", true]]LOG> 22:22:54 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VM_unpause", "188bbe5c-547c-0367-b848-2be29fc2275b"]
1318 VM_set_domain_action_request(id, None)
LOG> 22:22:54 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VM_set_domain_action_request", ["188bbe5c-547c-0367-b848-2be29fc2275b", []]]
1319 ]) t;13201321 Handshake.send s Handshake.Success;1322 debug "Synchronisation point 4";
LOG> 22:22:54 xs62-1 xenopsd: [debug|xs62-1|7|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Synchronisation point 4
1323 with e ->1324 debug "Caught %s: cleaning up VM state" (Printexc.to_string e);1325 perform_atomics (atomics_of_operation (VM_shutdown (id, None)) @ [1326 VM_remove id1327 ]) t;1328 Handshake.send s (Handshake.Error (Printexc.to_string e))1329 end
XenMotion でVMを受ける側
####################### Send######################
1220 | VM_migrate (id, vdi_map, vif_map, url') ->1221 debug "VM.migrate %s -> %s" id url';1222 let vm = VM_DB.read_exn id in1223 let open Xmlrpc_client in1224 let open Xenops_client in1225 let url = url' |> Http.Url.of_string in1226 (* We need to perform version exchange here *)1227 let is_localhost =1228 try1229 let q = query t.Xenops_task.dbg url in1230 debug "Remote system is: %s" (q |> Query.rpc_of_t |> Jsonrpc.to_string);1231 q.Query.instance_id = instance_id1232 with e ->1233 debug "Failed to contact remote system on %s: is it running? (%s)" url' (Printexc.to_string e);1234 raise (Failed_to_contact_remote_service (url |> transport_of_url |> stri ng_of_transport)) in1235 if is_localhost1236 then debug "This is a localhost migration.";1237 Xenops_hooks.vm_pre_migrate ~reason:Xenops_hooks.reason__migrate_source ~id;12381239 let module Remote = Xenops_interface.Client(struct let rpc = xml_http_rpc ~srcstr:"xenop s" ~dststr:"dst_xenops" url end) in1240 let id = Remote.VM.import_metadata t.Xenops_task.dbg (export_metadata vdi_map vif_map id ) in1241 debug "Received id = %s" id;
LOG> 22:22:30 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Received id = 188bbe5c-547c-0367-b848-2be29fc2275b
1242 let suffix = Printf.sprintf "/memory/%s" id in1243 let memory_url = Http.Url.(set_uri url (get_uri url ^ suffix)) in12441245 (* CA-78365: set the memory dynamic range to a single value to stop ballooning. *)1246 let atomic = VM_set_memory_dynamic_range(id, vm.Vm.memory_dynamic_min, vm.Vm.memory_dyna mic_min) in1247 let (_: unit) = perform_atomic ~subtask:(string_of_atomic atomic) ~progress_callback:(fu n _ -> ()) atomic t in12481249 (* Find out the VM's current memory_limit: this will be used to allocate memory on the r eceiver *)1250 let state = B.VM.get_state vm in1251 info "VM %s has memory_limit = %Ld" id state.Vm.memory_limit;
LOG> 22:22:30 xs62-3 xenopsd: [ info|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] VM 188bbe5c-547c-0367-b848-2be29fc2275b has memory_limit = 2147483648
1253 with_transport (transport_of_url memory_url)1254 (fun mfd ->1255 let open Xenops_migrate in1256 let request = http_put memory_url ~cookie:[1257 "instance_id", instance_id;1258 "dbg", t.Xenops_task.dbg;1259 "memory_limit", Int64.to_string state.Vm.memory_limit;1260 ] in1261 request |> Http.Request.to_wire_string |> Unixext.really_write_string mf d;12621263 begin match Handshake.recv mfd with1264 | Handshake.Success -> ()1265 | Handshake.Error msg ->1266 error "cannot transmit vm to host: %s" msg;1267 end;1268 debug "Synchronisation point 1";
LOG> 22:22:30 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Synchronisation point 1
12691270 perform_atomics [1271 VM_save(id, [ Live ], FD mfd)
LOG> 22:22:30 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VM_save", ["188bbe5c-547c-0367-b848-2be29fc2275b", ["Live"], ["FD", 10]]]
1272 ] t;1273 debug "Synchronisation point 2";
##### Ping fail: 22:22:49 - 22:23:00 #####LOG> 22:22:51 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Synchronisation point 2
12741275 Handshake.send ~verbose:true mfd Handshake.Success;
LOG> 22:22:51 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Handshake.send: about to write result to remote.LOG> 22:22:51 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Handshake.send: finished writing result to remote.
1276 debug "Synchronisation point 3";
LOG> 22:22:51 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Synchronisation point 3
12771278 Handshake.recv_success mfd;1279 debug "Synchronisation point 4";
LOG> 22:22:54 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Synchronisation point 4
1280 );1281 let atomics = [1282 VM_hook_script(id, Xenops_hooks.VM_pre_destroy, Xenops_hooks.reason__suspend);1283 ] @ (atomics_of_operation (VM_shutdown (id, None))) @ [1284 VM_hook_script(id, Xenops_hooks.VM_post_destroy, Xenops_hooks.reason__suspend);
LOG> 22:22:54 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VM_hook_script", ["188bbe5c-547c-0367-b848-2be29fc2275b", "VM_pre_destroy", "suspend"]]LOG> 22:22:54 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VM_destroy_device_model", "188bbe5c-547c-0367-b848-2be29fc2275b"]LOG> 22:22:54 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VBD_unplug", ["188bbe5c-547c-0367-b848-2be29fc2275b", "xvda"], true]LOG> 22:22:55 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VIF_unplug", ["188bbe5c-547c-0367-b848-2be29fc2275b", "1"], true]LOG> 22:22:56 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VIF_unplug", ["188bbe5c-547c-0367-b848-2be29fc2275b", "0"], true]LOG> 22:22:57 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VM_destroy", "188bbe5c-547c-0367-b848-2be29fc2275b"]LOG> 22:22:58 xs62-3 xenopsd: [debug|xs62-3|9|Async.VM.pool_migrate R:07d6b289f1a4|xenops] Performing: ["VM_hook_script", ["188bbe5c-547c-0367-b848-2be29fc2275b", "VM_post_destroy", "suspend"]]LOG> 22:22:58 xs62-3 xenopsd: [debug|xs62-3|10|events|xenops] Performing: ["VM_destroy_device_model", "188bbe5c-547c-0367-b848-2be29fc2275b"]LOG> 22:22:58 xs62-3 xenopsd: [debug|xs62-3|10|events|xenops] Performing: ["VBD_unplug", ["188bbe5c-547c-0367-b848-2be29fc2275b", "xvda"], true]LOG> 22:22:58 xs62-3 xenopsd: [debug|xs62-3|10|events|xenops] Performing: ["VIF_unplug", ["188bbe5c-547c-0367-b848-2be29fc2275b", "1"], true]LOG> 22:22:58 xs62-3 xenopsd: [debug|xs62-3|10|events|xenops] Performing: ["VIF_unplug", ["188bbe5c-547c-0367-b848-2be29fc2275b", "0"], true]
1285 ] in1286 perform_atomics atomics t;1287 VM_DB.signal id
結果をみてみる
結果は、11秒の断。
これがある程度の目安になるかと。
内訳の前に、XenMotionを行うと、
Synchronisation pointが1から4まで出力される。
これがある程度の目安になるかと。
Synchronisation point 1
メモリのコピーと、Migration先にVMの作成とリストアを実施している。
ここでは、パケットロスは発生していない。
Synchronisation point 2
ここから発生か?
処理はHandshake。
1275 Handshake.send ~verbose:true mfd Handshake.Success;なんだろこれ。多分、これか。
1311 (* Receive the all-clear to unpause *)
Synchronisation point 3
パケットロスから2秒経過。
ここでは、移動先でVMの各種デバイス(VBDやVIFなど)を有効化している。
で、vm_unpause でパケット返しても良さそうだが、パケロス継続。
Synchronisation point 4
パケロスから5秒経過。
移動元のVMの各種デバイス(VBDやVIF)を無効化し、移動元のVMはDestroyする。
で、再開。
まとめ
なんか超理不尽な気がしない?
ポイント2、ポイント3は多めに見るとしてポイント4だよ。
新しいVMで動かさせろよ。
多分この動作は、LiveMigrationが失敗したときに移動元VMで復旧できるような処置のような気がする。
やっぱりね、XenServer変だよ。
Self-fenceもそうだけど、守ろうとして、その機能が悪さして、結果みんな痛い目をみる。
そう、それが Citrix XenServer。
でも、最近のXenServerはまともなので、この辺も改善というか、どのリスクを取ってXenMotionするか選択できるようになってくれればなぁと考える今日この頃。
<2014/06/17 追記>
XenServer 5.5 では、パケットロスなしでLive Migrationできていた。
これは、単純に、
LiveMigration元: VDI CLOSE。 VIF UNPLUG
その後、
LiveMigration先: VIF PLUG。 VDI OPEN
といった単純な動作だったから。
これが、Citrix XenServerが
Self-fenceもそうだけど、守ろうとして、その機能が悪さして、結果みんな痛い目をみる。という根拠。
0 件のコメント:
コメントを投稿