summaryrefslogtreecommitdiffstats
BranchCommit messageAuthorAge
bug-deprecatedIgnore all deprecated warnings for OPENSSL on DARWINDennis Schafroth11 years
dht-stale-layout-fixeschange branch name.Raghavendra G11 years
expencryption: Move all xlators to experimentalVijay Bellur9 years
experimentalrfc.sh: fixes for faster submitAmar Tumballi7 years
heal-infoglusterd: Add warning and abort in case of failures in migration during remov...Vishal Pandey6 years
masterContributing: update about who can trigger the buildAmar Tumballi5 years
release-2.0s/Patchwork/Gerrit/Anand Avati14 years
release-3.0Merge branch 'release-3.0' of ssh://git.gluster.com/glusterfs into release-3.0Vijay Bellur14 years
release-3.1mount/fuse: Inherit direct-io-mode values from fds alreadyRaghavendra G14 years
release-3.10glusterd/ganesha : Skip non-ganesha nodes properly for ganesha HA set upJiffin Tony Thottan8 years
release-3.11doc: Minor edit to release notes for 3.11.3ShyamsundarR8 years
release-3.12Release notes for Gluster 3.12.15Jiffin Tony Thottan7 years
release-3.13afr: capture the correct errno in post-op quorum checkRavishankar N8 years
release-3.2features/marker: Replacing -1 with GF_CLIENT_PID_GSYNCD as part of code cleanup.Mohammed Junaid13 years
release-3.3build: really disable fusermount if you say soNiels de Vos12 years
release-3.43.4.7, this time for realKaleb S. KEITHLEY11 years
release-3.5doc: release notes for GlusterFS 3.5.9Niels de Vos10 years
release-3.6gfapi: return EINVAL for unsupported lseek() operationsNiels de Vos10 years
release-3.7debug/trace: Print entries' iatts in readdirp cbkKrutika Dhananjay9 years
release-3.8doc: release-notes for GlusterFS-3.8.15Niels de Vos8 years
release-3.8-fbscripts: Updated FB branch diff script to note SKIPsShyamsundarR8 years
release-3.9features/shard: Fix EIO error on add-brickKrutika Dhananjay9 years
release-4.0core: FreeBSD has pthread_set_name_np() (versus pthread_setname_np())Kaleb S. KEITHLEY8 years
release-4.1doc: Added release notes for 4.1.10hari gowtham6 years
release-5doc: Added release notes for 5.13Hari Gowtham6 years
release-6Adding release notes for release-6.10Rinku Kothiya5 years
release-7features/bit-rot: invalid snprintf() buffer sizeDmitry Antipov5 years
release-8geo-rep: Fix string comparisonKotresh HR5 years
round-robin2io-threads: distribute work fairly among clientsJeff Darcy9 years
testing-regression-job[DO NOT MERGE]Deepshikha khandelwal7 years
 
TagDownloadAuthorAge
v7.8commit b4f19c7b1c...Rinku Kothiya5 years
v8.2commit 895183d5a2...Rinku Kothiya5 years
v8.1commit f9b8462ba2...Rinku Kothiya5 years
v6.10commit 48fc076676...Rinku Kothiya5 years
v7.7commit 95f167483e...Rinku Kothiya5 years
v8.0commit 2e1e4168ab...Rinku Kothiya6 years
v8.0rc0commit 18bd1bdaa6...Rinku Kothiya6 years
v7.6commit bef7c8e54e...Rinku Kothiya6 years
v6.9commit 57b48f2802...Hari Gowtham6 years
v9devcommit 0e94dbb811...Rinku Kothiya6 years
v8.0alphacommit f949c3e13e...Rinku Kothiya6 years
v7.5commit df0a3c99dc...Rinku Kothiya6 years
v5.13commit 98eb9c95fb...Hari Gowtham6 years
v7.4commit 87742bdeeb...Rinku Kothiya6 years
v6.8commit 1b43afb1fd...Hari Gowtham6 years
v5.12commit e46e5721ac...Hari Gowtham6 years
v7.3commit 1d59dc3b37...Rinku Kothiya6 years
v7.2commit 7587318650...Rinku Kothiya6 years
v6.7commit 366d0c6c12...Hari Gowtham6 years
v7.1commit 385a37bb45...Rinku Kothiya6 years
v5.11commit 7c0ef8f5c0...Hari Gowtham6 years
v6.6commit 84487199af...Hari Gowtham6 years
v7.0commit a92e9e8e8a...Rinku Kothiya6 years
v5.10commit 3e12379fba...Hari Gowtham6 years
v7.0rc3commit bac5d7d60d...Rinku Kothiya6 years
v7.0rc2commit 6b3fec3793...Rinku Kothiya6 years
v7.0rc1commit a2201d804d...Rinku Kothiya6 years
v7.0rc0commit b3e0671a4e...Rinku Kothiya6 years
v6.5commit f571e3b7bf...Hari Gowtham6 years
v5.9commit 0386ca3969...Hari Gowtham6 years
v5.8commit 05138aedc3...Hari Gowtham6 years
v6.4commit 8761be7ba7...Hari Gowtham6 years
v4.1.10commit 05efb3cafd...Hari Gowtham6 years
v5.7commit d24b4605fe...Hari Gowtham7 years
v8devcommit 24a3204d70...Rinku Kothiya7 years
v7.0alphacommit 96702cfea3...Rinku Kothiya7 years
v6.3commit 9acd89087b...Hari Gowtham7 years
v4.1.9commit 735237b0ee...Hari Gowtham7 years
v6.2commit 630b896166...Hari Gowtham7 years
v6.1commit 5c521d403f...ShyamsundarR7 years
v5.6commit 34a2347780...ShyamsundarR7 years
v4.1.8commit 5c25484564...ShyamsundarR7 years
v6.0commit 3fadf5cc41...ShyamsundarR7 years
v5.5commit 0901b43f84...ShyamsundarR7 years
v6.0rc1commit 3bcf210a5a...ShyamsundarR7 years
v5.4commit c691b4a373...ShyamsundarR7 years
v6.0rc0commit e6fabf8c33...ShyamsundarR7 years
v7devcommit 90922d20f5...ShyamsundarR7 years
v6.0alphacommit c396ba1826...ShyamsundarR7 years
v4.1.7commit af2b99a04f...ShyamsundarR7 years
v5.3commit dc85835297...ShyamsundarR7 years
v5.2commit ae7f60235a...ShyamsundarR7 years
v4.1.6commit d6383f22f0...ShyamsundarR7 years
v5.1commit c12ded9ebf...ShyamsundarR7 years
v5.0commit ab12861f40...ShyamsundarR7 years
v3.12.15commit a1cf0f2c8b...Jiffin Tony Thottan7 years
v5.0rc1commit a94a67efde...ShyamsundarR7 years
v4.1.5commit 957a5e6b4a...ShyamsundarR7 years
v5.0rc0commit f4594a3c88...ShyamsundarR7 years
v6devcommit f15e948da3...ShyamsundarR7 years
v5.0alphacommit 41388f15fc...ShyamsundarR7 years
v3.12.14commit fe5b6bc852...Jiffin Tony Thottan7 years
v4.1.4commit deafd5a4f8...Jiffin Tony Thottan7 years
v4.1.3commit 7ddcf035b8...ShyamsundarR7 years
v3.12.13commit 8727292061...Jiffin Tony Thottan7 years
v4.1.2commit 044f9df659...ShyamsundarR7 years
v3.12.12commit f98d86f2a3...Jiffin Tony Thottan7 years
v4.1.1commit b17acca6d0...ShyamsundarR8 years
v3.12.11commit 15a5367925...ShyamsundarR8 years
v3.12.10commit abe2e42666...Jiffin Tony Thottan8 years
v4.1.0commit b4f4480094...ShyamsundarR8 years
v4.1.0rc0commit 4f591e873c...ShyamsundarR8 years
v4.2devcommit a6490fe021...ShyamsundarR8 years
v4.1.0alphacommit 548902d1a7...ShyamsundarR8 years
v3.10.12commit c420ef1251...ShyamsundarR8 years
v4.0.2-1commit 07d8cc04c4...ShyamsundarR8 years
v3.12.9commit 9d883f31db...ShyamsundarR8 years
v4.0.2commit 92cc124298...ShyamsundarR8 years
v3.12.8commit 318d2c833f...Jiffin Tony Thottan8 years
v4.0.1commit 17b43ec3d2...ShyamsundarR8 years
v3.12.7commit 5a1fcd2636...Jiffin Tony Thottan8 years
v4.0.0-2commit 2628a91eaa...ShyamsundarR8 years
v4.0.0commit c1c43f8601...ShyamsundarR8 years
v3.10.11commit 39716f601d...ShyamsundarR8 years
v4.0.0rc1commit 8b85778185...ShyamsundarR8 years
v3.12.6commit c5146ab5ad...Jiffin Tony Thottan8 years
v4.0.0rc0commit 4c74d7ffb6...ShyamsundarR8 years
v3.10.10commit dd038cc99a...ShyamsundarR8 years
v4.1devcommit 93f21655e1...ShyamsundarR8 years
v4.0.0alphacommit 7379f89de9...ShyamsundarR8 years
v3.13.2commit ee067308c3...ShyamsundarR8 years
v3.12.5commit 0e59d1556f...Jiffin Tony Thottan8 years
v3.10.9commit 1a7c263ac6...ShyamsundarR8 years
v3.13.1commit c1052342b2...ShyamsundarR8 years
v3.12.4commit 7b8d2f1241...Jiffin Tony Thottan8 years
v3.13.0commit aee30521e0...ShyamsundarR8 years
v3.10.8commit fb39306a14...ShyamsundarR8 years
v3.13.0rc0commit 2960af47b8...ShyamsundarR8 years
v3.12.3commit e37862dd3f...Jiffin Tony Thottan8 years
v4.0dev1commit 5424c63fe1...ShyamsundarR8 years
v3.13.0betacommit aa2c149e05...ShyamsundarR8 years
v3.10.7commit 8cc68838b0...ShyamsundarR8 years
v3.12.2commit 3c0b53018a...Jiffin Tony Thottan8 years
v3.10.6commit 23e6587cad...ShyamsundarR8 years
v3.12.1commit 02d9b26502...ShyamsundarR8 years
v3.12.0commit c8c56b9a53...ShyamsundarR8 years
v3.11.3commit dd281c7c15...ShyamsundarR8 years
v3.8.15commit d174f021a4...Niels de Vos8 years
v3.10.5commit 013291cd46...ShyamsundarR8 years
v3.12.0rc0commit aad7c1c196...ShyamsundarR8 years
v4.0devcommit 4a85a221c9...Shyam8 years
v3.12.0alpha1commit deed1ca53c...Shyam8 years
v3.11.2commit b8659b3f2c...Shyam8 years
v3.8.14commit c4057aa93e...Niels de Vos8 years
v3.10.4commit d393ff6209...Raghavendra Talur9 years
v3.11.1commit 8b7aee4f54...Shyam9 years
v3.8.13commit cfba812b3e...Niels de Vos9 years
v3.10.3commit 4e18189145...Raghavendra Talur9 years
v3.11.0commit f886a26964...Shyam9 years
v3.11.0rc1commit c674cf28ab...Shyam9 years
v3.10.2commit c84b179be9...Raghavendra Talur9 years
v3.8.12commit f453425865...Niels de Vos9 years
v3.11.0rc0commit dc5a26bbb4...Shyam9 years
v3.12devcommit 7a9d6aa599...Shyam9 years
v3.11.0beta1commit 3d2ed27812...Shyam9 years
v3.8.11commit 104d075557...Niels de Vos9 years
v3.10.1commit dc4aa17e61...Shyam9 years
v3.8.10commit 68d5c0ef24...Niels de Vos9 years
v3.10.0commit d2a3a8f520...Shyam9 years
v3.10.0rc1commit a8e8bd57e2...Shyam9 years
v3.8.9commit c5649e7b8d...Niels de Vos9 years
v3.10.0rc0commit 3d4192cb6c...Shyam9 years
v3.7.20commit d2bd103b38...Kaushal M9 years
v3.11devcommit 6f4811ca93...Shyam9 years
v3.10.0alpha1commit 3bed8373a9...Shyam9 years
v3.9.1commit 1e3b03c401...Kaleb S. KEITHLEY9 years
v3.8.8commit e5f3a990cd...Niels de Vos9 years
v3.7.19commit c2aa8b1e44...Kaushal M9 years
v3.8.7commit 7638c2fef0...Niels de Vos9 years
v3.7.18commit eada3bf2bb...Kaushal M9 years
v3.8.6commit c485ea191d...Niels de Vos9 years
v3.9.0commit 96770bc3c2...Pranith Kumar K9 years
v3.7.17commit c11131fcdf...Kaushal M9 years
v3.9.0rc2commit 915ae56a65...Aravinda VK9 years
v3.8.5commit 5ebbd95256...Niels de Vos9 years
v3.7.16commit d66a37aa20...Kaushal M9 years
v3.9rc1commit f5b283baa7...Pranith Kumar K9 years
v3.8.4commit 266ca69d01...Niels de Vos9 years
v3.10devcommit ee0d8ca53f...Pranith Kumar K9 years
v3.9rc0commit c5440058ea...Pranith Kumar K9 years
v3.7.15commit 50a29757c2...Kaushal M9 years
v3.8.3commit 8519efc5b3...Niels de Vos9 years
v3.8.2commit 8220b0002f...Niels de Vos9 years
v3.7.14commit 04fa81f1c4...Kaushal M9 years
v3.8.1commit 754e2658ba...Niels de Vos10 years
v3.7.13commit 4dce874a77...Kaushal M10 years
v3.7.12commit 5cf8e28988...Kaushal M10 years
v3.8.0commit c6d9e23b54...Niels de Vos10 years
v3.7.12rc1commit bac2781a2d...Vijay Bellur10 years
v3.8rc2commit d53bfc6f04...Niels de Vos10 years
v3.8rc1commit 74ab2d6f10...Niels de Vos10 years
v3.8rc0commit 0fadd46c2c...Niels de Vos10 years
v3.9devcommit 60e340481a...Niels de Vos10 years
v3.7.11commit b0bfc70048...Kaushal M10 years
v3.7.10commit 889e92c722...Kaushal M10 years
v3.7.9commit 1eece065f1...Vijay Bellur10 years
v3.5.9commit ab52c1e696...Niels de Vos10 years
v3.6.9commit 52f6c664a2...Raghavendra Bhat10 years
v3.5.8commit 721b15a74d...Niels de Vos10 years
v3.7.8commit 3db831788c...Pranith Kumar K10 years
v3.7.7commit ccdfa49c90...Pranith Kumar K10 years
v3.6.8commit b19a1d0798...Kaushal M10 years
v3.5.7commit 3efd1b8997...Niels de Vos10 years
v3.6.7commit 5d264dbcb7...Raghavendra Bhat10 years
v3.7.6commit a3289b1d06...Raghavendra Talur10 years
v3.7.5commit 25e581d42e...Pranith Kumar K10 years
v3.6.6commit 69b5471d13...Raghavendra Bhat10 years
v3.5.6commit 6a0faa1ff1...Niels de Vos10 years
v3.7.4commit 300a69669a...Kaushal M10 years
v3.6.5commit dfa2bfb289...Raghavendra Bhat10 years
v3.7.3commit 918e430294...Kaushal M10 years
v3.6.4commit 50b0c7baad...Raghavendra Bhat10 years
v3.5.5commit 191725abf5...Niels de Vos11 years
v3.7.2commit 568d7f4dda...Atin Mukherjee11 years
v3.6.4beta2commit 83778a592f...Raghavendra Bhat11 years
v3.5.4commit 55bd875f0b...Niels de Vos11 years
v3.7.1commit f266dc6be5...Krishnan Parthasarathi11 years
v3.6.4beta1commit 24b3db0768...Raghavendra Bhat11 years
v3.7.0commit 5538051a64...Vijay Bellur11 years
v3.7.0beta2commit a492b22f57...Niels de Vos11 years
v3.7.0beta1commit 7e2baf9acd...Vijay Bellur11 years
v3.6.3commit e7640557a6...Raghavendra Bhat11 years
v3.8devcommit 08a1041ca9...Vijay Bellur11 years
v3.7.0alpha0commit 8b987be340...Vijay Bellur11 years
v3.4.7commit bcb906a9e2...Kaleb S. KEITHLEY11 years
v3.5.4beta1commit 077185afe3...Niels de Vos11 years
v3.6.3beta2commit 3d76e803e7...Raghavendra Bhat11 years
v3.4.7beta4commit bcb906a9e2...Kaleb S. KEITHLEY11 years
v3.4.7beta3commit fcd46d8e8f...Kaleb S. KEITHLEY11 years
v3.4.7beta2commit 982f0fac53...Kaleb S. KEITHLEY11 years
v3.4.7beta1commit b037cfe399...Kaleb S. KEITHLEY11 years
v3.6.3beta1commit e312b0807b...Raghavendra Bhat11 years
v3.6.2commit 709d471294...Raghavendra Bhat11 years
v3.6.2beta2commit 443cda365c...Raghavendra Bhat11 years
v3.6.2beta1commit 6e423ca790...Raghavendra Bhat11 years
v3.5.3commit be560c6f41...Niels de Vos11 years
v3.4.6commit ceaba0fc43...Kaleb S. KEITHLEY11 years
v3.6.1commit 1ffdf112f7...Vijay Bellur11 years
v3.6.0commit 3867bdb496...Vijay Bellur11 years
v3.4.6beta2commit ceaba0fc43...Kaleb S. KEITHLEY11 years
v3.5.3beta2commit a970fc0cd7...Niels de Vos11 years
v3.5.3beta1commit db7d578da0...Niels de Vos11 years
v3.6.0beta3commit 912eec63f4...Vijay Bellur11 years
v3.6.0beta2commit dd80d06145...Vijay Bellur11 years
v3.6.0beta1commit 936cf82d93...Vijay Bellur11 years
v3.4.6beta1commit 1d4ef0b891...Kaleb S. KEITHLEY11 years
v3.5.2commit 9bb7cb9ff1...Niels de Vos11 years
v3.4.5commit 7b564b8800...Kaleb S. KEITHLEY11 years
v3.5.2beta1commit d5f72dc496...Niels de Vos11 years
v3.6.0alpha1commit 6aafb3a64a...Vijay Bellur11 years
v3.7devcommit 52da727e75...Vijay Bellur11 years
v3.4.5beta2commit e311014d94...Kaleb S. KEITHLEY12 years
v3.4.5beta1commit ecc21940ea...Kaleb S. KEITHLEY12 years
v3.5.1commit b8f6798c17...Niels de Vos12 years
v3.5.1beta2commit b167a0ca7f...Niels de Vos12 years
v3.4.4commit 8005d56f08...Kaleb S. KEITHLEY12 years
v3.4.4beta1commit 8005d56f08...Kaleb S. KEITHLEY12 years
v3.5.1betacommit d74024b470...Niels de Vos12 years
v3.5.0commit 2e767af207...Vijay Bellur12 years
v3.5.0beta5commit 4e0659a779...Vijay Bellur12 years
v3.4.3commit b0d6d20ab2...Kaleb S. KEITHLEY12 years
v3.4.3beta2commit 33cc417e64...Kaleb S. KEITHLEY12 years
v3.4.3beta1commit 010a9a7867...Kaleb S. KEITHLEY12 years
v3.5.0beta4commit e779cc8c32...Vijay Bellur12 years
v3.4.3alpha1commit 945c6de4e6...Kaleb S. KEITHLEY12 years
v3.5.0beta3commit b319f01ecd...Vijay Bellur12 years
v3.5.0beta2commit a338c4fbc4...Vijay Bellur12 years
v3.5beta1commit 1350c7193e...Vijay Bellur12 years
v3.4.2commit 098fd71353...Vijay Bellur12 years
v3.4.2qa5commit b2ee85b3e4...Vijay Bellur12 years
v3.4.2qa4commit 1832dbf0ba...Vijay Bellur12 years
v3.4.2qa3commit 790c2813ef...Vijay Bellur12 years
v3.4.2qa2commit 1e40a57d16...Vijay Bellur12 years
v3.5.0qa3commit b58810f5df...Anand Avati12 years
v3.5qa2commit a25d321bad...Vijay Bellur12 years
v3.4.2qa1commit 88dc9d8899...Anand Avati12 years
v3.5.0qa1commit f21cefed29...Vijay Bellur12 years
v3.4.1commit 56769c4db9...Vijay Bellur12 years
v3.4.1rc1commit 56769c4db9...Anand Avati12 years
v3.4.1qa3commit 04163fc4ba...Anand Avati12 years
v3.4.1qa2commit 536eccde0b...Anand Avati12 years
v3.4.1qa1commit 8565d383a1...Vijay Bellur12 years
v3.3.2commit 34bca063cd...Vijay Bellur12 years
v3.4.0commit b92b98ef9a...Vijay Bellur12 years
v3.3.2qa4commit 34bca063cd...Vijay Bellur13 years
v3.4.0beta4commit 505f57e07c...Vijay Bellur13 years
v3.4.0beta3commit 2fda7a9de2...Vijay Bellur13 years
v3.4.0beta2commit df83bc05ff...Vijay Bellur13 years
v3.3.2qa3commit 1a7e6053d3...Vijay Bellur13 years
v3.3.2qa2commit 0ab16bb29a...Vijay Bellur13 years
v3.4.0beta1commit 5ac55756cd...Anand Avati13 years
v3.4.0alpha3commit 92729add67...Vijay Bellur13 years
v3.3.2qa1commit d836002fce...Vijay Bellur13 years
v3.4.0alpha2commit c37546cf11...Anand Avati13 years
v3.4.0alphacommit 765fdd0809...Vijay Bellur13 years
v3.4.0qa8commit 315ee9c4e0...Vijay Bellur13 years
v3.4.0qa7commit 6fd654dc94...Vijay Bellur13 years
v3.3.1commit e7f14ad073...Vijay Bellur13 years
v3.4.0qa6commit e8c75fd929...Vijay Bellur13 years
v3.3.0.5rhs-40commit 6e3efac008...Vijay Bellur13 years
v3.3.0.5rhs-39commit 6e3efac008...Vijay Bellur13 years
v3.4.0qa5commit fef94c2acf...Vijay Bellur13 years
v3.4.0qa4commit 48d749dda3...Vijay Bellur13 years
v3.4.0qa3commit c85a3eee54...Vijay Bellur13 years
v3.3.1qa3commit 517a9d2450...Vijay Bellur13 years
v3.3.1qa2commit ace4cae71c...Vijay Bellur13 years
v3.3.1qa1commit 753f8c1324...Vijay Bellur13 years
v3.2.7commit 092dc2676b...Vijay Bellur14 years
v3.2.7qa2commit 2533d2b56b...Vijay Bellur14 years
v3.3.0commit 1b79849119...Vijay Bellur14 years
v3.3.0qa45commit 493ef71222...Anand Avati14 years
v3.3.0qa44commit 647f561f6a...Vijay Bellur14 years
v3.3.0qa43commit 9d4c8b3909...Vijay Bellur14 years
v3.3.0qa42commit d54d9e9412...Vijay Bellur14 years
v3.3.0beta4commit bdd240eca1...Vijay Bellur14 years
v3.3.0qa41commit 8852f95869...Vijay Bellur14 years
v3.3.0qa40commit 9189ff9739...Vijay Bellur14 years
v3.3.0qa39commit 81df001b3e...Vijay Bellur14 years
v3.3.0qa38commit fdcbf065a9...Vijay Bellur14 years
v3.3.0qa37commit 66fddb979d...Vijay Bellur14 years
v3.3.0qa36commit 857ba84a23...Vijay Bellur14 years
v3.3.0qa35commit 80eeaab2be...Vijay Bellur14 years
v3.3.0beta3commit df8e2f53b7...Vijay Bellur14 years
v3.2.7qa1commit deea482def...Vijay Bellur14 years
v3.3.0qa34commit 4bb82b2c77...Vijay Bellur14 years
v3.3.0qa33commit 1043dedfb5...Vijay Bellur14 years
v3.3.0qa32commit af0eb165f6...Vijay Bellur14 years
v3.3.0qa31commit c40b9975d0...Vijay Bellur14 years
v3.3.0qa30commit d98c3e1934...Vijay Bellur14 years
v3.3.0qa29commit 65c6e3706f...Anand Avati14 years
v3.3.0qa28commit 212d739886...Vijay Bellur14 years
v3.2.6p3commit 410b1092e6...Vijay Bellur14 years
v3.2.6p2commit 5ce988633d...Vijay Bellur14 years
v3.3.0qa27commit 152a0194e7...Vijay Bellur14 years
v3.2.6commit fafd5c17c0...Vijay Bellur14 years
v3.2.6qa6commit fafd5c17c0...Vijay Bellur14 years
v3.2.6qa5commit e657569da2...Vijay Bellur14 years
v3.3.0qa26commit f6a779ffc5...Vijay Bellur14 years
v3.2.6qa4commit 8127a6f35e...Vijay Bellur14 years
v3.3.0qa25commit 468768d280...Vijay Bellur14 years
v3.3.0qa24commit 88c6c11813...Vijay Bellur14 years
v3.3.0qa23commit 42cc043875...Vijay Bellur14 years
v3.3.0qa22commit c8d47f056e...Vijay Bellur14 years
v3.2.6qa3commit cd3ad588f2...Anand Avati14 years
v3.2.6qa2commit fa580e9299...Anand Avati14 years
v3.3.0qa21commit 83a3daf7c2...Vijay Bellur14 years
v3.3.0qa20commit 0694749c3e...Vijay Bellur14 years
v3.2.6qa1commit 1020a3dfe9...Anand Avati14 years
v3.3.0qa19commit be003fbb3a...Vijay Bellur14 years
v3.3.0qa18commit d7d9f3d400...Vijay Bellur14 years
v3.3.0qa17commit 0074f20844...Vijay Bellur14 years
v3.3.0qa16commit 7235e5b1af...Vijay Bellur14 years
v3.3.0qa15commit 289c2902d6...Vijay Bellur14 years
v3.2.5commit edf9551b38...Vijay Bellur14 years
v3.2.5qa9commit edf9551b38...Vijay Bellur14 years
v3.2.5qa8commit 252c9e5cf2...Vijay Bellur14 years
v3.2.5qa7commit d2a05724a6...Vijay Bellur14 years
v3.2.5qa6commit 51601b2bff...Vijay Bellur14 years
v3.2.5qa5commit 8668da9744...Vijay Bellur14 years
v3.2.5qa4commit bca358604d...Vijay Bellur14 years
v3.2.5qa3commit 3b0eecb53f...Vijay Bellur14 years
v3.2.5qa2commit 7dcc94cf1f...Vijay Bellur14 years
v3.2.5qa1commit 449f31c8ae...Vijay Bellur14 years
v3.3.0qa14commit 4235f7a74e...Vijay Bellur14 years
v3.2.4commit da73b31942...Vijay Bellur14 years
v3.3.0qa13commit 795c8996c1...Vijay Bellur14 years
v3.2.4qa5commit 6c5d3e40a6...Vijay Bellur14 years
v3.3.0qa12commit 16b7e3bf20...Vijay Bellur14 years
v3.2.4qa4commit edd9461647...Vijay Bellur14 years
v3.3.0qa11commit 7658047903...Vijay Bellur14 years
v3.3.0qa10commit 4765dd1a1c...Vijay Bellur14 years
v3.2.4qa3commit 9564e09e53...Vijay Bellur14 years
v3.2.4qa2commit 0f9502d5eb...Vijay Bellur14 years
v3.2.4qa1commit 6fe790ee35...Vijay Bellur14 years
v3.3.0qa9commit b827cdb230...Vijay Bellur14 years
v3.1.7commit a2739b842b...Vijay Bellur14 years
v3.1.7qa4commit a2739b842b...Vijay Bellur14 years
v3.1.7qa3commit f9fa468090...Vijay Bellur14 years
v3.1.7qa2commit d120020fd5...Vijay Bellur14 years
v3.1.7qa1commit 561bba7ae4...Vijay Bellur14 years
v3.2.3commit 1acef91232...Vijay Bellur14 years
v3.3beta2commit b827cdb230...Vijay Bellur14 years
v3.3.0qa8commit b827cdb230...Vijay Bellur14 years
v3.3.0qa7commit 601f5725a0...Vijay Bellur14 years
v3.2.3qa6commit 1acef91232...Vijay Bellur14 years
v3.3.0qa6commit b6e3e9c480...Vijay Bellur14 years
v3.3.0qa5commit 5ace31ac21...Vijay Bellur14 years
v3.2.3qa5commit 10f69943c4...Vijay Bellur14 years
v3.3.0qa4commit 350ae611ca...Vijay Bellur14 years
v3.2.3qa4commit 0564d1198b...Vijay Bellur14 years
v3.2.3qa3commit 2f53b7857c...Vijay Bellur14 years
v3.3.0qa3commit 6073fc29bf...Vijay Bellur14 years
v3.3.0qa2commit a0071bdf2a...Vijay Bellur14 years
v3.1.6commit 98a487f842...Vijay Bellur14 years
v3.1.6qa8commit ef517191c5...Vijay Bellur14 years
v3.3.0qa1commit 1b5a860f15...Vijay Bellur14 years
v3.1.6qa7commit 05e3dcc9b1...Vijay Bellur14 years
v3.2.3qa1commit 62adb4d1c2...Vijay Bellur14 years
v3.1.6qa6commit c92f45c742...Anand Avati14 years
v3.1.6qa5commit 0c01d96a06...Vijay Bellur14 years
v3.1.6qa4commit dfc317a77f...Anand Avati14 years
v3.1.6qa3commit 967199adb1...Anand Avati14 years
v3.1.6qa2commit 7382534ac1...Anand Avati14 years
v3.3beta1commit fd60df8798...Anand Avati14 years
v3.2.2commit c82a9d438b...Anand Avati14 years
v3.2.2qa8commit c82a9d438b...Anand Avati14 years
v3.1.6qa1commit 0c9648c1a0...Anand Avati14 years
v3.2.2qa7commit 972c4a3c34...Anand Avati14 years
v3.2.2qa5commit 7685cec583...Anand Avati14 years
v3.2.2qa4commit 817bda650c...Anand Avati15 years
v3.2.2qa3commit 1b01b64894...Anand Avati15 years
v3.2.2qa2commit 5c20eb3bbf...Vijay Bellur15 years
v3.2.2qa1commit 6ca8604204...Anand Avati15 years
v3.1.5commit a64d1a8157...Anand Avati15 years
v3.1.5qa4commit a64d1a8157...Vijay Bellur15 years
v3.1.5qa3commit 5bcb4ddca3...Anand Avati15 years
v3.1.5qa2commit 25da481bc5...Anand Avati15 years
v3.2.1commit c5321286e5...Anand Avati15 years
v3.2.1qa5commit c5321286e5...Anand Avati15 years
v3.2.1qa4commit 8dee45b3a7...Anand Avati15 years
v3.2.1qa3commit c51b2f7c6c...Anand Avati15 years
v3.2.1qa2commit 05c4dced82...Anand Avati15 years
v3.2.1qa1commit ef39bf9d23...Anand Avati15 years
v3.1.5qa1commit 5f1efbc32d...Vijay Bellur15 years
v3.0.8commit ee744e0908...Vijay Bellur15 years
v3.0.8qa1commit ee744e0908...Vijay Bellur15 years
v3.2.0commit 77f485dc30...Anand Avati15 years
branchpoint-3.2commit 1f06da6875...Anand Avati15 years
v3.2.0qa16commit 625f779dba...Anand Avati15 years
v3.2.0qa15commit b5848ed21b...Anand Avati15 years
v3.2.0qa14commit 72b57e311f...Anand Avati15 years
v3.2.0qa13commit da66edbe92...Vijay Bellur15 years
v3.2.0qa12commit 1c5706c43d...Anand Avati15 years
v3.2.0qa11commit 902478bf9e...Anand Avati15 years
v3.1.4commit 7b368061ea...Anand Vishweshwaran Avati15 years
v3.2.0qa10commit 6db2b422f0...Vijay Bellur15 years
v3.1.4qa3commit 7b368061ea...Vijay Bellur15 years
v3.2.0qa9commit 56814fefa0...Vijay Bellur15 years
v3.2.0qa8commit 35dea20e40...Vijay Bellur15 years
v3.1.4qa2commit 2b55a49045...Vijay Bellur15 years
v3.2.0qa7commit f338193a70...Vijay Bellur15 years
v3.2.0qa6commit 498dbbc506...Vijay Bellur15 years
v3.2.0qa5commit 408a2b0298...Vijay Bellur15 years
v3.1.3solariscommit 9c0d73d37b...Anand V. Avati15 years
v3.2.0qa4commit bd132d8e41...Vijay Bellur15 years
v3.1.3commit 1641d8bb4c...Vijay Bellur15 years
v3.1.3qa8commit c549807c23...Vijay Bellur15 years
v3.1.3qa7commit 5017098718...Vijay Bellur15 years
v3.1.3qa6commit 93845ea7cc...Vijay Bellur15 years
v3.1.3qa5commit cad088fe3a...Vijay Bellur15 years
v3.1.3qa4commit 135aca330b...Pranith K15 years
v3.1.3qa3commit 5b909c83de...Vijay Bellur15 years
v3.1.3qa2commit 77d82df9d5...Rahul15 years
v3.1.3qa1commit b99e0e0678...Vijay Bellur15 years
v3.1.2gsyncqa6commit 3bad56d0d3...Amar Tumballi15 years
v3.1.2gsyncqa5commit a139e43f48...Mohammed Junaid Ahmed15 years
v3.1.2gsyncqa4commit cbd61752ff...Raghavendra G15 years
v3.1.2commit f2a067c4fe...Vijay Bellur15 years
v3.1.2qa4commit 5368b898fa...Raghavendra G15 years
v3.1.2qa3commit cbba1c3f55...Shehjar Tikoo15 years
v3.1.2qa2commit df5f71b401...Amar Tumballi15 years
v3.1.2qa1commit 147b20c4a4...Anand Avati15 years
v3.0.7commit 6da4cc87ff...Anand V. Avati15 years
v3.0.7qa2commit 6da4cc87ff...Raghavendra Bhat15 years
v3.0.7qa1commit e602c69bed...Vijay Bellur15 years
v3.1.1commit 69a62d2a6d...Anand V. Avati15 years
v3.1.1qa11commit c0be54cfcd...Anand Avati15 years
v3.1.1qa10commit b605865986...Shehjar Tikoo15 years
v3.1.1qa9commit f6785d2b49...Anand Avati15 years
v3.1.1qa8commit ce9f328aa9...Anand Avati15 years
v3.1.1qa7commit 961fc917e8...shishir gowda15 years
v3.1.1qa6commit d6f1f04ef0...Raghavendra G15 years
v3.1.1qa5commit eaf0618e47...Anand Avati15 years
v3.1.1qa4commit 8ca96737a9...shishir gowda15 years
v3.1.1qa3commit 1b4613936e...Raghavendra Bhat15 years
v3.1.1qa2commit c65be2d304...Shehjar Tikoo15 years
v3.1.1qa1commit b2f195720b...Vijay Bellur15 years
v3.0.6commit 5cbc81a8d3...Vijay Bellur15 years
v3.0.6rc2commit 5cbc81a8d3...Pavan Sondur15 years
v3.0.6rc1commit ef4005be3a...Vijay Bellur15 years
v3.1.0commit 6e6b4b4fd0...Vijay Bellur15 years
v3.1.0qa46commit f182151cf3...Vijay Bellur15 years
v3.1.0qa45commit 27c8b7a369...Vijay Bellur15 years
v3.1.0qa44commit 2eb9861cbc...Kaushik BV15 years
v3.1.0qa43commit 13f1fff6da...Kaushik BV15 years
v3.1.0qa42commit cd5c9df4b6...Pavan Sondur15 years
v3.1.0qa41commit 4c7ca7ec15...Pranith K15 years
v3.1.0qa40commit ca8615173f...Pranith K15 years
v3.1.0qa39commit 609a89ceac...Kaushik BV15 years
v3.1.0qa38commit 365c814f7b...Pranith K15 years
v3.1.0qa37commit 17295c37f9...Amar Tumballi15 years
v3.1.0qa36commit 760daf2889...Amar Tumballi15 years
v3.1.0qa35commit 6686ddc227...Vijay Bellur15 years
v3.1.0qa34commit dbbec1261e...Amar Tumballi15 years
v3.1.0qa33commit 336e2df7b7...Shehjar Tikoo15 years
v3.1.0qa32commit 0b68f788a8...Vijay Bellur15 years
v3.1.0qa31commit 6e952607f1...Raghavendra G15 years
v3.1.0betacommit c5a5fea9e6...Pavan Sondur15 years
v3.1.0qa30commit c5a5fea9e6...Pavan Sondur15 years
v3.1.0qa29commit 7f645c3ac3...Amar Tumballi15 years
v3.1.0qa28commit 435603caeb...Amar Tumballi15 years
v3.1.0qa27commit 6dbd618548...Raghavendra G15 years
v3.1.0qa26commit 4e6fb304ce...Shehjar Tikoo15 years
v3.1.0qa25commit 47bc630dca...Shehjar Tikoo15 years
v3.1.0qa24commit 0e2c2f46dd...Raghavendra Bhat15 years
v3.1.0qa23commit e7535ad313...Pranith Kumar K15 years
v3.1.0qa22commit a9cbdd2916...Amar Tumballi15 years
v3.1.0qa21commit 993edcc972...Balamurugan Arumugam15 years
v3.1.0alphacommit 288040196c...Vijay Bellur15 years
v3.1.0qa20commit c1f4f9ba17...Raghavendra Bhat15 years
v3.1.0qa19commit 9b226cc588...Vijay Bellur15 years
v3.1.0qa18commit 440ffb55f0...Pavan Sondur15 years
v3.1.0qa17commit 37f01b2714...Raghavendra G15 years
v3.1.0qa16commit 1e99540dc0...Pranith Kumar K15 years
v3.1.0qa15commit b3a4a0e885...Vijay Bellur15 years
v3.1.0qa14commit c02661a69d...Vijay Bellur15 years
v3.1.0qa13commit 780023f5e5...Vijay Bellur15 years
v3.1.0qa12commit e1afe36eb3...Amar Tumballi15 years
v3.1.0qa11commit fb3cb751f1...Amar Tumballi15 years
v3.1.0qa10commit 4a62b116ef...Vijay Bellur15 years
v3.1.0qa9commit d13ddaf872...Anand V. Avati15 years
v3.1.0qa8commit df4a7d7576...Anand V. Avati15 years
v3.1.0prealpha4commit 12e997d863...Anand V. Avati15 years
v3.1.0prealpha3commit f51252fa0d...Anand V. Avati15 years
v3.1.0prealpha2commit 03df087149...Anand V. Avati15 years
v3.1.0prealpha1commit 7e6b5454ad...Anand V. Avati15 years
v3.1.0qa7commit ab72e06f7b...Anand V. Avati15 years
v3.1.0qa6commit 0ec245abd6...Anand V. Avati15 years
v3.1.0qa5commit 9349f559dc...Anand V. Avati15 years
v3.1.0qa4commit 4f4dcb98a7...Pavan Sondur15 years
v3.1.0qa3commit 543f9ef575...Anand V. Avati15 years
v3.1.0qa2commit 931a59e2b9...Anand V. Avati15 years
v3.0.5commit 002d35bfb1...Anand V. Avati16 years
v3.0.5rc9commit 2e35a3eef6...Anand Avati16 years
v3.0.5rc8commit e5d4a9bac5...Pavan Sondur16 years
v3.0.5rc7commit da1123b9d8...Pavan Sondur16 years
v3.0.5rc6commit 4437568045...Vijay Bellur16 years
v3.0.5rc5commit c9676d181d...Anand V. Avati16 years
v3.0.5rc4commit e338603747...Raghavendra G16 years
v3.0.5rc3commit af5ac6eb2e...Anand V. Avati16 years
v3.0.5rc2commit 6d9b11dba6...Raghavendra G16 years
v2.0.10rc3commit b8f058432a...Pavan Sondur16 years
v3.0.5rc1commit f55b20076b...Raghavendra Bhat16 years
v2.0.10rc2commit 6607f92f57...Vijay Bellur16 years
v3.0.4commit aaeddc5084...Anand V. Avati16 years
v3.0.4rc5commit aaeddc5084...Anand Avati16 years
v3.0.4rc4commit 6f67027d78...Vijay Bellur16 years
v3.0.4rc3commit 391023ddc5...Raghavendra G16 years
v3.0.4rc2commit 4cb614047e...Amar Tumballi16 years
v3.0.4rc1commit 9aed760471...Vikas Gorur16 years
v3.0.3commit 029062c103...Anand V. Avati16 years
v3.0.3rc2commit 029062c103...Pavan Sondur16 years
v3.0.3rc1commit 789a2aa227...Raghavendra G16 years
v3.0.2commit 15043b6d97...Anand V. Avati16 years
v2.0.10rc1commit 2d85ef645f...Raghavendra G16 years
v3.0.2rc1commit c15449aaae...Harshavardhana Ranganath16 years
v3.0.1commit 4c20b5377e...Anand V. Avati16 years
v3.0.1rc5commit 899b89a8c2...Raghavendra G16 years
v3.0.1rc4commit 546168723f...Anand Avati16 years
v3.0.1rc3commit 375f08e1b5...Anand Avati16 years
v3.0.1rc2commit 84fe79c086...Vikas Gorur16 years
v3.0.1rc1commit e6f074f931...Amar Tumballi16 years
v2.0.9commit 7e1ba386dd...Anand V. Avati16 years
v3.0.0commit 8379edd978...Anand V. Avati16 years
v2.0.8commit 1a53a5a4bf...Anand V. Avati16 years
v3.0.0pre1commit f8a56c6322...Vijay Bellur16 years
v2.0.7commit 7ba890140f...Anand V. Avati16 years
v2.0.6commit 8dfdde57b3...Anand V. Avati16 years
v2.0.5commit 683fda4bf0...Anand V. Avati16 years
v2.0.4commit 55f476455c...Anand V. Avati17 years
v2.0.3commit b470684cbf...Anand V. Avati17 years
v2.0.2commit 01b9e59055...Vikas Gorur17 years
tag-release-2.0commit 4d4cfc6e45...Anand V. Avati17 years
2.0.0commit 7b2e459db6...Anand V. Avati17 years
2.0.1commit 5c1d9108c1...Anand V. Avati17 years
2.0.0rc9commit 689347f278...Vikas Gorur17 years
2.0.0rc8commit 82394d4848...Vikas Gorur17 years
2.0.0rc7commit 4e5c297d7c...Raghavendra G17 years
2.0.0rc6commit 270621b34a...Amar Tumballi17 years
2.0.0rc5commit c20359b5b2...Amar Tumballi17 years
2.0.0rc3commit b6bf3b8d6e...Harshavardhana17 years
2.0.0rc2commit d47eb5d681...Anand V. Avati17 years
100%'> -rw-r--r--rpc/xdr/src/glusterd1-xdr.x91
-rw-r--r--rpc/xdr/src/glusterfs3-xdr.c197
-rw-r--r--rpc/xdr/src/glusterfs3-xdr.h152
-rw-r--r--rpc/xdr/src/glusterfs3-xdr.x82
-rw-r--r--rpc/xdr/src/msg-nfs3.c29
-rw-r--r--rpc/xdr/src/msg-nfs3.h13
-rw-r--r--rpc/xdr/src/xdr-nfs3.h2
-rwxr-xr-xrun-tests.sh30
-rwxr-xr-xsmoke.sh83
-rw-r--r--swift/1.4.8/README22
-rw-r--r--swift/1.4.8/gluster-swift-plugin.spec60
-rw-r--r--swift/1.4.8/gluster-swift.spec396
-rw-r--r--swift/1.4.8/plugins/DiskDir.py484
-rw-r--r--swift/1.4.8/plugins/DiskFile.py316
-rw-r--r--swift/1.4.8/plugins/Glusterfs.py131
-rw-r--r--swift/1.4.8/plugins/__init__.py16
-rw-r--r--swift/1.4.8/plugins/conf/account-server/1.conf22
-rw-r--r--swift/1.4.8/plugins/conf/account.builderbin786843 -> 0 bytes-rw-r--r--swift/1.4.8/plugins/conf/account.ring.gzbin739 -> 0 bytes-rw-r--r--swift/1.4.8/plugins/conf/container-server/1.conf24
-rw-r--r--swift/1.4.8/plugins/conf/container.builderbin786843 -> 0 bytes-rw-r--r--swift/1.4.8/plugins/conf/container.ring.gzbin741 -> 0 bytes-rw-r--r--swift/1.4.8/plugins/conf/db_file.db0
-rw-r--r--swift/1.4.8/plugins/conf/fs.conf9
-rw-r--r--swift/1.4.8/plugins/conf/object-server/1.conf22
-rw-r--r--swift/1.4.8/plugins/conf/object.builderbin786843 -> 0 bytes-rw-r--r--swift/1.4.8/plugins/conf/object.ring.gzbin738 -> 0 bytes-rw-r--r--swift/1.4.8/plugins/conf/proxy-server.conf21
-rw-r--r--swift/1.4.8/plugins/conf/swift.conf7
-rw-r--r--swift/1.4.8/plugins/constraints.py97
-rw-r--r--swift/1.4.8/plugins/utils.py679
-rw-r--r--swift/1.4.8/swift.diff797
-rw-r--r--tests/README.md27
-rw-r--r--tests/afr.rc15
-rwxr-xr-xtests/basic/bd.t131
-rwxr-xr-xtests/basic/cdc.t135
-rwxr-xr-xtests/basic/file-snapshot.t56
-rwxr-xr-xtests/basic/mgmt_v3-locks.t121
-rwxr-xr-xtests/basic/mount.t78
-rw-r--r--tests/basic/nufa.t32
-rwxr-xr-xtests/basic/posixonly.t30
-rw-r--r--tests/basic/pump.t44
-rwxr-xr-xtests/basic/quota.t51
-rwxr-xr-xtests/basic/rpm.t109
-rw-r--r--tests/basic/self-heald.t48
-rwxr-xr-xtests/basic/volume-snapshot.t83
-rw-r--r--tests/basic/volume-status.t66
-rwxr-xr-xtests/basic/volume.t34
-rwxr-xr-xtests/bugs/859927/repl.t69
-rw-r--r--tests/bugs/886998/strict-readdir.t52
-rw-r--r--tests/bugs/949327.t23
-rwxr-xr-xtests/bugs/bug-000000.t9
-rw-r--r--tests/bugs/bug-1002207.t54
-rwxr-xr-xtests/bugs/bug-1002556.t25
-rw-r--r--tests/bugs/bug-1004218.t26
-rw-r--r--tests/bugs/bug-1004744.t48
-rwxr-xr-xtests/bugs/bug-1015990-rep.t81
-rwxr-xr-xtests/bugs/bug-1015990.t95
-rwxr-xr-xtests/bugs/bug-1022055.t26
-rw-r--r--tests/bugs/bug-1022905.t39
-rw-r--r--tests/bugs/bug-1030208.t35
-rw-r--r--tests/bugs/bug-1040934.t37
-rw-r--r--tests/bugs/bug-1045333.t48
-rwxr-xr-xtests/bugs/bug-1049834.t40
-rwxr-xr-xtests/bugs/bug-1064768.t20
-rwxr-xr-xtests/bugs/bug-762989.t32
-rw-r--r--tests/bugs/bug-764638.t13
-rwxr-xr-xtests/bugs/bug-765230.t60
-rw-r--r--tests/bugs/bug-765380.t39
-rwxr-xr-xtests/bugs/bug-765473.t33
-rw-r--r--tests/bugs/bug-765564.t83
-rwxr-xr-xtests/bugs/bug-767095.t51
-rwxr-xr-xtests/bugs/bug-767585-gfid.t43
-rwxr-xr-xtests/bugs/bug-770655.t168
-rwxr-xr-xtests/bugs/bug-782095.t48
-rwxr-xr-xtests/bugs/bug-797171.t43
-rwxr-xr-xtests/bugs/bug-802417.t108
-rwxr-xr-xtests/bugs/bug-808400-dist.t31
-rw-r--r--tests/bugs/bug-808400-fcntl.c113
-rw-r--r--tests/bugs/bug-808400-flock.c92
-rwxr-xr-xtests/bugs/bug-808400-repl.t30
-rwxr-xr-xtests/bugs/bug-808400-stripe.t31
-rwxr-xr-xtests/bugs/bug-808400.t34
-rwxr-xr-xtests/bugs/bug-811493.t18
-rw-r--r--tests/bugs/bug-821056.t52
-rwxr-xr-xtests/bugs/bug-822830.t44
-rwxr-xr-xtests/bugs/bug-823081.t40
-rw-r--r--tests/bugs/bug-824753-file-locker.c42
-rwxr-xr-xtests/bugs/bug-824753.t45
-rwxr-xr-xtests/bugs/bug-830665.t106
-rw-r--r--tests/bugs/bug-834465.c61
-rwxr-xr-xtests/bugs/bug-834465.t44
-rw-r--r--tests/bugs/bug-839595.t31
-rwxr-xr-xtests/bugs/bug-844688.t37
-rw-r--r--tests/bugs/bug-845213.t19
-rw-r--r--tests/bugs/bug-846240.t58
-rwxr-xr-xtests/bugs/bug-847622.t25
-rwxr-xr-xtests/bugs/bug-847624.t23
-rw-r--r--tests/bugs/bug-848251.t50
-rwxr-xr-xtests/bugs/bug-852147.t85
-rwxr-xr-xtests/bugs/bug-853258.t45
-rwxr-xr-xtests/bugs/bug-853680.t52
-rwxr-xr-xtests/bugs/bug-853690.t94
-rw-r--r--tests/bugs/bug-856455.t42
-rw-r--r--tests/bugs/bug-857330/common.rc55
-rwxr-xr-xtests/bugs/bug-857330/normal.t78
-rwxr-xr-xtests/bugs/bug-857330/xml.t101
-rwxr-xr-xtests/bugs/bug-858215.t81
-rw-r--r--tests/bugs/bug-858242.c77
-rwxr-xr-xtests/bugs/bug-858242.t28
-rw-r--r--tests/bugs/bug-858488-min-free-disk.t114
-rwxr-xr-xtests/bugs/bug-859927.t70
-rw-r--r--tests/bugs/bug-860297.t13
-rw-r--r--tests/bugs/bug-860663.t51
-rw-r--r--tests/bugs/bug-861015-index.t36
-rw-r--r--tests/bugs/bug-861015-log.t29
-rwxr-xr-xtests/bugs/bug-861542.t51
-rwxr-xr-xtests/bugs/bug-862834.t46
-rw-r--r--tests/bugs/bug-862967.t59
-rw-r--r--tests/bugs/bug-863068.t76
-rwxr-xr-xtests/bugs/bug-864222.t26
-rwxr-xr-xtests/bugs/bug-865825.t76
-rw-r--r--tests/bugs/bug-866459.t44
-rw-r--r--tests/bugs/bug-867252.t41
-rw-r--r--tests/bugs/bug-867253.t59
-rw-r--r--tests/bugs/bug-869724.t37
-rwxr-xr-xtests/bugs/bug-872923.t57
-rwxr-xr-xtests/bugs/bug-873367.t41
-rw-r--r--tests/bugs/bug-873549.t17
-rw-r--r--tests/bugs/bug-873962-spb.t39
-rwxr-xr-xtests/bugs/bug-873962.t108
-rw-r--r--tests/bugs/bug-874498.t61
-rwxr-xr-xtests/bugs/bug-877293.t41
-rwxr-xr-xtests/bugs/bug-877885.t35
-rwxr-xr-xtests/bugs/bug-877992.t61
-rw-r--r--tests/bugs/bug-878004.t29
-rwxr-xr-xtests/bugs/bug-879490.t37
-rwxr-xr-xtests/bugs/bug-879494.t37
-rw-r--r--tests/bugs/bug-880898.t23
-rwxr-xr-xtests/bugs/bug-882278.t72
-rw-r--r--tests/bugs/bug-884328.t12
-rw-r--r--tests/bugs/bug-884452.t46
-rwxr-xr-xtests/bugs/bug-884455.t84
-rwxr-xr-xtests/bugs/bug-884597.t152
-rw-r--r--tests/bugs/bug-886998.t52
-rw-r--r--tests/bugs/bug-887098-gmount-crash.t48
-rwxr-xr-xtests/bugs/bug-887145.t89
-rw-r--r--tests/bugs/bug-888174.t65
-rw-r--r--tests/bugs/bug-888752.t24
-rwxr-xr-xtests/bugs/bug-889630.t56
-rw-r--r--tests/bugs/bug-889996.t19
-rwxr-xr-xtests/bugs/bug-892730.t76
-rw-r--r--tests/bugs/bug-893338.t34
-rwxr-xr-xtests/bugs/bug-893378.t73
-rw-r--r--tests/bugs/bug-895235.t23
-rwxr-xr-xtests/bugs/bug-896431.t124
-rwxr-xr-xtests/bugs/bug-902610.t59
-rw-r--r--tests/bugs/bug-903336.t13
-rwxr-xr-xtests/bugs/bug-904065.t90
-rwxr-xr-xtests/bugs/bug-904300.t61
-rw-r--r--tests/bugs/bug-905307.t36
-rw-r--r--tests/bugs/bug-905864.c82
-rw-r--r--tests/bugs/bug-905864.t32
-rw-r--r--tests/bugs/bug-906646.t93
-rwxr-xr-xtests/bugs/bug-907072.t46
-rwxr-xr-xtests/bugs/bug-908146.t39
-rwxr-xr-xtests/bugs/bug-912297.t44
-rwxr-xr-xtests/bugs/bug-912564.t92
-rw-r--r--tests/bugs/bug-913051.t65
-rw-r--r--tests/bugs/bug-913487.t14
-rw-r--r--tests/bugs/bug-913544.t24
-rwxr-xr-xtests/bugs/bug-913555.t54
-rwxr-xr-xtests/bugs/bug-915280.t51
-rwxr-xr-xtests/bugs/bug-915554.t75
-rw-r--r--tests/bugs/bug-916226.t26
-rwxr-xr-xtests/bugs/bug-916549.t19
-rw-r--r--tests/bugs/bug-918437-sh-mtime.t52
-rwxr-xr-xtests/bugs/bug-921072.t118
-rw-r--r--tests/bugs/bug-921231.t31
-rwxr-xr-xtests/bugs/bug-921408.t89
-rwxr-xr-xtests/bugs/bug-924075.t23
-rwxr-xr-xtests/bugs/bug-924265.t35
-rwxr-xr-xtests/bugs/bug-927616.t61
-rw-r--r--tests/bugs/bug-948686.t46
-rw-r--r--tests/bugs/bug-948729/bug-948729-force.t84
-rw-r--r--tests/bugs/bug-948729/bug-948729-mode-script.t85
-rw-r--r--tests/bugs/bug-948729/bug-948729.t67
-rw-r--r--tests/bugs/bug-949242.t54
-rw-r--r--tests/bugs/bug-949298.t12
-rw-r--r--tests/bugs/bug-949930.t27
-rwxr-xr-xtests/bugs/bug-955588.t27
-rw-r--r--tests/bugs/bug-957877.t31
-rw-r--r--tests/bugs/bug-958691.t50
-rw-r--r--tests/bugs/bug-958790.t21
-rw-r--r--tests/bugs/bug-961307.t32
-rw-r--r--tests/bugs/bug-961615.t34
-rw-r--r--tests/bugs/bug-961669.t48
-rwxr-xr-xtests/bugs/bug-963541.t33
-rw-r--r--tests/bugs/bug-963678.t56
-rwxr-xr-xtests/bugs/bug-964059.t30
-rw-r--r--tests/bugs/bug-966018.t34
-rwxr-xr-xtests/bugs/bug-969193.t13
-rwxr-xr-xtests/bugs/bug-970070.t14
-rwxr-xr-xtests/bugs/bug-973073.t48
-rw-r--r--tests/bugs/bug-974007.t52
-rwxr-xr-xtests/bugs/bug-974972.t36
-rw-r--r--tests/bugs/bug-976800.t28
-rw-r--r--tests/bugs/bug-977246.t21
-rwxr-xr-xtests/bugs/bug-977797.t114
-rw-r--r--tests/bugs/bug-978794.t29
-rwxr-xr-xtests/bugs/bug-979365.t47
-rw-r--r--tests/bugs/bug-982174.t36
-rwxr-xr-xtests/bugs/bug-983477.t52
-rw-r--r--tests/bugs/bug-985074.t55
-rw-r--r--tests/bugs/bug-986429.t19
-rwxr-xr-xtests/bugs/bug-986905.t27
-rw-r--r--tests/bugs/bug-991622.t35
-rw-r--r--tests/bugs/getlk_owner.c120
-rwxr-xr-xtests/bugs/overlap.py59
-rwxr-xr-xtests/cluster.rc112
-rw-r--r--tests/dht.rc79
-rw-r--r--tests/fallocate.rc19
-rwxr-xr-xtests/features/glupy.t29
-rwxr-xr-xtests/features/readdir-ahead.t44
-rw-r--r--tests/fileio.rc61
-rw-r--r--tests/include.rc248
-rw-r--r--tests/nfs.rc21
-rwxr-xr-xtests/performance/open-behind.t63
-rw-r--r--tests/performance/quick-read.t55
-rwxr-xr-xtests/snapshot.rc251
-rwxr-xr-xtests/utils/create-files.py207
-rw-r--r--tests/volume.rc325
-rw-r--r--xlators/Makefile.am3
-rw-r--r--xlators/cluster/afr/src/Makefile.am4
-rw-r--r--xlators/cluster/afr/src/afr-common.c882
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c276
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c343
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c645
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c1474
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.h11
-rw-r--r--xlators/cluster/afr/src/afr-lk-common.c1009
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h4
-rw-r--r--xlators/cluster/afr/src/afr-open.c196
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.c66
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c744
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h32
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c546
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c122
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c290
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h7
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c718
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h13
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c822
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h15
-rw-r--r--xlators/cluster/afr/src/afr.c146
-rw-r--r--xlators/cluster/afr/src/afr.h295
-rw-r--r--xlators/cluster/afr/src/pump.c14
-rw-r--r--xlators/cluster/dht/src/Makefile.am8
-rw-r--r--xlators/cluster/dht/src/dht-common.c946
-rw-r--r--xlators/cluster/dht/src/dht-common.h147
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c179
-rw-r--r--xlators/cluster/dht/src/dht-hashfn.c76
-rw-r--r--xlators/cluster/dht/src/dht-helper.c411
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c84
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c445
-rw-r--r--xlators/cluster/dht/src/dht-layout.c151
-rw-r--r--xlators/cluster/dht/src/dht-linkfile.c144
-rw-r--r--xlators/cluster/dht/src/dht-mem-types.h2
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c277
-rw-r--r--xlators/cluster/dht/src/dht-rename.c112
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c291
-rw-r--r--xlators/cluster/dht/src/dht-shared.c758
-rw-r--r--xlators/cluster/dht/src/dht.c555
-rw-r--r--xlators/cluster/dht/src/nufa.c336
-rw-r--r--xlators/cluster/dht/src/switch.c214
-rw-r--r--xlators/cluster/ha/src/Makefile.am2
-rw-r--r--xlators/cluster/map/src/Makefile.am2
-rw-r--r--xlators/cluster/stripe/src/Makefile.am2
-rw-r--r--xlators/cluster/stripe/src/stripe-helpers.c87
-rw-r--r--xlators/cluster/stripe/src/stripe-mem-types.h1
-rw-r--r--xlators/cluster/stripe/src/stripe.c867
-rw-r--r--xlators/cluster/stripe/src/stripe.h11
-rw-r--r--xlators/debug/error-gen/src/Makefile.am2
-rw-r--r--xlators/debug/error-gen/src/error-gen.c236
-rw-r--r--xlators/debug/error-gen/src/error-gen.h12
-rw-r--r--xlators/debug/io-stats/src/Makefile.am2
-rw-r--r--xlators/debug/io-stats/src/io-stats.c79
-rw-r--r--xlators/debug/trace/src/Makefile.am3
-rw-r--r--xlators/debug/trace/src/trace-mem-types.h21
-rw-r--r--xlators/debug/trace/src/trace.c2848
-rw-r--r--xlators/debug/trace/src/trace.h98
-rw-r--r--xlators/encryption/Makefile.am2
-rw-r--r--xlators/encryption/crypt/Makefile.am3
-rw-r--r--xlators/encryption/crypt/src/Makefile.am24
-rw-r--r--xlators/encryption/crypt/src/atom.c962
-rw-r--r--xlators/encryption/crypt/src/crypt-common.h141
-rw-r--r--xlators/encryption/crypt/src/crypt-mem-types.h43
-rw-r--r--xlators/encryption/crypt/src/crypt.c4498
-rw-r--r--xlators/encryption/crypt/src/crypt.h899
-rw-r--r--xlators/encryption/crypt/src/data.c769
-rw-r--r--xlators/encryption/crypt/src/keys.c302
-rw-r--r--xlators/encryption/crypt/src/metadata.c605
-rw-r--r--xlators/encryption/crypt/src/metadata.h74
-rw-r--r--xlators/encryption/rot-13/src/Makefile.am2
-rw-r--r--xlators/encryption/rot-13/src/rot-13.c19
-rw-r--r--xlators/features/Makefile.am3
-rw-r--r--xlators/features/changelog/Makefile.am3
-rw-r--r--xlators/features/changelog/lib/Makefile.am3
-rw-r--r--xlators/features/changelog/lib/examples/c/get-changes.c87
-rw-r--r--xlators/features/changelog/lib/examples/python/changes.py32
-rw-r--r--xlators/features/changelog/lib/examples/python/libgfchangelog.py64
-rw-r--r--xlators/features/changelog/lib/src/Makefile.am37
-rw-r--r--xlators/features/changelog/lib/src/changelog.h31
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.c180
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.h97
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-process.c571
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog.c515
-rw-r--r--xlators/features/changelog/src/Makefile.am19
-rw-r--r--xlators/features/changelog/src/changelog-encoders.c176
-rw-r--r--xlators/features/changelog/src/changelog-encoders.h46
-rw-r--r--xlators/features/changelog/src/changelog-helpers.c693
-rw-r--r--xlators/features/changelog/src/changelog-helpers.h395
-rw-r--r--xlators/features/changelog/src/changelog-mem-types.h29
-rw-r--r--xlators/features/changelog/src/changelog-misc.h101
-rw-r--r--xlators/features/changelog/src/changelog-notifier.c314
-rw-r--r--xlators/features/changelog/src/changelog-notifier.h19
-rw-r--r--xlators/features/changelog/src/changelog-rt.c72
-rw-r--r--xlators/features/changelog/src/changelog-rt.h33
-rw-r--r--xlators/features/changelog/src/changelog.c1477
-rw-r--r--xlators/features/compress/Makefile.am3
-rw-r--r--xlators/features/compress/src/Makefile.am17
-rw-r--r--xlators/features/compress/src/cdc-helper.c547
-rw-r--r--xlators/features/compress/src/cdc-mem-types.h22
-rw-r--r--xlators/features/compress/src/cdc.c342
-rw-r--r--xlators/features/compress/src/cdc.h107
-rw-r--r--xlators/features/filter/src/Makefile.am2
-rw-r--r--xlators/features/gfid-access/Makefile.am1
-rw-r--r--xlators/features/gfid-access/src/Makefile.am15
-rw-r--r--xlators/features/gfid-access/src/gfid-access-mem-types.h23
-rw-r--r--xlators/features/gfid-access/src/gfid-access.c1172
-rw-r--r--xlators/features/gfid-access/src/gfid-access.h128
-rw-r--r--xlators/features/glupy/Makefile.am3
-rw-r--r--xlators/features/glupy/doc/README.md44
-rw-r--r--xlators/features/glupy/doc/TESTING9
-rw-r--r--xlators/features/glupy/doc/test.vol10
-rw-r--r--xlators/features/glupy/src/Makefile.am20
-rw-r--r--xlators/features/glupy/src/debug-trace.py774
-rw-r--r--xlators/features/glupy/src/glupy.c2470
-rw-r--r--xlators/features/glupy/src/glupy.h69
-rw-r--r--xlators/features/glupy/src/gluster.py841
-rw-r--r--xlators/features/glupy/src/helloworld.py19
-rw-r--r--xlators/features/glupy/src/negative.py92
-rw-r--r--xlators/features/index/src/Makefile.am2
-rw-r--r--xlators/features/index/src/index.c346
-rw-r--r--xlators/features/index/src/index.h14
-rw-r--r--xlators/features/locks/src/Makefile.am3
-rw-r--r--xlators/features/locks/src/clear.c5
-rw-r--r--xlators/features/locks/src/common.c186
-rw-r--r--xlators/features/locks/src/common.h55
-rw-r--r--xlators/features/locks/src/entrylk.c118
-rw-r--r--xlators/features/locks/src/inodelk.c195
-rw-r--r--xlators/features/locks/src/locks.h32
-rw-r--r--xlators/features/locks/src/posix.c653
-rw-r--r--xlators/features/mac-compat/src/Makefile.am2
-rw-r--r--xlators/features/mac-compat/src/mac-compat.c3
-rw-r--r--xlators/features/marker/Makefile.am2
-rw-r--r--xlators/features/marker/src/Makefile.am2
-rw-r--r--xlators/features/marker/src/marker.c257
-rw-r--r--xlators/features/marker/src/marker.h6
-rw-r--r--xlators/features/marker/utils/syncdaemon/Makefile.am6
-rw-r--r--xlators/features/marker/utils/syncdaemon/master.py913
-rw-r--r--xlators/features/marker/utils/syncdaemon/monitor.py129
-rw-r--r--xlators/features/path-convertor/src/Makefile.am2
-rw-r--r--xlators/features/protect/Makefile.am3
-rw-r--r--xlators/features/protect/src/Makefile.am21
-rw-r--r--xlators/features/protect/src/prot_client.c215
-rw-r--r--xlators/features/protect/src/prot_dht.c168
-rw-r--r--xlators/features/protect/src/prot_server.c51
-rw-r--r--xlators/features/qemu-block/Makefile.am1
-rw-r--r--xlators/features/qemu-block/src/Makefile.am155
-rw-r--r--xlators/features/qemu-block/src/bdrv-xlator.c397
-rw-r--r--xlators/features/qemu-block/src/bh-syncop.c48
-rw-r--r--xlators/features/qemu-block/src/clock-timer.c60
-rw-r--r--xlators/features/qemu-block/src/coroutine-synctask.c116
-rw-r--r--xlators/features/qemu-block/src/monitor-logging.c50
-rw-r--r--xlators/features/qemu-block/src/qb-coroutines.c662
-rw-r--r--xlators/features/qemu-block/src/qb-coroutines.h30
-rw-r--r--xlators/features/qemu-block/src/qemu-block-memory-types.h25
-rw-r--r--xlators/features/qemu-block/src/qemu-block.c1140
-rw-r--r--xlators/features/qemu-block/src/qemu-block.h109
-rw-r--r--xlators/features/quiesce/src/Makefile.am2
-rw-r--r--xlators/features/quiesce/src/quiesce.c14
-rw-r--r--xlators/features/quota/src/Makefile.am2
-rw-r--r--xlators/features/quota/src/quota.c204
-rw-r--r--xlators/features/quota/src/quota.h1
-rw-r--r--xlators/features/read-only/src/Makefile.am4
-rw-r--r--xlators/features/read-only/src/worm.c3
-rw-r--r--xlators/features/trash/src/Makefile.am2
-rw-r--r--xlators/lib/src/libxlator.c167
-rw-r--r--xlators/lib/src/libxlator.h87
-rw-r--r--xlators/mgmt/glusterd/src/Makefile.am16
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-brick-ops.c713
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.c3124
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c2054
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handshake.c969
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-hooks.c14
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.c637
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.h51
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-log-ops.c43
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mem-types.h6
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c924
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.c1893
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.h45
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.c2975
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.h43
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-pmap.c100
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quota.c66
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rebalance.c312
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-replace-brick.c404
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rpc-ops.c628
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.c61
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.h35
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot.c5590
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.c2536
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.h133
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.c1588
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.h73
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c3886
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.h204
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c972
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.h89
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-ops.c951
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c1452
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.c608
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h545
-rw-r--r--xlators/mount/fuse/src/Makefile.am2
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.c1422
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.h117
-rw-r--r--xlators/mount/fuse/src/fuse-helpers.c31
-rw-r--r--xlators/mount/fuse/src/fuse-resolve.c31
-rwxr-xr-xxlators/mount/fuse/utils/mount.glusterfs.in200
-rw-r--r--xlators/nfs/server/src/Makefile.am9
-rw-r--r--xlators/nfs/server/src/acl3.c708
-rw-r--r--xlators/nfs/server/src/acl3.h31
-rw-r--r--xlators/nfs/server/src/mount3.c991
-rw-r--r--xlators/nfs/server/src/mount3.h39
-rw-r--r--xlators/nfs/server/src/mount3udp_svc.c17
-rw-r--r--xlators/nfs/server/src/nfs-common.c59
-rw-r--r--xlators/nfs/server/src/nfs-common.h20
-rw-r--r--xlators/nfs/server/src/nfs-fops.c141
-rw-r--r--xlators/nfs/server/src/nfs-fops.h28
-rw-r--r--xlators/nfs/server/src/nfs-generics.c33
-rw-r--r--xlators/nfs/server/src/nfs-generics.h27
-rw-r--r--xlators/nfs/server/src/nfs-inodes.c27
-rw-r--r--xlators/nfs/server/src/nfs-inodes.h17
-rw-r--r--xlators/nfs/server/src/nfs-mem-types.h20
-rw-r--r--xlators/nfs/server/src/nfs.c728
-rw-r--r--xlators/nfs/server/src/nfs.h28
-rw-r--r--xlators/nfs/server/src/nfs3-fh.c39
-rw-r--r--xlators/nfs/server/src/nfs3-fh.h32
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.c175
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.h23
-rw-r--r--xlators/nfs/server/src/nfs3.c558
-rw-r--r--xlators/nfs/server/src/nfs3.h90
-rw-r--r--xlators/nfs/server/src/nlm4.c340
-rw-r--r--xlators/nfs/server/src/nlm4.h22
-rw-r--r--xlators/nfs/server/src/nlmcbk_svc.c27
-rw-r--r--xlators/performance/Makefile.am2
-rw-r--r--xlators/performance/io-cache/src/Makefile.am2
-rw-r--r--xlators/performance/io-cache/src/io-cache.c96
-rw-r--r--xlators/performance/io-cache/src/io-cache.h2
-rw-r--r--xlators/performance/io-cache/src/page.c21
-rw-r--r--xlators/performance/io-threads/src/Makefile.am2
-rw-r--r--xlators/performance/io-threads/src/io-threads.c249
-rw-r--r--xlators/performance/io-threads/src/io-threads.h10
-rw-r--r--xlators/performance/md-cache/src/Makefile.am2
-rw-r--r--xlators/performance/md-cache/src/md-cache.c382
-rw-r--r--xlators/performance/open-behind/Makefile.am1
-rw-r--r--xlators/performance/open-behind/src/Makefile.am15
-rw-r--r--xlators/performance/open-behind/src/open-behind-mem-types.h21
-rw-r--r--xlators/performance/open-behind/src/open-behind.c1001
-rw-r--r--xlators/performance/quick-read/src/Makefile.am2
-rw-r--r--xlators/performance/quick-read/src/quick-read-mem-types.h1
-rw-r--r--xlators/performance/quick-read/src/quick-read.c3718
-rw-r--r--xlators/performance/quick-read/src/quick-read.h57
-rw-r--r--xlators/performance/read-ahead/src/Makefile.am2
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.c115
-rw-r--r--xlators/performance/readdir-ahead/Makefile.am3
-rw-r--r--xlators/performance/readdir-ahead/src/Makefile.am15
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h24
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.c560
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.h46
-rw-r--r--xlators/performance/symlink-cache/src/Makefile.am2
-rw-r--r--xlators/performance/write-behind/src/Makefile.am2
-rw-r--r--xlators/performance/write-behind/src/write-behind.c402
-rw-r--r--xlators/playground/Makefile.am2
-rw-r--r--xlators/playground/template/Makefile.am2
-rw-r--r--xlators/playground/template/src/Makefile.am16
-rw-r--r--xlators/playground/template/src/template.c49
-rw-r--r--xlators/playground/template/src/template.h24
-rw-r--r--xlators/protocol/auth/addr/src/Makefile.am2
-rw-r--r--xlators/protocol/auth/addr/src/addr.c17
-rw-r--r--xlators/protocol/auth/login/src/Makefile.am2
-rw-r--r--xlators/protocol/auth/login/src/login.c17
-rw-r--r--xlators/protocol/client/src/Makefile.am2
-rw-r--r--xlators/protocol/client/src/client-handshake.c256
-rw-r--r--xlators/protocol/client/src/client-helpers.c73
-rw-r--r--xlators/protocol/client/src/client-lk.c360
-rw-r--r--xlators/protocol/client/src/client-rpc-fops.c530
-rw-r--r--xlators/protocol/client/src/client.c180
-rw-r--r--xlators/protocol/client/src/client.h51
-rw-r--r--xlators/protocol/server/src/Makefile.am11
-rw-r--r--xlators/protocol/server/src/authenticate.c64
-rw-r--r--xlators/protocol/server/src/authenticate.h19
-rw-r--r--xlators/protocol/server/src/server-handshake.c149
-rw-r--r--xlators/protocol/server/src/server-helpers.c1143
-rw-r--r--xlators/protocol/server/src/server-helpers.h74
-rw-r--r--xlators/protocol/server/src/server-mem-types.h19
-rw-r--r--xlators/protocol/server/src/server-resolve.c54
-rw-r--r--xlators/protocol/server/src/server-rpc-fops.c2239
-rw-r--r--xlators/protocol/server/src/server.c796
-rw-r--r--xlators/protocol/server/src/server.h126
-rw-r--r--xlators/storage/Makefile.am6
-rw-r--r--xlators/storage/bd/Makefile.am3
-rw-r--r--xlators/storage/bd/src/Makefile.am20
-rw-r--r--xlators/storage/bd/src/bd-aio.c527
-rw-r--r--xlators/storage/bd/src/bd-aio.h41
-rw-r--r--xlators/storage/bd/src/bd-helper.c783
-rw-r--r--xlators/storage/bd/src/bd.c2404
-rw-r--r--xlators/storage/bd/src/bd.h178
-rw-r--r--xlators/storage/posix/src/Makefile.am2
-rw-r--r--xlators/storage/posix/src/posix-aio.c20
-rw-r--r--xlators/storage/posix/src/posix-handle.c7
-rw-r--r--xlators/storage/posix/src/posix-helpers.c409
-rw-r--r--xlators/storage/posix/src/posix.c1155
-rw-r--r--xlators/storage/posix/src/posix.h50
-rw-r--r--xlators/system/posix-acl/src/Makefile.am2
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.c4
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.h16
-rw-r--r--xlators/system/posix-acl/src/posix-acl.c67
-rw-r--r--xlators/system/posix-acl/src/posix-acl.h49
969 files changed, 173226 insertions, 40726 deletions
diff --git a/.gitignore b/.gitignore
index c5371b264..ff253c1da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,26 +8,41 @@ install-sh
ltmain.sh
Makefile.in
missing
+py-compile
*.sw?
*~
-*lo
-*la
-*o
+*.lo
+*.la
+*.o
+*.tar.gz
+*.rpm
.libs
+.deps
Makefile
stamp-h1
# Generated files
-extras/init.d/glusterfs-server.plist
-extras/init.d/glusterfsd-Debian
-extras/init.d/glusterfsd-Redhat
-extras/init.d/glusterfsd-SuSE
-glusterfs.spec
-libtool
-xlators/mount/fuse/utils/mount.glusterfs
-xlators/mount/fuse/utils/mount_glusterfs
+api/examples/__init__.py*
+api/examples/setup.py
argp-standalone/libargp.a
+contrib/uuid/uuid_types.h
+extras/init.d/glusterd-Debian
+extras/init.d/glusterd-Redhat
+extras/init.d/glusterd-SuSE
+extras/init.d/glusterd.plist
+extras/ocf/glusterd
+extras/ocf/volume
+extras/who-wrote-glusterfs/gitdm
+glusterfs-api.pc
+glusterfs.spec
glusterfsd/src/glusterfsd
+libgfchangelog.pc
libglusterfs/src/spec.lex.c
libglusterfs/src/y.tab.c
libglusterfs/src/y.tab.h
+libtool
+run-tests.sh
+ufo/.tox
+ufo/test/unit/.coverage
+xlators/mount/fuse/utils/mount.glusterfs
+xlators/mount/fuse/utils/mount_glusterfs
diff --git a/.mailmap b/.mailmap
new file mode 100644
index 000000000..6bcd95dea
--- /dev/null
+++ b/.mailmap
@@ -0,0 +1,31 @@
+# .mailmap, see 'git short-log --help' for details
+#
+# Listing of contributors that filed patches with different email addresses.
+# Format: <name> <main-email> <alias> [<alias> ...]
+#
+
+Amar Tumballi <amarts@redhat.com> <amar@gluster.com> <amar@del.gluster.com>
+Anand Avati <avati@redhat.com> <avati@gluster.com> <avati@dev.gluster.com> <avati@amp.gluster.com> <avati@blackhole.gluster.com>
+Anush Shetty <ashetty@redhat.com> <anush@gluster.com>
+Csaba Henk <csaba@redhat.com> <csaba@gluster.com> <csaba@lowlife.hu> <csaba@zresearch.com>
+Harshavardhana <fharshav@redhat.com> <harsha@gluster.com> <harsha@zresearch.com> <harsha@dev.gluster.com> <harsha@harshavardhana.net>
+Kaleb S. KEITHLEY <kkeithle@redhat.com> <kkeithle@f16node1.kkeithle.usersys.redhat.com>
+Kaushal M <kaushal@redhat.com> <kaushal@gluster.com>
+Kaushik BV <kbudiger@redhat.com> <kaushikbv@gluster.com>
+Krishna Srinivas <ksriniva@redhat.com> <krishna@gluster.com> <krishna@zresearch.com> <krishna@guest-laptop>
+Krishnan Parthasarathi <kparthas@redhat.com> <kp@gluster.com>
+Louis Zuckerman <louiszuckerman@gmail.com> <me@louiszuckerman.com>
+M S Vishwanath Bhat <vbhat@redhat.com> <msvbhat@gmail.com> <vishwanath@gluster.com>
+Pavan Sondur <pavan@gluster.com> <pavan@dev.gluster.com>
+Pete Zaitcev <zaitcev@kotori.zaitcev.us> <zaitcev@yahoo.com>
+Pranith Kumar K <pkarampu@redhat.com> <pranithk@gluster.com>
+Raghavendra Bhat <raghavendra@redhat.com> <raghavendrabhat@gluster.com>
+Raghavendra G <rgowdapp@redhat.com> <raghavendra@gluster.com> <raghavendra@zresearch.com>
+Rahul C S <rahulcs@redhat.com> <rahulcssjce@gmail.com>
+Rajesh Amaravathi <rajesh@redhat.com> <rajesh@gluster.com> <rajesh.amaravathi@gmail.com>
+Shehjar Tikoo <shehjart@gluster.com> <shehjart@zresearch.com>
+Venky Shankar <vshankar@redhat.com> <venky@gluster.com>
+Vijay Bellur <vbellur@redhat.com> <vijay@gluster.com> <vijay@dev.gluster.com>
+Vijaykumar Koppad <vkoppad@redhat.com> <vijaykumar.koppad@gmail.com>
+Vikas Gorur <vikas@gluster.com> <vikas@zresearch.com>
+shishir gowda <sgowda@redhat.com> <shishirng@gluster.com>
diff --git a/Makefile.am b/Makefile.am
index 836f17efc..598ebb410 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,13 +1,16 @@
EXTRA_DIST = autogen.sh \
COPYING-GPLV2 COPYING-LGPLV3 \
INSTALL README AUTHORS THANKS NEWS \
- glusterfs.spec glusterfs-api.pc.in
+ glusterfs.spec glusterfs-api.pc.in libgfchangelog.pc.in \
+ error-codes.json gf-error-codes.h.template \
+ gen-headers.py run-tests.sh \
+ $(shell find $(top_srcdir)/tests -type f -print)
SUBDIRS = argp-standalone libglusterfs rpc api xlators glusterfsd \
- $(FUSERMOUNT_SUBDIR) doc extras cli
+ $(FUSERMOUNT_SUBDIR) doc extras cli @SYNCDAEMON_SUBDIR@
pkgconfigdir = @pkgconfigdir@
-pkgconfig_DATA = glusterfs-api.pc
+pkgconfig_DATA = glusterfs-api.pc libgfchangelog.pc
CLEANFILES =
@@ -18,4 +21,9 @@ gitclean: distclean
rm -fr autom4te.cache
rm -f missing aclocal.m4 config.h.in config.guess config.sub ltmain.sh install-sh configure depcomp
rm -fr argp-standalone/autom4te.cache
- rm -f argp-standalone/aclocal.m4 argp-standalone/config.h.in argp-standalone/configure argp-standalone/depcomp argp-standalone/install-sh argp-standalone/missing
+ rm -f argp-standalone/aclocal.m4 argp-standalone/config.h.in
+ rm -f argp-standalone/configure argp-standalone/depcomp
+ rm -f argp-standalone/install-sh argp-standalone/missing
+
+dist-hook:
+ (cd $(srcdir) && git diff && echo ===== git log ==== && git log) > $(distdir)/ChangeLog
diff --git a/api/Makefile.am b/api/Makefile.am
index af437a64d..f0ad1ee97 100644
--- a/api/Makefile.am
+++ b/api/Makefile.am
@@ -1 +1 @@
-SUBDIRS = src
+SUBDIRS = src examples
diff --git a/api/examples/Makefile.am b/api/examples/Makefile.am
index 6048bb1c8..05f40ff53 100644
--- a/api/examples/Makefile.am
+++ b/api/examples/Makefile.am
@@ -1,4 +1,6 @@
-noinst_PROGRAMS = glfsxmp
+EXTRA_PROGRAMS = glfsxmp
glfsxmp_SOURCES = glfsxmp.c
glfsxmp_CFLAGS = $(GLFS_CFLAGS) -Wall
-glfsxmp_LDADD = $(GLFS_LIBS) \ No newline at end of file
+glfsxmp_LDADD = $(GLFS_LIBS) -lrt
+
+EXTRA_DIST = gfapi.py
diff --git a/api/examples/README b/api/examples/README
index 8f0bab697..4d2b521f7 100644
--- a/api/examples/README
+++ b/api/examples/README
@@ -17,3 +17,20 @@ install glusterfs-api RPM.
Make sure your LDFLAGS includes -L/path/to/lib where libgfapi.so is
installed and -I/path/to/include/glusterfs where the 'api' directory
containing the headers are available.
+
+glfsxmp.c
+=========
+
+glfsxmp.c is an example application which uses libgfapi
+
+Compilation Steps For glfsxmp.c
+===============================
+
+1. $./autogen.sh
+2. $./configure
+
+Note: Before running ./configure , as mentioned above, you need to
+ take care of #1 or #2 i.e. pkg-config path or LDFLAGS and
+ -I/<path> with correct values.
+
+3. $make glfsxmp
diff --git a/api/examples/gfapi.py b/api/examples/gfapi.py
index e71a9866e..3ac67f4d5 100644..100755
--- a/api/examples/gfapi.py
+++ b/api/examples/gfapi.py
@@ -1,29 +1,422 @@
-import ctypes
+#!/usr/bin/python
+
+from ctypes import *
+from ctypes.util import find_library
import os
import sys
+import time
+import types
# Looks like ctypes is having trouble with dependencies, so just force them to
# load with RTLD_GLOBAL until I figure that out.
-glfs = ctypes.CDLL("libglusterfs.so",ctypes.RTLD_GLOBAL)
-xdr = ctypes.CDLL("libgfxdr.so",ctypes.RTLD_GLOBAL)
-api = ctypes.CDLL("api/libgfapi.so",ctypes.RTLD_GLOBAL)
-
-fs = api.glfs_new(sys.argv[1])
-api.glfs_set_logging(fs,"/dev/stderr",7)
-api.glfs_set_volfile_server(fs,"socket","localhost",24007)
-api.glfs_init(fs)
-print "Initialized volume"
-
-fd = api.glfs_creat(fs,sys.argv[2],os.O_RDWR,0644)
-print "Created file"
-
-# Read anything that's there from before.
-rbuf = ctypes.create_string_buffer(32)
-if api.glfs_read(fd,rbuf,32,0) > 0:
- print "old data = %s" % rbuf.value
-
-# Write some new data.
-api.glfs_lseek(fd,0,os.SEEK_SET)
-wrote = api.glfs_write(fd,sys.argv[3],len(sys.argv[3]),0)
-if wrote > 0:
- print "wrote %d bytes" % wrote
+glfs = CDLL(find_library("glusterfs"),RTLD_GLOBAL)
+xdr = CDLL(find_library("gfxdr"),RTLD_GLOBAL)
+api = CDLL(find_library("gfapi"),RTLD_GLOBAL)
+
+# Wow, the Linux kernel folks really play nasty games with this structure. If
+# you look at the man page for stat(2) and then at this definition you'll note
+# two discrepancies. First, we seem to have st_nlink and st_mode reversed. In
+# fact that's exactly how they're defined *for 64-bit systems*; for 32-bit
+# they're in the man-page order. Even uglier, the man page makes no mention of
+# the *nsec fields, but they are very much present and if they're not included
+# then we get memory corruption because libgfapi has a structure definition
+# that's longer than ours and they overwrite some random bit of memory after
+# the space we allocated. Yes, that's all very disgusting, and I'm still not
+# sure this will really work on 32-bit because all of the field types are so
+# obfuscated behind macros and feature checks.
+class Stat (Structure):
+ _fields_ = [
+ ("st_dev", c_ulong),
+ ("st_ino", c_ulong),
+ ("st_nlink", c_ulong),
+ ("st_mode", c_uint),
+ ("st_uid", c_uint),
+ ("st_gid", c_uint),
+ ("st_rdev", c_ulong),
+ ("st_size", c_ulong),
+ ("st_blksize", c_ulong),
+ ("st_blocks", c_ulong),
+ ("st_atime", c_ulong),
+ ("st_atimensec", c_ulong),
+ ("st_mtime", c_ulong),
+ ("st_mtimensec", c_ulong),
+ ("st_ctime", c_ulong),
+ ("st_ctimensec", c_ulong),
+ ]
+api.glfs_creat.restype = c_void_p
+api.glfs_open.restype = c_void_p
+api.glfs_lstat.restype = c_int
+api.glfs_lstat.argtypes = [c_void_p, c_char_p, POINTER(Stat)]
+
+class Dirent (Structure):
+ _fields_ = [
+ ("d_ino", c_ulong),
+ ("d_off", c_ulong),
+ ("d_reclen", c_ushort),
+ ("d_type", c_char),
+ ("d_name", c_char * 256),
+ ]
+api.glfs_opendir.restype = c_void_p
+api.glfs_readdir_r.restype = c_int
+api.glfs_readdir_r.argtypes = [c_void_p, POINTER(Dirent),
+ POINTER(POINTER(Dirent))]
+
+# There's a bit of ctypes glitchiness around __del__ functions and module-level
+# variables. If we unload the module while we still have references to File or
+# Volume objects, the module-level variables might have disappeared by the time
+# __del__ gets called. Therefore the objects hold references which they
+# release when __del__ is done. We only actually use the object-local values
+# in __del__; for clarity, we just use the simpler module-level form elsewhere.
+
+class File(object):
+
+ def __init__ (self, fd):
+ # Add a reference so the module-level variable "api" doesn't
+ # get yanked out from under us (see comment above File def'n).
+ self._api = api
+ self.fd = fd
+
+ def __del__ (self):
+ self._api.glfs_close(self.fd)
+ self._api = None
+
+ # File operations, in alphabetical order.
+
+ def fsync (self):
+ return api.glfs_fsync(self.fd)
+
+ def read (self, buflen, flags=0):
+ rbuf = create_string_buffer(buflen)
+ rc = api.glfs_read(self.fd,rbuf,buflen,flags)
+ if rc > 0:
+ return rbuf.value[:rc]
+ else:
+ return rc
+
+ def read_buffer (self, buf, flags=0):
+ return api.glfs_read(self.fd,buf,len(buf),flags)
+
+ def write (self, data, flags=0):
+ return api.glfs_write(self.fd,data,len(data),flags)
+
+ def fallocate (self, mode, offset, len):
+ return api.glfs_fallocate(self.fd, mode, offset, len)
+
+ def discard (self, offset, len):
+ return api.glfs_discard(self.fd, offset, len)
+
+
+class Dir(object):
+
+ def __init__ (self, fd):
+ # Add a reference so the module-level variable "api" doesn't
+ # get yanked out from under us (see comment above File def'n).
+ self._api = api
+ self.fd = fd
+ self.cursor = POINTER(Dirent)()
+
+ def __del__ (self):
+ self._api.glfs_closedir(self.fd)
+ self._api = None
+
+ def next (self):
+ entry = Dirent()
+ entry.d_reclen = 256
+ rc = api.glfs_readdir_r(self.fd,byref(entry),byref(self.cursor))
+ if (rc < 0) or (not self.cursor) or (not self.cursor.contents):
+ return rc
+ return entry
+
+class Volume(object):
+
+ # Housekeeping functions.
+
+ def __init__ (self, host, volid, proto="tcp", port=24007):
+ # Add a reference so the module-level variable "api" doesn't
+ # get yanked out from under us (see comment above File def'n).
+ self._api = api
+ self.fs = api.glfs_new(volid)
+ api.glfs_set_volfile_server(self.fs,proto,host,port)
+
+ def __del__ (self):
+ self._api.glfs_fini(self.fs)
+ self._api = None
+
+ def set_logging (self, path, level):
+ api.glfs_set_logging(self.fs,path,level)
+
+ def mount (self):
+ api.glfs_init(self.fs)
+
+ # File operations, in alphabetical order.
+
+ def creat (self, path, flags, mode):
+ fd = api.glfs_creat(self.fs,path,flags,mode)
+ if not fd:
+ return fd
+ return File(fd)
+
+ def getxattr (self, path, key, maxlen):
+ buf = create_string_buffer(maxlen)
+ rc = api.glfs_getxattr(self.fs,path,key,buf,maxlen)
+ if rc < 0:
+ return rc
+ return buf.value[:rc]
+
+ def listxattr (self, path):
+ buf = create_string_buffer(512)
+ rc = api.glfs_listxattr(self.fs,path,buf,512)
+ if rc < 0:
+ return rc
+ xattrs = []
+ # Parsing character by character is ugly, but it seems like the
+ # easiest way to deal with the "strings separated by NUL in one
+ # buffer" format.
+ i = 0
+ while i < rc:
+ new_xa = buf.raw[i]
+ i += 1
+ while i < rc:
+ next_char = buf.raw[i]
+ i += 1
+ if next_char == '\0':
+ xattrs.append(new_xa)
+ break
+ new_xa += next_char
+ xattrs.sort()
+ return xattrs
+
+ def lstat (self, path):
+ x = Stat()
+ rc = api.glfs_lstat(self.fs,path,byref(x))
+ if rc >= 0:
+ return x
+ else:
+ return rc
+
+ def mkdir (self, path):
+ return api.glfs_mkdir(self.fs,path)
+
+ def open (self, path, flags):
+ fd = api.glfs_open(self.fs,path,flags)
+ if not fd:
+ return fd
+ return File(fd)
+
+ def opendir (self, path):
+ fd = api.glfs_opendir(self.fs,path)
+ if not fd:
+ return fd
+ return Dir(fd)
+
+ def rename (self, opath, npath):
+ return api.glfs_rename(self.fs,opath,npath)
+
+ def rmdir (self, path):
+ return api.glfs_rmdir(self.fs,path)
+
+ def setxattr (self, path, key, value, vlen):
+ return api.glfs_setxattr(self.fs,path,key,value,vlen,0)
+
+ def unlink (self, path):
+ return api.glfs_unlink(self.fs,path)
+
+if __name__ == "__main__":
+ def test_create_write (vol, path, data):
+ mypath = path + ".io"
+ fd = vol.creat(mypath,os.O_WRONLY|os.O_EXCL,0644)
+ if not fd:
+ return False, "creat error"
+ rc = fd.write(data)
+ if rc != len(data):
+ return False, "wrote %d/%d bytes" % (rc, len(data))
+ return True, "wrote %d bytes" % rc
+
+ # TBD: this test fails if we do create, open, write, read
+ def test_open_read (vol, path, data):
+ mypath = path + ".io"
+ fd = vol.open(mypath,os.O_RDONLY)
+ if not fd:
+ return False, "open error"
+ dlen = len(data) * 2
+ buf = fd.read(dlen)
+ if type(buf) == types.IntType:
+ return False, "read error %d" % buf
+ if len(buf) != len(data):
+ return False, "read %d/%d bytes" % (len(buf), len(data))
+ return True, "read '%s'" % buf
+
+ def test_lstat (vol, path, data):
+ mypath = path + ".io"
+ sb = vol.lstat(mypath)
+ if type(sb) == types.IntType:
+ return False, "lstat error %d" % sb
+ if sb.st_size != len(data):
+ return False, "lstat size is %d, expected %d" % (
+ sb.st_size, len(data))
+ return True, "lstat got correct size %d" % sb.st_size
+
+ def test_rename (vol, path, data):
+ opath = path + ".io"
+ npath = path + ".tmp"
+ rc = vol.rename(opath,npath)
+ if rc < 0:
+ return False, "rename error %d" % rc
+ ofd = vol.open(opath,os.O_RDWR)
+ if isinstance(ofd,File):
+ return False, "old path working after rename"
+ nfd = vol.open(npath,os.O_RDWR)
+ if isinstance(nfd,File):
+ return False, "new path not working after rename"
+ return True, "rename worked"
+
+ def test_unlink (vol, path, data):
+ mypath = path + ".tmp"
+ rc = vol.unlink(mypath)
+ if rc < 0:
+ return False, "unlink error %d" % fd
+ fd = vol.open(mypath,os.O_RDWR)
+ if isinstance(fd,File):
+ return False, "path still usable after unlink"
+ return True, "unlink worked"
+
+ def test_mkdir (vol, path, data):
+ mypath = path + ".dir"
+ rc = vol.mkdir(mypath)
+ if rc < 0:
+ return False, "mkdir error %d" % rc
+ return True, "mkdir worked"
+
+ def test_create_in_dir (vol, path, data):
+ mypath = path + ".dir/probe"
+ fd = vol.creat(mypath,os.O_RDWR,0644)
+ if not isinstance(fd,File):
+ return False, "create (in dir) error"
+ return True, "create (in dir) worked"
+
+ def test_dir_listing (vol, path, data):
+ mypath = path + ".dir"
+ fd = vol.opendir(mypath)
+ if not isinstance(fd,Dir):
+ return False, "opendir error %d" % fd
+ files = []
+ while True:
+ ent = fd.next()
+ if not isinstance(ent,Dirent):
+ break
+ name = ent.d_name[:ent.d_reclen]
+ files.append(name)
+ if files != [".", "..", "probe"]:
+ return False, "wrong directory contents"
+ return True, "directory listing worked"
+
+ def test_unlink_in_dir (vol, path, data):
+ mypath = path + ".dir/probe"
+ rc = vol.unlink(mypath)
+ if rc < 0:
+ return False, "unlink (in dir) error %d" % rc
+ return True, "unlink (in dir) worked"
+
+ def test_rmdir (vol, path, data):
+ mypath = path + ".dir"
+ rc = vol.rmdir(mypath)
+ if rc < 0:
+ return False, "rmdir error %d" % rc
+ sb = vol.lstat(mypath)
+ if not isinstance(sb,Stat):
+ return False, "dir still there after rmdir"
+ return True, "rmdir worked"
+
+ def test_setxattr (vol, path, data):
+ mypath = path + ".xa"
+ fd = vol.creat(mypath,os.O_RDWR|os.O_EXCL,0644)
+ if not fd:
+ return False, "creat (xattr test) error"
+ key1, key2 = "hello", "goodbye"
+ if vol.setxattr(mypath,"trusted.key1",key1,len(key1)) < 0:
+ return False, "setxattr (key1) error"
+ if vol.setxattr(mypath,"trusted.key2",key2,len(key2)) < 0:
+ return False, "setxattr (key2) error"
+ return True, "setxattr worked"
+
+ def test_getxattr (vol, path, data):
+ mypath = path + ".xa"
+ buf = vol.getxattr(mypath,"trusted.key1",32)
+ if type(buf) == types.IntType:
+ return False, "getxattr error"
+ if buf != "hello":
+ return False, "wrong getxattr value %s" % buf
+ return True, "getxattr worked"
+
+ def test_listxattr (vol, path, data):
+ mypath = path + ".xa"
+ xattrs = vol.listxattr(mypath)
+ if type(xattrs) == types.IntType:
+ return False, "listxattr error"
+ if xattrs != ["trusted.key1","trusted.key2"]:
+ return False, "wrong listxattr value %s" % repr(xattrs)
+ return True, "listxattr worked"
+
+ def test_fallocate (vol, path, data):
+ mypath = path + ".io"
+ fd = vol.creat(mypath,os.O_WRONLY|os.O_EXCL,0644)
+ if not fd:
+ return False, "creat error"
+ rc = fd.fallocate(0, 0, 1024*1024)
+ if rc != 0:
+ return False, "fallocate error"
+ rc = fd.discard(4096, 4096)
+ if rc != 0:
+ return False, "discard error"
+ return True, "fallocate/discard worked"
+
+ test_list = (
+ test_create_write,
+ test_open_read,
+ test_lstat,
+ test_rename,
+ test_unlink,
+ test_mkdir,
+ test_create_in_dir,
+ test_dir_listing,
+ test_unlink_in_dir,
+ test_rmdir,
+ test_setxattr,
+ test_getxattr,
+ test_listxattr,
+ test_fallocate,
+ )
+
+ ok_to_fail = (
+ # TBD: this fails opening the new file, even though the file
+ # did get renamed. Looks like a gfapi bug, not ours.
+ (test_rename, "new path not working after rename"),
+ # TBD: similar, call returns error even though it worked
+ (test_rmdir, "dir still there after rmdir"),
+ )
+
+ volid, path = sys.argv[1:3]
+ data = "fubar"
+ vol = Volume("localhost",volid)
+ vol.set_logging("/dev/null",7)
+ #vol.set_logging("/dev/stderr",7)
+ vol.mount()
+
+ failures = 0
+ expected = 0
+ for t in test_list:
+ rc, msg = t(vol,path,data)
+ if rc:
+ print "PASS: %s" % msg
+ else:
+ print "FAIL: %s" % msg
+ failures += 1
+ for otf in ok_to_fail:
+ if (t == otf[0]) and (msg == otf[1]):
+ print " (skipping known failure)"
+ expected += 1
+ break # from the *inner* for loop
+ else:
+ break # from the *outer* for loop
+
+ print "%d failures (%d expected)" % (failures, expected)
diff --git a/api/examples/glfsxmp.c b/api/examples/glfsxmp.c
index 9497128af..600d72fb5 100644
--- a/api/examples/glfsxmp.c
+++ b/api/examples/glfsxmp.c
@@ -1,83 +1,1598 @@
#include <stdio.h>
+#include <stdlib.h>
#include <errno.h>
#include "api/glfs.h"
+#include "api/glfs-handles.h"
#include <string.h>
#include <time.h>
+
+int
+test_dirops (glfs_t *fs)
+{
+ glfs_fd_t *fd = NULL;
+ char buf[512];
+ struct dirent *entry = NULL;
+
+ fd = glfs_opendir (fs, "/");
+ if (!fd) {
+ fprintf (stderr, "/: %s\n", strerror (errno));
+ return -1;
+ }
+
+ fprintf (stderr, "Entries:\n");
+ while (glfs_readdir_r (fd, (struct dirent *)buf, &entry), entry) {
+ fprintf (stderr, "%s: %lu\n", entry->d_name, glfs_telldir (fd));
+ }
+
+ glfs_closedir (fd);
+ return 0;
+}
+
+
+int
+test_xattr (glfs_t *fs)
+{
+ char *filename = "/filename2";
+ char buf[512];
+ char *ptr;
+ int ret;
+
+ ret = glfs_setxattr (fs, filename, "user.testkey", "testval", 8, 0);
+ fprintf (stderr, "setxattr(%s): %d (%s)\n", filename, ret,
+ strerror (errno));
+
+ ret = glfs_setxattr (fs, filename, "user.testkey2", "testval", 8, 0);
+ fprintf (stderr, "setxattr(%s): %d (%s)\n", filename, ret,
+ strerror (errno));
+
+ ret = glfs_listxattr (fs, filename, buf, 512);
+ fprintf (stderr, "listxattr(%s): %d (%s)\n", filename, ret,
+ strerror (errno));
+ if (ret < 0)
+ return -1;
+
+ for (ptr = buf; ptr < buf + ret; ptr++) {
+ printf ("key=%s\n", ptr);
+ ptr += strlen (ptr);
+ }
+
+ return 0;
+}
+
+
+int
+test_chdir (glfs_t *fs)
+{
+ int ret = -1;
+ char *topdir = "/topdir";
+ char *linkdir = "/linkdir";
+ char *subdir = "./subdir";
+ char *respath = NULL;
+ char pathbuf[4096];
+
+ ret = glfs_mkdir (fs, topdir, 0755);
+ if (ret) {
+ fprintf (stderr, "mkdir(%s): %s\n", topdir, strerror (errno));
+ return -1;
+ }
+
+ respath = glfs_getcwd (fs, pathbuf, 4096);
+ fprintf (stdout, "getcwd() = %s\n", respath);
+
+ ret = glfs_symlink (fs, topdir, linkdir);
+ if (ret) {
+ fprintf (stderr, "symlink(%s, %s): %s\n", topdir, linkdir, strerror (errno));
+ return -1;
+ }
+
+ ret = glfs_chdir (fs, linkdir);
+ if (ret) {
+ fprintf (stderr, "chdir(%s): %s\n", linkdir, strerror (errno));
+ return -1;
+ }
+
+ respath = glfs_getcwd (fs, pathbuf, 4096);
+ fprintf (stdout, "getcwd() = %s\n", respath);
+
+ respath = glfs_realpath (fs, subdir, pathbuf);
+ if (respath) {
+ fprintf (stderr, "realpath(%s) worked unexpectedly: %s\n", subdir, respath);
+ return -1;
+ }
+
+ ret = glfs_mkdir (fs, subdir, 0755);
+ if (ret) {
+ fprintf (stderr, "mkdir(%s): %s\n", subdir, strerror (errno));
+ return -1;
+ }
+
+ respath = glfs_realpath (fs, subdir, pathbuf);
+ if (!respath) {
+ fprintf (stderr, "realpath(%s): %s\n", subdir, strerror (errno));
+ } else {
+ fprintf (stdout, "realpath(%s) = %s\n", subdir, respath);
+ }
+
+ ret = glfs_chdir (fs, subdir);
+ if (ret) {
+ fprintf (stderr, "chdir(%s): %s\n", subdir, strerror (errno));
+ return -1;
+ }
+
+ respath = glfs_getcwd (fs, pathbuf, 4096);
+ fprintf (stdout, "getcwd() = %s\n", respath);
+
+ respath = glfs_realpath (fs, "/linkdir/subdir", pathbuf);
+ if (!respath) {
+ fprintf (stderr, "realpath(/linkdir/subdir): %s\n", strerror (errno));
+ } else {
+ fprintf (stdout, "realpath(/linkdir/subdir) = %s\n", respath);
+ }
+
+ return 0;
+}
+
+#ifdef DEBUG
+static void
+peek_stat (struct stat *sb)
+{
+ printf ("Dumping stat information:\n");
+ printf ("File type: ");
+
+ switch (sb->st_mode & S_IFMT) {
+ case S_IFBLK: printf ("block device\n"); break;
+ case S_IFCHR: printf ("character device\n"); break;
+ case S_IFDIR: printf ("directory\n"); break;
+ case S_IFIFO: printf ("FIFO/pipe\n"); break;
+ case S_IFLNK: printf ("symlink\n"); break;
+ case S_IFREG: printf ("regular file\n"); break;
+ case S_IFSOCK: printf ("socket\n"); break;
+ default: printf ("unknown?\n"); break;
+ }
+
+ printf ("I-node number: %ld\n", (long) sb->st_ino);
+
+ printf ("Mode: %lo (octal)\n",
+ (unsigned long) sb->st_mode);
+
+ printf ("Link count: %ld\n", (long) sb->st_nlink);
+ printf ("Ownership: UID=%ld GID=%ld\n",
+ (long) sb->st_uid, (long) sb->st_gid);
+
+ printf ("Preferred I/O block size: %ld bytes\n",
+ (long) sb->st_blksize);
+ printf ("File size: %lld bytes\n",
+ (long long) sb->st_size);
+ printf ("Blocks allocated: %lld\n",
+ (long long) sb->st_blocks);
+
+ printf ("Last status change: %s", ctime(&sb->st_ctime));
+ printf ("Last file access: %s", ctime(&sb->st_atime));
+ printf ("Last file modification: %s", ctime(&sb->st_mtime));
+
+ return;
+}
+
+static void
+peek_handle (unsigned char *glid)
+{
+ int i;
+
+ for (i = 0; i < GFAPI_HANDLE_LENGTH; i++)
+ {
+ printf (":%02x:", glid[i]);
+ }
+ printf ("\n");
+}
+#else /* DEBUG */
+static void
+peek_stat (struct stat *sb)
+{
+ return;
+}
+
+static void
+peek_handle (unsigned char *id)
+{
+ return;
+}
+#endif /* DEBUG */
+
+glfs_t *fs = NULL;
+char *full_parent_name = "/testdir", *parent_name = "testdir";
+
+void
+test_h_unlink (void)
+{
+ char *my_dir = "unlinkdir";
+ char *my_file = "file.txt";
+ char *my_subdir = "dir1";
+ struct glfs_object *parent = NULL, *leaf = NULL, *dir = NULL,
+ *subdir = NULL, *subleaf = NULL;
+ struct stat sb;
+ int ret;
+
+ printf ("glfs_h_unlink tests: In Progress\n");
+
+ /* Prepare tests */
+ parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+ if (parent == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ full_parent_name, NULL, strerror (errno));
+ printf ("glfs_h_lookupat tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ dir = glfs_h_mkdir (fs, parent, my_dir, 0644, &sb);
+ if (dir == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+ my_dir, parent, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+
+ leaf = glfs_h_creat (fs, dir, my_file, O_CREAT, 0644, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+ my_file, dir, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+
+ subdir = glfs_h_mkdir (fs, dir, my_subdir, 0644, &sb);
+ if (subdir == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+ my_subdir, dir, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+
+ subleaf = glfs_h_creat (fs, subdir, my_file, O_CREAT, 0644, &sb);
+ if (subleaf == NULL) {
+ fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+ my_file, subdir, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+
+ /* unlink non empty directory */
+ ret = glfs_h_unlink (fs, dir, my_subdir);
+ if ((ret && errno != ENOTEMPTY) || (ret == 0)) {
+ fprintf (stderr, "glfs_h_unlink: error unlinking %s: it is non empty: %s\n",
+ my_subdir, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+
+ /* unlink regular file */
+ ret = glfs_h_unlink (fs, subdir, my_file);
+ if (ret) {
+ fprintf (stderr, "glfs_h_unlink: error unlinking %s: from (%p),%s\n",
+ my_file, subdir, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+
+ /* unlink directory */
+ ret = glfs_h_unlink (fs, dir, my_subdir);
+ if (ret) {
+ fprintf (stderr, "glfs_h_unlink: error unlinking %s: from (%p),%s\n",
+ my_subdir, dir, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+
+ /* unlink regular file */
+ ret = glfs_h_unlink (fs, dir, my_file);
+ if (ret) {
+ fprintf (stderr, "glfs_h_unlink: error unlinking %s: from (%p),%s\n",
+ my_file, dir, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+
+ /* unlink non-existant regular file */
+ ret = glfs_h_unlink (fs, dir, my_file);
+ if ((ret && errno != ENOENT) || (ret == 0)) {
+ fprintf (stderr, "glfs_h_unlink: error unlinking non-existant %s: invalid errno ,%d, %s\n",
+ my_file, ret, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+
+ /* unlink non-existant directory */
+ ret = glfs_h_unlink (fs, dir, my_subdir);
+ if ((ret && errno != ENOENT) || (ret == 0)) {
+ fprintf (stderr, "glfs_h_unlink: error unlinking non-existant %s: invalid errno ,%d, %s\n",
+ my_subdir, ret, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+
+ /* unlink directory */
+ ret = glfs_h_unlink (fs, parent, my_dir);
+ if (ret) {
+ fprintf (stderr, "glfs_h_unlink: error unlinking %s: from (%p),%s\n",
+ my_dir, dir, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+
+ printf ("glfs_h_unlink tests: PASSED\n");
+
+out:
+ if (dir)
+ glfs_h_close (dir);
+ if (leaf)
+ glfs_h_close (leaf);
+ if (subdir)
+ glfs_h_close (subdir);
+ if (subleaf)
+ glfs_h_close (subleaf);
+ if (parent)
+ glfs_h_close (parent);
+
+ return;
+}
+
+void
+test_h_getsetattrs (void)
+{
+ char *my_dir = "attrdir";
+ char *my_file = "attrfile.txt";
+ struct glfs_object *parent = NULL, *leaf = NULL, *dir = NULL;
+ struct stat sb, retsb;
+ int ret, valid;
+ struct timespec timestamp;
+
+ printf("glfs_h_getattrs and setattrs tests: In Progress\n");
+
+ /* Prepare tests */
+ parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+ if (parent == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ full_parent_name, NULL, strerror (errno));
+ printf ("glfs_h_lookupat tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ dir = glfs_h_mkdir (fs, parent, my_dir, 0644, &sb);
+ if (dir == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+ my_dir, parent, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ leaf = glfs_h_creat (fs, dir, my_file, O_CREAT, 0644, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+ my_file, dir, strerror (errno));
+ printf ("glfs_h_unlink tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ ret = glfs_h_getattrs (fs, dir, &retsb);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_getattrs: error %s: from (%p),%s\n",
+ my_dir, dir, strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&retsb);
+ /* TODO: Compare stat information */
+
+ retsb.st_mode = 00666;
+ retsb.st_uid = 1000;
+ retsb.st_gid = 1001;
+ ret = clock_gettime (CLOCK_REALTIME, &timestamp);
+ if(ret != 0) {
+ fprintf (stderr, "clock_gettime: error %s\n", strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+ retsb.st_atim = timestamp;
+ retsb.st_mtim = timestamp;
+ valid = GFAPI_SET_ATTR_MODE | GFAPI_SET_ATTR_UID | GFAPI_SET_ATTR_GID |
+ GFAPI_SET_ATTR_ATIME | GFAPI_SET_ATTR_MTIME;
+ peek_stat (&retsb);
+
+ ret = glfs_h_setattrs (fs, dir, &retsb, valid);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_setattrs: error %s: from (%p),%s\n",
+ my_dir, dir, strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+
+ memset(&retsb, 0, sizeof (struct stat));
+ ret = glfs_h_stat (fs, dir, &retsb);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_stat: error %s: from (%p),%s\n",
+ my_dir, dir, strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&retsb);
+
+ printf ("glfs_h_getattrs and setattrs tests: PASSED\n");
+out:
+ if (parent)
+ glfs_h_close (parent);
+ if (leaf)
+ glfs_h_close (leaf);
+ if (dir)
+ glfs_h_close (dir);
+
+ return;
+}
+
+void
+test_h_truncate (void)
+{
+ char *my_dir = "truncatedir";
+ char *my_file = "file.txt";
+ struct glfs_object *root = NULL, *parent = NULL, *leaf = NULL;
+ struct stat sb;
+ glfs_fd_t *fd = NULL;
+ char buf[32];
+ off_t offset = 0;
+ int ret = 0;
+
+ printf("glfs_h_truncate tests: In Progress\n");
+
+ /* Prepare tests */
+ root = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+ if (root == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ full_parent_name, NULL, strerror (errno));
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ parent = glfs_h_mkdir (fs, root, my_dir, 0644, &sb);
+ if (parent == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+ my_dir, root, strerror (errno));
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ leaf = glfs_h_creat (fs, parent, my_file, O_CREAT, 0644, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+ my_file, parent, strerror (errno));
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ fd = glfs_h_open (fs, leaf, O_RDWR);
+ if (fd == NULL) {
+ fprintf (stderr, "glfs_h_open: error on open of %s: %s\n",
+ my_file, strerror (errno));
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+
+ memcpy (buf, "abcdefghijklmnopqrstuvwxyz012345", 32);
+ ret = glfs_write (fd, buf, 32, 0);
+
+ /* run tests */
+ /* truncate lower */
+ offset = 30;
+ ret = glfs_h_truncate (fs, leaf, offset);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_truncate: error creating %s: from (%p),%s\n",
+ my_file, parent, strerror (errno));
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+ ret = glfs_h_getattrs (fs, leaf, &sb);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_getattrs: error for %s (%p),%s\n",
+ my_file, leaf, strerror (errno));
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+ if (sb.st_size != offset) {
+ fprintf (stderr, "glfs_h_truncate: post size mismatch\n");
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+
+ /* truncate higher */
+ offset = 32;
+ ret = glfs_h_truncate (fs, leaf, offset);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_truncate: error creating %s: from (%p),%s\n",
+ my_file, parent, strerror (errno));
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+ ret = glfs_h_getattrs (fs, leaf, &sb);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_getattrs: error for %s (%p),%s\n",
+ my_file, leaf, strerror (errno));
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+ if (sb.st_size != offset) {
+ fprintf (stderr, "glfs_h_truncate: post size mismatch\n");
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+
+ /* truncate equal */
+ offset = 30;
+ ret = glfs_h_truncate (fs, leaf, offset);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_truncate: error creating %s: from (%p),%s\n",
+ my_file, parent, strerror (errno));
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+ ret = glfs_h_getattrs (fs, leaf, &sb);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_getattrs: error for %s (%p),%s\n",
+ my_file, leaf, strerror (errno));
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+ if (sb.st_size != offset) {
+ fprintf (stderr, "glfs_h_truncate: post size mismatch\n");
+ printf ("glfs_h_truncate tests: FAILED\n");
+ goto out;
+ }
+
+ printf ("glfs_h_truncate tests: PASSED\n");
+out:
+ if (fd)
+ glfs_close (fd);
+ if (root)
+ glfs_h_close (root);
+ if (parent)
+ glfs_h_close (parent);
+ if (leaf)
+ glfs_h_close (leaf);
+
+ return;
+}
+
+void
+test_h_links (void)
+{
+ char *my_dir = "linkdir";
+ char *my_file = "file.txt";
+ char *my_symlnk = "slnk.txt";
+ char *my_lnk = "lnk.txt";
+ char *linksrc_dir = "dir1";
+ char *linktgt_dir = "dir2";
+ struct glfs_object *root = NULL, *parent = NULL, *leaf = NULL,
+ *dirsrc = NULL, *dirtgt = NULL, *dleaf = NULL;
+ struct glfs_object *ln1 = NULL;
+ struct stat sb;
+ int ret;
+ char *buf = NULL;
+
+ printf("glfs_h_link(s) tests: In Progress\n");
+
+ /* Prepare tests */
+ root = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+ if (root == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ full_parent_name, NULL, strerror (errno));
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ parent = glfs_h_mkdir (fs, root, my_dir, 0644, &sb);
+ if (parent == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+ my_dir, root, strerror (errno));
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ leaf = glfs_h_creat (fs, parent, my_file, O_CREAT, 0644, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+ my_file, parent, strerror (errno));
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ dirsrc = glfs_h_mkdir (fs, parent, linksrc_dir, 0644, &sb);
+ if (dirsrc == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+ linksrc_dir, parent, strerror (errno));
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ dirtgt = glfs_h_mkdir (fs, parent, linktgt_dir, 0644, &sb);
+ if (dirtgt == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+ linktgt_dir, parent, strerror (errno));
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ dleaf = glfs_h_creat (fs, dirsrc, my_file, O_CREAT, 0644, &sb);
+ if (dleaf == NULL) {
+ fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+ my_file, dirsrc, strerror (errno));
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ /* run tests */
+ /* sym link: /testdir/linkdir/file.txt to ./slnk.txt */
+ ln1 = glfs_h_symlink (fs, parent, my_symlnk, "./file.txt", &sb);
+ if (ln1 == NULL) {
+ fprintf (stderr, "glfs_h_symlink: error creating %s: from (%p),%s\n",
+ my_symlnk, parent, strerror (errno));
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ buf = calloc (1024, sizeof(char));
+ if (buf == NULL) {
+ fprintf (stderr, "Error allocating memory\n");
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+
+ ret = glfs_h_readlink (fs, ln1, buf, 1024);
+ if (ret <= 0) {
+ fprintf (stderr, "glfs_h_readlink: error reading %s: from (%p),%s\n",
+ my_symlnk, ln1, strerror (errno));
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+ if (!(strncmp (buf, my_symlnk, strlen (my_symlnk)))) {
+ fprintf (stderr, "glfs_h_readlink: error mismatch in link name: actual %s: retrieved %s\n",
+ my_symlnk, buf);
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+
+ /* link: /testdir/linkdir/file.txt to ./lnk.txt */
+ ret = glfs_h_link (fs, leaf, parent, my_lnk);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_link: error creating %s: from (%p),%s\n",
+ my_lnk, parent, strerror (errno));
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+ /* TODO: Should write content to a file and read from the link */
+
+ /* link: /testdir/linkdir/dir1/file.txt to ../dir2/slnk.txt */
+ ret = glfs_h_link (fs, dleaf, dirtgt, my_lnk);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_link: error creating %s: from (%p),%s\n",
+ my_lnk, dirtgt, strerror (errno));
+ printf ("glfs_h_link(s) tests: FAILED\n");
+ goto out;
+ }
+ /* TODO: Should write content to a file and read from the link */
+
+ printf ("glfs_h_link(s) tests: PASSED\n");
+
+out:
+ if (root)
+ glfs_h_close (root);
+ if (parent)
+ glfs_h_close (parent);
+ if (leaf)
+ glfs_h_close (leaf);
+ if (dirsrc)
+ glfs_h_close (dirsrc);
+ if (dirtgt)
+ glfs_h_close (dirtgt);
+ if (dleaf)
+ glfs_h_close (dleaf);
+ if (ln1)
+ glfs_h_close (ln1);
+ if (buf)
+ free (buf);
+
+ return;
+}
+
+void
+test_h_rename (void)
+{
+ char *my_dir = "renamedir";
+ char *my_file = "file.txt";
+ char *src_dir = "dir1";
+ char *tgt_dir = "dir2";
+ struct glfs_object *root = NULL, *parent = NULL, *leaf = NULL,
+ *dirsrc = NULL, *dirtgt = NULL, *dleaf = NULL;
+ struct stat sb;
+ int ret;
+
+ printf("glfs_h_rename tests: In Progress\n");
+
+ /* Prepare tests */
+ root = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+ if (root == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ full_parent_name, NULL, strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ parent = glfs_h_mkdir (fs, root, my_dir, 0644, &sb);
+ if (parent == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+ my_dir, root, strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ leaf = glfs_h_creat (fs, parent, my_file, O_CREAT, 0644, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+ my_file, parent, strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ dirsrc = glfs_h_mkdir (fs, parent, src_dir, 0644, &sb);
+ if (dirsrc == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+ src_dir, parent, strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ dirtgt = glfs_h_mkdir (fs, parent, tgt_dir, 0644, &sb);
+ if (dirtgt == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+ tgt_dir, parent, strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ dleaf = glfs_h_creat (fs, dirsrc, my_file, O_CREAT, 0644, &sb);
+ if (dleaf == NULL) {
+ fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+ my_file, dirsrc, strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ /* run tests */
+ /* Rename file.txt -> file1.txt */
+ ret = glfs_h_rename (fs, parent, "file.txt", parent, "file1.txt");
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_rename: error renaming %s to %s (%s)\n",
+ "file.txt", "file1.txt", strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+
+ /* rename dir1/file.txt -> file.txt */
+ ret = glfs_h_rename (fs, dirsrc, "file.txt", parent, "file.txt");
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_rename: error renaming %s/%s to %s (%s)\n",
+ src_dir, "file.txt", "file.txt", strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+
+ /* rename file1.txt -> file.txt (exists) */
+ ret = glfs_h_rename (fs, parent, "file1.txt", parent, "file.txt");
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_rename: error renaming %s to %s (%s)\n",
+ "file.txt", "file.txt", strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+
+ /* rename dir1 -> dir3 */
+ ret = glfs_h_rename (fs, parent, "dir1", parent, "dir3");
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_rename: error renaming %s to %s (%s)\n",
+ "dir1", "dir3", strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+
+ /* rename dir2 ->dir3 (exists) */
+ ret = glfs_h_rename (fs, parent, "dir2", parent, "dir3");
+ if (ret != 0) {
+ fprintf (stderr, "glfs_h_rename: error renaming %s to %s (%s)\n",
+ "dir2", "dir3", strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+
+ /* rename file.txt -> dir3 (fail) */
+ ret = glfs_h_rename (fs, parent, "file.txt", parent, "dir3");
+ if (ret == 0) {
+ fprintf (stderr, "glfs_h_rename: NO error renaming %s to %s (%s)\n",
+ "file.txt", "dir3", strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+
+ /* rename dir3 -> file.txt (fail) */
+ ret = glfs_h_rename (fs, parent, "dir3", parent, "file.txt");
+ if (ret == 0) {
+ fprintf (stderr, "glfs_h_rename: NO error renaming %s to %s (%s)\n",
+ "dir3", "file.txt", strerror (errno));
+ printf ("glfs_h_rename tests: FAILED\n");
+ goto out;
+ }
+
+ printf ("glfs_h_rename tests: PASSED\n");
+
+out:
+ if (root)
+ glfs_h_close (root);
+ if (parent)
+ glfs_h_close (parent);
+ if (leaf)
+ glfs_h_close (leaf);
+ if (dirsrc)
+ glfs_h_close (dirsrc);
+ if (dirtgt)
+ glfs_h_close (dirtgt);
+ if (dleaf)
+ glfs_h_close (dleaf);
+
+ return;
+}
+
+void
+assimilatetime (struct timespec *ts, struct timespec ts_st,
+ struct timespec ts_ed)
+{
+ if ((ts_ed.tv_nsec - ts_st.tv_nsec) < 0) {
+ ts->tv_sec += ts_ed.tv_sec - ts_st.tv_sec - 1;
+ ts->tv_nsec += 1000000000 + ts_ed.tv_nsec - ts_st.tv_nsec;
+ } else {
+ ts->tv_sec += ts_ed.tv_sec - ts_st.tv_sec;
+ ts->tv_nsec += ts_ed.tv_nsec - ts_st.tv_nsec;
+ }
+
+ if (ts->tv_nsec > 1000000000) {
+ ts->tv_nsec = ts->tv_nsec - 1000000000;
+ ts->tv_sec += 1;
+ }
+
+ return;
+}
+
+#define MAX_FILES_CREATE 10
+#define MAXPATHNAME 512
+void
+test_h_performance (void)
+{
+ char *my_dir = "perftest",
+ *full_dir_path="/testdir/perftest";
+ char *my_file = "file_", my_file_name[MAXPATHNAME];
+ struct glfs_object *parent = NULL, *leaf = NULL, *dir = NULL;
+ struct stat sb;
+ int ret, i;
+ struct glfs_fd *fd;
+ struct timespec c_ts = {0, 0}, c_ts_st, c_ts_ed;
+ struct timespec o_ts = {0, 0}, o_ts_st, o_ts_ed;
+
+ printf("glfs_h_performance tests: In Progress\n");
+
+ /* Prepare tests */
+ parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+ if (parent == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ full_parent_name, NULL, strerror (errno));
+ printf ("glfs_h_performance tests: FAILED\n");
+ goto out;
+ }
+
+ dir = glfs_h_mkdir (fs, parent, my_dir, 0644, &sb);
+ if (dir == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+ my_dir, parent, strerror (errno));
+ printf ("glfs_h_performance tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ /* create performance */
+ ret = clock_gettime (CLOCK_REALTIME, &o_ts_st);
+ if(ret != 0) {
+ fprintf (stderr, "clock_gettime: error %s\n", strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+
+ for (i = 0; i < MAX_FILES_CREATE; i++) {
+ sprintf (my_file_name, "%s%d", my_file, i);
+
+ ret = clock_gettime (CLOCK_REALTIME, &c_ts_st);
+ if(ret != 0) {
+ fprintf (stderr, "clock_gettime: error %s\n",
+ strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+
+ leaf = glfs_h_lookupat (fs, dir, my_file_name, &sb);
+ if (leaf != NULL) {
+ fprintf (stderr, "glfs_h_lookup: exists %s\n",
+ my_file_name);
+ printf ("glfs_h_performance tests: FAILED\n");
+ goto out;
+ }
+
+ leaf = glfs_h_creat (fs, dir, my_file_name, O_CREAT, 0644, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+ my_file, dir, strerror (errno));
+ printf ("glfs_h_performance tests: FAILED\n");
+ goto out;
+ }
+
+ ret = clock_gettime (CLOCK_REALTIME, &c_ts_ed);
+ if(ret != 0) {
+ fprintf (stderr, "clock_gettime: error %s\n",
+ strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+
+ assimilatetime (&c_ts, c_ts_st, c_ts_ed);
+ glfs_h_close (leaf); leaf = NULL;
+ }
+
+ ret = clock_gettime (CLOCK_REALTIME, &o_ts_ed);
+ if(ret != 0) {
+ fprintf (stderr, "clock_gettime: error %s\n", strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+
+ assimilatetime (&o_ts, o_ts_st, o_ts_ed);
+
+ printf ("Creation performance (handle based):\n\t# empty files:%d\n",
+ MAX_FILES_CREATE);
+ printf ("\tOverall time:\n\t\tSecs:%ld\n\t\tnSecs:%ld\n",
+ o_ts.tv_sec, o_ts.tv_nsec);
+ printf ("\tcreate call time time:\n\t\tSecs:%ld\n\t\tnSecs:%ld\n",
+ c_ts.tv_sec, c_ts.tv_nsec);
+
+ /* create using path */
+ c_ts.tv_sec = o_ts.tv_sec = 0;
+ c_ts.tv_nsec = o_ts.tv_nsec = 0;
+
+ sprintf (my_file_name, "%s1", full_dir_path);
+ ret = glfs_mkdir (fs, my_file_name, 0644);
+ if (ret != 0) {
+ fprintf (stderr, "glfs_mkdir: error creating %s: from (%p),%s\n",
+ my_dir, parent, strerror (errno));
+ printf ("glfs_h_performance tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ ret = clock_gettime (CLOCK_REALTIME, &o_ts_st);
+ if(ret != 0) {
+ fprintf (stderr, "clock_gettime: error %s\n", strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+
+ for (i = 0; i < MAX_FILES_CREATE; i++) {
+ sprintf (my_file_name, "%s1/%sn%d", full_dir_path, my_file, i);
+
+ ret = clock_gettime (CLOCK_REALTIME, &c_ts_st);
+ if(ret != 0) {
+ fprintf (stderr, "clock_gettime: error %s\n",
+ strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+
+ ret = glfs_stat (fs, my_file_name, &sb);
+ if (ret == 0) {
+ fprintf (stderr, "glfs_stat: exists %s\n",
+ my_file_name);
+ printf ("glfs_h_performance tests: FAILED\n");
+ goto out;
+ }
+
+ fd = glfs_creat (fs, my_file_name, O_CREAT, 0644);
+ if (fd == NULL) {
+ fprintf (stderr, "glfs_creat: error creating %s: from (%p),%s\n",
+ my_file, dir, strerror (errno));
+ printf ("glfs_h_performance tests: FAILED\n");
+ goto out;
+ }
+
+ ret = clock_gettime (CLOCK_REALTIME, &c_ts_ed);
+ if(ret != 0) {
+ fprintf (stderr, "clock_gettime: error %s\n",
+ strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+
+ assimilatetime (&c_ts, c_ts_st, c_ts_ed);
+ glfs_close (fd);
+ }
+
+ ret = clock_gettime (CLOCK_REALTIME, &o_ts_ed);
+ if(ret != 0) {
+ fprintf (stderr, "clock_gettime: error %s\n", strerror (errno));
+ printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+ goto out;
+ }
+
+ assimilatetime (&o_ts, o_ts_st, o_ts_ed);
+
+ printf ("Creation performance (path based):\n\t# empty files:%d\n",
+ MAX_FILES_CREATE);
+ printf ("\tOverall time:\n\t\tSecs:%ld\n\t\tnSecs:%ld\n",
+ o_ts.tv_sec, o_ts.tv_nsec);
+ printf ("\tcreate call time time:\n\t\tSecs:%ld\n\t\tnSecs:%ld\n",
+ c_ts.tv_sec, c_ts.tv_nsec);
+out:
+ return;
+}
+
+int
+test_handleops (int argc, char *argv[])
+{
+ int ret = 0;
+ glfs_fd_t *fd = NULL;
+ struct stat sb = {0, };
+ struct glfs_object *root = NULL, *parent = NULL, *leaf = NULL,
+ *tmp = NULL;
+ char readbuf[32], writebuf[32];
+ unsigned char leaf_handle[GFAPI_HANDLE_LENGTH];
+
+ char *full_leaf_name = "/testdir/testfile.txt",
+ *leaf_name = "testfile.txt",
+ *relative_leaf_name = "testdir/testfile.txt";
+ char *leaf_name1 = "testfile1.txt";
+ char *full_newparent_name = "/testdir/dir1",
+ *newparent_name = "dir1";
+ char *full_newnod_name = "/testdir/nod1",
+ *newnod_name = "nod1";
+
+ /* Initialize test area */
+ ret = glfs_mkdir (fs, full_parent_name, 0644);
+ if (ret != 0 && errno != EEXIST) {
+ fprintf (stderr, "%s: (%p) %s\n", full_parent_name, fd,
+ strerror (errno));
+ printf ("Test initialization failed on volume %s\n", argv[1]);
+ goto out;
+ }
+ else if (ret != 0) {
+ printf ("Found test directory %s to be existing\n",
+ full_parent_name);
+ printf ("Cleanup test directory and restart tests\n");
+ goto out;
+ }
+
+ fd = glfs_creat (fs, full_leaf_name, O_CREAT, 0644);
+ if (fd == NULL) {
+ fprintf (stderr, "%s: (%p) %s\n", full_leaf_name, fd,
+ strerror (errno));
+ printf ("Test initialization failed on volume %s\n", argv[1]);
+ goto out;
+ }
+ glfs_close (fd);
+
+ printf ("Initialized the test area, within volume %s\n", argv[1]);
+
+ /* Handle based APIs test area */
+
+ /* glfs_lookupat test */
+ printf ("glfs_h_lookupat tests: In Progress\n");
+ /* start at root of the volume */
+ root = glfs_h_lookupat (fs, NULL, "/", &sb);
+ if (root == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ "/", NULL, strerror (errno));
+ printf ("glfs_h_lookupat tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ /* lookup a parent within root */
+ parent = glfs_h_lookupat (fs, root, parent_name, &sb);
+ if (parent == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ parent_name, root, strerror (errno));
+ printf ("glfs_h_lookupat tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ /* lookup a leaf/child within the parent */
+ leaf = glfs_h_lookupat (fs, parent, leaf_name, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ leaf_name, parent, strerror (errno));
+ printf ("glfs_h_lookupat tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ /* reset */
+ glfs_h_close (root); root = NULL;
+ glfs_h_close (leaf); leaf = NULL;
+ glfs_h_close (parent); parent = NULL;
+
+ /* check absolute paths */
+ root = glfs_h_lookupat (fs, NULL, "/", &sb);
+ if (root == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ "/", NULL, strerror (errno));
+ printf ("glfs_h_lookupat tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+ if (parent == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ full_parent_name, root, strerror (errno));
+ printf ("glfs_h_lookupat tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ leaf = glfs_h_lookupat (fs, NULL, full_leaf_name, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ full_leaf_name, parent, strerror (errno));
+ printf ("glfs_h_lookupat tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ /* reset */
+ glfs_h_close (leaf); leaf = NULL;
+
+ /* check multiple component paths */
+ leaf = glfs_h_lookupat (fs, root, relative_leaf_name, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ relative_leaf_name, parent, strerror (errno));
+ goto out;
+ }
+ peek_stat (&sb);
+
+ /* reset */
+ glfs_h_close (root); root = NULL;
+ glfs_h_close (parent); parent = NULL;
+
+ /* check symlinks in path */
+
+ /* TODO: -ve test cases */
+ /* parent invalid
+ * path invalid
+ * path does not exist after some components
+ * no parent, but relative path
+ * parent and full path? -ve?
+ */
+
+ printf ("glfs_h_lookupat tests: PASSED\n");
+
+ /* glfs_openat test */
+ printf ("glfs_h_open tests: In Progress\n");
+ fd = glfs_h_open (fs, leaf, O_RDWR);
+ if (fd == NULL) {
+ fprintf (stderr, "glfs_h_open: error on open of %s: %s\n",
+ full_leaf_name, strerror (errno));
+ printf ("glfs_h_open tests: FAILED\n");
+ goto out;
+ }
+
+ /* test read/write based on fd */
+ memcpy (writebuf, "abcdefghijklmnopqrstuvwxyz012345", 32);
+ ret = glfs_write (fd, writebuf, 32, 0);
+
+ glfs_lseek (fd, 0, SEEK_SET);
+
+ ret = glfs_read (fd, readbuf, 32, 0);
+ if (memcmp (readbuf, writebuf, 32)) {
+ printf ("Failed to read what I wrote: %s %s\n", readbuf,
+ writebuf);
+ glfs_close (fd);
+ printf ("glfs_h_open tests: FAILED\n");
+ goto out;
+ }
+
+ glfs_h_close (leaf); leaf = NULL;
+ glfs_close (fd);
+
+ printf ("glfs_h_open tests: PASSED\n");
+
+ /* Create tests */
+ printf ("glfs_h_creat tests: In Progress\n");
+ parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+ if (parent == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ full_parent_name, root, strerror (errno));
+ printf ("glfs_h_creat tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ leaf = glfs_h_creat (fs, parent, leaf_name1, O_CREAT, 0644, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_creat: error on create of %s: from (%p),%s\n",
+ leaf_name1, parent, strerror (errno));
+ printf ("glfs_h_creat tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ glfs_h_close (leaf); leaf = NULL;
+
+ leaf = glfs_h_creat (fs, parent, leaf_name1, O_CREAT | O_EXCL, 0644,
+ &sb);
+ if (leaf != NULL || errno != EEXIST) {
+ fprintf (stderr, "glfs_h_creat: existing file, leaf = (%p), errno = %s\n",
+ leaf, strerror (errno));
+ printf ("glfs_h_creat tests: FAILED\n");
+ if (leaf != NULL) {
+ glfs_h_close (leaf); leaf = NULL;
+ }
+ }
+
+ tmp = glfs_h_creat (fs, root, parent_name, O_CREAT, 0644, &sb);
+ if (tmp != NULL || !(errno == EISDIR || errno == EINVAL)) {
+ fprintf (stderr, "glfs_h_creat: dir create, tmp = (%p), errno = %s\n",
+ leaf, strerror (errno));
+ printf ("glfs_h_creat tests: FAILED\n");
+ if (tmp != NULL) {
+ glfs_h_close (tmp); tmp = NULL;
+ }
+ }
+
+ /* TODO: Other combinations and -ve cases as applicable */
+ printf ("glfs_h_creat tests: PASSED\n");
+
+ /* extract handle and create from handle test */
+ printf ("glfs_h_extract_handle and glfs_h_create_from_handle tests: In Progress\n");
+ /* TODO: Change the lookup to creat below for a GIFD recovery falure,
+ * that needs to be fixed */
+ leaf = glfs_h_lookupat (fs, parent, leaf_name1, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ leaf_name1, parent, strerror (errno));
+ printf ("glfs_h_extract_handle tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ ret = glfs_h_extract_handle (leaf, leaf_handle,
+ GFAPI_HANDLE_LENGTH);
+ if (ret < 0) {
+ fprintf (stderr, "glfs_h_extract_handle: error extracting handle of %s: %s\n",
+ full_leaf_name, strerror (errno));
+ printf ("glfs_h_extract_handle tests: FAILED\n");
+ goto out;
+ }
+ peek_handle (leaf_handle);
+
+ glfs_h_close (leaf); leaf = NULL;
+
+ leaf = glfs_h_create_from_handle (fs, leaf_handle, GFAPI_HANDLE_LENGTH,
+ &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_create_from_handle: error on create of %s: from (%p),%s\n",
+ leaf_name1, leaf_handle, strerror (errno));
+ printf ("glfs_h_create_from_handle tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ fd = glfs_h_open (fs, leaf, O_RDWR);
+ if (fd == NULL) {
+ fprintf (stderr, "glfs_h_open: error on open of %s: %s\n",
+ full_leaf_name, strerror (errno));
+ printf ("glfs_h_create_from_handle tests: FAILED\n");
+ goto out;
+ }
+
+ /* test read/write based on fd */
+ memcpy (writebuf, "abcdefghijklmnopqrstuvwxyz012345", 32);
+ ret = glfs_write (fd, writebuf, 32, 0);
+
+ glfs_lseek (fd, 0, SEEK_SET);
+
+ ret = glfs_read (fd, readbuf, 32, 0);
+ if (memcmp (readbuf, writebuf, 32)) {
+ printf ("Failed to read what I wrote: %s %s\n", writebuf,
+ writebuf);
+ printf ("glfs_h_create_from_handle tests: FAILED\n");
+ glfs_close (fd);
+ goto out;
+ }
+
+ glfs_close (fd);
+ glfs_h_close (leaf); leaf = NULL;
+ glfs_h_close (parent); parent = NULL;
+
+ printf ("glfs_h_extract_handle and glfs_h_create_from_handle tests: PASSED\n");
+
+ /* Mkdir tests */
+ printf ("glfs_h_mkdir tests: In Progress\n");
+
+ ret = glfs_rmdir (fs, full_newparent_name);
+ if (ret && errno != ENOENT) {
+ fprintf (stderr, "glfs_rmdir: Failed for %s: %s\n",
+ full_newparent_name, strerror (errno));
+ printf ("glfs_h_mkdir tests: FAILED\n");
+ goto out;
+ }
+
+ parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+ if (parent == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ full_parent_name, root, strerror (errno));
+ printf ("glfs_h_mkdir tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ leaf = glfs_h_mkdir (fs, parent, newparent_name, 0644, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error on mkdir of %s: from (%p),%s\n",
+ newparent_name, parent, strerror (errno));
+ printf ("glfs_h_mkdir tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ glfs_h_close (leaf); leaf = NULL;
+
+ leaf = glfs_h_mkdir (fs, parent, newparent_name, 0644, &sb);
+ if (leaf != NULL || errno != EEXIST) {
+ fprintf (stderr, "glfs_h_mkdir: existing directory, leaf = (%p), errno = %s\n",
+ leaf, strerror (errno));
+ printf ("glfs_h_mkdir tests: FAILED\n");
+ if (leaf != NULL) {
+ glfs_h_close (leaf); leaf = NULL;
+ }
+ }
+
+ glfs_h_close (parent); parent = NULL;
+
+ printf ("glfs_h_mkdir tests: PASSED\n");
+
+ /* Mknod tests */
+ printf ("glfs_h_mknod tests: In Progress\n");
+ ret = glfs_unlink (fs, full_newnod_name);
+ if (ret && errno != ENOENT) {
+ fprintf (stderr, "glfs_unlink: Failed for %s: %s\n",
+ full_newnod_name, strerror (errno));
+ printf ("glfs_h_mknod tests: FAILED\n");
+ goto out;
+ }
+
+ parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+ if (parent == NULL) {
+ fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+ full_parent_name, root, strerror (errno));
+ printf ("glfs_h_mknod tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ leaf = glfs_h_mknod (fs, parent, newnod_name, S_IFIFO, 0, &sb);
+ if (leaf == NULL) {
+ fprintf (stderr, "glfs_h_mkdir: error on mkdir of %s: from (%p),%s\n",
+ newnod_name, parent, strerror (errno));
+ printf ("glfs_h_mknod tests: FAILED\n");
+ goto out;
+ }
+ peek_stat (&sb);
+
+ /* TODO: creat op on a FIFO node hangs, need to check and fix
+ tmp = glfs_h_creat (fs, parent, newnod_name, O_CREAT, 0644, &sb);
+ if (tmp != NULL || errno != EINVAL) {
+ fprintf (stderr, "glfs_h_creat: node create, tmp = (%p), errno = %s\n",
+ tmp, strerror (errno));
+ printf ("glfs_h_creat/mknod tests: FAILED\n");
+ if (tmp != NULL) {
+ glfs_h_close(tmp); tmp = NULL;
+ }
+ } */
+
+ glfs_h_close (leaf); leaf = NULL;
+
+ leaf = glfs_h_mknod (fs, parent, newnod_name, 0644, 0, &sb);
+ if (leaf != NULL || errno != EEXIST) {
+ fprintf (stderr, "glfs_h_mknod: existing node, leaf = (%p), errno = %s\n",
+ leaf, strerror (errno));
+ printf ("glfs_h_mknod tests: FAILED\n");
+ if (leaf != NULL) {
+ glfs_h_close (leaf); leaf = NULL;
+ }
+ }
+
+ glfs_h_close (parent); parent = NULL;
+
+ printf ("glfs_h_mknod tests: PASSED\n");
+
+ /* unlink tests */
+ test_h_unlink ();
+
+ /* TODO: opendir tests */
+
+ /* getattr tests */
+ test_h_getsetattrs ();
+
+ /* TODO: setattr tests */
+
+ /* truncate tests */
+ test_h_truncate();
+
+ /* link tests */
+ test_h_links ();
+
+ /* rename tests */
+ test_h_rename ();
+
+ /* performance tests */
+ test_h_performance ();
+
+ /* END: New APIs test area */
+
+out:
+ /* Cleanup glfs handles */
+ if (root)
+ glfs_h_close (root);
+ if (parent)
+ glfs_h_close (parent);
+ if (leaf)
+ glfs_h_close (leaf);
+
+ return ret;
+}
+
int
main (int argc, char *argv[])
{
- glfs_t *fs = NULL;
- glfs_t *fs2 = NULL;
- int ret = 0;
- glfs_fd_t *fd = NULL;
- glfs_fd_t *fd2 = NULL;
- struct stat sb = {0, };
- char readbuf[32];
- char writebuf[32];
+ glfs_t *fs2 = NULL;
+ int ret = 0;
+ glfs_fd_t *fd = NULL;
+ glfs_fd_t *fd2 = NULL;
+ struct stat sb = {0, };
+ char readbuf[32];
+ char writebuf[32];
+
+ char *filename = "/filename2";
+
+ if (argc != 3) {
+ printf ("Expect following args\n\t%s <volname> <hostname>\n", argv[0]);
+ return -1;
+ }
+
+ fs = glfs_new (argv[1]);
+ if (!fs) {
+ fprintf (stderr, "glfs_new: returned NULL\n");
+ return 1;
+ }
+
+// ret = glfs_set_volfile (fs, "/tmp/posix.vol");
+
+ ret = glfs_set_volfile_server (fs, "tcp", argv[2], 24007);
+
+// ret = glfs_set_volfile_server (fs, "unix", "/tmp/gluster.sock", 0);
+
+ ret = glfs_set_logging (fs, "/dev/stderr", 7);
+
+ ret = glfs_init (fs);
+
+ fprintf (stderr, "glfs_init: returned %d\n", ret);
+
+ sleep (2);
+
+ fs2 = glfs_new (argv[1]);
+ if (!fs2) {
+ fprintf (stderr, "glfs_new: returned NULL\n");
+ return 1;
+ }
+
+
+// ret = glfs_set_volfile (fs2, "/tmp/posix.vol");
- char *filename = "/filename2";
+ ret = glfs_set_volfile_server (fs2, "tcp", argv[2], 24007);
- fs = glfs_new ("iops");
- if (!fs) {
- fprintf (stderr, "glfs_new: returned NULL\n");
- return 1;
- }
+ ret = glfs_set_logging (fs2, "/dev/stderr", 7);
-// ret = glfs_set_volfile (fs, "/tmp/posix.vol");
+ ret = glfs_init (fs2);
- ret = glfs_set_volfile_server (fs, "socket", "localhost", 24007);
+ fprintf (stderr, "glfs_init: returned %d\n", ret);
-// ret = glfs_set_volfile_server (fs, "unix", "/tmp/gluster.sock", 0);
+ ret = glfs_lstat (fs, filename, &sb);
+ fprintf (stderr, "%s: (%d) %s\n", filename, ret, strerror (errno));
- ret = glfs_set_logging (fs, "/dev/stderr", 7);
+ fd = glfs_creat (fs, filename, O_RDWR, 0644);
+ fprintf (stderr, "%s: (%p) %s\n", filename, fd, strerror (errno));
- ret = glfs_init (fs);
+ fd2 = glfs_open (fs2, filename, O_RDWR);
+ fprintf (stderr, "%s: (%p) %s\n", filename, fd, strerror (errno));
- fprintf (stderr, "glfs_init: returned %d\n", ret);
+ sprintf (writebuf, "hi there\n");
+ ret = glfs_write (fd, writebuf, 32, 0);
- sleep (2);
+ glfs_lseek (fd2, 0, SEEK_SET);
- fs2 = glfs_new ("iops");
- if (!fs2) {
- fprintf (stderr, "glfs_new: returned NULL\n");
- return 1;
- }
+ ret = glfs_read (fd2, readbuf, 32, 0);
+ printf ("read %d, %s", ret, readbuf);
-// ret = glfs_set_volfile (fs2, "/tmp/posix.vol");
+ glfs_close (fd);
+ glfs_close (fd2);
- ret = glfs_set_volfile_server (fs2, "socket", "localhost", 24007);
+ filename = "/filename3";
+ ret = glfs_mknod (fs, filename, S_IFIFO, 0);
+ fprintf (stderr, "%s: (%d) %s\n", filename, ret, strerror (errno));
- ret = glfs_set_logging (fs2, "/dev/stderr", 7);
+ ret = glfs_lstat (fs, filename, &sb);
+ fprintf (stderr, "%s: (%d) %s\n", filename, ret, strerror (errno));
- ret = glfs_init (fs2);
- fprintf (stderr, "glfs_init: returned %d\n", ret);
+ ret = glfs_rename (fs, filename, "/filename4");
+ fprintf (stderr, "rename(%s): (%d) %s\n", filename, ret,
+ strerror (errno));
- ret = glfs_lstat (fs, filename, &sb);
- fprintf (stderr, "%s: (%d) %s\n", filename, ret, strerror (errno));
+ ret = glfs_unlink (fs, "/filename4");
+ fprintf (stderr, "unlink(%s): (%d) %s\n", "/filename4", ret,
+ strerror (errno));
- fd = glfs_creat (fs, filename, O_RDWR, 0644);
- fprintf (stderr, "%s: (%p) %s\n", filename, fd, strerror (errno));
+ filename = "/dirname2";
+ ret = glfs_mkdir (fs, filename, 0);
+ fprintf (stderr, "%s: (%d) %s\n", filename, ret, strerror (errno));
- fd2 = glfs_open (fs2, filename, O_RDWR);
- fprintf (stderr, "%s: (%p) %s\n", filename, fd, strerror (errno));
+ ret = glfs_lstat (fs, filename, &sb);
+ fprintf (stderr, "lstat(%s): (%d) %s\n", filename, ret, strerror (errno));
- sprintf (writebuf, "hi there\n");
- ret = glfs_write (fd, writebuf, 32, 0);
+ ret = glfs_rmdir (fs, filename);
+ fprintf (stderr, "rmdir(%s): (%d) %s\n", filename, ret, strerror (errno));
- glfs_lseek (fd2, 0, SEEK_SET);
+ test_dirops (fs);
- ret = glfs_read (fd2, readbuf, 32, 0);
+ test_xattr (fs);
- printf ("read %d, %s", ret, readbuf);
+ test_chdir (fs);
- glfs_close (fd);
- glfs_close (fd2);
+ test_handleops (argc, argv);
+ // done
- glfs_fini (fs);
- glfs_fini (fs2);
+ glfs_fini (fs);
+ glfs_fini (fs2);
- return ret;
+ return ret;
}
diff --git a/api/examples/setup.py.in b/api/examples/setup.py.in
new file mode 100644
index 000000000..44b738094
--- /dev/null
+++ b/api/examples/setup.py.in
@@ -0,0 +1,29 @@
+from distutils.core import setup
+
+# generate a __init__.py for the package namespace
+fo = open('__init__.py', 'w')
+fo.write('__version__ = "@PACKAGE_VERSION@"\n')
+fo.close()
+
+DESC = """GlusterFS is a clustered file-system capable of scaling to
+several petabytes. It aggregates various storage bricks over Infiniband
+RDMA or TCP/IP interconnect into one large parallel network file system.
+GlusterFS is one of the most sophisticated file systems in terms of
+features and extensibility. It borrows a powerful concept called
+Translators from GNU Hurd kernel. Much of the code in GlusterFS is in
+user space and easily manageable.
+
+This package contains the Python interface to the libgfapi library."""
+
+setup(
+ name='glusterfs-api',
+ version='@PACKAGE_VERSION@',
+ description='Python client library for the GlusterFS libgfapi',
+ long_description=DESC,
+ author='Gluster Community',
+ author_email='gluster-devel@nongnu.org',
+ license='LGPLv3',
+ url='http://gluster.org/',
+ package_dir={'gluster':''},
+ packages=['gluster']
+)
diff --git a/api/src/Makefile.am b/api/src/Makefile.am
index 954a71f76..7c5df3e20 100644
--- a/api/src/Makefile.am
+++ b/api/src/Makefile.am
@@ -1,9 +1,10 @@
lib_LTLIBRARIES = libgfapi.la
noinst_HEADERS = glfs-mem-types.h glfs-internal.h
-libgfapi_HEADERS = glfs.h
+libgfapi_HEADERS = glfs.h glfs-handles.h
libgfapidir = $(includedir)/glusterfs/api
-libgfapi_la_SOURCES = glfs.c glfs-mgmt.c glfs-fops.c glfs-resolve.c
+libgfapi_la_SOURCES = glfs.c glfs-mgmt.c glfs-fops.c glfs-resolve.c \
+ glfs-handleops.c
libgfapi_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
$(top_builddir)/rpc/xdr/src/libgfxdr.la \
@@ -14,13 +15,21 @@ libgfapi_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 \
-I$(top_srcdir)/rpc/rpc-lib/src \
-I$(top_srcdir)/rpc/xdr/src
+
xlator_LTLIBRARIES = api.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/mount
+# workaround for broken parallel install support in automake with LTLIBRARIES
+# http://debbugs.gnu.org/cgi/bugreport.cgi?bug=7328
+install_xlatorLTLIBRARIES = install-xlatorLTLIBRARIES
+$(install_xlatorLTLIBRARIES): install-libLTLIBRARIES
api_la_SOURCES = glfs-master.c
-
-api_la_LDFLAGS = -module -avoidversion
-api_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+api_la_DEPENDENCIES = libgfapi.la
+api_la_LDFLAGS = -module -avoid-version
+api_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+ $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
+ $(top_builddir)/rpc/xdr/src/libgfxdr.la \
+ $(top_builddir)/api/src/libgfapi.la
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
diff --git a/api/src/glfs-fops.c b/api/src/glfs-fops.c
index c9897243e..10bb7d38b 100644
--- a/api/src/glfs-fops.c
+++ b/api/src/glfs-fops.c
@@ -13,6 +13,56 @@
#include "glfs-mem-types.h"
#include "syncop.h"
#include "glfs.h"
+#include <limits.h>
+
+#ifdef NAME_MAX
+#define GF_NAME_MAX NAME_MAX
+#else
+#define GF_NAME_MAX 255
+#endif
+
+#define READDIRBUF_SIZE (sizeof(struct dirent) + GF_NAME_MAX + 1)
+
+int
+glfs_loc_link (loc_t *loc, struct iatt *iatt)
+{
+ int ret = -1;
+ inode_t *linked_inode = NULL;
+
+ if (!loc->inode) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ linked_inode = inode_link (loc->inode, loc->parent, loc->name, iatt);
+ if (linked_inode) {
+ inode_lookup (linked_inode);
+ inode_unref (linked_inode);
+ ret = 0;
+ } else {
+ ret = -1;
+ errno = ENOMEM;
+ }
+
+ return ret;
+}
+
+
+void
+glfs_iatt_to_stat (struct glfs *fs, struct iatt *iatt, struct stat *stat)
+{
+ iatt_to_stat (iatt, stat);
+ stat->st_dev = fs->dev_id;
+}
+
+
+int
+glfs_loc_unlink (loc_t *loc)
+{
+ inode_unlink (loc->inode, loc->parent, loc->name);
+
+ return 0;
+}
struct glfs_fd *
@@ -23,6 +73,7 @@ glfs_open (struct glfs *fs, const char *path, int flags)
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -33,11 +84,15 @@ glfs_open (struct glfs *fs, const char *path, int flags)
goto out;
}
- glfd = GF_CALLOC (1, sizeof (*glfd), glfs_mt_glfs_fd_t);
+ glfd = glfs_fd_new (fs);
if (!glfd)
goto out;
- ret = glfs_resolve (fs, subvol, path, &loc, &iatt);
+retry:
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
if (ret)
goto out;
@@ -53,6 +108,14 @@ glfs_open (struct glfs *fs, const char *path, int flags)
goto out;
}
+ if (glfd->fd) {
+ /* Retry. Safe to touch glfd->fd as we
+ still have not glfs_fd_bind() yet.
+ */
+ fd_unref (glfd->fd);
+ glfd->fd = NULL;
+ }
+
glfd->fd = fd_create (loc.inode, getpid());
if (!glfd->fd) {
ret = -1;
@@ -61,14 +124,22 @@ glfs_open (struct glfs *fs, const char *path, int flags)
}
ret = syncop_open (subvol, &loc, flags, glfd->fd);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
out:
loc_wipe (&loc);
if (ret && glfd) {
glfs_fd_destroy (glfd);
glfd = NULL;
+ } else if (glfd) {
+ glfd->fd->flags = flags;
+ fd_bind (glfd->fd);
+ glfs_fd_bind (glfd);
}
+ glfs_subvol_done (fs, subvol);
+
return glfd;
}
@@ -78,15 +149,35 @@ glfs_close (struct glfs_fd *glfd)
{
xlator_t *subvol = NULL;
int ret = -1;
+ fd_t *fd = NULL;
+ struct glfs *fs = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
- ret = syncop_flush (subvol, glfd->fd);
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+ ret = syncop_flush (subvol, fd);
+out:
+ fs = glfd->fs;
glfs_fd_destroy (glfd);
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -98,6 +189,7 @@ glfs_lstat (struct glfs *fs, const char *path, struct stat *stat)
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -107,14 +199,18 @@ glfs_lstat (struct glfs *fs, const char *path, struct stat *stat)
errno = EIO;
goto out;
}
+retry:
+ ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
- ret = glfs_lresolve (fs, subvol, path, &loc, &iatt);
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
if (ret == 0 && stat)
- iatt_to_stat (&iatt, stat);
+ glfs_iatt_to_stat (fs, &iatt, stat);
out:
loc_wipe (&loc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -126,6 +222,7 @@ glfs_stat (struct glfs *fs, const char *path, struct stat *stat)
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -135,14 +232,18 @@ glfs_stat (struct glfs *fs, const char *path, struct stat *stat)
errno = EIO;
goto out;
}
+retry:
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
- ret = glfs_resolve (fs, subvol, path, &loc, &iatt);
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
if (ret == 0 && stat)
- iatt_to_stat (&iatt, stat);
+ glfs_iatt_to_stat (fs, &iatt, stat);
out:
loc_wipe (&loc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -153,21 +254,34 @@ glfs_fstat (struct glfs_fd *glfd, struct stat *stat)
int ret = -1;
xlator_t *subvol = NULL;
struct iatt iatt = {0, };
+ fd_t *fd = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
if (!subvol) {
ret = -1;
errno = EIO;
goto out;
}
- ret = syncop_fstat (subvol, glfd->fd, &iatt);
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ ret = syncop_fstat (subvol, fd, &iatt);
if (ret == 0 && stat)
- iatt_to_stat (&iatt, stat);
+ glfs_iatt_to_stat (glfd->fs, &iatt, stat);
out:
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
return ret;
}
@@ -182,6 +296,7 @@ glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode)
struct iatt iatt = {0, };
uuid_t gfid;
dict_t *xattr_req = NULL;
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -207,7 +322,7 @@ glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode)
goto out;
}
- glfd = GF_CALLOC (1, sizeof (*glfd), glfs_mt_glfs_fd_t);
+ glfd = glfs_fd_new (fs);
if (!glfd)
goto out;
@@ -216,7 +331,11 @@ glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode)
is a danging symlink must create the dangling
destinataion.
*/
- ret = glfs_resolve (fs, subvol, path, &loc, &iatt);
+retry:
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
if (ret == -1 && errno != ENOENT)
/* Any other type of error is fatal */
goto out;
@@ -256,6 +375,14 @@ glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode)
}
}
+ if (glfd->fd) {
+ /* Retry. Safe to touch glfd->fd as we
+ still have not glfs_fd_bind() yet.
+ */
+ fd_unref (glfd->fd);
+ glfd->fd = NULL;
+ }
+
glfd->fd = fd_create (loc.inode, getpid());
if (!glfd->fd) {
ret = -1;
@@ -263,18 +390,34 @@ glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode)
goto out;
}
- ret = syncop_create (subvol, &loc, flags, mode, glfd->fd, xattr_req);
+ if (ret == 0) {
+ ret = syncop_open (subvol, &loc, flags, glfd->fd);
+ } else {
+ ret = syncop_create (subvol, &loc, flags, mode, glfd->fd,
+ xattr_req, &iatt);
+ }
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+ if (ret == 0)
+ ret = glfs_loc_link (&loc, &iatt);
out:
loc_wipe (&loc);
if (xattr_req)
- dict_destroy (xattr_req);
+ dict_unref (xattr_req);
if (ret && glfd) {
glfs_fd_destroy (glfd);
glfd = NULL;
+ } else if (glfd) {
+ glfd->fd->flags = flags;
+ fd_bind (glfd->fd);
+ glfs_fd_bind (glfd);
}
+ glfs_subvol_done (fs, subvol);
+
return glfd;
}
@@ -315,33 +458,52 @@ glfs_preadv (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
off_t offset, int flags)
{
xlator_t *subvol = NULL;
- int ret = -1;
- size_t size = -1;
+ ssize_t ret = -1;
+ ssize_t size = -1;
struct iovec *iov = NULL;
int cnt = 0;
struct iobref *iobref = NULL;
+ fd_t *fd = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
size = iov_length (iovec, iovcnt);
- ret = syncop_readv (subvol, glfd->fd, size, offset,
- 0, &iov, &cnt, &iobref);
+ ret = syncop_readv (subvol, fd, size, offset, 0, &iov, &cnt, &iobref);
if (ret <= 0)
- return ret;
+ goto out;
size = iov_copy (iovec, iovcnt, iov, cnt); /* FIXME!!! */
glfd->offset = (offset + size);
- if (iov)
- GF_FREE (iov);
- if (iobref)
- iobref_unref (iobref);
+ ret = size;
+out:
+ if (iov)
+ GF_FREE (iov);
+ if (iobref)
+ iobref_unref (iobref);
+
+ if (fd)
+ fd_unref (fd);
- return size;
+ glfs_subvol_done (glfd->fs, subvol);
+
+ return ret;
}
@@ -421,10 +583,6 @@ glfs_io_async_task (void *data)
ssize_t ret = 0;
switch (gio->op) {
- case GF_FOP_READ:
- ret = glfs_preadv (gio->glfd, gio->iov, gio->count,
- gio->offset, gio->flags);
- break;
case GF_FOP_WRITE:
ret = glfs_pwritev (gio->glfd, gio->iov, gio->count,
gio->offset, gio->flags);
@@ -438,6 +596,12 @@ glfs_io_async_task (void *data)
else
ret = glfs_fsync (gio->glfd);
break;
+ case GF_FOP_DISCARD:
+ ret = glfs_discard (gio->glfd, gio->offset, gio->count);
+ break;
+ case GF_FOP_ZEROFILL:
+ ret = glfs_zerofill(gio->glfd, gio->offset, gio->count);
+ break;
}
return (int) ret;
@@ -445,23 +609,90 @@ glfs_io_async_task (void *data)
int
+glfs_preadv_async_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iovec *iovec,
+ int count, struct iatt *stbuf, struct iobref *iobref,
+ dict_t *xdata)
+{
+ struct glfs_io *gio = NULL;
+ xlator_t *subvol = NULL;
+ struct glfs *fs = NULL;
+ struct glfs_fd *glfd = NULL;
+
+
+ gio = frame->local;
+ frame->local = NULL;
+ subvol = cookie;
+ glfd = gio->glfd;
+ fs = glfd->fs;
+
+ if (op_ret <= 0)
+ goto out;
+
+ op_ret = iov_copy (gio->iov, gio->count, iovec, count);
+
+ glfd->offset = gio->offset + op_ret;
+out:
+ errno = op_errno;
+ gio->fn (gio->glfd, op_ret, gio->data);
+
+ GF_FREE (gio->iov);
+ GF_FREE (gio);
+ STACK_DESTROY (frame->root);
+ glfs_subvol_done (fs, subvol);
+
+ return 0;
+}
+
+
+int
glfs_preadv_async (struct glfs_fd *glfd, const struct iovec *iovec, int count,
off_t offset, int flags, glfs_io_cbk fn, void *data)
{
struct glfs_io *gio = NULL;
int ret = 0;
+ call_frame_t *frame = NULL;
+ xlator_t *subvol = NULL;
+ glfs_t *fs = NULL;
+ fd_t *fd = NULL;
+
+ __glfs_entry_fd (glfd);
+
+ subvol = glfs_active_subvol (glfd->fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ fs = glfd->fs;
+
+ frame = syncop_create_frame (THIS);
+ if (!frame) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
gio = GF_CALLOC (1, sizeof (*gio), glfs_mt_glfs_io_t);
if (!gio) {
+ ret = -1;
errno = ENOMEM;
- return -1;
+ goto out;
}
gio->iov = iov_dup (iovec, count);
if (!gio->iov) {
- GF_FREE (gio);
+ ret = -1;
errno = ENOMEM;
- return -1;
+ goto out;
}
gio->op = GF_FOP_READ;
@@ -472,15 +703,23 @@ glfs_preadv_async (struct glfs_fd *glfd, const struct iovec *iovec, int count,
gio->fn = fn;
gio->data = data;
- ret = synctask_new (glfs_from_glfd (glfd)->ctx->env,
- glfs_io_async_task, glfs_io_async_cbk,
- NULL, gio);
+ frame->local = gio;
+ STACK_WIND_COOKIE (frame, glfs_preadv_async_cbk, subvol, subvol,
+ subvol->fops->readv, fd, iov_length (iovec, count),
+ offset, flags, NULL);
+
+out:
if (ret) {
GF_FREE (gio->iov);
GF_FREE (gio);
+ STACK_DESTROY (frame->root);
+ glfs_subvol_done (fs, subvol);
}
+ if (fd)
+ fd_unref (fd);
+
return ret;
}
@@ -540,24 +779,39 @@ glfs_pwritev (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
struct iobref *iobref = NULL;
struct iobuf *iobuf = NULL;
struct iovec iov = {0, };
+ fd_t *fd = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
size = iov_length (iovec, iovcnt);
iobuf = iobuf_get2 (subvol->ctx->iobuf_pool, size);
if (!iobuf) {
+ ret = -1;
errno = ENOMEM;
- return -1;
+ goto out;
}
iobref = iobref_new ();
if (!iobref) {
iobuf_unref (iobuf);
errno = ENOMEM;
- return -1;
+ ret = -1;
+ goto out;
}
ret = iobref_add (iobref, iobuf);
@@ -565,7 +819,8 @@ glfs_pwritev (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
iobuf_unref (iobuf);
iobref_unref (iobref);
errno = ENOMEM;
- return -1;
+ ret = -1;
+ goto out;
}
iov_unload (iobuf_ptr (iobuf), iovec, iovcnt); /* FIXME!!! */
@@ -573,17 +828,22 @@ glfs_pwritev (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
iov.iov_base = iobuf_ptr (iobuf);
iov.iov_len = size;
- ret = syncop_writev (subvol, glfd->fd, &iov, 1, offset,
- iobref, flags);
+ ret = syncop_writev (subvol, fd, &iov, 1, offset, iobref, flags);
iobuf_unref (iobuf);
iobref_unref (iobref);
if (ret <= 0)
- return ret;
+ goto out;
glfd->offset = (offset + size);
+out:
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
return ret;
}
@@ -722,18 +982,31 @@ glfs_fsync (struct glfs_fd *glfd)
{
int ret = -1;
xlator_t *subvol = NULL;
+ fd_t *fd = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
if (!subvol) {
ret = -1;
errno = EIO;
goto out;
}
- ret = syncop_fsync (subvol, glfd->fd, 0);
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ ret = syncop_fsync (subvol, fd, 0);
out:
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
return ret;
}
@@ -783,18 +1056,31 @@ glfs_fdatasync (struct glfs_fd *glfd)
{
int ret = -1;
xlator_t *subvol = NULL;
+ fd_t *fd = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
if (!subvol) {
ret = -1;
errno = EIO;
goto out;
}
- ret = syncop_fsync (subvol, glfd->fd, 1);
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ ret = syncop_fsync (subvol, fd, 1);
out:
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
return ret;
}
@@ -811,18 +1097,31 @@ glfs_ftruncate (struct glfs_fd *glfd, off_t offset)
{
int ret = -1;
xlator_t *subvol = NULL;
+ fd_t *fd = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
if (!subvol) {
ret = -1;
errno = EIO;
goto out;
}
- ret = syncop_ftruncate (subvol, glfd->fd, offset);
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ ret = syncop_ftruncate (subvol, fd, offset);
out:
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
return ret;
}
@@ -866,6 +1165,7 @@ glfs_access (struct glfs *fs, const char *path, int mode)
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -875,15 +1175,22 @@ glfs_access (struct glfs *fs, const char *path, int mode)
errno = EIO;
goto out;
}
+retry:
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
- ret = glfs_resolve (fs, subvol, path, &loc, &iatt);
if (ret)
goto out;
ret = syncop_access (subvol, &loc, mode);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
out:
loc_wipe (&loc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -897,6 +1204,7 @@ glfs_symlink (struct glfs *fs, const char *data, const char *path)
struct iatt iatt = {0, };
uuid_t gfid;
dict_t *xattr_req = NULL;
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -921,8 +1229,10 @@ glfs_symlink (struct glfs *fs, const char *data, const char *path)
errno = ENOMEM;
goto out;
}
+retry:
+ ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
- ret = glfs_lresolve (fs, subvol, path, &loc, &iatt);
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
if (loc.inode) {
errno = EEXIST;
@@ -948,12 +1258,19 @@ glfs_symlink (struct glfs *fs, const char *data, const char *path)
goto out;
}
- ret = syncop_symlink (subvol, &loc, data, xattr_req);
+ ret = syncop_symlink (subvol, &loc, data, xattr_req, &iatt);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+ if (ret == 0)
+ ret = glfs_loc_link (&loc, &iatt);
out:
loc_wipe (&loc);
if (xattr_req)
- dict_destroy (xattr_req);
+ dict_unref (xattr_req);
+
+ glfs_subvol_done (fs, subvol);
return ret;
}
@@ -966,6 +1283,8 @@ glfs_readlink (struct glfs *fs, const char *path, char *buf, size_t bufsiz)
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
+ int reval = 0;
+ char *linkval = NULL;
__glfs_entry_fs (fs);
@@ -975,8 +1294,11 @@ glfs_readlink (struct glfs *fs, const char *path, char *buf, size_t bufsiz)
errno = EIO;
goto out;
}
+retry:
+ ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
- ret = glfs_lresolve (fs, subvol, path, &loc, &iatt);
if (ret)
goto out;
@@ -986,10 +1308,18 @@ glfs_readlink (struct glfs *fs, const char *path, char *buf, size_t bufsiz)
goto out;
}
- ret = syncop_readlink (subvol, &loc, &buf, bufsiz);
+ ret = syncop_readlink (subvol, &loc, &linkval, bufsiz);
+ if (ret > 0) {
+ memcpy (buf, linkval, ret);
+ GF_FREE (linkval);
+ }
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
out:
loc_wipe (&loc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -1003,6 +1333,7 @@ glfs_mknod (struct glfs *fs, const char *path, mode_t mode, dev_t dev)
struct iatt iatt = {0, };
uuid_t gfid;
dict_t *xattr_req = NULL;
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -1027,8 +1358,10 @@ glfs_mknod (struct glfs *fs, const char *path, mode_t mode, dev_t dev)
errno = ENOMEM;
goto out;
}
+retry:
+ ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
- ret = glfs_lresolve (fs, subvol, path, &loc, &iatt);
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
if (loc.inode) {
errno = EEXIST;
@@ -1054,12 +1387,19 @@ glfs_mknod (struct glfs *fs, const char *path, mode_t mode, dev_t dev)
goto out;
}
- ret = syncop_mknod (subvol, &loc, mode, dev, xattr_req);
+ ret = syncop_mknod (subvol, &loc, mode, dev, xattr_req, &iatt);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+ if (ret == 0)
+ ret = glfs_loc_link (&loc, &iatt);
out:
loc_wipe (&loc);
if (xattr_req)
- dict_destroy (xattr_req);
+ dict_unref (xattr_req);
+
+ glfs_subvol_done (fs, subvol);
return ret;
}
@@ -1074,6 +1414,7 @@ glfs_mkdir (struct glfs *fs, const char *path, mode_t mode)
struct iatt iatt = {0, };
uuid_t gfid;
dict_t *xattr_req = NULL;
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -1098,8 +1439,10 @@ glfs_mkdir (struct glfs *fs, const char *path, mode_t mode)
errno = ENOMEM;
goto out;
}
+retry:
+ ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
- ret = glfs_lresolve (fs, subvol, path, &loc, &iatt);
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
if (loc.inode) {
errno = EEXIST;
@@ -1125,12 +1468,19 @@ glfs_mkdir (struct glfs *fs, const char *path, mode_t mode)
goto out;
}
- ret = syncop_mkdir (subvol, &loc, mode, xattr_req);
+ ret = syncop_mkdir (subvol, &loc, mode, xattr_req, &iatt);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+ if (ret == 0)
+ ret = glfs_loc_link (&loc, &iatt);
out:
loc_wipe (&loc);
if (xattr_req)
- dict_destroy (xattr_req);
+ dict_unref (xattr_req);
+
+ glfs_subvol_done (fs, subvol);
return ret;
}
@@ -1143,6 +1493,7 @@ glfs_unlink (struct glfs *fs, const char *path)
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -1152,8 +1503,11 @@ glfs_unlink (struct glfs *fs, const char *path)
errno = EIO;
goto out;
}
+retry:
+ ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
- ret = glfs_lresolve (fs, subvol, path, &loc, &iatt);
if (ret)
goto out;
@@ -1164,9 +1518,16 @@ glfs_unlink (struct glfs *fs, const char *path)
}
ret = syncop_unlink (subvol, &loc);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+ if (ret == 0)
+ ret = glfs_loc_unlink (&loc);
out:
loc_wipe (&loc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -1178,6 +1539,7 @@ glfs_rmdir (struct glfs *fs, const char *path)
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -1187,8 +1549,11 @@ glfs_rmdir (struct glfs *fs, const char *path)
errno = EIO;
goto out;
}
+retry:
+ ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
- ret = glfs_lresolve (fs, subvol, path, &loc, &iatt);
if (ret)
goto out;
@@ -1199,9 +1564,16 @@ glfs_rmdir (struct glfs *fs, const char *path)
}
ret = syncop_rmdir (subvol, &loc);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+ if (ret == 0)
+ ret = glfs_loc_unlink (&loc);
out:
loc_wipe (&loc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -1215,6 +1587,7 @@ glfs_rename (struct glfs *fs, const char *oldpath, const char *newpath)
loc_t newloc = {0, };
struct iatt oldiatt = {0, };
struct iatt newiatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -1224,31 +1597,56 @@ glfs_rename (struct glfs *fs, const char *oldpath, const char *newpath)
errno = EIO;
goto out;
}
+retry:
+ ret = glfs_lresolve (fs, subvol, oldpath, &oldloc, &oldiatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &oldloc, retry);
- ret = glfs_lresolve (fs, subvol, oldpath, &oldloc, &oldiatt);
if (ret)
goto out;
+retrynew:
+ ret = glfs_lresolve (fs, subvol, newpath, &newloc, &newiatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &newloc, retrynew);
- ret = glfs_lresolve (fs, subvol, newpath, &newloc, &newiatt);
if (ret && errno != ENOENT && newloc.parent)
goto out;
- if ((oldiatt.ia_type == IA_IFDIR) != (newiatt.ia_type == IA_IFDIR)) {
- /* Either both old and new must be dirs, or both must be
- non-dirs. Else, fail.
- */
- ret = -1;
- errno = EISDIR;
- goto out;
- }
+ if (newiatt.ia_type != IA_INVAL) {
+ if ((oldiatt.ia_type == IA_IFDIR) !=
+ (newiatt.ia_type == IA_IFDIR)) {
+ /* Either both old and new must be dirs,
+ * or both must be non-dirs. Else, fail.
+ */
+ ret = -1;
+ errno = EISDIR;
+ goto out;
+ }
+ }
/* TODO: check if new or old is a prefix of the other, and fail EINVAL */
ret = syncop_rename (subvol, &oldloc, &newloc);
+
+ if (ret == -1 && errno == ESTALE) {
+ if (reval < DEFAULT_REVAL_COUNT) {
+ reval++;
+ loc_wipe (&oldloc);
+ loc_wipe (&newloc);
+ goto retry;
+ }
+ }
+
+ if (ret == 0)
+ inode_rename (oldloc.parent->table, oldloc.parent, oldloc.name,
+ newloc.parent, newloc.name, oldloc.inode,
+ &oldiatt);
out:
loc_wipe (&oldloc);
loc_wipe (&newloc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -1262,6 +1660,7 @@ glfs_link (struct glfs *fs, const char *oldpath, const char *newpath)
loc_t newloc = {0, };
struct iatt oldiatt = {0, };
struct iatt newiatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -1271,12 +1670,18 @@ glfs_link (struct glfs *fs, const char *oldpath, const char *newpath)
errno = EIO;
goto out;
}
+retry:
+ ret = glfs_lresolve (fs, subvol, oldpath, &oldloc, &oldiatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &oldloc, retry);
- ret = glfs_lresolve (fs, subvol, oldpath, &oldloc, &oldiatt);
if (ret)
goto out;
+retrynew:
+ ret = glfs_lresolve (fs, subvol, newpath, &newloc, &newiatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &newloc, retrynew);
- ret = glfs_lresolve (fs, subvol, newpath, &newloc, &newiatt);
if (ret == 0) {
ret = -1;
errno = EEXIST;
@@ -1289,11 +1694,32 @@ glfs_link (struct glfs *fs, const char *oldpath, const char *newpath)
goto out;
}
+ /* Filling the inode of the hard link to be same as that of the
+ original file
+ */
+ if (newloc.inode) {
+ inode_unref (newloc.inode);
+ newloc.inode = NULL;
+ }
+ newloc.inode = inode_ref (oldloc.inode);
+
ret = syncop_link (subvol, &oldloc, &newloc);
+
+ if (ret == -1 && errno == ESTALE) {
+ loc_wipe (&oldloc);
+ loc_wipe (&newloc);
+ if (reval--)
+ goto retry;
+ }
+
+ if (ret == 0)
+ ret = glfs_loc_link (&newloc, &oldiatt);
out:
loc_wipe (&oldloc);
loc_wipe (&newloc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -1306,6 +1732,7 @@ glfs_opendir (struct glfs *fs, const char *path)
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -1316,12 +1743,16 @@ glfs_opendir (struct glfs *fs, const char *path)
goto out;
}
- glfd = GF_CALLOC (1, sizeof (*glfd), glfs_mt_glfs_fd_t);
+ glfd = glfs_fd_new (fs);
if (!glfd)
goto out;
+
INIT_LIST_HEAD (&glfd->entries);
+retry:
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
- ret = glfs_resolve (fs, subvol, path, &loc, &iatt);
if (ret)
goto out;
@@ -1331,6 +1762,14 @@ glfs_opendir (struct glfs *fs, const char *path)
goto out;
}
+ if (glfd->fd) {
+ /* Retry. Safe to touch glfd->fd as we
+ still have not glfs_fd_bind() yet.
+ */
+ fd_unref (glfd->fd);
+ glfd->fd = NULL;
+ }
+
glfd->fd = fd_create (loc.inode, getpid());
if (!glfd->fd) {
ret = -1;
@@ -1339,14 +1778,21 @@ glfs_opendir (struct glfs *fs, const char *path)
}
ret = syncop_opendir (subvol, &loc, glfd->fd);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
out:
loc_wipe (&loc);
if (ret && glfd) {
glfs_fd_destroy (glfd);
glfd = NULL;
+ } else {
+ fd_bind (glfd->fd);
+ glfs_fd_bind (glfd);
}
+ glfs_subvol_done (fs, subvol);
+
return glfd;
}
@@ -1398,6 +1844,70 @@ glfs_seekdir (struct glfs_fd *fd, long offset)
*/
}
+int
+glfs_discard_async (struct glfs_fd *glfd, off_t offset, size_t len,
+ glfs_io_cbk fn, void *data)
+{
+ struct glfs_io *gio = NULL;
+ int ret = 0;
+
+ gio = GF_CALLOC (1, sizeof (*gio), glfs_mt_glfs_io_t);
+ if (!gio) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ gio->op = GF_FOP_DISCARD;
+ gio->glfd = glfd;
+ gio->offset = offset;
+ gio->count = len;
+ gio->fn = fn;
+ gio->data = data;
+
+ ret = synctask_new (glfs_from_glfd (glfd)->ctx->env,
+ glfs_io_async_task, glfs_io_async_cbk,
+ NULL, gio);
+
+ if (ret) {
+ GF_FREE (gio->iov);
+ GF_FREE (gio);
+ }
+
+ return ret;
+}
+
+int
+glfs_zerofill_async (struct glfs_fd *glfd, off_t offset, size_t len,
+ glfs_io_cbk fn, void *data)
+{
+ struct glfs_io *gio = NULL;
+ int ret = 0;
+
+ gio = GF_CALLOC (1, sizeof (*gio), glfs_mt_glfs_io_t);
+ if (!gio) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ gio->op = GF_FOP_ZEROFILL;
+ gio->glfd = glfd;
+ gio->offset = offset;
+ gio->count = len;
+ gio->fn = fn;
+ gio->data = data;
+
+ ret = synctask_new (glfs_from_glfd (glfd)->ctx->env,
+ glfs_io_async_task, glfs_io_async_cbk,
+ NULL, gio);
+
+ if (ret) {
+ GF_FREE (gio->iov);
+ GF_FREE (gio);
+ }
+
+ return ret;
+}
+
void
gf_dirent_to_dirent (gf_dirent_t *gf_dirent, struct dirent *dirent)
@@ -1416,54 +1926,81 @@ gf_dirent_to_dirent (gf_dirent_t *gf_dirent, struct dirent *dirent)
dirent->d_namlen = strlen (gf_dirent->d_name);
#endif
- strncpy (dirent->d_name, gf_dirent->d_name, 256);
+ strncpy (dirent->d_name, gf_dirent->d_name, GF_NAME_MAX + 1);
}
int
-glfd_entry_refresh (struct glfs_fd *glfd)
+glfd_entry_refresh (struct glfs_fd *glfd, int plus)
{
xlator_t *subvol = NULL;
gf_dirent_t entries;
gf_dirent_t old;
int ret = -1;
+ fd_t *fd = NULL;
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
if (!subvol) {
+ ret = -1;
errno = EIO;
- return -1;
+ goto out;
+ }
+
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ if (fd->inode->ia_type != IA_IFDIR) {
+ ret = -1;
+ errno = EBADF;
+ goto out;
}
INIT_LIST_HEAD (&entries.list);
INIT_LIST_HEAD (&old.list);
- ret = syncop_readdir (subvol, glfd->fd, 131072, glfd->offset,
- &entries);
+ if (plus)
+ ret = syncop_readdirp (subvol, fd, 131072, glfd->offset,
+ NULL, &entries);
+ else
+ ret = syncop_readdir (subvol, fd, 131072, glfd->offset,
+ &entries);
if (ret >= 0) {
- /* spurious errno is dangerous for glfd_entry_next() */
- errno = 0;
+ if (plus)
+ gf_link_inodes_from_dirent (THIS, fd->inode, &entries);
list_splice_init (&glfd->entries, &old.list);
list_splice_init (&entries.list, &glfd->entries);
+
+ /* spurious errno is dangerous for glfd_entry_next() */
+ errno = 0;
}
if (ret > 0)
glfd->next = list_entry (glfd->entries.next, gf_dirent_t, list);
gf_dirent_free (&old);
+out:
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
return ret;
}
gf_dirent_t *
-glfd_entry_next (struct glfs_fd *glfd)
+glfd_entry_next (struct glfs_fd *glfd, int plus)
{
gf_dirent_t *entry = NULL;
int ret = -1;
if (!glfd->offset || !glfd->next) {
- ret = glfd_entry_refresh (glfd);
+ ret = glfd_entry_refresh (glfd, plus);
if (ret < 0)
return NULL;
}
@@ -1483,22 +2020,57 @@ glfd_entry_next (struct glfs_fd *glfd)
}
+static struct dirent *
+glfs_readdirbuf_get (struct glfs_fd *glfd)
+{
+ struct dirent *buf = NULL;
+
+ LOCK (&glfd->fd->lock);
+ {
+ buf = glfd->readdirbuf;
+ if (buf) {
+ memset (buf, 0, READDIRBUF_SIZE);
+ goto unlock;
+ }
+
+ buf = GF_CALLOC (1, READDIRBUF_SIZE, glfs_mt_readdirbuf_t);
+ if (!buf) {
+ errno = ENOMEM;
+ goto unlock;
+ }
+
+ glfd->readdirbuf = buf;
+ }
+unlock:
+ UNLOCK (&glfd->fd->lock);
+
+ return buf;
+}
+
+
int
-glfs_readdir_r (struct glfs_fd *glfd, struct dirent *buf, struct dirent **res)
+glfs_readdirplus_r (struct glfs_fd *glfd, struct stat *stat, struct dirent *ext,
+ struct dirent **res)
{
int ret = 0;
gf_dirent_t *entry = NULL;
+ struct dirent *buf = NULL;
__glfs_entry_fd (glfd);
- if (glfd->fd->inode->ia_type != IA_IFDIR) {
- ret = -1;
- errno = EBADF;
- goto out;
+ errno = 0;
+
+ if (ext)
+ buf = ext;
+ else
+ buf = glfs_readdirbuf_get (glfd);
+
+ if (!buf) {
+ errno = ENOMEM;
+ return -1;
}
- errno = 0;
- entry = glfd_entry_next (glfd);
+ entry = glfd_entry_next (glfd, !!stat);
if (errno)
ret = -1;
@@ -1509,20 +2081,53 @@ glfs_readdir_r (struct glfs_fd *glfd, struct dirent *buf, struct dirent **res)
*res = NULL;
}
- if (entry)
+ if (entry) {
gf_dirent_to_dirent (entry, buf);
-out:
+ if (stat)
+ glfs_iatt_to_stat (glfd->fs, &entry->d_stat, stat);
+ }
+
return ret;
}
int
+glfs_readdir_r (struct glfs_fd *glfd, struct dirent *buf, struct dirent **res)
+{
+ return glfs_readdirplus_r (glfd, 0, buf, res);
+}
+
+
+struct dirent *
+glfs_readdirplus (struct glfs_fd *glfd, struct stat *stat)
+{
+ struct dirent *res = NULL;
+ int ret = -1;
+
+ ret = glfs_readdirplus_r (glfd, stat, NULL, &res);
+ if (ret)
+ return NULL;
+
+ return res;
+}
+
+
+
+struct dirent *
+glfs_readdir (struct glfs_fd *glfd)
+{
+ return glfs_readdirplus (glfd, NULL);
+}
+
+
+int
glfs_statvfs (struct glfs *fs, const char *path, struct statvfs *buf)
{
int ret = -1;
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -1532,15 +2137,22 @@ glfs_statvfs (struct glfs *fs, const char *path, struct statvfs *buf)
errno = EIO;
goto out;
}
+retry:
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
- ret = glfs_resolve (fs, subvol, path, &loc, &iatt);
if (ret)
goto out;
ret = syncop_statfs (subvol, &loc, buf);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
out:
loc_wipe (&loc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -1553,6 +2165,7 @@ glfs_setattr (struct glfs *fs, const char *path, struct iatt *iatt,
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt riatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -1562,19 +2175,25 @@ glfs_setattr (struct glfs *fs, const char *path, struct iatt *iatt,
errno = EIO;
goto out;
}
-
+retry:
if (follow)
- ret = glfs_resolve (fs, subvol, path, &loc, &riatt);
+ ret = glfs_resolve (fs, subvol, path, &loc, &riatt, reval);
else
- ret = glfs_lresolve (fs, subvol, path, &loc, &riatt);
+ ret = glfs_lresolve (fs, subvol, path, &loc, &riatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
if (ret)
goto out;
ret = syncop_setattr (subvol, &loc, iatt, valid, 0, 0);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
out:
loc_wipe (&loc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -1584,18 +2203,31 @@ glfs_fsetattr (struct glfs_fd *glfd, struct iatt *iatt, int valid)
{
int ret = -1;
xlator_t *subvol = NULL;
+ fd_t *fd = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
if (!subvol) {
ret = -1;
errno = EIO;
goto out;
}
- ret = syncop_fsetattr (subvol, glfd->fd, iatt, valid, 0, 0);
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ ret = syncop_fsetattr (subvol, fd, iatt, valid, 0, 0);
out:
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
return ret;
}
@@ -1784,6 +2416,7 @@ glfs_getxattr_common (struct glfs *fs, const char *path, const char *name,
loc_t loc = {0, };
struct iatt iatt = {0, };
dict_t *xattr = NULL;
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -1793,15 +2426,21 @@ glfs_getxattr_common (struct glfs *fs, const char *path, const char *name,
errno = EIO;
goto out;
}
-
+retry:
if (follow)
- ret = glfs_resolve (fs, subvol, path, &loc, &iatt);
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
else
- ret = glfs_lresolve (fs, subvol, path, &loc, &iatt);
+ ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
if (ret)
goto out;
ret = syncop_getxattr (subvol, &loc, &xattr, name);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
if (ret)
goto out;
@@ -1809,6 +2448,8 @@ glfs_getxattr_common (struct glfs *fs, const char *path, const char *name,
out:
loc_wipe (&loc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -1836,52 +2477,45 @@ glfs_fgetxattr (struct glfs_fd *glfd, const char *name, void *value,
int ret = -1;
xlator_t *subvol = NULL;
dict_t *xattr = NULL;
+ fd_t *fd = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
if (!subvol) {
ret = -1;
errno = EIO;
goto out;
}
- ret = syncop_fgetxattr (subvol, glfd->fd, &xattr, name);
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ ret = syncop_fgetxattr (subvol, fd, &xattr, name);
if (ret)
goto out;
ret = glfs_getxattr_process (value, size, xattr, name);
out:
- return ret;
-}
-
-
-static int
-dict_keys_join (void *value, int size, dict_t *dict)
-{
- int len = 0;
-
- int add_key_len (dict_t *d, char *k, data_t *v, void *o)
- {
- if (value && size > len)
- strncpy (value + len, k, size - len);
-
- len += (strlen (k) + 1);
-
- return 0;
- }
+ if (fd)
+ fd_unref (fd);
- dict_foreach (dict, add_key_len, 0);
+ glfs_subvol_done (glfd->fs, subvol);
- return len;
+ return ret;
}
+
int
glfs_listxattr_process (void *value, size_t size, dict_t *xattr)
{
int ret = -1;
- ret = dict_keys_join (NULL, 0, xattr);
+ ret = dict_keys_join (NULL, 0, xattr, NULL);
if (!value || !size)
goto out;
@@ -1892,7 +2526,7 @@ glfs_listxattr_process (void *value, size_t size, dict_t *xattr)
goto out;
}
- dict_keys_join (value, size, xattr);
+ dict_keys_join (value, size, xattr, NULL);
out:
if (xattr)
dict_unref (xattr);
@@ -1909,6 +2543,7 @@ glfs_listxattr_common (struct glfs *fs, const char *path, void *value,
loc_t loc = {0, };
struct iatt iatt = {0, };
dict_t *xattr = NULL;
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -1919,14 +2554,21 @@ glfs_listxattr_common (struct glfs *fs, const char *path, void *value,
goto out;
}
+retry:
if (follow)
- ret = glfs_resolve (fs, subvol, path, &loc, &iatt);
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
else
- ret = glfs_lresolve (fs, subvol, path, &loc, &iatt);
+ ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
if (ret)
goto out;
ret = syncop_getxattr (subvol, &loc, &xattr, NULL);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
if (ret)
goto out;
@@ -1934,6 +2576,8 @@ glfs_listxattr_common (struct glfs *fs, const char *path, void *value,
out:
loc_wipe (&loc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -1958,22 +2602,35 @@ glfs_flistxattr (struct glfs_fd *glfd, void *value, size_t size)
int ret = -1;
xlator_t *subvol = NULL;
dict_t *xattr = NULL;
+ fd_t *fd = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
if (!subvol) {
ret = -1;
errno = EIO;
goto out;
}
- ret = syncop_fgetxattr (subvol, glfd->fd, &xattr, NULL);
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ ret = syncop_fgetxattr (subvol, fd, &xattr, NULL);
if (ret)
goto out;
ret = glfs_listxattr_process (value, size, xattr);
out:
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
return ret;
}
@@ -2007,6 +2664,7 @@ glfs_setxattr_common (struct glfs *fs, const char *path, const char *name,
loc_t loc = {0, };
struct iatt iatt = {0, };
dict_t *xattr = NULL;
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -2016,11 +2674,14 @@ glfs_setxattr_common (struct glfs *fs, const char *path, const char *name,
errno = EIO;
goto out;
}
-
+retry:
if (follow)
- ret = glfs_resolve (fs, subvol, path, &loc, &iatt);
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
else
- ret = glfs_lresolve (fs, subvol, path, &loc, &iatt);
+ ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
if (ret)
goto out;
@@ -2032,11 +2693,16 @@ glfs_setxattr_common (struct glfs *fs, const char *path, const char *name,
}
ret = syncop_setxattr (subvol, &loc, xattr, flags);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
out:
loc_wipe (&loc);
if (xattr)
dict_unref (xattr);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -2064,16 +2730,24 @@ glfs_fsetxattr (struct glfs_fd *glfd, const char *name, const void *value,
int ret = -1;
xlator_t *subvol = NULL;
dict_t *xattr = NULL;
+ fd_t *fd = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
if (!subvol) {
ret = -1;
errno = EIO;
goto out;
}
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
xattr = dict_for_key_value (name, value, size);
if (!xattr) {
ret = -1;
@@ -2081,11 +2755,16 @@ glfs_fsetxattr (struct glfs_fd *glfd, const char *name, const void *value,
goto out;
}
- ret = syncop_fsetxattr (subvol, glfd->fd, xattr, flags);
+ ret = syncop_fsetxattr (subvol, fd, xattr, flags);
out:
if (xattr)
dict_unref (xattr);
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
return ret;
}
@@ -2098,6 +2777,7 @@ glfs_removexattr_common (struct glfs *fs, const char *path, const char *name,
xlator_t *subvol = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
+ int reval = 0;
__glfs_entry_fs (fs);
@@ -2107,18 +2787,26 @@ glfs_removexattr_common (struct glfs *fs, const char *path, const char *name,
errno = EIO;
goto out;
}
-
+retry:
if (follow)
- ret = glfs_resolve (fs, subvol, path, &loc, &iatt);
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
else
- ret = glfs_lresolve (fs, subvol, path, &loc, &iatt);
+ ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
if (ret)
goto out;
ret = syncop_removexattr (subvol, &loc, name);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
out:
loc_wipe (&loc);
+ glfs_subvol_done (fs, subvol);
+
return ret;
}
@@ -2142,17 +2830,423 @@ glfs_fremovexattr (struct glfs_fd *glfd, const char *name)
{
int ret = -1;
xlator_t *subvol = NULL;
+ fd_t *fd = NULL;
+
+ __glfs_entry_fd (glfd);
+
+ subvol = glfs_active_subvol (glfd->fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ ret = syncop_fremovexattr (subvol, fd, name);
+out:
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
+ return ret;
+}
+
+
+int
+glfs_fallocate (struct glfs_fd *glfd, int keep_size, off_t offset, size_t len)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ fd_t *fd = NULL;
__glfs_entry_fd (glfd);
- subvol = glfs_fd_subvol (glfd);
+ subvol = glfs_active_subvol (glfd->fs);
if (!subvol) {
ret = -1;
errno = EIO;
goto out;
}
- ret = syncop_fremovexattr (subvol, glfd->fd, name);
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ ret = syncop_fallocate (subvol, fd, keep_size, offset, len);
out:
+ if (fd)
+ fd_unref(fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
return ret;
}
+
+
+int
+glfs_discard (struct glfs_fd *glfd, off_t offset, size_t len)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ fd_t *fd = NULL;
+
+ __glfs_entry_fd (glfd);
+
+ subvol = glfs_active_subvol (glfd->fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ ret = syncop_discard (subvol, fd, offset, len);
+out:
+ if (fd)
+ fd_unref(fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
+ return ret;
+}
+
+int
+glfs_zerofill (struct glfs_fd *glfd, off_t offset, size_t len)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ fd_t *fd = NULL;
+
+ __glfs_entry_fd (glfd);
+
+ subvol = glfs_active_subvol (glfd->fs);
+ if (!subvol) {
+ errno = EIO;
+ goto out;
+ }
+
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ errno = EBADFD;
+ goto out;
+ }
+
+ ret = syncop_zerofill (subvol, fd, offset, len);
+out:
+ if (fd)
+ fd_unref(fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
+ return ret;
+}
+
+int
+glfs_chdir (struct glfs *fs, const char *path)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+ int reval = 0;
+
+ __glfs_entry_fs (fs);
+
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+retry:
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+ if (ret)
+ goto out;
+
+ if (!IA_ISDIR (iatt.ia_type)) {
+ ret = -1;
+ errno = ENOTDIR;
+ goto out;
+ }
+
+ glfs_cwd_set (fs, loc.inode);
+
+out:
+ loc_wipe (&loc);
+
+ glfs_subvol_done (fs, subvol);
+
+ return ret;
+}
+
+
+int
+glfs_fchdir (struct glfs_fd *glfd)
+{
+ int ret = -1;
+ inode_t *inode = NULL;
+ xlator_t *subvol = NULL;
+ fd_t *fd = NULL;
+
+ __glfs_entry_fd (glfd);
+
+ subvol = glfs_active_subvol (glfd->fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ inode = fd->inode;
+
+ if (!IA_ISDIR (inode->ia_type)) {
+ ret = -1;
+ errno = ENOTDIR;
+ goto out;
+ }
+
+ glfs_cwd_set (glfd->fs, inode);
+ ret = 0;
+out:
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
+ return ret;
+}
+
+
+char *
+glfs_realpath (struct glfs *fs, const char *path, char *resolved_path)
+{
+ int ret = -1;
+ char *retpath = NULL;
+ char *allocpath = NULL;
+ xlator_t *subvol = NULL;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+ int reval = 0;
+
+ __glfs_entry_fs (fs);
+
+ if (resolved_path)
+ retpath = resolved_path;
+ else
+ retpath = allocpath = malloc (PATH_MAX + 1);
+
+ if (!retpath) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+retry:
+ ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+ if (ret)
+ goto out;
+
+ if (loc.path) {
+ strncpy (retpath, loc.path, PATH_MAX);
+ retpath[PATH_MAX] = 0;
+ }
+
+out:
+ loc_wipe (&loc);
+
+ if (ret == -1) {
+ if (allocpath)
+ free (allocpath);
+ retpath = NULL;
+ }
+
+ glfs_subvol_done (fs, subvol);
+
+ return retpath;
+}
+
+
+char *
+glfs_getcwd (struct glfs *fs, char *buf, size_t n)
+{
+ int ret = -1;
+ inode_t *inode = NULL;
+ char *path = NULL;
+
+ __glfs_entry_fs (fs);
+
+ if (!buf || n < 2) {
+ ret = -1;
+ errno = EINVAL;
+ goto out;
+ }
+
+ inode = glfs_cwd_get (fs);
+
+ if (!inode) {
+ strncpy (buf, "/", n);
+ ret = 0;
+ goto out;
+ }
+
+ ret = inode_path (inode, 0, &path);
+ if (n <= ret) {
+ ret = -1;
+ errno = ERANGE;
+ goto out;
+ }
+
+ strncpy (buf, path, n);
+ ret = 0;
+out:
+ GF_FREE (path);
+
+ if (inode)
+ inode_unref (inode);
+
+ if (ret < 0)
+ return NULL;
+
+ return buf;
+}
+
+
+static void
+gf_flock_to_flock (struct gf_flock *gf_flock, struct flock *flock)
+{
+ flock->l_type = gf_flock->l_type;
+ flock->l_whence = gf_flock->l_whence;
+ flock->l_start = gf_flock->l_start;
+ flock->l_len = gf_flock->l_len;
+ flock->l_pid = gf_flock->l_pid;
+}
+
+
+static void
+gf_flock_from_flock (struct gf_flock *gf_flock, struct flock *flock)
+{
+ gf_flock->l_type = flock->l_type;
+ gf_flock->l_whence = flock->l_whence;
+ gf_flock->l_start = flock->l_start;
+ gf_flock->l_len = flock->l_len;
+ gf_flock->l_pid = flock->l_pid;
+}
+
+
+int
+glfs_posix_lock (struct glfs_fd *glfd, int cmd, struct flock *flock)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ struct gf_flock gf_flock = {0, };
+ struct gf_flock saved_flock = {0, };
+ fd_t *fd = NULL;
+
+ __glfs_entry_fd (glfd);
+
+ subvol = glfs_active_subvol (glfd->fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+ if (!fd) {
+ ret = -1;
+ errno = EBADFD;
+ goto out;
+ }
+
+ gf_flock_from_flock (&gf_flock, flock);
+ gf_flock_from_flock (&saved_flock, flock);
+ ret = syncop_lk (subvol, fd, cmd, &gf_flock);
+ gf_flock_to_flock (&gf_flock, flock);
+
+ if (ret == 0 && (cmd == F_SETLK || cmd == F_SETLKW))
+ fd_lk_insert_and_merge (fd, cmd, &saved_flock);
+out:
+ if (fd)
+ fd_unref (fd);
+
+ glfs_subvol_done (glfd->fs, subvol);
+
+ return ret;
+}
+
+
+struct glfs_fd *
+glfs_dup (struct glfs_fd *glfd)
+{
+ xlator_t *subvol = NULL;
+ fd_t *fd = NULL;
+ glfs_fd_t *dupfd = NULL;
+ struct glfs *fs = NULL;
+
+ __glfs_entry_fd (glfd);
+
+ fs = glfd->fs;
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ errno = EIO;
+ goto out;
+ }
+
+ fd = glfs_resolve_fd (fs, subvol, glfd);
+ if (!fd) {
+ errno = EBADFD;
+ goto out;
+ }
+
+ dupfd = glfs_fd_new (fs);
+ if (!dupfd) {
+ errno = ENOMEM;
+ goto out;
+ }
+
+ dupfd->fd = fd_ref (fd);
+out:
+ if (fd)
+ fd_unref (fd);
+ if (dupfd)
+ glfs_fd_bind (dupfd);
+
+ glfs_subvol_done (fs, subvol);
+
+ return dupfd;
+}
diff --git a/api/src/glfs-handleops.c b/api/src/glfs-handleops.c
new file mode 100644
index 000000000..9c707a619
--- /dev/null
+++ b/api/src/glfs-handleops.c
@@ -0,0 +1,1278 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ * This file is part of GlusterFS.
+ *
+ * This file is licensed to you under your choice of the GNU Lesser
+ * General Public License, version 3 or any later version (LGPLv3 or
+ * later), or the GNU General Public License, version 2 (GPLv2), in all
+ * cases as published by the Free Software Foundation.
+ */
+
+
+#include "glfs-internal.h"
+#include "glfs-mem-types.h"
+#include "syncop.h"
+#include "glfs.h"
+#include "glfs-handles.h"
+
+static void
+glfs_iatt_from_stat (struct stat *stat, int valid, struct iatt *iatt,
+ int *glvalid)
+{
+ /* validate in args */
+ if ((stat == NULL) || (iatt == NULL) || (glvalid == NULL)) {
+ errno = EINVAL;
+ return;
+ }
+
+ *glvalid = 0;
+
+ if (valid & GFAPI_SET_ATTR_MODE) {
+ iatt->ia_prot = ia_prot_from_st_mode (stat->st_mode);
+ *glvalid |= GF_SET_ATTR_MODE;
+ }
+
+ if (valid & GFAPI_SET_ATTR_UID) {
+ iatt->ia_uid = stat->st_uid;
+ *glvalid |= GF_SET_ATTR_UID;
+ }
+
+ if (valid & GFAPI_SET_ATTR_GID) {
+ iatt->ia_gid = stat->st_gid;
+ *glvalid |= GF_SET_ATTR_GID;
+ }
+
+ if (valid & GFAPI_SET_ATTR_ATIME) {
+ iatt->ia_atime = stat->st_atime;
+ iatt->ia_atime_nsec = ST_ATIM_NSEC (stat);
+ *glvalid |= GF_SET_ATTR_ATIME;
+ }
+
+ if (valid & GFAPI_SET_ATTR_MTIME) {
+ iatt->ia_mtime = stat->st_mtime;
+ iatt->ia_mtime_nsec = ST_MTIM_NSEC (stat);
+ *glvalid |= GF_SET_ATTR_MTIME;
+ }
+
+ return;
+}
+
+struct glfs_object *
+glfs_h_lookupat (struct glfs *fs, struct glfs_object *parent,
+ const char *path, struct stat *stat)
+{
+ int ret = 0;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ struct iatt iatt = {0, };
+ struct glfs_object *object = NULL;
+ loc_t loc = {0, };
+
+ /* validate in args */
+ if ((fs == NULL) || (path == NULL)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ if (parent) {
+ inode = glfs_resolve_inode (fs, subvol, parent);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+ }
+
+ /* fop/op */
+ ret = glfs_resolve_at (fs, subvol, inode, path, &loc, &iatt,
+ 0 /*TODO: links? */, 0);
+
+ /* populate out args */
+ if (!ret) {
+ if (stat)
+ glfs_iatt_to_stat (fs, &iatt, stat);
+
+ ret = glfs_create_object (&loc, &object);
+ }
+
+out:
+ loc_wipe (&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ glfs_subvol_done (fs, subvol);
+
+ return object;
+}
+
+int
+glfs_h_stat (struct glfs *fs, struct glfs_object *object, struct stat *stat)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+
+ /* validate in args */
+ if ((fs == NULL) || (object == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, object);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ /* populate loc */
+ GLFS_LOC_FILL_INODE (inode, loc, out);
+
+ /* fop/op */
+ ret = syncop_stat (subvol, &loc, &iatt);
+
+ /* populate out args */
+ if (!ret && stat) {
+ glfs_iatt_to_stat (fs, &iatt, stat);
+ }
+out:
+ loc_wipe (&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ glfs_subvol_done (fs, subvol);
+
+ return ret;
+}
+
+int
+glfs_h_getattrs (struct glfs *fs, struct glfs_object *object, struct stat *stat)
+{
+ int ret = 0;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ struct iatt iatt = {0, };
+
+ /* validate in args */
+ if ((fs == NULL) || (object == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, object);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ /* fop/op */
+ ret = glfs_resolve_base (fs, subvol, inode, &iatt);
+
+ /* populate out args */
+ if (!ret && stat) {
+ glfs_iatt_to_stat (fs, &iatt, stat);
+ }
+
+out:
+ if (inode)
+ inode_unref (inode);
+
+ glfs_subvol_done (fs, subvol);
+
+ return ret;
+}
+
+int
+glfs_h_setattrs (struct glfs *fs, struct glfs_object *object, struct stat *stat,
+ int valid)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+ int glvalid = 0;
+
+ /* validate in args */
+ if ((fs == NULL) || (object == NULL) || (stat == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, object);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ /* map valid masks from in args */
+ glfs_iatt_from_stat (stat, valid, &iatt, &glvalid);
+
+ /* populate loc */
+ GLFS_LOC_FILL_INODE (inode, loc, out);
+
+ /* fop/op */
+ ret = syncop_setattr (subvol, &loc, &iatt, glvalid, 0, 0);
+out:
+ loc_wipe (&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ glfs_subvol_done (fs, subvol);
+
+ return ret;
+}
+
+struct glfs_fd *
+glfs_h_open (struct glfs *fs, struct glfs_object *object, int flags)
+{
+ int ret = -1;
+ struct glfs_fd *glfd = NULL;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ loc_t loc = {0, };
+
+ /* validate in args */
+ if ((fs == NULL) || (object == NULL)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, object);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ /* check types to open */
+ if (IA_ISDIR (inode->ia_type)) {
+ ret = -1;
+ errno = EISDIR;
+ goto out;
+ }
+
+ if (!IA_ISREG (inode->ia_type)) {
+ ret = -1;
+ errno = EINVAL;
+ goto out;
+ }
+
+ glfd = glfs_fd_new (fs);
+ if (!glfd) {
+ errno = ENOMEM;
+ goto out;
+ }
+
+ glfd->fd = fd_create (inode, getpid());
+ if (!glfd->fd) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ /* populate loc */
+ GLFS_LOC_FILL_INODE (inode, loc, out);
+
+ /* fop/op */
+ ret = syncop_open (subvol, &loc, flags, glfd->fd);
+
+out:
+ loc_wipe (&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ if (ret && glfd) {
+ glfs_fd_destroy (glfd);
+ glfd = NULL;
+ } else {
+ glfd->fd->flags = flags;
+ fd_bind (glfd->fd);
+ glfs_fd_bind (glfd);
+ }
+
+ glfs_subvol_done (fs, subvol);
+
+ return glfd;
+}
+
+struct glfs_object *
+glfs_h_creat (struct glfs *fs, struct glfs_object *parent, const char *path,
+ int flags, mode_t mode, struct stat *stat)
+{
+ int ret = -1;
+ struct glfs_fd *glfd = NULL;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+ uuid_t gfid;
+ dict_t *xattr_req = NULL;
+ struct glfs_object *object = NULL;
+
+ /* validate in args */
+ if ((fs == NULL) || (parent == NULL) || (path == NULL)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, parent);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ uuid_generate (gfid);
+ ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+ if (ret) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ GLFS_LOC_FILL_PINODE (inode, loc, ret, errno, out, path);
+
+ glfd = glfs_fd_new (fs);
+ if (!glfd)
+ goto out;
+
+ glfd->fd = fd_create (loc.inode, getpid());
+ if (!glfd->fd) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ /* fop/op */
+ ret = syncop_create (subvol, &loc, flags, mode, glfd->fd,
+ xattr_req, &iatt);
+
+ /* populate out args */
+ if (ret == 0) {
+ /* TODO: If the inode existed in the cache (say file already
+ exists), then the glfs_loc_link will not update the
+ loc.inode, as a result we will have a 0000 GFID that we
+ would copy out to the object, this needs to be fixed.
+ */
+ ret = glfs_loc_link (&loc, &iatt);
+ if (ret != 0) {
+ goto out;
+ }
+
+ if (stat)
+ glfs_iatt_to_stat (fs, &iatt, stat);
+
+ ret = glfs_create_object (&loc, &object);
+ }
+
+out:
+ if (ret && object != NULL) {
+ glfs_h_close (object);
+ object = NULL;
+ }
+
+ loc_wipe(&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ if (glfd) {
+ glfs_fd_destroy (glfd);
+ glfd = NULL;
+ }
+
+ glfs_subvol_done (fs, subvol);
+
+ return object;
+}
+
+struct glfs_object *
+glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent, const char *path,
+ mode_t mode, struct stat *stat)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+ uuid_t gfid;
+ dict_t *xattr_req = NULL;
+ struct glfs_object *object = NULL;
+
+ /* validate in args */
+ if ((fs == NULL) || (parent == NULL) || (path == NULL)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, parent);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ uuid_generate (gfid);
+ ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+ if (ret) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ GLFS_LOC_FILL_PINODE (inode, loc, ret, errno, out, path);
+
+ /* fop/op */
+ ret = syncop_mkdir (subvol, &loc, mode, xattr_req, &iatt);
+
+ /* populate out args */
+ if ( ret == 0 ) {
+ ret = glfs_loc_link (&loc, &iatt);
+ if (ret != 0) {
+ goto out;
+ }
+
+ if (stat)
+ glfs_iatt_to_stat (fs, &iatt, stat);
+
+ ret = glfs_create_object (&loc, &object);
+ }
+
+out:
+ if (ret && object != NULL) {
+ glfs_h_close (object);
+ object = NULL;
+ }
+
+ loc_wipe(&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ glfs_subvol_done (fs, subvol);
+
+ return object;
+}
+
+struct glfs_object *
+glfs_h_mknod (struct glfs *fs, struct glfs_object *parent, const char *path,
+ mode_t mode, dev_t dev, struct stat *stat)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+ uuid_t gfid;
+ dict_t *xattr_req = NULL;
+ struct glfs_object *object = NULL;
+
+ /* validate in args */
+ if ((fs == NULL) || (parent == NULL) || (path == NULL)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, parent);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ uuid_generate (gfid);
+ ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+ if (ret) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ GLFS_LOC_FILL_PINODE (inode, loc, ret, errno, out, path);
+
+ /* fop/op */
+ ret = syncop_mknod (subvol, &loc, mode, dev, xattr_req, &iatt);
+
+ /* populate out args */
+ if (ret == 0) {
+ ret = glfs_loc_link (&loc, &iatt);
+ if (ret != 0) {
+ goto out;
+ }
+
+ if (stat)
+ glfs_iatt_to_stat (fs, &iatt, stat);
+
+ ret = glfs_create_object (&loc, &object);
+ }
+out:
+ if (ret && object != NULL) {
+ glfs_h_close (object);
+ object = NULL;
+ }
+
+ loc_wipe(&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ glfs_subvol_done (fs, subvol);
+
+ return object;
+}
+
+int
+glfs_h_unlink (struct glfs *fs, struct glfs_object *parent, const char *path)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ loc_t loc = {0, };
+
+ /* validate in args */
+ if ((fs == NULL) || (parent == NULL) || (path == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if ( !subvol ) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, parent);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ ret = glfs_resolve_at (fs, subvol, inode, path, &loc, NULL, 0 , 0);
+ if (ret != 0) {
+ goto out;
+ }
+
+ if (!IA_ISDIR(loc.inode->ia_type)) {
+ ret = syncop_unlink (subvol, &loc);
+ if (ret != 0) {
+ goto out;
+ }
+ } else {
+ ret = syncop_rmdir (subvol, &loc);
+ if (ret != 0) {
+ goto out;
+ }
+ }
+
+ if (ret == 0)
+ ret = glfs_loc_unlink (&loc);
+
+out:
+ loc_wipe (&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ glfs_subvol_done (fs, subvol);
+
+ return ret;
+}
+
+struct glfs_fd *
+glfs_h_opendir (struct glfs *fs, struct glfs_object *object)
+{
+ int ret = -1;
+ struct glfs_fd *glfd = NULL;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ loc_t loc = {0, };
+
+ /* validate in args */
+ if ((fs == NULL) || (object == NULL)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, object);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ if (!IA_ISDIR (inode->ia_type)) {
+ ret = -1;
+ errno = ENOTDIR;
+ goto out;
+ }
+
+ glfd = glfs_fd_new (fs);
+ if (!glfd)
+ goto out;
+
+ INIT_LIST_HEAD (&glfd->entries);
+
+ glfd->fd = fd_create (inode, getpid());
+ if (!glfd->fd) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ GLFS_LOC_FILL_INODE (inode, loc, out);
+
+ /* fop/op */
+ ret = syncop_opendir (subvol, &loc, glfd->fd);
+
+out:
+ loc_wipe (&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ if (ret && glfd) {
+ glfs_fd_destroy (glfd);
+ glfd = NULL;
+ } else {
+ fd_bind (glfd->fd);
+ glfs_fd_bind (glfd);
+ }
+
+ glfs_subvol_done (fs, subvol);
+
+ return glfd;
+}
+
+ssize_t
+glfs_h_extract_handle (struct glfs_object *object, unsigned char *handle,
+ int len)
+{
+ ssize_t ret = -1;
+
+ /* validate in args */
+ if (object == NULL) {
+ errno = EINVAL;
+ goto out;
+ }
+
+ if (!handle || !len) {
+ ret = GFAPI_HANDLE_LENGTH;
+ goto out;
+ }
+
+ if (len < GFAPI_HANDLE_LENGTH)
+ {
+ errno = ERANGE;
+ goto out;
+ }
+
+ memcpy (handle, object->gfid, GFAPI_HANDLE_LENGTH);
+
+ ret = GFAPI_HANDLE_LENGTH;
+
+out:
+ return ret;
+}
+
+struct glfs_object *
+glfs_h_create_from_handle (struct glfs *fs, unsigned char *handle, int len,
+ struct stat *stat)
+{
+ loc_t loc = {0, };
+ int ret = -1;
+ struct iatt iatt = {0, };
+ inode_t *newinode = NULL;
+ xlator_t *subvol = NULL;
+ struct glfs_object *object = NULL;
+
+ /* validate in args */
+ if ((fs == NULL) || (handle == NULL) || (len != GFAPI_HANDLE_LENGTH)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ errno = EIO;
+ goto out;
+ }
+
+ memcpy (loc.gfid, handle, GFAPI_HANDLE_LENGTH);
+
+ newinode = inode_find (subvol->itable, loc.gfid);
+ if (newinode)
+ loc.inode = newinode;
+ else {
+ loc.inode = inode_new (subvol->itable);
+ if (!loc.inode) {
+ errno = ENOMEM;
+ goto out;
+ }
+ }
+
+ ret = syncop_lookup (subvol, &loc, 0, &iatt, 0, 0);
+ if (ret) {
+ gf_log (subvol->name, GF_LOG_WARNING,
+ "inode refresh of %s failed: %s",
+ uuid_utoa (loc.gfid), strerror (errno));
+ goto out;
+ }
+
+ newinode = inode_link (loc.inode, 0, 0, &iatt);
+ if (newinode)
+ inode_lookup (newinode);
+ else {
+ gf_log (subvol->name, GF_LOG_WARNING,
+ "inode linking of %s failed: %s",
+ uuid_utoa (loc.gfid), strerror (errno));
+ errno = EINVAL;
+ goto out;
+ }
+
+ /* populate stat */
+ if (stat)
+ glfs_iatt_to_stat (fs, &iatt, stat);
+
+ object = GF_CALLOC (1, sizeof(struct glfs_object),
+ glfs_mt_glfs_object_t);
+ if (object == NULL) {
+ errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ /* populate the return object */
+ object->inode = newinode;
+ uuid_copy (object->gfid, object->inode->gfid);
+
+out:
+ /* TODO: Check where the inode ref is being held? */
+ loc_wipe (&loc);
+
+ glfs_subvol_done (fs, subvol);
+
+ return object;
+}
+
+int
+glfs_h_close (struct glfs_object *object)
+{
+ /* Release the held reference */
+ inode_unref (object->inode);
+ GF_FREE (object);
+
+ return 0;
+}
+
+int
+glfs_h_truncate (struct glfs *fs, struct glfs_object *object, off_t offset)
+{
+ loc_t loc = {0, };
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+
+ /* validate in args */
+ if ((fs == NULL) || (object == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, object);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ GLFS_LOC_FILL_INODE (inode, loc, out);
+
+ /* fop/op */
+ ret = syncop_truncate (subvol, &loc, (off_t)offset);
+
+ /* populate out args */
+ if (ret == 0)
+ ret = glfs_loc_unlink (&loc);
+
+out:
+ loc_wipe (&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ glfs_subvol_done (fs, subvol);
+
+ return ret;
+}
+
+struct glfs_object *
+glfs_h_symlink (struct glfs *fs, struct glfs_object *parent, const char *name,
+ const char *data, struct stat *stat)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+ uuid_t gfid;
+ dict_t *xattr_req = NULL;
+ struct glfs_object *object = NULL;
+
+ /* validate in args */
+ if ((fs == NULL) || (parent == NULL) || (name == NULL) ||
+ (data == NULL)) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, parent);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ uuid_generate (gfid);
+ ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+ if (ret) {
+ ret = -1;
+ errno = ENOMEM;
+ goto out;
+ }
+
+ GLFS_LOC_FILL_PINODE (inode, loc, ret, errno, out, name);
+
+ /* fop/op */
+ ret = syncop_symlink (subvol, &loc, data, xattr_req, &iatt);
+
+ /* populate out args */
+ if (ret == 0) {
+ /* TODO: If the inode existed in the cache (say file already
+ * exists), then the glfs_loc_link will not update the
+ * loc.inode, as a result we will have a 0000 GFID that we
+ * would copy out to the object, this needs to be fixed.
+ */
+ ret = glfs_loc_link (&loc, &iatt);
+ if (ret != 0) {
+ goto out;
+ }
+
+ if (stat)
+ glfs_iatt_to_stat (fs, &iatt, stat);
+
+ ret = glfs_create_object (&loc, &object);
+ }
+
+out:
+ if (ret && object != NULL) {
+ glfs_h_close (object);
+ object = NULL;
+ }
+
+ loc_wipe(&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ glfs_subvol_done (fs, subvol);
+
+ return object;
+}
+
+int
+glfs_h_readlink (struct glfs *fs, struct glfs_object *object, char *buf,
+ size_t bufsiz)
+{
+ loc_t loc = {0, };
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ char *linkval = NULL;
+
+ /* validate in args */
+ if ((fs == NULL) || (object == NULL) || (buf == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, object);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ GLFS_LOC_FILL_INODE (inode, loc, out);
+
+ /* fop/op */
+ ret = syncop_readlink (subvol, &loc, &linkval, bufsiz);
+
+ /* populate out args */
+ if (ret > 0)
+ memcpy (buf, linkval, ret);
+
+out:
+ loc_wipe (&loc);
+
+ if (inode)
+ inode_unref (inode);
+
+ if (linkval)
+ GF_FREE (linkval);
+
+ glfs_subvol_done (fs, subvol);
+
+ return ret;
+}
+
+int
+glfs_h_link (struct glfs *fs, struct glfs_object *linksrc,
+ struct glfs_object *parent, const char *name)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ inode_t *pinode = NULL;
+ loc_t oldloc = {0, };
+ loc_t newloc = {0, };
+
+ /* validate in args */
+ if ((fs == NULL) || (linksrc == NULL) || (parent == NULL) ||
+ (name == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if (!subvol) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ inode = glfs_resolve_inode (fs, subvol, linksrc);
+ if (!inode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ if (inode->ia_type == IA_IFDIR) {
+ ret = -1;
+ errno = EISDIR;
+ goto out;
+ }
+
+ GLFS_LOC_FILL_INODE (inode, oldloc, out);
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ pinode = glfs_resolve_inode (fs, subvol, parent);
+ if (!pinode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ /* setup newloc based on parent */
+ newloc.parent = inode_ref (pinode);
+ newloc.name = name;
+ ret = glfs_loc_touchup (&newloc);
+ if (ret != 0) {
+ errno = EINVAL;
+ goto out;
+ }
+
+ /* Filling the inode of the hard link to be same as that of the
+ * original file
+ */
+ newloc.inode = inode_ref (inode);
+
+ /* fop/op */
+ ret = syncop_link (subvol, &oldloc, &newloc);
+
+ if (ret == 0)
+ /* TODO: No iatt to pass as there has been no lookup */
+ ret = glfs_loc_link (&newloc, NULL);
+out:
+ loc_wipe (&oldloc);
+ loc_wipe (&newloc);
+
+ if (inode)
+ inode_unref (inode);
+
+ if (pinode)
+ inode_unref (pinode);
+
+ glfs_subvol_done (fs, subvol);
+
+ return ret;
+}
+
+int
+glfs_h_rename (struct glfs *fs, struct glfs_object *olddir, const char *oldname,
+ struct glfs_object *newdir, const char *newname)
+{
+ int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *oldpinode = NULL;
+ inode_t *newpinode = NULL;
+ loc_t oldloc = {0, };
+ loc_t newloc = {0, };
+ struct iatt oldiatt = {0, };
+ struct iatt newiatt = {0, };
+
+ /* validate in args */
+ if ((fs == NULL) || (olddir == NULL) || (oldname == NULL) ||
+ (newdir == NULL) || (newname == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ __glfs_entry_fs (fs);
+
+ /* get the active volume */
+ subvol = glfs_active_subvol (fs);
+ if ( !subvol ) {
+ ret = -1;
+ errno = EIO;
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ oldpinode = glfs_resolve_inode (fs, subvol, olddir);
+ if (!oldpinode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ ret = glfs_resolve_at (fs, subvol, oldpinode, oldname, &oldloc,
+ &oldiatt, 0 , 0);
+ if (ret != 0) {
+ goto out;
+ }
+
+ /* get/refresh the in arg objects inode in correlation to the xlator */
+ newpinode = glfs_resolve_inode (fs, subvol, newdir);
+ if (!newpinode) {
+ errno = ESTALE;
+ goto out;
+ }
+
+ ret = glfs_resolve_at (fs, subvol, newpinode, newname, &newloc,
+ &newiatt, 0, 0);
+
+ if (ret && errno != ENOENT && newloc.parent)
+ goto out;
+
+ if (newiatt.ia_type != IA_INVAL) {
+ if ((oldiatt.ia_type == IA_IFDIR) !=
+ (newiatt.ia_type == IA_IFDIR)) {
+ /* Either both old and new must be dirs,
+ * or both must be non-dirs. Else, fail.
+ */
+ ret = -1;
+ errno = EISDIR;
+ goto out;
+ }
+ }
+
+ /* TODO: check if new or old is a prefix of the other, and fail EINVAL */
+
+ ret = syncop_rename (subvol, &oldloc, &newloc);
+
+ if (ret == 0)
+ inode_rename (oldloc.parent->table, oldloc.parent, oldloc.name,
+ newloc.parent, newloc.name, oldloc.inode,
+ &oldiatt);
+
+out:
+ loc_wipe (&oldloc);
+ loc_wipe (&newloc);
+
+ if (oldpinode)
+ inode_unref (oldpinode);
+
+ if (newpinode)
+ inode_unref (newpinode);
+
+ glfs_subvol_done (fs, subvol);
+
+ return ret;
+}
diff --git a/api/src/glfs-handles.h b/api/src/glfs-handles.h
new file mode 100644
index 000000000..437f2cbc8
--- /dev/null
+++ b/api/src/glfs-handles.h
@@ -0,0 +1,143 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLFS_HANDLES_H
+#define _GLFS_HANDLES_H
+
+#include "glfs.h"
+
+/* GLFS OBJECT BASED OPERATIONS
+ *
+ * The following APIs are introduced to provide an API framework that can work
+ * with gluster objects (files and directories), instead of absolute paths.
+ *
+ * The following API set can be related to the POSIX *at interfaces (like
+ * openat (2)). The intention of these APIs is to be able to operate based
+ * on parent object and looking up or creating child objects within, OR to be
+ * used on the actual object thus looked up or created, and retrieve information
+ * regarding the same.
+ *
+ * The APIs also provide for generating an opaque invariant handle to the
+ * object, that can later be used to lookup the object, instead of the regular
+ * glfs_h_* variants. The APIs that provide this behaviour are,
+ * glfs_h_extract_handle and glfs_h_create_from_handle.
+ *
+ * The object handles can be transitioned to fd based operations as supported
+ * by glfs.h calls, using the glfs_h_open call. This provides a way to move
+ * from objects to fd's akin to moving from path to fd for required operations.
+ *
+ * NOTE: The opaque invariant handle is the GFID of the object in reality, but
+ * maintained as an opaque data value, for potential internal changes to the
+ * same without impacting the caller.
+ *
+ * NOTE: Currently looking up an object can create multiple object handles to
+ * the same, i.e distinct glfs_object *. Hence each such looked up or received
+ * handle from other calls, would need to be closed. In the future, for a given
+ * object these pointers would be the same, and an ease of use API to forget all
+ * instances of this bject would be provided (instead of a per lookup close).
+ * This should not change the APIs in their current form.
+ *
+ */
+
+/* Values for valid falgs to be used when using XXXsetattr, to set multiple
+ attribute values passed via the related stat structure.
+ */
+#define GFAPI_SET_ATTR_MODE 0x1
+#define GFAPI_SET_ATTR_UID 0x2
+#define GFAPI_SET_ATTR_GID 0x4
+#define GFAPI_SET_ATTR_SIZE 0x8
+#define GFAPI_SET_ATTR_ATIME 0x10
+#define GFAPI_SET_ATTR_MTIME 0x20
+
+/* Handle length for object handles returned from glfs_h_extract_handle or
+ * glfs_h_create_from_handle */
+#define GFAPI_HANDLE_LENGTH 16
+
+__BEGIN_DECLS
+
+/*
+ * Notes:
+ *
+ * The file object handle. One per looked up, created file/directory
+ *
+ * This had been introduced to facilitate gfid/inode based gfapi
+ * - a requirement introduced by nfs-ganesha
+ */
+struct glfs_object;
+typedef struct glfs_object glfs_object_t;
+
+/* Handle based operations */
+/* Operations that generate handles */
+struct glfs_object *glfs_h_lookupat (struct glfs *fs,
+ struct glfs_object *parent,
+ const char *path, struct stat *stat);
+
+struct glfs_object *glfs_h_creat (struct glfs *fs, struct glfs_object *parent,
+ const char *path, int flags, mode_t mode,
+ struct stat *sb);
+
+struct glfs_object *glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent,
+ const char *path, mode_t flags,
+ struct stat *sb);
+
+struct glfs_object *glfs_h_mknod (struct glfs *fs, struct glfs_object *parent,
+ const char *path, mode_t mode, dev_t dev,
+ struct stat *sb);
+
+struct glfs_object *glfs_h_symlink (struct glfs *fs, struct glfs_object *parent,
+ const char *name, const char *data,
+ struct stat *stat);
+
+/* Operations on the actual objects */
+int glfs_h_unlink (struct glfs *fs, struct glfs_object *parent,
+ const char *path);
+
+int glfs_h_close (struct glfs_object *object);
+
+int glfs_caller_specific_init (void *uid_caller_key, void *gid_caller_key,
+ void *future);
+
+int glfs_h_truncate (struct glfs *fs, struct glfs_object *object, off_t offset);
+
+int glfs_h_stat(struct glfs *fs, struct glfs_object *object, struct stat *stat);
+
+int glfs_h_getattrs (struct glfs *fs, struct glfs_object *object,
+ struct stat *stat);
+
+int glfs_h_setattrs (struct glfs *fs, struct glfs_object *object,
+ struct stat *sb, int valid);
+
+int glfs_h_readlink (struct glfs *fs, struct glfs_object *object, char *buf,
+ size_t bufsiz);
+
+int glfs_h_link (struct glfs *fs, struct glfs_object *linktgt,
+ struct glfs_object *parent, const char *name);
+
+int glfs_h_rename (struct glfs *fs, struct glfs_object *olddir,
+ const char *oldname, struct glfs_object *newdir,
+ const char *newname);
+
+/* Operations enabling opaque invariant handle to object transitions */
+ssize_t glfs_h_extract_handle (struct glfs_object *object,
+ unsigned char *handle, int len);
+
+struct glfs_object *glfs_h_create_from_handle (struct glfs *fs,
+ unsigned char *handle, int len,
+ struct stat *stat);
+
+/* Operations enabling object handles to fd transitions */
+struct glfs_fd *glfs_h_opendir (struct glfs *fs, struct glfs_object *object);
+
+struct glfs_fd *glfs_h_open (struct glfs *fs, struct glfs_object *object,
+ int flags);
+
+__END_DECLS
+
+#endif /* !_GLFS_HANDLES_H */ \ No newline at end of file
diff --git a/api/src/glfs-internal.h b/api/src/glfs-internal.h
index c2fc0ecc1..ec1d5579d 100644
--- a/api/src/glfs-internal.h
+++ b/api/src/glfs-internal.h
@@ -14,6 +14,46 @@
#include "xlator.h"
+#define GLFS_SYMLINK_MAX_FOLLOW 2048
+
+#define DEFAULT_REVAL_COUNT 1
+
+#define ESTALE_RETRY(ret,errno,reval,loc,label) do { \
+ if (ret == -1 && errno == ESTALE) { \
+ if (reval < DEFAULT_REVAL_COUNT) { \
+ reval++; \
+ loc_wipe (loc); \
+ goto label; \
+ } \
+ } \
+ } while (0)
+
+#define GLFS_LOC_FILL_INODE(oinode, loc, label) do { \
+ loc.inode = inode_ref (oinode); \
+ uuid_copy (loc.gfid, oinode->gfid); \
+ ret = glfs_loc_touchup (&loc); \
+ if (ret != 0) { \
+ errno = EINVAL; \
+ goto label; \
+ } \
+ } while (0)
+
+#define GLFS_LOC_FILL_PINODE(pinode, loc, ret, errno, label, path) do { \
+ loc.inode = inode_new (pinode->table); \
+ if (!loc.inode) { \
+ ret = -1; \
+ errno = ENOMEM; \
+ goto label; \
+ } \
+ loc.parent = inode_ref (pinode); \
+ loc.name = path; \
+ ret = glfs_loc_touchup (&loc); \
+ if (ret != 0) { \
+ errno = EINVAL; \
+ goto label; \
+ } \
+ } while (0)
+
struct glfs;
typedef int (*glfs_init_cbk) (struct glfs *fs, int ret);
@@ -33,16 +73,37 @@ struct glfs {
int err;
xlator_t *active_subvol;
+ xlator_t *next_subvol;
+ xlator_t *old_subvol;
char *oldvolfile;
ssize_t oldvollen;
+
+ inode_t *cwd;
+
+ uint32_t dev_id; /* Used to fill st_dev in struct stat */
+
+ struct list_head openfds;
+
+ gf_boolean_t migration_in_progress;
};
struct glfs_fd {
+ struct list_head openfds;
+ struct glfs *fs;
off_t offset;
- fd_t *fd;
+ fd_t *fd; /* Currently guared by @fs->mutex. TODO: per-glfd lock */
struct list_head entries;
gf_dirent_t *next;
+ struct dirent *readdirbuf;
+};
+
+/* glfs object handle introduced for the alternate gfapi implementation based
+ on glfs handles/gfid/inode
+*/
+struct glfs_object {
+ inode_t *inode;
+ uuid_t gfid;
};
#define DEFAULT_EVENT_POOL_SIZE 16384
@@ -54,10 +115,14 @@ int glfs_mgmt_init (struct glfs *fs);
void glfs_init_done (struct glfs *fs, int ret);
int glfs_process_volfp (struct glfs *fs, FILE *fp);
int glfs_resolve (struct glfs *fs, xlator_t *subvol, const char *path, loc_t *loc,
- struct iatt *iatt);
+ struct iatt *iatt, int reval);
int glfs_lresolve (struct glfs *fs, xlator_t *subvol, const char *path, loc_t *loc,
- struct iatt *iatt);
-void glfs_first_lookup (xlator_t *subvol);
+ struct iatt *iatt, int reval);
+fd_t *glfs_resolve_fd (struct glfs *fs, xlator_t *subvol, struct glfs_fd *glfd);
+
+fd_t *__glfs_migrate_fd (struct glfs *fs, xlator_t *subvol, struct glfs_fd *glfd);
+
+int glfs_first_lookup (xlator_t *subvol);
static inline void
__glfs_entry_fs (struct glfs *fs)
@@ -73,10 +138,63 @@ __glfs_entry_fd (struct glfs_fd *fd)
}
+/*
+ By default all lock attempts from user context must
+ use glfs_lock() and glfs_unlock(). This allows
+ for a safe implementation of graph migration where
+ we can give up the mutex during syncop calls so
+ that bottom up calls (particularly CHILD_UP notify)
+ can do a mutex_lock() on @glfs without deadlocking
+ the filesystem
+*/
+static inline int
+glfs_lock (struct glfs *fs)
+{
+ pthread_mutex_lock (&fs->mutex);
+
+ while (!fs->init)
+ pthread_cond_wait (&fs->cond, &fs->mutex);
+
+ while (fs->migration_in_progress)
+ pthread_cond_wait (&fs->cond, &fs->mutex);
+
+ return 0;
+}
+
+
+static inline void
+glfs_unlock (struct glfs *fs)
+{
+ pthread_mutex_unlock (&fs->mutex);
+}
+
+
void glfs_fd_destroy (struct glfs_fd *glfd);
-xlator_t * glfs_fd_subvol (struct glfs_fd *glfd);
+struct glfs_fd *glfs_fd_new (struct glfs *fs);
+void glfs_fd_bind (struct glfs_fd *glfd);
xlator_t * glfs_active_subvol (struct glfs *fs);
+xlator_t * __glfs_active_subvol (struct glfs *fs);
+void glfs_subvol_done (struct glfs *fs, xlator_t *subvol);
+
+inode_t * glfs_refresh_inode (xlator_t *subvol, inode_t *inode);
+
+inode_t *glfs_cwd_get (struct glfs *fs);
+int glfs_cwd_set (struct glfs *fs, inode_t *inode);
+inode_t *glfs_resolve_inode (struct glfs *fs, xlator_t *subvol,
+ struct glfs_object *object);
+int glfs_create_object (loc_t *loc, struct glfs_object **retobject);
+int __glfs_cwd_set (struct glfs *fs, inode_t *inode);
+
+int glfs_resolve_base (struct glfs *fs, xlator_t *subvol, inode_t *inode,
+ struct iatt *iatt);
+int glfs_resolve_at (struct glfs *fs, xlator_t *subvol, inode_t *at,
+ const char *origpath, loc_t *loc, struct iatt *iatt,
+ int follow, int reval);
+int glfs_loc_touchup (loc_t *loc);
+void glfs_iatt_to_stat (struct glfs *fs, struct iatt *iatt, struct stat *stat);
+int glfs_loc_link (loc_t *loc, struct iatt *iatt);
+int glfs_loc_unlink (loc_t *loc);
#endif /* !_GLFS_INTERNAL_H */
diff --git a/api/src/glfs-master.c b/api/src/glfs-master.c
index 0806c3077..c02534c18 100644
--- a/api/src/glfs-master.c
+++ b/api/src/glfs-master.c
@@ -22,26 +22,58 @@
#include "xlator.h"
#include "glusterfs.h"
+
#include "glfs-internal.h"
+#include "glfs-mem-types.h"
int
glfs_graph_setup (struct glfs *fs, glusterfs_graph_t *graph)
{
- if (fs->active_subvol == graph->top)
- return 0;
+ xlator_t *new_subvol = NULL;
+ xlator_t *old_subvol = NULL;
+ inode_table_t *itable = NULL;
+ int ret = -1;
+
+ new_subvol = graph->top;
+ /* This is called in a bottom-up context, it should specifically
+ NOT be glfs_lock()
+ */
pthread_mutex_lock (&fs->mutex);
{
- fs->active_subvol = graph->top;
- pthread_cond_broadcast (&fs->cond);
+ if (new_subvol->switched ||
+ new_subvol == fs->active_subvol ||
+ new_subvol == fs->next_subvol) {
+ /* Spurious CHILD_UP event on old graph */
+ ret = 0;
+ goto unlock;
+ }
+
+ if (!new_subvol->itable) {
+ itable = inode_table_new (131072, new_subvol);
+ if (!itable) {
+ errno = ENOMEM;
+ ret = -1;
+ goto unlock;
+ }
+
+ new_subvol->itable = itable;
+ }
+
+ old_subvol = fs->next_subvol;
+ fs->next_subvol = new_subvol;
+ fs->next_subvol->winds++; /* first ref */
+ ret = 0;
}
+unlock:
pthread_mutex_unlock (&fs->mutex);
- gf_log ("glfs-master", GF_LOG_INFO, "switched to graph %s (%d)",
- uuid_utoa ((unsigned char *)graph->graph_uuid), graph->id);
+ if (old_subvol)
+ /* wasn't picked up so far, skip */
+ glfs_subvol_done (fs, old_subvol);
- return 0;
+ return ret;
}
@@ -83,6 +115,18 @@ notify (xlator_t *this, int event, void *data, ...)
int
mem_acct_init (xlator_t *this)
{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, glfs_mt_end + 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to initialise "
+ "memory accounting");
+ return ret;
+ }
+
return 0;
}
@@ -101,13 +145,10 @@ fini (xlator_t *this)
}
-struct xlator_dumpops dumpops = {
-};
+struct xlator_dumpops dumpops;
-struct xlator_fops fops = {
-};
+struct xlator_fops fops;
-struct xlator_cbks cbks = {
-};
+struct xlator_cbks cbks;
diff --git a/api/src/glfs-mem-types.h b/api/src/glfs-mem-types.h
index e2e9b4c1e..3301b3da5 100644
--- a/api/src/glfs-mem-types.h
+++ b/api/src/glfs-mem-types.h
@@ -16,13 +16,16 @@
#define GF_MEM_TYPE_START (gf_common_mt_end + 1)
enum glfs_mem_types_ {
- glfs_mt_glfs_t,
+ glfs_mt_glfs_t = GF_MEM_TYPE_START,
glfs_mt_call_pool_t,
glfs_mt_xlator_t,
glfs_mt_glfs_fd_t,
glfs_mt_glfs_io_t,
glfs_mt_volfile_t,
- glfs_mt_end
+ glfs_mt_xlator_cmdline_option_t,
+ glfs_mt_glfs_object_t,
+ glfs_mt_readdirbuf_t,
+ glfs_mt_end
};
#endif
diff --git a/api/src/glfs-mgmt.c b/api/src/glfs-mgmt.c
index 2ead93863..6843e9cb3 100644
--- a/api/src/glfs-mgmt.c
+++ b/api/src/glfs-mgmt.c
@@ -114,7 +114,7 @@ mgmt_cbk_event (struct rpc_clnt *rpc, void *mydata, void *data)
}
-rpcclnt_cb_actor_t gluster_cbk_actors[] = {
+rpcclnt_cb_actor_t mgmt_cbk_actors[] = {
[GF_CBK_FETCHSPEC] = {"FETCHSPEC", GF_CBK_FETCHSPEC, mgmt_cbk_spec },
[GF_CBK_EVENT_NOTIFY] = {"EVENTNOTIFY", GF_CBK_EVENT_NOTIFY,
mgmt_cbk_event},
@@ -125,7 +125,7 @@ struct rpcclnt_cb_program mgmt_cbk_prog = {
.progname = "GlusterFS Callback",
.prognum = GLUSTER_CBK_PROGRAM,
.progver = GLUSTER_CBK_VERSION,
- .actors = gluster_cbk_actors,
+ .actors = mgmt_cbk_actors,
.numactors = GF_CBK_MAXVALUE,
};
@@ -203,162 +203,6 @@ out:
static int
-xlator_equal_rec (xlator_t *xl1, xlator_t *xl2)
-{
- xlator_list_t *trav1 = NULL;
- xlator_list_t *trav2 = NULL;
- int ret = 0;
-
- if (xl1 == NULL || xl2 == NULL) {
- gf_log ("xlator", GF_LOG_DEBUG, "invalid argument");
- return -1;
- }
-
- trav1 = xl1->children;
- trav2 = xl2->children;
-
- while (trav1 && trav2) {
- ret = xlator_equal_rec (trav1->xlator, trav2->xlator);
- if (ret) {
- gf_log ("glfs-mgmt", GF_LOG_DEBUG,
- "xlators children not equal");
- goto out;
- }
-
- trav1 = trav1->next;
- trav2 = trav2->next;
- }
-
- if (trav1 || trav2) {
- ret = -1;
- goto out;
- }
-
- if (strcmp (xl1->name, xl2->name)) {
- ret = -1;
- goto out;
- }
-out :
- return ret;
-}
-
-
-static gf_boolean_t
-is_graph_topology_equal (glusterfs_graph_t *graph1,
- glusterfs_graph_t *graph2)
-{
- xlator_t *trav1 = NULL;
- xlator_t *trav2 = NULL;
- gf_boolean_t ret = _gf_true;
-
- trav1 = graph1->first;
- trav2 = graph2->first;
-
- ret = xlator_equal_rec (trav1, trav2);
-
- if (ret) {
- gf_log ("glfs-mgmt", GF_LOG_DEBUG,
- "graphs are not equal");
- ret = _gf_false;
- goto out;
- }
-
- ret = _gf_true;
- gf_log ("glfs-mgmt", GF_LOG_DEBUG,
- "graphs are equal");
-
-out:
- return ret;
-}
-
-
-/* Function has 3types of return value 0, -ve , 1
- * return 0 =======> reconfiguration of options has succeeded
- * return 1 =======> the graph has to be reconstructed and all the xlators should be inited
- * return -1(or -ve) =======> Some Internal Error occurred during the operation
- */
-static int
-glusterfs_volfile_reconfigure (struct glfs *fs, FILE *newvolfile_fp)
-{
- glusterfs_graph_t *oldvolfile_graph = NULL;
- glusterfs_graph_t *newvolfile_graph = NULL;
- FILE *oldvolfile_fp = NULL;
- glusterfs_ctx_t *ctx = NULL;
-
- int ret = -1;
-
- oldvolfile_fp = tmpfile ();
- if (!oldvolfile_fp)
- goto out;
-
- if (!fs->oldvollen) {
- ret = 1; // Has to call INIT for the whole graph
- goto out;
- }
- fwrite (fs->oldvolfile, fs->oldvollen, 1, oldvolfile_fp);
- fflush (oldvolfile_fp);
- if (ferror (oldvolfile_fp)) {
- goto out;
- }
-
- oldvolfile_graph = glusterfs_graph_construct (oldvolfile_fp);
- if (!oldvolfile_graph) {
- goto out;
- }
-
- newvolfile_graph = glusterfs_graph_construct (newvolfile_fp);
- if (!newvolfile_graph) {
- goto out;
- }
-
- if (!is_graph_topology_equal (oldvolfile_graph,
- newvolfile_graph)) {
-
- ret = 1;
- gf_log ("glfs-mgmt", GF_LOG_DEBUG,
- "Graph topology not equal(should call INIT)");
- goto out;
- }
-
- gf_log ("glfs-mgmt", GF_LOG_DEBUG,
- "Only options have changed in the new "
- "graph");
-
- ctx = fs->ctx;
-
- if (!ctx) {
- gf_log ("glfs-mgmt", GF_LOG_ERROR,
- "glusterfs_ctx_get() returned NULL");
- goto out;
- }
-
- oldvolfile_graph = ctx->active;
-
- if (!oldvolfile_graph) {
- gf_log ("glfs-mgmt", GF_LOG_ERROR,
- "glusterfs_ctx->active is NULL");
- goto out;
- }
-
- /* */
- ret = glusterfs_graph_reconfigure (oldvolfile_graph,
- newvolfile_graph);
- if (ret) {
- gf_log ("glfs-mgmt", GF_LOG_DEBUG,
- "Could not reconfigure new options in old graph");
- goto out;
- }
-
- ret = 0;
-out:
- if (oldvolfile_fp)
- fclose (oldvolfile_fp);
-
- return ret;
-}
-
-
-static int
glusterfs_oldvolfile_update (struct glfs *fs, char *volfile, ssize_t size)
{
int ret = -1;
@@ -416,6 +260,7 @@ mgmt_getspec_cbk (struct rpc_req *req, struct iovec *iov, int count,
gf_log (frame->this->name, GF_LOG_ERROR,
"failed to get the 'volume file' from server");
ret = -1;
+ errno = rsp.op_errno;
goto out;
}
@@ -450,7 +295,8 @@ mgmt_getspec_cbk (struct rpc_req *req, struct iovec *iov, int count,
* return -1(or -ve) =======> Some Internal Error occurred during the operation
*/
- ret = glusterfs_volfile_reconfigure (fs, tmpfp);
+ ret = glusterfs_volfile_reconfigure (fs->oldvollen, tmpfp, fs->ctx,
+ fs->oldvolfile);
if (ret == 0) {
gf_log ("glusterfsd-mgmt", GF_LOG_DEBUG,
"No need to re-load volfile, reconfigure done");
@@ -477,6 +323,14 @@ out:
if (rsp.spec)
free (rsp.spec);
+ // Stop if server is running at an unsupported op-version
+ if (ENOTSUP == ret) {
+ gf_log ("mgmt", GF_LOG_ERROR, "Server is operating at an "
+ "op-version which is not supported");
+ errno = ENOTSUP;
+ glfs_init_done (fs, -1);
+ }
+
if (ret && ctx && !ctx->active) {
/* Do it only for the first time */
/* Failed to get the volume file, something wrong,
@@ -484,8 +338,11 @@ out:
gf_log ("glfs-mgmt", GF_LOG_ERROR,
"failed to fetch volume file (key:%s)",
ctx->cmd_args.volfile_id);
- if (!need_retry)
+ if (!need_retry) {
+ if (!errno)
+ errno = EINVAL;
glfs_init_done (fs, -1);
+ }
}
if (tmpfp)
@@ -503,6 +360,7 @@ glfs_volfile_fetch (struct glfs *fs)
int ret = 0;
call_frame_t *frame = NULL;
glusterfs_ctx_t *ctx = NULL;
+ dict_t *dict = NULL;
ctx = fs->ctx;
cmd_args = &ctx->cmd_args;
@@ -512,10 +370,41 @@ glfs_volfile_fetch (struct glfs *fs)
req.key = cmd_args->volfile_id;
req.flags = 0;
+ dict = dict_new ();
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+
+ // Set the supported min and max op-versions, so glusterd can make a
+ // decision
+ ret = dict_set_int32 (dict, "min-op-version", GD_OP_VERSION_MIN);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to set min-op-version"
+ " in request dict");
+ goto out;
+ }
+
+ ret = dict_set_int32 (dict, "max-op-version", GD_OP_VERSION_MAX);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to set max-op-version"
+ " in request dict");
+ goto out;
+ }
+
+ ret = dict_allocate_and_serialize (dict, &req.xdata.xdata_val,
+ &req.xdata.xdata_len);
+ if (ret < 0) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Failed to serialize dictionary");
+ goto out;
+ }
+
ret = mgmt_submit_request (&req, frame, ctx, &clnt_handshake_prog,
GF_HNDSK_GETSPEC, mgmt_getspec_cbk,
(xdrproc_t)xdr_gf_getspec_req);
- return ret;
+out:
+ return ret;
}
@@ -544,9 +433,10 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
gf_log ("glfs-mgmt", GF_LOG_INFO,
"%d connect attempts left",
cmd_args->max_connect_attempts);
- if (0 >= cmd_args->max_connect_attempts)
+ if (0 >= cmd_args->max_connect_attempts) {
+ errno = ENOTCONN;
glfs_init_done (fs, -1);
- break;
+ }
}
break;
case RPC_CLNT_CONNECT:
@@ -555,10 +445,11 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
ret = glfs_volfile_fetch (fs);
if (ret && ctx && (ctx->active == NULL)) {
/* Do it only for the first time */
- /* Exit the process.. there is some wrong options */
+ /* Exit the process.. there are some wrong options */
gf_log ("glfs-mgmt", GF_LOG_ERROR,
"failed to fetch volume file (key:%s)",
ctx->cmd_args.volfile_id);
+ errno = EINVAL;
glfs_init_done (fs, -1);
}
diff --git a/api/src/glfs-resolve.c b/api/src/glfs-resolve.c
index f7754d201..4ca2eb6fc 100644
--- a/api/src/glfs-resolve.c
+++ b/api/src/glfs-resolve.c
@@ -32,9 +32,11 @@
#include "glfs-internal.h"
+#define graphid_str(subvol) (uuid_utoa((unsigned char *)subvol->graph->graph_uuid))
-void
-glfs_first_lookup (xlator_t *subvol)
+
+int
+glfs_first_lookup_safe (xlator_t *subvol)
{
loc_t loc = {0, };
int ret = -1;
@@ -49,10 +51,89 @@ glfs_first_lookup (xlator_t *subvol)
gf_log (subvol->name, GF_LOG_DEBUG, "first lookup complete %d", ret);
- return;
+ return ret;
+}
+
+
+int
+__glfs_first_lookup (struct glfs *fs, xlator_t *subvol)
+{
+ int ret = -1;
+
+ fs->migration_in_progress = 1;
+ pthread_mutex_unlock (&fs->mutex);
+ {
+ ret = glfs_first_lookup_safe (subvol);
+ }
+ pthread_mutex_lock (&fs->mutex);
+ fs->migration_in_progress = 0;
+ pthread_cond_broadcast (&fs->cond);
+
+ return ret;
+}
+
+
+inode_t *
+glfs_refresh_inode_safe (xlator_t *subvol, inode_t *oldinode)
+{
+ loc_t loc = {0, };
+ int ret = -1;
+ struct iatt iatt = {0, };
+ inode_t *newinode = NULL;
+
+
+ if (!oldinode)
+ return NULL;
+
+ if (oldinode->table->xl == subvol)
+ return inode_ref (oldinode);
+
+ newinode = inode_find (subvol->itable, oldinode->gfid);
+ if (newinode)
+ return newinode;
+
+ uuid_copy (loc.gfid, oldinode->gfid);
+ loc.inode = inode_new (subvol->itable);
+ if (!loc.inode)
+ return NULL;
+
+ ret = syncop_lookup (subvol, &loc, 0, &iatt, 0, 0);
+
+ if (ret) {
+ gf_log (subvol->name, GF_LOG_WARNING,
+ "inode refresh of %s failed: %s",
+ uuid_utoa (oldinode->gfid), strerror (errno));
+ loc_wipe (&loc);
+ return NULL;
+ }
+
+ newinode = inode_link (loc.inode, 0, 0, &iatt);
+ if (newinode)
+ inode_lookup (newinode);
+
+ loc_wipe (&loc);
+
+ return newinode;
}
+inode_t *
+__glfs_refresh_inode (struct glfs *fs, xlator_t *subvol, inode_t *inode)
+{
+ inode_t *newinode = NULL;
+
+ fs->migration_in_progress = 1;
+ pthread_mutex_unlock (&fs->mutex);
+ {
+ newinode = glfs_refresh_inode_safe (subvol, inode);
+ }
+ pthread_mutex_lock (&fs->mutex);
+ fs->migration_in_progress = 0;
+ pthread_cond_broadcast (&fs->cond);
+
+ return newinode;
+}
+
int
glfs_loc_touchup (loc_t *loc)
{
@@ -60,7 +141,10 @@ glfs_loc_touchup (loc_t *loc)
int ret = -1;
char *bn = NULL;
- ret = inode_path (loc->parent, loc->name, &path);
+ if (loc->parent)
+ ret = inode_path (loc->parent, loc->name, &path);
+ else
+ ret = inode_path (loc->inode, 0, &path);
loc->path = path;
@@ -98,7 +182,7 @@ glfs_resolve_symlink (struct glfs *fs, xlator_t *subvol, inode_t *inode,
ret = syncop_readlink (subvol, &loc, &path, 4096);
- if (ret)
+ if (ret < 0)
goto out;
if (lpath)
@@ -109,7 +193,7 @@ out:
}
-void
+int
glfs_resolve_base (struct glfs *fs, xlator_t *subvol, inode_t *inode,
struct iatt *iatt)
{
@@ -128,12 +212,15 @@ glfs_resolve_base (struct glfs *fs, xlator_t *subvol, inode_t *inode,
ret = syncop_lookup (subvol, &loc, NULL, iatt, NULL, NULL);
out:
loc_wipe (&loc);
+
+ return ret;
}
inode_t *
glfs_resolve_component (struct glfs *fs, xlator_t *subvol, inode_t *parent,
- const char *component, struct iatt *iatt)
+ const char *component, struct iatt *iatt,
+ int force_lookup)
{
loc_t loc = {0, };
inode_t *inode = NULL;
@@ -149,23 +236,23 @@ glfs_resolve_component (struct glfs *fs, xlator_t *subvol, inode_t *parent,
loc.parent = inode_ref (parent);
uuid_copy (loc.pargfid, parent->gfid);
- xattr_req = dict_new ();
- if (!xattr_req) {
- errno = ENOMEM;
- goto out;
- }
-
- ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
- if (ret) {
- errno = ENOMEM;
- goto out;
- }
- loc.inode = inode_grep (parent->table, parent, component);
+ if (strcmp (component, ".") == 0)
+ loc.inode = inode_ref (parent);
+ else if (strcmp (component, "..") == 0)
+ loc.inode = inode_parent (parent, 0, 0);
+ else
+ loc.inode = inode_grep (parent->table, parent, component);
if (loc.inode) {
uuid_copy (loc.gfid, loc.inode->gfid);
reval = 1;
+
+ if (!force_lookup) {
+ inode = inode_ref (loc.inode);
+ ciatt.ia_type = inode->ia_type;
+ goto found;
+ }
} else {
uuid_generate (gfid);
loc.inode = inode_new (parent->table);
@@ -174,20 +261,35 @@ glfs_resolve_component (struct glfs *fs, xlator_t *subvol, inode_t *parent,
if (!loc.inode)
goto out;
-
glret = glfs_loc_touchup (&loc);
if (glret < 0) {
ret = -1;
goto out;
}
- ret = syncop_lookup (subvol, &loc, xattr_req, &ciatt, NULL, NULL);
+ ret = syncop_lookup (subvol, &loc, NULL, &ciatt, NULL, NULL);
if (ret && reval) {
inode_unref (loc.inode);
loc.inode = inode_new (parent->table);
- if (!loc.inode)
+ if (!loc.inode) {
+ errno = ENOMEM;
goto out;
+ }
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ errno = ENOMEM;
+ goto out;
+ }
+
uuid_generate (gfid);
+
+ ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+ if (ret) {
+ errno = ENOMEM;
+ goto out;
+ }
+
ret = syncop_lookup (subvol, &loc, xattr_req, &ciatt,
NULL, NULL);
}
@@ -195,13 +297,14 @@ glfs_resolve_component (struct glfs *fs, xlator_t *subvol, inode_t *parent,
goto out;
inode = inode_link (loc.inode, loc.parent, component, &ciatt);
+found:
if (inode)
inode_lookup (inode);
if (iatt)
*iatt = ciatt;
out:
if (xattr_req)
- dict_destroy (xattr_req);
+ dict_unref (xattr_req);
loc_wipe (&loc);
@@ -212,7 +315,7 @@ out:
int
glfs_resolve_at (struct glfs *fs, xlator_t *subvol, inode_t *at,
const char *origpath, loc_t *loc, struct iatt *iatt,
- int follow)
+ int follow, int reval)
{
inode_t *inode = NULL;
inode_t *parent = NULL;
@@ -238,7 +341,8 @@ glfs_resolve_at (struct glfs *fs, xlator_t *subvol, inode_t *at,
} else {
inode = inode_ref (subvol->itable->root);
- glfs_resolve_base (fs, subvol, inode, &ciatt);
+ if (strcmp (path, "/") == 0)
+ glfs_resolve_base (fs, subvol, inode, &ciatt);
}
for (component = strtok_r (path, "/", &saveptr);
@@ -252,7 +356,13 @@ glfs_resolve_at (struct glfs *fs, xlator_t *subvol, inode_t *at,
parent = inode;
inode = glfs_resolve_component (fs, subvol, parent,
- component, &ciatt);
+ component, &ciatt,
+ /* force hard lookup on the last
+ component, as the caller
+ wants proper iatt filled
+ */
+ (reval || (!next_component &&
+ iatt)));
if (!inode)
break;
@@ -264,6 +374,16 @@ glfs_resolve_at (struct glfs *fs, xlator_t *subvol, inode_t *at,
char *lpath = NULL;
loc_t sym_loc = {0,};
+ if (follow > GLFS_SYMLINK_MAX_FOLLOW) {
+ errno = ELOOP;
+ ret = -1;
+ if (inode) {
+ inode_unref (inode);
+ inode = NULL;
+ }
+ break;
+ }
+
ret = glfs_resolve_symlink (fs, subvol, inode, &lpath);
inode_unref (inode);
inode = NULL;
@@ -279,7 +399,7 @@ glfs_resolve_at (struct glfs *fs, xlator_t *subvol, inode_t *at,
/* always recurisvely follow while
following symlink
*/
- 1);
+ follow + 1, reval);
if (ret == 0)
inode = inode_ref (sym_loc.inode);
loc_wipe (&sym_loc);
@@ -335,12 +455,34 @@ out:
int
+glfs_resolve_path (struct glfs *fs, xlator_t *subvol, const char *origpath,
+ loc_t *loc, struct iatt *iatt, int follow, int reval)
+{
+ int ret = -1;
+ inode_t *cwd = NULL;
+
+ if (origpath[0] == '/')
+ return glfs_resolve_at (fs, subvol, NULL, origpath, loc, iatt,
+ follow, reval);
+
+ cwd = glfs_cwd_get (fs);
+
+ ret = glfs_resolve_at (fs, subvol, cwd, origpath, loc, iatt,
+ follow, reval);
+ if (cwd)
+ inode_unref (cwd);
+
+ return ret;
+}
+
+
+int
glfs_resolve (struct glfs *fs, xlator_t *subvol, const char *origpath,
- loc_t *loc, struct iatt *iatt)
+ loc_t *loc, struct iatt *iatt, int reval)
{
int ret = -1;
- ret = glfs_resolve_at (fs, subvol, NULL, origpath, loc, iatt, 1);
+ ret = glfs_resolve_path (fs, subvol, origpath, loc, iatt, 1, reval);
return ret;
}
@@ -348,12 +490,480 @@ glfs_resolve (struct glfs *fs, xlator_t *subvol, const char *origpath,
int
glfs_lresolve (struct glfs *fs, xlator_t *subvol, const char *origpath,
- loc_t *loc, struct iatt *iatt)
+ loc_t *loc, struct iatt *iatt, int reval)
+{
+ int ret = -1;
+
+ ret = glfs_resolve_path (fs, subvol, origpath, loc, iatt, 0, reval);
+
+ return ret;
+}
+
+
+int
+glfs_migrate_fd_locks_safe (struct glfs *fs, xlator_t *oldsubvol, fd_t *oldfd,
+ xlator_t *newsubvol, fd_t *newfd)
+{
+ dict_t *lockinfo = NULL;
+ int ret = 0;
+ char uuid1[64];
+
+ if (!oldfd->lk_ctx || fd_lk_ctx_empty (oldfd->lk_ctx))
+ return 0;
+
+ newfd->lk_ctx = fd_lk_ctx_ref (oldfd->lk_ctx);
+
+ ret = syncop_fgetxattr (oldsubvol, oldfd, &lockinfo,
+ GF_XATTR_LOCKINFO_KEY);
+ if (ret < 0) {
+ gf_log (fs->volname, GF_LOG_WARNING,
+ "fgetxattr (%s) failed (%s) on graph %s (%d)",
+ uuid_utoa_r (oldfd->inode->gfid, uuid1),
+ strerror (errno),
+ graphid_str (oldsubvol), oldsubvol->graph->id);
+ goto out;
+ }
+
+ if (!dict_get (lockinfo, GF_XATTR_LOCKINFO_KEY)) {
+ gf_log (fs->volname, GF_LOG_WARNING,
+ "missing lokinfo key (%s) on graph %s (%d)",
+ uuid_utoa_r (oldfd->inode->gfid, uuid1),
+ graphid_str (oldsubvol), oldsubvol->graph->id);
+ goto out;
+ }
+
+ ret = syncop_fsetxattr (newsubvol, newfd, lockinfo, 0);
+ if (ret < 0) {
+ gf_log (fs->volname, GF_LOG_WARNING,
+ "fsetxattr (%s) failed (%s) on graph %s (%d)",
+ uuid_utoa_r (newfd->inode->gfid, uuid1),
+ strerror (errno),
+ graphid_str (newsubvol), newsubvol->graph->id);
+ goto out;
+ }
+out:
+ if (lockinfo)
+ dict_unref (lockinfo);
+ return ret;
+}
+
+
+fd_t *
+glfs_migrate_fd_safe (struct glfs *fs, xlator_t *newsubvol, fd_t *oldfd)
{
+ fd_t *newfd = NULL;
+ inode_t *oldinode = NULL;
+ inode_t *newinode = NULL;
+ xlator_t *oldsubvol = NULL;
int ret = -1;
+ loc_t loc = {0, };
+ char uuid1[64];
+
+
+ oldinode = oldfd->inode;
+ oldsubvol = oldinode->table->xl;
+
+ if (oldsubvol == newsubvol)
+ return fd_ref (oldfd);
+
+ if (!oldsubvol->switched) {
+ ret = syncop_fsync (oldsubvol, oldfd, 0);
+ if (ret) {
+ gf_log (fs->volname, GF_LOG_WARNING,
+ "fsync() failed (%s) on %s graph %s (%d)",
+ strerror (errno),
+ uuid_utoa_r (oldfd->inode->gfid, uuid1),
+ graphid_str (oldsubvol), oldsubvol->graph->id);
+ }
+ }
+
+ newinode = glfs_refresh_inode_safe (newsubvol, oldinode);
+ if (!newinode) {
+ gf_log (fs->volname, GF_LOG_WARNING,
+ "inode (%s) refresh failed (%s) on graph %s (%d)",
+ uuid_utoa_r (oldinode->gfid, uuid1),
+ strerror (errno),
+ graphid_str (newsubvol), newsubvol->graph->id);
+ goto out;
+ }
+
+ newfd = fd_create (newinode, getpid());
+ if (!newfd) {
+ gf_log (fs->volname, GF_LOG_WARNING,
+ "fd_create (%s) failed (%s) on graph %s (%d)",
+ uuid_utoa_r (newinode->gfid, uuid1),
+ strerror (errno),
+ graphid_str (newsubvol), newsubvol->graph->id);
+ goto out;
+ }
+
+ loc.inode = inode_ref (newinode);
+
+ ret = inode_path (oldfd->inode, NULL, (char **)&loc.path);
+ if (ret < 0) {
+ gf_log (fs->volname, GF_LOG_INFO, "inode_path failed");
+ goto out;
+ }
+
+ uuid_copy (loc.gfid, oldinode->gfid);
+
+
+ if (IA_ISDIR (oldinode->ia_type))
+ ret = syncop_opendir (newsubvol, &loc, newfd);
+ else
+ ret = syncop_open (newsubvol, &loc,
+ oldfd->flags & ~(O_TRUNC|O_EXCL|O_CREAT),
+ newfd);
+ loc_wipe (&loc);
+
+ if (ret) {
+ gf_log (fs->volname, GF_LOG_WARNING,
+ "syncop_open%s (%s) failed (%s) on graph %s (%d)",
+ IA_ISDIR (oldinode->ia_type) ? "dir" : "",
+ uuid_utoa_r (newinode->gfid, uuid1),
+ strerror (errno),
+ graphid_str (newsubvol), newsubvol->graph->id);
+ goto out;
+ }
+
+ ret = glfs_migrate_fd_locks_safe (fs, oldsubvol, oldfd, newsubvol,
+ newfd);
+
+ if (ret) {
+ gf_log (fs->volname, GF_LOG_WARNING,
+ "lock migration (%s) failed (%s) on graph %s (%d)",
+ uuid_utoa_r (newinode->gfid, uuid1),
+ strerror (errno),
+ graphid_str (newsubvol), newsubvol->graph->id);
+ goto out;
+ }
+
+ newfd->flags = oldfd->flags;
+ fd_bind (newfd);
+out:
+ if (newinode)
+ inode_unref (newinode);
+
+ if (ret) {
+ fd_unref (newfd);
+ newfd = NULL;
+ }
- ret = glfs_resolve_at (fs, subvol, NULL, origpath, loc, iatt, 0);
+ return newfd;
+}
+
+
+fd_t *
+__glfs_migrate_fd (struct glfs *fs, xlator_t *newsubvol, struct glfs_fd *glfd)
+{
+ fd_t *oldfd = NULL;
+ fd_t *newfd = NULL;
+
+ oldfd = glfd->fd;
+
+ fs->migration_in_progress = 1;
+ pthread_mutex_unlock (&fs->mutex);
+ {
+ newfd = glfs_migrate_fd_safe (fs, newsubvol, oldfd);
+ }
+ pthread_mutex_lock (&fs->mutex);
+ fs->migration_in_progress = 0;
+ pthread_cond_broadcast (&fs->cond);
+
+ return newfd;
+}
+
+
+fd_t *
+__glfs_resolve_fd (struct glfs *fs, xlator_t *subvol, struct glfs_fd *glfd)
+{
+ fd_t *fd = NULL;
+
+ if (glfd->fd->inode->table->xl == subvol)
+ return fd_ref (glfd->fd);
+
+ fd = __glfs_migrate_fd (fs, subvol, glfd);
+ if (!fd)
+ return NULL;
+
+ if (subvol == fs->active_subvol) {
+ fd_unref (glfd->fd);
+ glfd->fd = fd_ref (fd);
+ }
+
+ return fd;
+}
+
+
+fd_t *
+glfs_resolve_fd (struct glfs *fs, xlator_t *subvol, struct glfs_fd *glfd)
+{
+ fd_t *fd = NULL;
+
+ glfs_lock (fs);
+ {
+ fd = __glfs_resolve_fd (fs, subvol, glfd);
+ }
+ glfs_unlock (fs);
+
+ return fd;
+}
+
+
+void
+__glfs_migrate_openfds (struct glfs *fs, xlator_t *subvol)
+{
+ struct glfs_fd *glfd = NULL;
+ fd_t *fd = NULL;
+
+ list_for_each_entry (glfd, &fs->openfds, openfds) {
+ if (uuid_is_null (glfd->fd->inode->gfid)) {
+ gf_log (fs->volname, GF_LOG_INFO,
+ "skipping openfd %p/%p in graph %s (%d)",
+ glfd, glfd->fd, graphid_str(subvol),
+ subvol->graph->id);
+ /* create in progress, defer */
+ continue;
+ }
+
+ fd = __glfs_migrate_fd (fs, subvol, glfd);
+ if (fd) {
+ fd_unref (glfd->fd);
+ glfd->fd = fd;
+ }
+ }
+}
+
+
+xlator_t *
+__glfs_active_subvol (struct glfs *fs)
+{
+ xlator_t *new_subvol = NULL;
+ int ret = -1;
+ inode_t *new_cwd = NULL;
+
+ if (!fs->next_subvol)
+ return fs->active_subvol;
+
+ new_subvol = fs->next_subvol;
+
+ ret = __glfs_first_lookup (fs, new_subvol);
+ if (ret) {
+ gf_log (fs->volname, GF_LOG_INFO,
+ "first lookup on graph %s (%d) failed (%s)",
+ graphid_str (new_subvol), new_subvol->graph->id,
+ strerror (errno));
+ return NULL;
+ }
+
+ if (fs->cwd) {
+ new_cwd = __glfs_refresh_inode (fs, new_subvol, fs->cwd);
+
+ if (!new_cwd) {
+ char buf1[64];
+ gf_log (fs->volname, GF_LOG_INFO,
+ "cwd refresh of %s graph %s (%d) failed (%s)",
+ uuid_utoa_r (fs->cwd->gfid, buf1),
+ graphid_str (new_subvol),
+ new_subvol->graph->id, strerror (errno));
+ return NULL;
+ }
+ }
+
+ __glfs_migrate_openfds (fs, new_subvol);
+
+ /* switching @active_subvol and @cwd
+ should be atomic
+ */
+ fs->old_subvol = fs->active_subvol;
+ fs->active_subvol = fs->next_subvol;
+ fs->next_subvol = NULL;
+
+ if (new_cwd) {
+ __glfs_cwd_set (fs, new_cwd);
+ inode_unref (new_cwd);
+ }
+
+ gf_log (fs->volname, GF_LOG_INFO, "switched to graph %s (%d)",
+ graphid_str (new_subvol), new_subvol->graph->id);
+
+ return new_subvol;
+}
+
+xlator_t *
+glfs_active_subvol (struct glfs *fs)
+{
+ xlator_t *subvol = NULL;
+ xlator_t *old_subvol = NULL;
+
+ glfs_lock (fs);
+ {
+ subvol = __glfs_active_subvol (fs);
+
+ if (subvol)
+ subvol->winds++;
+
+ if (fs->old_subvol) {
+ old_subvol = fs->old_subvol;
+ fs->old_subvol = NULL;
+ old_subvol->switched = 1;
+ }
+ }
+ glfs_unlock (fs);
+
+ if (old_subvol)
+ glfs_subvol_done (fs, old_subvol);
+
+ return subvol;
+}
+
+
+void
+glfs_subvol_done (struct glfs *fs, xlator_t *subvol)
+{
+ int ref = 0;
+ xlator_t *active_subvol = NULL;
+
+ if (!subvol)
+ return;
+
+ glfs_lock (fs);
+ {
+ ref = (--subvol->winds);
+ active_subvol = fs->active_subvol;
+ }
+ glfs_unlock (fs);
+
+ if (ref == 0) {
+ assert (subvol != active_subvol);
+ xlator_notify (subvol, GF_EVENT_PARENT_DOWN, subvol, NULL);
+ }
+}
+
+
+int
+__glfs_cwd_set (struct glfs *fs, inode_t *inode)
+{
+ if (inode->table->xl != fs->active_subvol) {
+ inode = __glfs_refresh_inode (fs, fs->active_subvol, inode);
+ if (!inode)
+ return -1;
+ } else {
+ inode_ref (inode);
+ }
+
+ if (fs->cwd)
+ inode_unref (fs->cwd);
+
+ fs->cwd = inode;
+
+ return 0;
+}
+
+
+int
+glfs_cwd_set (struct glfs *fs, inode_t *inode)
+{
+ int ret = 0;
+
+ glfs_lock (fs);
+ {
+ ret = __glfs_cwd_set (fs, inode);
+ }
+ glfs_unlock (fs);
return ret;
}
+
+inode_t *
+__glfs_cwd_get (struct glfs *fs)
+{
+ inode_t *cwd = NULL;
+
+ if (!fs->cwd)
+ return NULL;
+
+ if (fs->cwd->table->xl == fs->active_subvol) {
+ cwd = inode_ref (fs->cwd);
+ return cwd;
+ }
+
+ cwd = __glfs_refresh_inode (fs, fs->active_subvol, fs->cwd);
+
+ return cwd;
+}
+
+inode_t *
+glfs_cwd_get (struct glfs *fs)
+{
+ inode_t *cwd = NULL;
+
+ glfs_lock (fs);
+ {
+ cwd = __glfs_cwd_get (fs);
+ }
+ glfs_unlock (fs);
+
+ return cwd;
+}
+
+inode_t *
+__glfs_resolve_inode (struct glfs *fs, xlator_t *subvol,
+ struct glfs_object *object)
+{
+ inode_t *inode = NULL;
+
+ if (object->inode->table->xl == subvol)
+ return inode_ref (object->inode);
+
+ inode = __glfs_refresh_inode (fs, fs->active_subvol,
+ object->inode);
+ if (!inode)
+ return NULL;
+
+ if (subvol == fs->active_subvol) {
+ inode_unref (object->inode);
+ object->inode = inode_ref (inode);
+ }
+
+ return inode;
+}
+
+inode_t *
+glfs_resolve_inode (struct glfs *fs, xlator_t *subvol,
+ struct glfs_object *object)
+{
+ inode_t *inode = NULL;
+
+ glfs_lock (fs);
+ {
+ inode = __glfs_resolve_inode(fs, subvol, object);
+ }
+ glfs_unlock (fs);
+
+ return inode;
+}
+
+int
+glfs_create_object (loc_t *loc, struct glfs_object **retobject)
+{
+ struct glfs_object *object = NULL;
+
+ object = GF_CALLOC (1, sizeof(struct glfs_object),
+ glfs_mt_glfs_object_t);
+ if (object == NULL) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ object->inode = loc->inode;
+ uuid_copy (object->gfid, object->inode->gfid);
+
+ /* we hold the reference */
+ loc->inode = NULL;
+
+ *retobject = object;
+
+ return 0;
+}
diff --git a/api/src/glfs.c b/api/src/glfs.c
index f0bdc86f0..29ed47c0c 100644
--- a/api/src/glfs.c
+++ b/api/src/glfs.c
@@ -11,17 +11,15 @@
/*
TODO:
+ - merge locks in glfs_posix_lock for lock self-healing
- set proper pid/lk_owner to call frames (currently buried in syncop)
- fix logging.c/h to store logfp and loglevel in glusterfs_ctx_t and
reach it via THIS.
- - fd migration on graph switch.
- update syncop functions to accept/return xdata. ???
- protocol/client to reconnect immediately after portmap disconnect.
- handle SEEK_END failure in _lseek()
- handle umask (per filesystem?)
- - implement glfs_set_xlator_option(), like --xlator-option
- make itables LRU based
- - implement glfs_fini()
- 0-copy for readv/writev
- reconcile the open/creat mess
*/
@@ -51,6 +49,8 @@
#include "glfs.h"
#include "glfs-internal.h"
+#include "hashfn.h"
+#include "rpc-clnt.h"
static gf_boolean_t
@@ -66,7 +66,7 @@ glusterfs_ctx_defaults_init (glusterfs_ctx_t *ctx)
call_pool_t *pool = NULL;
int ret = -1;
- xlator_mem_acct_init (THIS, glfs_mt_end);
+ xlator_mem_acct_init (THIS, glfs_mt_end + 1);
ctx->process_uuid = generate_glusterfs_ctx_id ();
if (!ctx->process_uuid) {
@@ -85,7 +85,7 @@ glusterfs_ctx_defaults_init (glusterfs_ctx_t *ctx)
goto err;
}
- ctx->env = syncenv_new (0);
+ ctx->env = syncenv_new (0, 0, 0);
if (!ctx->env) {
goto err;
}
@@ -277,69 +277,117 @@ out:
///////////////////////////////////////////////////////////////////////////////
-struct glfs *
-glfs_from_glfd (struct glfs_fd *glfd)
+int
+glfs_set_xlator_option (struct glfs *fs, const char *xlator, const char *key,
+ const char *value)
{
- return ((xlator_t *)glfd->fd->inode->table->xl->ctx->master)->private;
+ xlator_cmdline_option_t *option = NULL;
+
+ option = GF_CALLOC (1, sizeof (*option),
+ glfs_mt_xlator_cmdline_option_t);
+ if (!option)
+ goto enomem;
+
+ INIT_LIST_HEAD (&option->cmd_args);
+
+ option->volume = gf_strdup (xlator);
+ if (!option->volume)
+ goto enomem;
+ option->key = gf_strdup (key);
+ if (!option->key)
+ goto enomem;
+ option->value = gf_strdup (value);
+ if (!option->value)
+ goto enomem;
+
+ list_add (&option->cmd_args, &fs->ctx->cmd_args.xlator_options);
+
+ return 0;
+enomem:
+ errno = ENOMEM;
+
+ if (!option)
+ return -1;
+
+ GF_FREE (option->volume);
+ GF_FREE (option->key);
+ GF_FREE (option->value);
+ GF_FREE (option);
+
+ return -1;
}
+int glfs_setfsuid (uid_t fsuid)
+{
+ return syncopctx_setfsuid (&fsuid);
+}
-void
-glfs_fd_destroy (struct glfs_fd *glfd)
+int glfs_setfsgid (gid_t fsgid)
{
- if (!glfd)
- return;
- if (glfd->fd)
- fd_unref (glfd->fd);
- GF_FREE (glfd);
+ return syncopctx_setfsgid (&fsgid);
}
+int glfs_setfsgroups (size_t size, const gid_t *list)
+{
+ return syncopctx_setfsgroups(size, list);
+}
-xlator_t *
-glfs_fd_subvol (struct glfs_fd *glfd)
+struct glfs *
+glfs_from_glfd (struct glfs_fd *glfd)
{
- xlator_t *subvol = NULL;
+ return glfd->fs;
+}
+
+
+struct glfs_fd *
+glfs_fd_new (struct glfs *fs)
+{
+ struct glfs_fd *glfd = NULL;
+ glfd = GF_CALLOC (1, sizeof (*glfd), glfs_mt_glfs_fd_t);
if (!glfd)
return NULL;
- subvol = glfd->fd->inode->table->xl;
+ glfd->fs = fs;
- return subvol;
+ INIT_LIST_HEAD (&glfd->openfds);
+
+ return glfd;
}
-xlator_t *
-glfs_active_subvol (struct glfs *fs)
+void
+glfs_fd_bind (struct glfs_fd *glfd)
{
- xlator_t *subvol = NULL;
- inode_table_t *itable = NULL;
+ struct glfs *fs = NULL;
- pthread_mutex_lock (&fs->mutex);
- {
- while (!fs->init)
- pthread_cond_wait (&fs->cond, &fs->mutex);
+ fs = glfd->fs;
- subvol = fs->active_subvol;
+ glfs_lock (fs);
+ {
+ list_add_tail (&glfd->openfds, &fs->openfds);
}
- pthread_mutex_unlock (&fs->mutex);
+ glfs_unlock (fs);
+}
- if (!subvol)
- return NULL;
+void
+glfs_fd_destroy (struct glfs_fd *glfd)
+{
+ if (!glfd)
+ return;
- if (!subvol->itable) {
- itable = inode_table_new (0, subvol);
- if (!itable) {
- errno = ENOMEM;
- return NULL;
- }
+ glfs_lock (glfd->fs);
+ {
+ list_del_init (&glfd->openfds);
+ }
+ glfs_unlock (glfd->fs);
- subvol->itable = itable;
+ if (glfd->fd)
+ fd_unref (glfd->fd);
- glfs_first_lookup (subvol);
- }
+ GF_FREE (glfd->readdirbuf);
- return subvol;
+ GF_FREE (glfd);
}
@@ -368,7 +416,9 @@ glfs_new (const char *volname)
return NULL;
}
+#ifdef DEBUG
gf_mem_acct_enable_set (ctx);
+#endif
/* first globals init, for gf_mem_acct_enable_set () */
ret = glusterfs_globals_init (ctx);
@@ -396,6 +446,8 @@ glfs_new (const char *volname)
pthread_mutex_init (&fs->mutex, NULL);
pthread_cond_init (&fs->cond, NULL);
+ INIT_LIST_HEAD (&fs->openfds);
+
return fs;
}
@@ -439,14 +491,26 @@ glfs_set_volfile_server (struct glfs *fs, const char *transport,
int
glfs_set_logging (struct glfs *fs, const char *logfile, int loglevel)
{
- int ret = -1;
+ int ret = 0;
+ char *tmplog = NULL;
- ret = gf_log_init (fs->ctx, logfile);
- if (ret)
- return ret;
+ if (!logfile) {
+ ret = gf_set_log_file_path (&fs->ctx->cmd_args);
+ if (ret)
+ goto out;
+ tmplog = fs->ctx->cmd_args.log_file;
+ } else {
+ tmplog = (char *)logfile;
+ }
- gf_log_set_loglevel (loglevel);
+ ret = gf_log_init (fs->ctx, tmplog, NULL);
+ if (ret)
+ goto out;
+ if (loglevel >= 0)
+ gf_log_set_loglevel (loglevel);
+
+out:
return ret;
}
@@ -456,7 +520,8 @@ glfs_init_wait (struct glfs *fs)
{
int ret = -1;
- pthread_mutex_lock (&fs->mutex);
+ /* Always a top-down call, use glfs_lock() */
+ glfs_lock (fs);
{
while (!fs->init)
pthread_cond_wait (&fs->cond,
@@ -464,7 +529,7 @@ glfs_init_wait (struct glfs *fs)
ret = fs->ret;
errno = fs->err;
}
- pthread_mutex_unlock (&fs->mutex);
+ glfs_unlock (fs);
return ret;
}
@@ -473,20 +538,32 @@ glfs_init_wait (struct glfs *fs)
void
glfs_init_done (struct glfs *fs, int ret)
{
- if (fs->init_cbk) {
- fs->init_cbk (fs, ret);
- return;
+ glfs_init_cbk init_cbk;
+
+ if (!fs) {
+ gf_log ("glfs", GF_LOG_ERROR,
+ "fs is NULL");
+ goto out;
}
+ init_cbk = fs->init_cbk;
+
+ /* Always a bottom-up call, use mutex_lock() */
pthread_mutex_lock (&fs->mutex);
{
fs->init = 1;
fs->ret = ret;
fs->err = errno;
- pthread_cond_broadcast (&fs->cond);
+ if (!init_cbk)
+ pthread_cond_broadcast (&fs->cond);
}
pthread_mutex_unlock (&fs->mutex);
+
+ if (init_cbk)
+ init_cbk (fs, ret);
+out:
+ return;
}
@@ -499,7 +576,7 @@ glfs_init_common (struct glfs *fs)
if (ret)
return ret;
- ret = pthread_create (&fs->poller, NULL, glfs_poller, fs);
+ ret = gf_thread_create (&fs->poller, NULL, glfs_poller, fs);
if (ret)
return ret;
@@ -507,6 +584,7 @@ glfs_init_common (struct glfs *fs)
if (ret)
return ret;
+ fs->dev_id = gf_dm_hashfn (fs->volname, strlen (fs->volname));
return ret;
}
@@ -542,7 +620,54 @@ glfs_init (struct glfs *fs)
int
glfs_fini (struct glfs *fs)
{
- int ret = -1;
-
- return ret;
+ int ret = -1;
+ int countdown = 100;
+ xlator_t *subvol = NULL;
+ glusterfs_ctx_t *ctx = NULL;
+ call_pool_t *call_pool = NULL;
+
+ ctx = fs->ctx;
+
+ if (ctx->mgmt) {
+ rpc_clnt_disable (ctx->mgmt);
+ ctx->mgmt = NULL;
+ }
+
+ __glfs_entry_fs (fs);
+
+ call_pool = fs->ctx->pool;
+
+ while (countdown--) {
+ /* give some time for background frames to finish */
+ if (!call_pool->cnt)
+ break;
+ usleep (100000);
+ }
+ /* leaked frames may exist, we ignore */
+
+ /*We deem glfs_fini as successful if there are no pending frames in the call
+ *pool*/
+ ret = (call_pool->cnt == 0)? 0: -1;
+
+ subvol = glfs_active_subvol (fs);
+ if (subvol) {
+ /* PARENT_DOWN within glfs_subvol_done() is issued only
+ on graph switch (new graph should activiate and
+ decrement the extra @winds count taken in glfs_graph_setup()
+
+ Since we are explicitly destroying, PARENT_DOWN is necessary
+ */
+ xlator_notify (subvol, GF_EVENT_PARENT_DOWN, subvol, 0);
+ /* TBD: wait for CHILD_DOWN before exiting, in case of
+ asynchronous cleanup like graceful socket disconnection
+ in the future.
+ */
+ }
+
+ glfs_subvol_done (fs, subvol);
+
+ if (ctx->log.logfile)
+ fclose (ctx->log.logfile);
+
+ return ret;
}
diff --git a/api/src/glfs.h b/api/src/glfs.h
index ded42feba..18fda496e 100644
--- a/api/src/glfs.h
+++ b/api/src/glfs.h
@@ -136,9 +136,9 @@ int glfs_set_volfile (glfs_t *fs, const char *volfile);
@transport: String specifying the transport used to connect to the
management daemon. Specifying NULL will result in the usage
- of the default (socket) transport type. Permitted values
+ of the default (tcp) transport type. Permitted values
are those what you specify as transport-type in a volume
- specification file (e.g "socket", "rdma", "unix".)
+ specification file (e.g "tcp", "rdma", "unix".)
@host: String specifying the address of where to find the management
daemon. Depending on the transport type this would either be
@@ -176,7 +176,9 @@ int glfs_set_volfile_server (glfs_t *fs, const char *transport,
@fs: The 'virtual mount' object to be configured with the logging parameters.
@logfile: The logfile to be used for logging. Will be created if it does not
- already exist (provided system permissions allow.)
+ already exist (provided system permissions allow). If NULL, a new
+ logfile will be created in default log directory associated with
+ the glusterfs installation.
@loglevel: Numerical value specifying the degree of verbosity. Higher the
value, more verbose the logging.
@@ -218,6 +220,36 @@ int glfs_set_logging (glfs_t *fs, const char *logfile, int loglevel);
int glfs_init (glfs_t *fs);
+/*
+ SYNOPSIS
+
+ glfs_fini: Cleanup and destroy the 'virtual mount'
+
+ DESCRIPTION
+
+ This function attempts to gracefully destroy glfs_t object. An attempt is
+ made to wait for all background processing to complete before returning.
+
+ glfs_fini() must be called after all operations on glfs_t is finished.
+
+ IMPORTANT
+
+ IT IS NECESSARY TO CALL glfs_fini() ON ALL THE INITIALIZED glfs_t
+ OBJECTS BEFORE TERMINATING THE PROGRAM. THERE MAY BE CACHED AND
+ UNWRITTEN / INCOMPLETE OPERATIONS STILL IN PROGRESS EVEN THOUGH THE
+ API CALLS HAVE RETURNED. glfs_fini() WILL WAIT FOR BACKGROUND OPERATIONS
+ TO COMPLETE BEFORE RETURNING, THEREBY MAKING IT SAFE FOR THE PROGRAM TO
+ EXIT.
+
+ PARAMETERS
+
+ @fs: The 'virtual mount' object to be destroyed.
+
+ RETURN VALUES
+
+ 0 : Success.
+*/
+
int glfs_fini (glfs_t *fs);
/*
@@ -239,6 +271,32 @@ int glfs_fini (glfs_t *fs);
struct glfs_fd;
typedef struct glfs_fd glfs_fd_t;
+/*
+ * PER THREAD IDENTITY MODIFIERS
+ *
+ * The following operations enable to set a per thread identity context
+ * for the glfs APIs to perform operations as. The calls here are kept as close
+ * to POSIX equivalents as possible.
+ *
+ * NOTES:
+ *
+ * - setgroups is a per thread setting, hence this is named as fsgroups to be
+ * close in naming to the fs(u/g)id APIs
+ * - Typical mode of operation is to set the IDs as required, with the
+ * supplementary groups being optionally set, make the glfs call and post the
+ * glfs operation set them back to eu/gid or uid/gid as appropriate to the
+ * caller
+ * - The groups once set, need to be unset by setting the size to 0 (in which
+ * case the list argument is a do not care)
+ * - Once a process for a thread of operation choses to set the IDs, all glfs
+ * calls made from that thread would default to the IDs set for the thread.
+ * As a result use these APIs with care and ensure that the set IDs are
+ * reverted to global process defaults as required.
+ *
+ */
+int glfs_setfsuid (uid_t fsuid);
+int glfs_setfsgid (gid_t fsgid);
+int glfs_setfsgroups (size_t size, const gid_t *list);
/*
SYNOPSIS
@@ -301,6 +359,32 @@ int glfs_close (glfs_fd_t *fd);
glfs_t *glfs_from_glfd (glfs_fd_t *fd);
+int glfs_set_xlator_option (glfs_t *fs, const char *xlator, const char *key,
+ const char *value);
+
+/*
+
+ glfs_io_cbk
+
+ The following is the function type definition of the callback
+ function pointer which has to be provided by the caller to the
+ *_async() versions of the IO calls.
+
+ The callback function is called on completion of the requested
+ IO, and the appropriate return value is returned in @ret.
+
+ In case of an error in completing the IO, @ret will be -1 and
+ @errno will be set with the appropriate error.
+
+ @ret will be same as the return value of the non _async() variant
+ of the particular call
+
+ @data is the same context pointer provided by the caller at the
+ time of issuing the async IO call. This can be used by the
+ caller to differentiate different instances of the async requests
+ in a common callback function.
+*/
+
typedef void (*glfs_io_cbk) (glfs_fd_t *fd, ssize_t ret, void *data);
// glfs_{read,write}[_async]
@@ -384,9 +468,31 @@ int glfs_link (glfs_t *fs, const char *oldpath, const char *newpath);
glfs_fd_t *glfs_opendir (glfs_t *fs, const char *path);
+/*
+ * @glfs_readdir_r and @glfs_readdirplus_r ARE thread safe AND re-entrant,
+ * but the interface has ambiguity about the size of @dirent to be allocated
+ * before calling the APIs. 512 byte buffer (for @dirent) is sufficient for
+ * all known systems which are tested againt glusterfs/gfapi, but may be
+ * insufficient in the future.
+ */
+
int glfs_readdir_r (glfs_fd_t *fd, struct dirent *dirent,
struct dirent **result);
+int glfs_readdirplus_r (glfs_fd_t *fd, struct stat *stat, struct dirent *dirent,
+ struct dirent **result);
+
+/*
+ * @glfs_readdir and @glfs_readdirplus are NEITHER thread safe NOR re-entrant
+ * when called on the same directory handle. However they ARE thread safe
+ * AND re-entrant when called on different directory handles (which may be
+ * referring to the same directory too.)
+ */
+
+struct dirent *glfs_readdir (glfs_fd_t *fd);
+
+struct dirent *glfs_readdirplus (glfs_fd_t *fd, struct stat *stat);
+
long glfs_telldir (glfs_fd_t *fd);
void glfs_seekdir (glfs_fd_t *fd, long offset);
@@ -442,6 +548,34 @@ int glfs_lremovexattr (glfs_t *fs, const char *path, const char *name);
int glfs_fremovexattr (glfs_fd_t *fd, const char *name);
+int glfs_fallocate(glfs_fd_t *fd, int keep_size, off_t offset, size_t len);
+
+int glfs_discard(glfs_fd_t *fd, off_t offset, size_t len);
+
+
+int glfs_discard_async (glfs_fd_t *fd, off_t length, size_t lent,
+ glfs_io_cbk fn, void *data);
+
+int glfs_zerofill(glfs_fd_t *fd, off_t offset, size_t len);
+
+int glfs_zerofill_async (glfs_fd_t *fd, off_t length, size_t len,
+ glfs_io_cbk fn, void *data);
+
+char *glfs_getcwd (glfs_t *fs, char *buf, size_t size);
+
+int glfs_chdir (glfs_t *fs, const char *path);
+
+int glfs_fchdir (glfs_fd_t *fd);
+
+char *glfs_realpath (glfs_t *fs, const char *path, char *resolved_path);
+
+/*
+ * @cmd and @flock are as specified in man fcntl(2).
+ */
+int glfs_posix_lock (glfs_fd_t *fd, int cmd, struct flock *flock);
+
+glfs_fd_t *glfs_dup (glfs_fd_t *fd);
+
__END_DECLS
#endif /* !_GLFS_H */
diff --git a/argp-standalone/configure.ac b/argp-standalone/configure.ac
index 65ebc4518..2ecd2a801 100644
--- a/argp-standalone/configure.ac
+++ b/argp-standalone/configure.ac
@@ -8,7 +8,7 @@ AC_CONFIG_SRCDIR([argp-ba.c])
AC_CONFIG_AUX_DIR([.])
AM_INIT_AUTOMAKE
-AM_CONFIG_HEADER(config.h)
+AC_CONFIG_HEADERS(config.h)
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES(yes)])
@@ -22,7 +22,7 @@ AC_GNU_SOURCE
AC_PROG_CC
AC_PROG_MAKE_SET
AC_PROG_RANLIB
-AM_PROG_CC_STDC
+AC_PROG_CC
if test "x$am_cv_prog_cc_stdc" = xno ; then
AC_ERROR([the C compiler doesn't handle ANSI-C])
diff --git a/autogen.sh b/autogen.sh
index ca69d4624..f937e6be0 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -1,8 +1,105 @@
#!/bin/sh
-aclocal -I ./contrib/aclocal
-autoheader
-(libtoolize --automake --copy --force || glibtoolize --automake --copy --force)
-autoconf
-automake --add-missing --copy --foreign
+echo
+echo ... GlusterFS autogen ...
+echo
+
+## Check all dependencies are present
+MISSING=""
+
+# Check for aclocal
+env aclocal --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+ ACLOCAL=aclocal
+else
+ MISSING="$MISSING aclocal"
+fi
+
+# Check for autoconf
+env autoconf --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+ AUTOCONF=autoconf
+else
+ MISSING="$MISSING autoconf"
+fi
+
+# Check for autoheader
+env autoheader --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+ AUTOHEADER=autoheader
+else
+ MISSING="$MISSING autoheader"
+fi
+
+# Check for automake
+env automake --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+ AUTOMAKE=automake
+else
+ MISSING="$MISSING automake"
+fi
+
+# Check for libtoolize or glibtoolize
+env libtoolize --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+ # libtoolize was found, so use it
+ TOOL=libtoolize
+else
+ # libtoolize wasn't found, so check for glibtoolize
+ env glibtoolize --version > /dev/null 2>&1
+ if [ $? -eq 0 ]; then
+ TOOL=glibtoolize
+ else
+ MISSING="$MISSING libtoolize/glibtoolize"
+ fi
+fi
+
+# Check for tar
+env tar --version > /dev/null 2>&1
+if [ $? -ne 0 ]; then
+ MISSING="$MISSING tar"
+fi
+
+## If dependencies are missing, warn the user and abort
+if [ "x$MISSING" != "x" ]; then
+ echo "Aborting."
+ echo
+ echo "The following build tools are missing:"
+ echo
+ for pkg in $MISSING; do
+ echo " * $pkg"
+ done
+ echo
+ echo "Please install them and try again."
+ echo
+ exit 1
+fi
+
+## generate gf-error-codes.h from error-codes.json
+echo "Generate gf-error-codes.h ..."
+if ./gen-headers.py; then
+ if ! mv -fv gf-error-codes.h libglusterfs/src/gf-error-codes.h; then
+ exit 1
+ fi
+else
+ exit 1
+fi
+
+## Do the autogeneration
+echo Running ${ACLOCAL}...
+$ACLOCAL -I ./contrib/aclocal
+echo Running ${AUTOHEADER}...
+$AUTOHEADER
+echo Running ${TOOL}...
+$TOOL --automake --copy --force
+echo Running ${AUTOCONF}...
+$AUTOCONF
+echo Running ${AUTOMAKE}...
+$AUTOMAKE --add-missing --copy --foreign
+
+# Run autogen in the argp-standalone sub-directory
cd argp-standalone;./autogen.sh
+
+# Instruct user on next steps
+echo
+echo "Please proceed with configuring, compiling, and installing."
diff --git a/cli/src/Makefile.am b/cli/src/Makefile.am
index 393077688..216d1bb55 100644
--- a/cli/src/Makefile.am
+++ b/cli/src/Makefile.am
@@ -2,12 +2,12 @@ sbin_PROGRAMS = gluster
gluster_SOURCES = cli.c registry.c input.c cli-cmd.c cli-rl.c \
cli-cmd-volume.c cli-cmd-peer.c cli-rpc-ops.c cli-cmd-parser.c\
- cli-cmd-system.c cli-cmd-misc.c cli-xml-output.c
+ cli-cmd-system.c cli-cmd-misc.c cli-xml-output.c cli-cmd-snapshot.c
gluster_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(GF_LDADD)\
$(RLLIBS) $(top_builddir)/rpc/xdr/src/libgfxdr.la \
$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
- $(GF_GLUSTERFS_LIBS) $(LIBXML2_LIBS)
+ $(GF_GLUSTERFS_LIBS) $(XML_LIBS)
gluster_LDFLAGS = $(GF_LDFLAGS)
noinst_HEADERS = cli.h cli-mem-types.h cli-cmd.h
@@ -19,7 +19,7 @@ AM_CPPFLAGS = $(GF_CPPFLAGS) \
-DCONFDIR=\"$(sysconfdir)/glusterfs\" \
-DGSYNCD_PREFIX=\"$(libexecdir)/glusterfs\"\
-DSYNCDAEMON_COMPILE=$(SYNCDAEMON_COMPILE) -DSBIN_DIR=\"$(sbindir)\"\
- $(LIBXML2_CFLAGS)
+ $(XML_CPPFLAGS)
AM_CFLAGS = -Wall $(GF_GLUSTERFS_CFLAGS)
diff --git a/cli/src/cli-cmd-misc.c b/cli/src/cli-cmd-misc.c
index f3ef12147..566d7c978 100644
--- a/cli/src/cli-cmd-misc.c
+++ b/cli/src/cli-cmd-misc.c
@@ -31,6 +31,8 @@ extern struct cli_cmd volume_cmds[];
extern struct cli_cmd cli_probe_cmds[];
extern struct cli_cmd cli_log_cmds[];
extern struct cli_cmd cli_system_cmds[];
+extern struct cli_cmd cli_bd_cmds[];
+extern struct cli_cmd snapshot_cmds[];
struct cli_cmd cli_misc_cmds[];
int
@@ -45,7 +47,8 @@ cli_cmd_display_help (struct cli_state *state, struct cli_cmd_word *in_word,
const char **words, int wordcount)
{
struct cli_cmd *cmd[] = {volume_cmds, cli_probe_cmds,
- cli_misc_cmds, NULL};
+ cli_misc_cmds, snapshot_cmds,
+ NULL};
struct cli_cmd *cmd_ind = NULL;
int i = 0;
diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c
index 4181e6c81..5ab208b8f 100644
--- a/cli/src/cli-cmd-parser.c
+++ b/cli/src/cli-cmd-parser.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -26,6 +26,38 @@
#include "protocol-common.h"
#include "cli1-xdr.h"
+#define MAX_SNAP_DESCRIPTION_LEN 1024
+
+struct snap_config_opt_vals_ snap_confopt_vals[] = {
+ {.op_name = "snap-max-hard-limit",
+ .question = "Changing snapshot-max-hard-limit "
+ "will lead to deletion of snapshots "
+ "if they exceed the new limit.\n"
+ "Do you want to continue?"
+ },
+ {.op_name = "snap-max-soft-limit",
+ .question = "Changing snapshot-max-soft-limit "
+ "will lead to deletion of snapshots "
+ "if they exceed the new limit.\n"
+ "Do you want to continue?"
+ },
+ {.op_name = "both",
+ .question = "Changing snapshot-max-hard-limit & "
+ "snapshot-max-soft-limit will lead to "
+ "deletion of snapshots if they exceed "
+ "the new limit.\nDo you want to continue?"
+ },
+ {.op_name = NULL,
+ }
+};
+
+enum cli_snap_config_set_types {
+ GF_SNAP_CONFIG_SET_HARD = 0,
+ GF_SNAP_CONFIG_SET_SOFT = 1,
+ GF_SNAP_CONFIG_SET_BOTH = 2,
+};
+typedef enum cli_snap_config_set_types cli_snap_config_set_types;
+
static const char *
id_sel (void *wcon)
{
@@ -160,12 +192,18 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
char *bricks = NULL;
int32_t brick_count = 0;
char *opwords[] = { "replica", "stripe", "transport", NULL };
+
char *invalid_volnames[] = {"volume", "type", "subvolumes", "option",
- "end-volume", "all", NULL};
+ "end-volume", "all", "volume_not_in_ring",
+ "description", "force",
+ "snap-max-hard-limit",
+ "snap-max-soft-limit", NULL};
char *w = NULL;
int op_count = 0;
int32_t replica_count = 1;
int32_t stripe_count = 1;
+ gf_boolean_t is_force = _gf_false;
+ int wc = wordcount;
GF_ASSERT (words);
GF_ASSERT (options);
@@ -300,7 +338,7 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
goto out;
}
index += 2;
- } else {
+ } else {
GF_ASSERT (!"opword mismatch");
ret = -1;
goto out;
@@ -323,7 +361,12 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
brick_index = index;
- ret = cli_cmd_bricks_parse (words, wordcount, brick_index, &bricks,
+ if (strcmp (words[wordcount - 1], "force") == 0) {
+ is_force = _gf_true;
+ wc = wordcount - 1;
+ }
+
+ ret = cli_cmd_bricks_parse (words, wc, brick_index, &bricks,
&brick_count);
if (ret)
goto out;
@@ -373,6 +416,10 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
if (ret)
goto out;
+ ret = dict_set_int32 (dict, "force", is_force);
+ if (ret)
+ goto out;
+
*options = dict;
out:
@@ -652,7 +699,7 @@ cli_is_key_spl (char *key)
#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd"
static int
-cli_add_key_group (dict_t *dict, char *key, char *value)
+cli_add_key_group (dict_t *dict, char *key, char *value, char **op_errstr)
{
int ret = -1;
int opt_count = 0;
@@ -666,6 +713,7 @@ cli_add_key_group (dict_t *dict, char *key, char *value)
char *tagpath = NULL;
char *buf = NULL;
char line[PATH_MAX + 256] = {0,};
+ char errstr[2048] = "";
FILE *fp = NULL;
ret = gf_asprintf (&tagpath, "%s/groups/%s",
@@ -678,6 +726,10 @@ cli_add_key_group (dict_t *dict, char *key, char *value)
fp = fopen (tagpath, "r");
if (!fp) {
ret = -1;
+ snprintf(errstr, sizeof(errstr), "Unable to open file '%s'."
+ " Error: %s", tagpath, strerror (errno));
+ if (op_errstr)
+ *op_errstr = gf_strdup(errstr);
goto out;
}
@@ -690,6 +742,10 @@ cli_add_key_group (dict_t *dict, char *key, char *value)
tok_val = strtok_r (NULL, "=", &saveptr);
if (!tok_key || !tok_val) {
ret = -1;
+ snprintf(errstr, sizeof(errstr), "'%s' file format "
+ "not valid.", tagpath);
+ if (op_errstr)
+ *op_errstr = gf_strdup(errstr);
goto out;
}
@@ -711,6 +767,10 @@ cli_add_key_group (dict_t *dict, char *key, char *value)
if (!opt_count) {
ret = -1;
+ snprintf(errstr, sizeof(errstr), "'%s' file format "
+ "not valid.", tagpath);
+ if (op_errstr)
+ *op_errstr = gf_strdup(errstr);
goto out;
}
ret = dict_set_int32 (dict, "count", opt_count);
@@ -731,7 +791,8 @@ out:
#undef GLUSTERD_DEFAULT_WORKDIR
int32_t
-cli_cmd_volume_set_parse (const char **words, int wordcount, dict_t **options)
+cli_cmd_volume_set_parse (const char **words, int wordcount, dict_t **options,
+ char **op_errstr)
{
dict_t *dict = NULL;
char *volname = NULL;
@@ -784,7 +845,12 @@ cli_cmd_volume_set_parse (const char **words, int wordcount, dict_t **options)
if (ret == -1)
goto out;
- ret = cli_add_key_group (dict, key, value);
+ if (strlen (value) == 0) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = cli_add_key_group (dict, key, value, op_errstr);
if (ret == 0)
*options = dict;
goto out;
@@ -806,6 +872,11 @@ cli_cmd_volume_set_parse (const char **words, int wordcount, dict_t **options)
if (ret == -1)
goto out;
+ if (strlen (value) == 0) {
+ ret = -1;
+ goto out;
+ }
+
if (cli_is_key_spl (key)) {
ret = -1;
goto out;
@@ -851,6 +922,8 @@ cli_cmd_volume_add_brick_parse (const char **words, int wordcount,
int count = 1;
char *w = NULL;
int index;
+ gf_boolean_t is_force = _gf_false;
+ int wc = wordcount;
GF_ASSERT (words);
GF_ASSERT (options);
@@ -928,7 +1001,13 @@ cli_cmd_volume_add_brick_parse (const char **words, int wordcount,
brick_index = index;
parse_bricks:
- ret = cli_cmd_bricks_parse (words, wordcount, brick_index, &bricks,
+
+ if (strcmp (words[wordcount - 1], "force") == 0) {
+ is_force = _gf_true;
+ wc = wordcount - 1;
+ }
+
+ ret = cli_cmd_bricks_parse (words, wc, brick_index, &bricks,
&brick_count);
if (ret)
goto out;
@@ -942,6 +1021,10 @@ parse_bricks:
if (ret)
goto out;
+ ret = dict_set_int32 (dict, "force", is_force);
+ if (ret)
+ goto out;
+
*options = dict;
out:
@@ -1144,6 +1227,7 @@ cli_cmd_volume_replace_brick_parse (const char **words, int wordcount,
char *opwords[] = { "start", "commit", "pause", "abort", "status",
NULL };
char *w = NULL;
+ gf_boolean_t is_force = _gf_false;
GF_ASSERT (words);
GF_ASSERT (options);
@@ -1241,12 +1325,17 @@ cli_cmd_volume_replace_brick_parse (const char **words, int wordcount,
}
if (wordcount == (op_index + 1)) {
- if (replace_op != GF_REPLACE_OP_COMMIT) {
+ if ((replace_op != GF_REPLACE_OP_COMMIT) &&
+ (replace_op != GF_REPLACE_OP_START)) {
ret = -1;
goto out;
}
if (!strcmp ("force", words[op_index])) {
- replace_op = GF_REPLACE_OP_COMMIT_FORCE;
+ if (replace_op == GF_REPLACE_OP_COMMIT)
+ replace_op = GF_REPLACE_OP_COMMIT_FORCE;
+
+ else if (replace_op == GF_REPLACE_OP_START)
+ is_force = _gf_true;
}
}
@@ -1260,6 +1349,9 @@ cli_cmd_volume_replace_brick_parse (const char **words, int wordcount,
if (ret)
goto out;
+ ret = dict_set_int32 (dict, "force", is_force);
+ if (ret)
+ goto out;
*options = dict;
@@ -1504,22 +1596,161 @@ gsyncd_glob_check (const char *w)
return !!strpbrk (w, "*?[");
}
+static int
+config_parse (const char **words, int wordcount, dict_t *dict,
+ unsigned cmdi, unsigned glob)
+{
+ int32_t ret = -1;
+ int32_t i = -1;
+ char *append_str = NULL;
+ size_t append_len = 0;
+ char *subop = NULL;
+
+ switch ((wordcount - 1) - cmdi) {
+ case 0:
+ subop = gf_strdup ("get-all");
+ break;
+ case 1:
+ if (words[cmdi + 1][0] == '!') {
+ (words[cmdi + 1])++;
+ if (gf_asprintf (&subop, "del%s",
+ glob ? "-glob" : "") == -1)
+ subop = NULL;
+ } else
+ subop = gf_strdup ("get");
+
+ ret = dict_set_str (dict, "op_name", ((char *)words[cmdi + 1]));
+ if (ret < 0)
+ goto out;
+ break;
+ default:
+ if (gf_asprintf (&subop, "set%s", glob ? "-glob" : "") == -1)
+ subop = NULL;
+
+ ret = dict_set_str (dict, "op_name", ((char *)words[cmdi + 1]));
+ if (ret < 0)
+ goto out;
+
+ /* join the varargs by spaces to get the op_value */
+
+ for (i = cmdi + 2; i < wordcount; i++)
+ append_len += (strlen (words[i]) + 1);
+ /* trailing strcat will add two bytes, make space for that */
+ append_len++;
+
+ append_str = GF_CALLOC (1, append_len, cli_mt_append_str);
+ if (!append_str) {
+ ret = -1;
+ goto out;
+ }
+
+ for (i = cmdi + 2; i < wordcount; i++) {
+ strcat (append_str, words[i]);
+ strcat (append_str, " ");
+ }
+ append_str[append_len - 2] = '\0';
+ /* "checkpoint now" is special: we resolve that "now" */
+ if (strcmp (words[cmdi + 1], "checkpoint") == 0 &&
+ strcmp (append_str, "now") == 0) {
+ struct timeval tv = {0,};
+
+ ret = gettimeofday (&tv, NULL);
+ if (ret == -1)
+ goto out; /* FIXME: free append_str? */
+
+ GF_FREE (append_str);
+ append_str = GF_CALLOC (1, 300, cli_mt_append_str);
+ if (!append_str) {
+ ret = -1;
+ goto out;
+ }
+ strcpy (append_str, "as of ");
+ gf_time_fmt (append_str + strlen ("as of "),
+ 300 - strlen ("as of "),
+ tv.tv_sec, gf_timefmt_FT);
+ }
+
+ ret = dict_set_dynstr (dict, "op_value", append_str);
+ }
+
+ ret = -1;
+ if (subop) {
+ ret = dict_set_dynstr (dict, "subop", subop);
+ if (!ret)
+ subop = NULL;
+ }
+
+out:
+ if (ret && append_str)
+ GF_FREE (append_str);
+
+ GF_FREE (subop);
+
+ gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+static int32_t
+force_push_pem_parse (const char **words, int wordcount,
+ dict_t *dict, unsigned *cmdi)
+{
+ int32_t ret = 0;
+
+ if (!strcmp ((char *)words[wordcount-1], "force")) {
+ if ((strcmp ((char *)words[wordcount-2], "start")) &&
+ (strcmp ((char *)words[wordcount-2], "stop")) &&
+ (strcmp ((char *)words[wordcount-2], "create")) &&
+ (strcmp ((char *)words[wordcount-2], "push-pem"))) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_uint32 (dict, "force",
+ _gf_true);
+ if (ret)
+ goto out;
+ (*cmdi)++;
+
+ if (!strcmp ((char *)words[wordcount-2], "push-pem")) {
+ if (strcmp ((char *)words[wordcount-3], "create")) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_int32 (dict, "push_pem", 1);
+ if (ret)
+ goto out;
+ (*cmdi)++;
+ }
+ } else if (!strcmp ((char *)words[wordcount-1], "push-pem")) {
+ if (strcmp ((char *)words[wordcount-2], "create")) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_int32 (dict, "push_pem", 1);
+ if (ret)
+ goto out;
+ (*cmdi)++;
+ }
+
+out:
+ gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+
int32_t
cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
{
int32_t ret = -1;
dict_t *dict = NULL;
gf1_cli_gsync_set type = GF_GSYNC_OPTION_TYPE_NONE;
- char *append_str = NULL;
- size_t append_len = 0;
- char *subop = NULL;
int i = 0;
unsigned masteri = 0;
unsigned slavei = 0;
unsigned glob = 0;
unsigned cmdi = 0;
- char *opwords[] = { "status", "start", "stop", "config",
- "log-rotate", NULL };
+ char *opwords[] = { "create", "status", "start", "stop",
+ "config", "force", "delete",
+ "push-pem", "detail", NULL };
char *w = NULL;
GF_ASSERT (words);
@@ -1531,10 +1762,11 @@ cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
/* new syntax:
*
- * volume geo-replication [$m [$s]] status
+ * volume geo-replication $m $s create [push-pem] [force]
+ * volume geo-replication [$m [$s]] status [detail]
* volume geo-replication [$m] $s config [[!]$opt [$val]]
- * volume geo-replication $m $s start|stop
- * volume geo-replication $m [$s] log-rotate
+ * volume geo-replication $m $s start|stop [force]
+ * volume geo-replication $m $s delete
*/
if (wordcount < 3)
@@ -1565,6 +1797,13 @@ cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
if (slavei == 3)
masteri = 2;
} else if (i <= 3) {
+ if (!strcmp ((char *)words[wordcount-1], "detail")) {
+ /* For status detail it is mandatory to provide
+ * both master and slave */
+ ret = -1;
+ goto out;
+ }
+
/* no $s, can only be status cmd
* (with either a single $m before it or nothing)
* -- these conditions imply that i <= 3 after
@@ -1591,7 +1830,12 @@ cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
if (!w)
goto out;
- if (strcmp (w, "status") == 0) {
+ if (strcmp (w, "create") == 0) {
+ type = GF_GSYNC_OPTION_TYPE_CREATE;
+
+ if (!masteri || !slavei)
+ goto out;
+ } else if (strcmp (w, "status") == 0) {
type = GF_GSYNC_OPTION_TYPE_STATUS;
if (slavei && !masteri)
@@ -1611,14 +1855,33 @@ cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
if (!masteri || !slavei)
goto out;
- } else if (strcmp(w, "log-rotate") == 0) {
- type = GF_GSYNC_OPTION_TYPE_ROTATE;
+ } else if (strcmp (w, "delete") == 0) {
+ type = GF_GSYNC_OPTION_TYPE_DELETE;
- if (slavei && !masteri)
+ if (!masteri || !slavei)
goto out;
} else
GF_ASSERT (!"opword mismatch");
+ ret = force_push_pem_parse (words, wordcount, dict, &cmdi);
+ if (ret)
+ goto out;
+
+ if (!strcmp ((char *)words[wordcount-1], "detail")) {
+ if (strcmp ((char *)words[wordcount-2], "status")) {
+ ret = -1;
+ goto out;
+ }
+ if (!slavei || !masteri) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_uint32 (dict, "status-detail", _gf_true);
+ if (ret)
+ goto out;
+ cmdi++;
+ }
+
if (type != GF_GSYNC_OPTION_TYPE_CONFIG &&
(cmdi < wordcount - 1 || glob))
goto out;
@@ -1627,97 +1890,26 @@ cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
ret = 0;
- if (masteri)
+ if (masteri) {
ret = dict_set_str (dict, "master", (char *)words[masteri]);
+ if (!ret)
+ ret = dict_set_str (dict, "volname",
+ (char *)words[masteri]);
+ }
if (!ret && slavei)
ret = dict_set_str (dict, "slave", (char *)words[slavei]);
if (!ret)
ret = dict_set_int32 (dict, "type", type);
- if (!ret && type == GF_GSYNC_OPTION_TYPE_CONFIG) {
- switch ((wordcount - 1) - cmdi) {
- case 0:
- subop = gf_strdup ("get-all");
- break;
- case 1:
- if (words[cmdi + 1][0] == '!') {
- (words[cmdi + 1])++;
- if (gf_asprintf (&subop, "del%s", glob ? "-glob" : "") == -1)
- subop = NULL;
- } else
- subop = gf_strdup ("get");
-
- ret = dict_set_str (dict, "op_name", ((char *)words[cmdi + 1]));
- if (ret < 0)
- goto out;
- break;
- default:
- if (gf_asprintf (&subop, "set%s", glob ? "-glob" : "") == -1)
- subop = NULL;
-
- ret = dict_set_str (dict, "op_name", ((char *)words[cmdi + 1]));
- if (ret < 0)
- goto out;
-
- /* join the varargs by spaces to get the op_value */
-
- for (i = cmdi + 2; i < wordcount; i++)
- append_len += (strlen (words[i]) + 1);
- /* trailing strcat will add two bytes, make space for that */
- append_len++;
-
- append_str = GF_CALLOC (1, append_len, cli_mt_append_str);
- if (!append_str) {
- ret = -1;
- goto out;
- }
-
- for (i = cmdi + 2; i < wordcount; i++) {
- strcat (append_str, words[i]);
- strcat (append_str, " ");
- }
- append_str[append_len - 2] = '\0';
-
- /* "checkpoint now" is special: we resolve that "now" */
- if (strcmp (words[cmdi + 1], "checkpoint") == 0 &&
- strcmp (append_str, "now") == 0) {
- struct timeval tv = {0,};
-
- ret = gettimeofday (&tv, NULL);
- if (ret == -1)
- goto out; /* FIXME: free append_str? */
-
- GF_FREE (append_str);
- append_str = GF_CALLOC (1, 300, cli_mt_append_str);
- if (!append_str) {
- ret = -1;
- goto out;
- }
- strcpy (append_str, "as of ");
- gf_time_fmt (append_str + strlen ("as of "),
- 300 - strlen ("as of "),
- tv.tv_sec, gf_timefmt_FT);
- }
-
- ret = dict_set_dynstr (dict, "op_value", append_str);
- }
-
- ret = -1;
- if (subop) {
- ret = dict_set_dynstr (dict, "subop", subop);
- if (!ret)
- subop = NULL;
- }
- }
+ if (!ret && type == GF_GSYNC_OPTION_TYPE_CONFIG)
+ ret = config_parse (words, wordcount, dict, cmdi, glob);
out:
if (ret) {
if (dict)
dict_destroy (dict);
- GF_FREE (append_str);
} else
*options = dict;
- GF_FREE (subop);
return ret;
}
@@ -1977,7 +2169,7 @@ cli_cmd_get_statusop (const char *arg)
uint32_t ret = GF_CLI_STATUS_NONE;
char *w = NULL;
char *opwords[] = {"detail", "mem", "clients", "fd",
- "inode", "callpool", NULL};
+ "inode", "callpool", "tasks", NULL};
struct {
char *opname;
uint32_t opcode;
@@ -1988,6 +2180,7 @@ cli_cmd_get_statusop (const char *arg)
{ "fd", GF_CLI_STATUS_FD },
{ "inode", GF_CLI_STATUS_INODE },
{ "callpool", GF_CLI_STATUS_CALLPOOL },
+ { "tasks", GF_CLI_STATUS_TASKS },
{ NULL }
};
@@ -2102,8 +2295,9 @@ cli_cmd_volume_status_parse (const char **words, int wordcount,
if (!strcmp (words[3], "nfs")) {
if (cmd == GF_CLI_STATUS_FD ||
- cmd == GF_CLI_STATUS_DETAIL) {
- cli_err ("Detail/FD status not available"
+ cmd == GF_CLI_STATUS_DETAIL ||
+ cmd == GF_CLI_STATUS_TASKS) {
+ cli_err ("Detail/FD/Tasks status not available"
" for NFS Servers");
ret = -1;
goto out;
@@ -2112,14 +2306,21 @@ cli_cmd_volume_status_parse (const char **words, int wordcount,
} else if (!strcmp (words[3], "shd")){
if (cmd == GF_CLI_STATUS_FD ||
cmd == GF_CLI_STATUS_CLIENTS ||
- cmd == GF_CLI_STATUS_DETAIL) {
- cli_err ("Detail/FD/Clients status not "
+ cmd == GF_CLI_STATUS_DETAIL ||
+ cmd == GF_CLI_STATUS_TASKS) {
+ cli_err ("Detail/FD/Clients/Tasks status not "
"available for Self-heal Daemons");
ret = -1;
goto out;
}
cmd |= GF_CLI_STATUS_SHD;
} else {
+ if (cmd == GF_CLI_STATUS_TASKS) {
+ cli_err ("Tasks status not available for "
+ "bricks");
+ ret = -1;
+ goto out;
+ }
cmd |= GF_CLI_STATUS_BRICK;
ret = dict_set_str (dict, "brick", (char *)words[3]);
}
@@ -2268,12 +2469,103 @@ out:
return ret;
}
+static int
+extract_hostname_path_from_token (const char *tmp_words, char **hostname,
+ char **path)
+{
+ int ret = 0;
+ char *delimiter = NULL;
+ char *tmp_host = NULL;
+ char *host_name = NULL;
+ char *words = NULL;
+
+ *hostname = NULL;
+ *path = NULL;
+
+ words = GF_CALLOC (1, strlen (tmp_words) + 1, gf_common_mt_char);
+ if (!words){
+ ret = -1;
+ goto out;
+ }
+
+ strncpy (words, tmp_words, strlen (tmp_words) + 1);
+
+ if (validate_brick_name (words)) {
+ cli_err ("Wrong brick type: %s, use <HOSTNAME>:"
+ "<export-dir-abs-path>", words);
+ ret = -1;
+ goto out;
+ } else {
+ delimiter = strrchr (words, ':');
+ ret = gf_canonicalize_path (delimiter + 1);
+ if (ret) {
+ goto out;
+ } else {
+ *path = GF_CALLOC (1, strlen (delimiter+1) +1,
+ gf_common_mt_char);
+ if (!*path) {
+ ret = -1;
+ goto out;
+
+ }
+ strncpy (*path, delimiter +1,
+ strlen(delimiter + 1) + 1);
+ }
+ }
+
+ tmp_host = gf_strdup (words);
+ if (!tmp_host) {
+ gf_log ("cli", GF_LOG_ERROR, "Out of memory");
+ ret = -1;
+ goto out;
+ }
+ get_host_name (tmp_host, &host_name);
+ if (!host_name) {
+ ret = -1;
+ gf_log("cli",GF_LOG_ERROR, "Unable to allocate "
+ "memory");
+ goto out;
+ }
+ if (!(strcmp (host_name, "localhost") &&
+ strcmp (host_name, "127.0.0.1") &&
+ strncmp (host_name, "0.", 2))) {
+ cli_err ("Please provide a valid hostname/ip other "
+ "than localhost, 127.0.0.1 or loopback "
+ "address (0.0.0.0 to 0.255.255.255).");
+ ret = -1;
+ goto out;
+ }
+ if (!valid_internet_address (host_name, _gf_false)) {
+ cli_err ("internet address '%s' does not conform to "
+ "standards", host_name);
+ ret = -1;
+ goto out;
+ }
+
+ *hostname = GF_CALLOC (1, strlen (host_name) + 1,
+ gf_common_mt_char);
+ if (!*hostname) {
+ ret = -1;
+ goto out;
+ }
+ strncpy (*hostname, host_name, strlen (host_name) + 1);
+ ret = 0;
+
+out:
+ GF_FREE (words);
+ GF_FREE (tmp_host);
+ return ret;
+}
+
+
int
cli_cmd_volume_heal_options_parse (const char **words, int wordcount,
dict_t **options)
{
int ret = 0;
dict_t *dict = NULL;
+ char *hostname = NULL;
+ char *path = NULL;
dict = dict_new ();
if (!dict)
@@ -2295,6 +2587,11 @@ cli_cmd_volume_heal_options_parse (const char **words, int wordcount,
ret = dict_set_int32 (dict, "heal-op",
GF_AFR_OP_HEAL_FULL);
goto done;
+ } else if (!strcmp (words[3], "statistics")) {
+ ret = dict_set_int32 (dict, "heal-op",
+ GF_AFR_OP_STATISTICS);
+ goto done;
+
} else if (!strcmp (words[3], "info")) {
ret = dict_set_int32 (dict, "heal-op",
GF_AFR_OP_INDEX_SUMMARY);
@@ -2305,28 +2602,66 @@ cli_cmd_volume_heal_options_parse (const char **words, int wordcount,
}
}
if (wordcount == 5) {
- if (strcmp (words[3], "info")) {
+ if (strcmp (words[3], "info") &&
+ strcmp (words[3], "statistics")) {
ret = -1;
goto out;
}
- if (!strcmp (words[4], "healed")) {
- ret = dict_set_int32 (dict, "heal-op",
- GF_AFR_OP_HEALED_FILES);
- goto done;
- }
- if (!strcmp (words[4], "heal-failed")) {
- ret = dict_set_int32 (dict, "heal-op",
- GF_AFR_OP_HEAL_FAILED_FILES);
- goto done;
+
+ if (!strcmp (words[3], "info")) {
+ if (!strcmp (words[4], "healed")) {
+ ret = dict_set_int32 (dict, "heal-op",
+ GF_AFR_OP_HEALED_FILES);
+ goto done;
+ }
+ if (!strcmp (words[4], "heal-failed")) {
+ ret = dict_set_int32 (dict, "heal-op",
+ GF_AFR_OP_HEAL_FAILED_FILES);
+ goto done;
+ }
+ if (!strcmp (words[4], "split-brain")) {
+ ret = dict_set_int32 (dict, "heal-op",
+ GF_AFR_OP_SPLIT_BRAIN_FILES);
+ goto done;
+ }
}
- if (!strcmp (words[4], "split-brain")) {
- ret = dict_set_int32 (dict, "heal-op",
- GF_AFR_OP_SPLIT_BRAIN_FILES);
- goto done;
+
+ if (!strcmp (words[3], "statistics")) {
+ if (!strcmp (words[4], "heal-count")) {
+ ret = dict_set_int32 (dict, "heal-op",
+ GF_AFR_OP_STATISTICS_HEAL_COUNT);
+ goto done;
+ }
}
ret = -1;
goto out;
}
+ if (wordcount == 7) {
+ if (!strcmp (words[3], "statistics")
+ && !strcmp (words[4], "heal-count")
+ && !strcmp (words[5], "replica")) {
+
+ ret = dict_set_int32 (dict, "heal-op",
+ GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA);
+ if (ret)
+ goto out;
+ ret = extract_hostname_path_from_token (words[6],
+ &hostname, &path);
+ if (ret)
+ goto out;
+ ret = dict_set_dynstr (dict, "per-replica-cmd-hostname",
+ hostname);
+ if (ret)
+ goto out;
+ ret = dict_set_dynstr (dict, "per-replica-cmd-path",
+ path);
+ if (ret)
+ goto out;
+ else
+ goto done;
+
+ }
+ }
ret = -1;
goto out;
done:
@@ -2431,3 +2766,917 @@ out:
return ret;
}
+
+int32_t
+cli_snap_create_desc_parse (dict_t *dict, const char **words,
+ size_t wordcount, int32_t desc_opt_loc)
+{
+ int32_t ret = -1;
+ char *desc = NULL;
+ int32_t desc_len = 0;
+
+ desc = GF_CALLOC (MAX_SNAP_DESCRIPTION_LEN + 1, sizeof(char),
+ gf_common_mt_char);
+ if (!desc) {
+ ret = -1;
+ goto out;
+ }
+
+
+ if (strlen (words[desc_opt_loc]) >= MAX_SNAP_DESCRIPTION_LEN) {
+ cli_out ("snapshot create: description truncated: "
+ "Description provided is longer than 1024 characters");
+ desc_len = MAX_SNAP_DESCRIPTION_LEN;
+ } else {
+ desc_len = strlen (words[desc_opt_loc]);
+ }
+
+ strncpy (desc, words[desc_opt_loc], desc_len);
+ desc[desc_len] = '\0';
+ /* Calculating the size of the description as given by the user */
+
+ ret = dict_set_dynstr (dict, "description", desc);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to save snap "
+ "description");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret && desc)
+ GF_FREE (desc);
+
+ return ret;
+}
+
+/* Function to check whether the Volume name is repeated */
+int
+cli_check_if_volname_repeated (const char **words, unsigned int start_index,
+ uint64_t cur_index) {
+ uint64_t i = -1;
+ int ret = 0;
+
+ GF_ASSERT (words);
+
+ for (i = start_index ; i < cur_index ; i++) {
+ if (strcmp (words[i], words[cur_index]) == 0) {
+ ret = -1;
+ goto out;
+ }
+ }
+out :
+ return ret;
+}
+
+/* snapshot create <snapname> <vol-name(s)> [description <description>]
+ * [force]
+ * @arg-0, dict : Request Dictionary to be sent to server side.
+ * @arg-1, words : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ * 0 on success
+ */
+int
+cli_snap_create_parse (dict_t *dict, const char **words, int wordcount) {
+ uint64_t i = 0;
+ int ret = -1;
+ uint64_t volcount = 0;
+ char key[PATH_MAX] = "";
+ char *snapname = NULL;
+ unsigned int cmdi = 2;
+ /* cmdi is command index, here cmdi is "2" (gluster snapshot create)*/
+
+ GF_ASSERT (words);
+ GF_ASSERT (dict);
+
+ if (wordcount <= cmdi + 1) {
+ cli_err ("Invalid Syntax.");
+ gf_log ("cli", GF_LOG_ERROR,
+ "Too less words for snap create command");
+ goto out;
+ }
+
+ if (strlen(words[cmdi]) >= GLUSTERD_MAX_SNAP_NAME) {
+ cli_err ("snapshot create: failed: snapname cannot exceed "
+ "255 characters.");
+ gf_log ("cli", GF_LOG_ERROR, "Snapname too long");
+
+ goto out;
+ }
+
+ snapname = (char *) words[cmdi];
+ for (i = 0 ; i < strlen (snapname); i++) {
+ /* Following volume name convention */
+ if (!isalnum (snapname[i]) && (snapname[i] != '_'
+ && (snapname[i] != '-'))) {
+ /* TODO : Is this message enough?? */
+ cli_err ("Snapname can contain only alphanumeric, "
+ "\"-\" and \"_\" characters");
+ goto out;
+ }
+ }
+
+ ret = dict_set_str (dict, "snapname", (char *)words[cmdi]);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not save snap "
+ "name");
+ goto out;
+ }
+
+ /* Filling volume name in the dictionary */
+ for (i = cmdi + 1 ; i < wordcount
+ && (strcmp (words[i], "description")) != 0
+ && (strcmp (words[i], "force") != 0); i++) {
+ volcount++;
+ /* volume index starts from 1 */
+ ret = snprintf (key, sizeof (key),"volname%ld", volcount);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_set_str (dict, key, (char *)words[i]);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not "
+ "save volume name");
+ goto out;
+ }
+
+ if (i >= cmdi + 2) {
+ ret = -1;
+ cli_err("Creating multiple volume snapshot is not "
+ "supported as of now");
+ goto out;
+ }
+ /* TODO : remove this above condition check once
+ * multiple volume snapshot is supported */
+ }
+
+ if (volcount == 0) {
+ ret = -1;
+ cli_err ("Please provide the volume name");
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+
+ ret = dict_set_int32 (dict, "volcount", volcount);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not save volcount");
+ goto out;
+ }
+
+ /* Verify how we got out of "for" loop,
+ * if it is by reaching wordcount limit then goto "out",
+ * because we need not parse for "description" and "force"
+ * after this.
+ */
+ if (i == wordcount) {
+ goto out;
+ }
+
+ if ((strcmp (words[i], "description")) == 0) {
+ ++i;
+ if (i > (wordcount - 1)) {
+ ret = -1;
+ cli_err ("Please provide a description");
+ gf_log ("cli", GF_LOG_ERROR,
+ "Description not provided");
+ goto out;
+ }
+
+ ret = cli_snap_create_desc_parse(dict, words, wordcount, i);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not save snap "
+ "description");
+ goto out;
+ }
+
+ if ( i == (wordcount - 1))
+ goto out;
+ i++;
+ /* point the index to next word.
+ * As description might be follwed by force option.
+ * Before that, check if wordcount limit is reached
+ */
+ }
+
+ if ((strcmp (words[i], "force") != 0)) {
+ ret = -1;
+ cli_err ("Invalid Syntax.");
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+ ret = dict_set_int8 (dict, "snap-force", 1);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not save "
+ "snap force option");
+ goto out;
+ }
+
+ /* Check if the command has anything after "force" keyword */
+ if (++i < wordcount) {
+ ret = -1;
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+
+ ret = 0;
+
+out :
+ return ret;
+}
+
+/* snapshot list [volname]
+ * @arg-0, dict : Request Dictionary to be sent to server side.
+ * @arg-1, words : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ * 0 on success
+ */
+int
+cli_snap_list_parse (dict_t *dict, const char **words, int wordcount) {
+ int ret = -1;
+
+ GF_ASSERT (words);
+ GF_ASSERT (dict);
+
+ if (wordcount < 2 || wordcount > 3) {
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+
+ if (wordcount == 2) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "volname", (char *)words[2]);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Failed to save volname in dictionary");
+ goto out;
+ }
+out :
+ return ret;
+}
+
+/* snapshot info [(snapname | volume <volname>)]
+ * @arg-0, dict : Request Dictionary to be sent to server side.
+ * @arg-1, words : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ * 0 on success
+ */
+int
+cli_snap_info_parse (dict_t *dict, const char **words, int wordcount)
+{
+
+ int ret = -1;
+ int32_t cmd = GF_SNAP_INFO_TYPE_ALL;
+ unsigned int cmdi = 2;
+ /* cmdi is command index, here cmdi is "2" (gluster snapshot info)*/
+
+ GF_ASSERT (words);
+ GF_ASSERT (dict);
+
+ if (wordcount > 4 || wordcount < cmdi) {
+ gf_log ("", GF_LOG_ERROR, "Invalid syntax");
+ goto out;
+ }
+
+ if (wordcount == cmdi) {
+ ret = 0;
+ goto out;
+ }
+
+ /* If 3rd word is not "volume", then it must
+ * be snapname.
+ */
+ if (strcmp (words[cmdi], "volume") != 0) {
+ ret = dict_set_str (dict, "snapname",
+ (char *)words[cmdi]);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to save "
+ "snapname %s", words[cmdi]);
+ goto out;
+ }
+
+ /* Once snap name is parsed, if we encounter any other
+ * word then fail it. Invalid Syntax.
+ * example : snapshot info <snapname> word
+ */
+ if ((cmdi + 1) != wordcount) {
+ ret = -1;
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+
+ cmd = GF_SNAP_INFO_TYPE_SNAP;
+ ret = 0;
+ goto out;
+ /* No need to continue the parsing once we
+ * get the snapname
+ */
+ }
+
+ /* If 3rd word is "volume", then check if next word
+ * is present. As, "snapshot info volume" is an
+ * invalid command.
+ */
+ if ((cmdi + 1) == wordcount) {
+ ret = -1;
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "volname", (char *)words[wordcount - 1]);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Count not save "
+ "volume name %s", words[wordcount - 1]);
+ goto out;
+ }
+ cmd = GF_SNAP_INFO_TYPE_VOL;
+out :
+ if (ret == 0) {
+ ret = dict_set_int32 (dict, "cmd", cmd);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not save "
+ "type of snapshot info");
+ }
+ }
+ return ret;
+}
+
+
+
+/* snapshot restore <snapname>
+ * @arg-0, dict : Request Dictionary to be sent to server side.
+ * @arg-1, words : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ * 0 on success
+ */
+int
+cli_snap_restore_parse (dict_t *dict, const char **words, int wordcount)
+{
+
+ int ret = -1;
+
+ GF_ASSERT (words);
+ GF_ASSERT (dict);
+
+ if (wordcount != 3) {
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "snapname", (char *)words[2]);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to save snap-name %s",
+ words[2]);
+ goto out;
+ }
+out :
+ return ret;
+}
+
+/* snapshot delete <snapname>
+ * @arg-0, dict : Request Dictionary to be sent to server side.
+ * @arg-1, words : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ * 0 on success
+ * 1 if user cancel the operation
+ */
+int
+cli_snap_delete_parse (dict_t *dict, const char **words, int wordcount,
+ struct cli_state *state) {
+
+ int ret = -1;
+ const char *question = NULL;
+ gf_answer_t answer = GF_ANSWER_NO;
+
+ question = "Deleting snap will erase all the information about "
+ "the snap. Do you still want to continue?";
+
+ GF_ASSERT (words);
+ GF_ASSERT (dict);
+
+ if (wordcount != 3) {
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "snapname", (char *)words[2]);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to save snapname %s",
+ words[2]);
+ goto out;
+ }
+
+ answer = cli_cmd_get_confirmation (state, question);
+ if (GF_ANSWER_NO == answer) {
+ ret = 1;
+ gf_log ("cli", GF_LOG_DEBUG, "User cancelled "
+ "snapshot delete operation");
+ goto out;
+ }
+out :
+ return ret;
+}
+
+/* snapshot status [(snapname | volume <volname>)]
+ * @arg-0, dict : Request Dictionary to be sent to server side.
+ * @arg-1, words : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ * 0 on success
+ */
+int
+cli_snap_status_parse (dict_t *dict, const char **words, int wordcount)
+{
+
+ int ret = -1;
+ int32_t cmd = GF_SNAP_STATUS_TYPE_ALL;
+ unsigned int cmdi = 2;
+ /* cmdi is command index, here cmdi is "2" (gluster snapshot status)*/
+
+ GF_ASSERT (words);
+ GF_ASSERT (dict);
+
+ if (wordcount > 4 || wordcount < cmdi) {
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+
+ if (wordcount == cmdi) {
+ ret = 0;
+ goto out;
+ }
+
+ /* if 3rd word is not "volume", then it must be "snapname"
+ */
+ if (strcmp (words[cmdi], "volume") != 0) {
+ ret = dict_set_str (dict, "snapname",
+ (char *)words[cmdi]);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Count not save "
+ "snap name %s", words[cmdi]);
+ goto out;
+ }
+
+ if ((cmdi + 1) != wordcount) {
+ ret = -1;
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+
+ ret = 0;
+ cmd = GF_SNAP_STATUS_TYPE_SNAP;
+ goto out;
+ }
+
+ /* If 3rd word is "volume", then check if next word is present.
+ * As, "snapshot info volume" is an invalid command
+ */
+ if ((cmdi + 1) == wordcount) {
+ ret = -1;
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "volname", (char *)words [wordcount - 1]);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Count not save "
+ "volume name %s", words[wordcount - 1]);
+ goto out;
+ }
+ cmd = GF_SNAP_STATUS_TYPE_VOL;
+
+out :
+ if (ret == 0) {
+ ret = dict_set_int32 (dict, "cmd", cmd);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not save cmd "
+ "of snapshot status");
+ }
+ }
+ return ret;
+}
+
+
+int32_t
+cli_snap_config_limit_parse (const char **words, dict_t *dict,
+ unsigned int wordcount, unsigned int index,
+ char *key)
+{
+ int ret = -1;
+ int limit = 0;
+
+ GF_ASSERT (words);
+ GF_ASSERT (dict);
+ GF_ASSERT (key);
+
+ if (index >= wordcount) {
+ ret = -1;
+ cli_err ("Please provide a value for %s.",key);
+ gf_log ("cli", GF_LOG_ERROR, "Value not provided for %s", key);
+ goto out;
+ }
+
+ limit = strtol (words[index], NULL, 0);
+ if (limit <= 0) {
+ ret = -1;
+ cli_err ("%s should be greater than 0.", key);
+ goto out;
+ }
+
+ ret = dict_set_int32 (dict, key, limit);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not set "
+ "%s in dictionary", key);
+ goto out;
+ }
+
+out :
+ return ret;
+}
+
+/* function cli_snap_config_parse
+ * Config Syntax : gluster snapshot config [volname]
+ * [snap-max-hard-limit <count>]
+ * [snap-max-soft-limit <count>]
+ *
+ return value: <0 on failure
+ 1 if user cancels the operation
+ 0 on success
+
+ NOTE : snap-max-soft-limit can only be set for system.
+*/
+int32_t
+cli_snap_config_parse (const char **words, int wordcount, dict_t *dict,
+ struct cli_state *state)
+{
+ int ret = -1;
+ gf_answer_t answer = GF_ANSWER_NO;
+ gf_boolean_t vol_presence = _gf_false;
+ struct snap_config_opt_vals_ *conf_vals = NULL;
+ int8_t hard_limit = 0;
+ int8_t soft_limit = 0;
+ int8_t config_type = -1;
+ const char *question = NULL;
+ unsigned int cmdi = 2;
+ /* cmdi is command index, here cmdi is "2" (gluster snapshot config)*/
+
+ GF_ASSERT (words);
+ GF_ASSERT (dict);
+ GF_ASSERT (state);
+
+ if ((wordcount < 2) || (wordcount > 7)) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Invalid wordcount(%d)", wordcount);
+ goto out;
+ }
+
+ if (wordcount == 2) {
+ config_type = GF_SNAP_CONFIG_DISPLAY;
+ ret = 0;
+ goto set;
+ }
+
+ /* Check whether the 3rd word is volname */
+ if (strcmp (words[cmdi], "snap-max-hard-limit") != 0
+ && strcmp (words[cmdi], "snap-max-soft-limit") != 0) {
+ ret = dict_set_str (dict, "volname", (char *)words[cmdi]);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to set volname");
+ goto out;
+ }
+ cmdi++;
+ vol_presence = _gf_true;
+
+ if (cmdi == wordcount) {
+ config_type = GF_SNAP_CONFIG_DISPLAY;
+ ret = 0;
+ goto set;
+ }
+ }
+
+ config_type = GF_SNAP_CONFIG_TYPE_SET;
+
+ if (strcmp (words[cmdi], "snap-max-hard-limit") == 0) {
+ ret = cli_snap_config_limit_parse (words, dict, wordcount,
+ ++cmdi, "snap-max-hard-limit");
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to parse snap "
+ "config hard limit");
+ goto out;
+ }
+ hard_limit = 1;
+
+ if (++cmdi == wordcount) {
+ ret = 0;
+ goto set;
+ }
+ }
+
+ if (strcmp (words[cmdi], "snap-max-soft-limit") == 0) {
+ if (vol_presence == 1) {
+ ret = -1;
+ cli_err ("Soft limit cannot be set to individual "
+ "volumes.");
+ gf_log ("cli", GF_LOG_ERROR, "Soft limit cannot be "
+ "set to volumes");
+ goto out;
+ }
+
+ ret = cli_snap_config_limit_parse (words, dict, wordcount,
+ ++cmdi, "snap-max-soft-limit");
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to parse snap "
+ "config soft limit");
+ goto out;
+ }
+
+ if (++cmdi != wordcount) {
+ ret = -1;
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+ soft_limit = 1;
+ } else {
+ ret = -1;
+ gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+ goto out;
+ }
+ ret = 0; /* Success */
+
+set:
+ ret = dict_set_int32 (dict, "config-command", config_type);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to set "
+ "config-command");
+ goto out;
+ }
+
+ if (config_type == GF_SNAP_CONFIG_TYPE_SET) {
+ conf_vals = snap_confopt_vals;
+ if (hard_limit && soft_limit) {
+ question = conf_vals[GF_SNAP_CONFIG_SET_BOTH].question;
+ } else if (soft_limit) {
+ question = conf_vals[GF_SNAP_CONFIG_SET_SOFT].question;
+ } else if (hard_limit) {
+ question = conf_vals[GF_SNAP_CONFIG_SET_HARD].question;
+ }
+
+ answer = cli_cmd_get_confirmation (state, question);
+ if (GF_ANSWER_NO == answer) {
+ ret = 1;
+ gf_log ("cli", GF_LOG_DEBUG, "User cancelled "
+ "snapshot config operation");
+ }
+ }
+
+out:
+ return ret;
+}
+
+int
+validate_snapname (const char *snapname, char **opwords) {
+ int ret = -1;
+ int i = 0;
+
+ GF_ASSERT (snapname);
+ GF_ASSERT (opwords);
+
+ for (i = 0 ; opwords[i] != NULL; i++) {
+ if (strcmp (opwords[i], snapname) == 0) {
+ cli_out ("\"%s\" cannot be a snapname", snapname);
+ goto out;
+ }
+ }
+ ret = 0;
+out :
+ return ret;
+}
+
+int32_t
+cli_cmd_snapshot_parse (const char **words, int wordcount, dict_t **options,
+ struct cli_state *state)
+{
+ int32_t ret = -1;
+ dict_t *dict = NULL;
+ gf1_cli_snapshot type = GF_SNAP_OPTION_TYPE_NONE;
+ char *w = NULL;
+ char *opwords[] = {"create", "delete", "restore", "start",
+ "stop", "list", "status", "config",
+ "info", NULL};
+ char *invalid_snapnames[] = {"description", "force",
+ "volume", NULL};
+
+ GF_ASSERT (words);
+ GF_ASSERT (options);
+ GF_ASSERT (state);
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ /* Lowest wordcount possible */
+ if (wordcount < 2) {
+ gf_log ("", GF_LOG_ERROR,
+ "Invalid command: Not enough arguments");
+ goto out;
+ }
+
+ w = str_getunamb (words[1], opwords);
+ if (!w) {
+ /* Checks if the operation is a valid operation */
+ gf_log ("", GF_LOG_ERROR, "Opword Mismatch");
+ goto out;
+ }
+
+ if (!strcmp (w, "create")) {
+ type = GF_SNAP_OPTION_TYPE_CREATE;
+ } else if (!strcmp (w, "list")) {
+ type = GF_SNAP_OPTION_TYPE_LIST;
+ } else if (!strcmp (w, "info")) {
+ type = GF_SNAP_OPTION_TYPE_INFO;
+ } else if (!strcmp (w, "delete")) {
+ type = GF_SNAP_OPTION_TYPE_DELETE;
+ } else if (!strcmp (w, "config")) {
+ type = GF_SNAP_OPTION_TYPE_CONFIG;
+ } else if (!strcmp (w, "restore")) {
+ type = GF_SNAP_OPTION_TYPE_RESTORE;
+ } else if (!strcmp (w, "status")) {
+ type = GF_SNAP_OPTION_TYPE_STATUS;
+ }
+
+ if (type != GF_SNAP_OPTION_TYPE_CONFIG) {
+ ret = dict_set_int32 (dict, "hold_snap_locks", _gf_true);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Unable to set hold-snap-locks value "
+ "as _gf_true");
+ goto out;
+ }
+ }
+
+ /* Check which op is intended */
+ switch (type) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ {
+ /* Syntax :
+ * gluster snapshot create <snapname> <vol-name(s)>
+ * [description <description>]
+ * [force]
+ */
+
+ /* In cases where the snapname is not given then
+ * parsing fails & snapname cannot be "description",
+ * "force" and "volume", that check is made here
+ */
+ if (wordcount == 2){
+ ret = -1;
+ gf_log ("cli", GF_LOG_ERROR,
+ "Invalid Syntax");
+ goto out;
+ }
+
+ ret = validate_snapname (words[2], invalid_snapnames);
+ if (ret) {
+ goto out;
+ }
+
+ ret = cli_snap_create_parse (dict, words, wordcount);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "create command parsing failed.");
+ goto out;
+ }
+ break;
+ }
+ case GF_SNAP_OPTION_TYPE_INFO:
+ {
+ /* Syntax :
+ * gluster snapshot info [(snapname] | [vol <volname>)]
+ */
+ ret = cli_snap_info_parse (dict, words, wordcount);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to parse "
+ "snapshot info command");
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_OPTION_TYPE_LIST:
+ {
+ /* Syntax :
+ * gluster snaphsot list [volname]
+ */
+
+ ret = cli_snap_list_parse (dict, words, wordcount);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to parse "
+ "snapshot list command");
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ {
+ /* Syntax :
+ * gluster snapshot delete <snapname>
+ */
+ ret = cli_snap_delete_parse (dict, words, wordcount,
+ state);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to parse "
+ "snapshot delete command");
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_OPTION_TYPE_CONFIG:
+ {
+ /* snapshot config [volname] [snap-max-hard-limit <count>]
+ * [snap-max-soft-limit <percent>] */
+ ret = cli_snap_config_parse (words, wordcount, dict,
+ state);
+ if (ret) {
+ if (ret < 0)
+ gf_log ("cli", GF_LOG_ERROR,
+ "config command parsing failed.");
+ goto out;
+ }
+
+ ret = dict_set_int32 (dict, "type",
+ GF_SNAP_OPTION_TYPE_CONFIG);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to set "
+ "config type");
+ ret = -1;
+ goto out;
+ }
+ break;
+ }
+ case GF_SNAP_OPTION_TYPE_STATUS:
+ {
+ /* Syntax :
+ * gluster snapshot status [(snapname |
+ * volume <volname>)]
+ */
+ ret = cli_snap_status_parse (dict, words, wordcount);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to parse "
+ "snapshot status command");
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ {
+ /* Syntax:
+ * snapshot restore <snapname>
+ */
+ ret = cli_snap_restore_parse (dict, words, wordcount);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to parse "
+ "restore command");
+ goto out;
+ }
+ break;
+ }
+ default:
+ gf_log ("", GF_LOG_ERROR, "Opword Mismatch");
+ goto out;
+ break;
+ }
+
+ ret = dict_set_int32 (dict, "type", type);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to set type.");
+ goto out;
+ }
+ /* If you got so far, input is valid */
+ ret = 0;
+out:
+ if (ret) {
+ if (dict)
+ dict_destroy (dict);
+ } else
+ *options = dict;
+
+ return ret;
+}
diff --git a/cli/src/cli-cmd-peer.c b/cli/src/cli-cmd-peer.c
index 08651e72f..551312411 100644
--- a/cli/src/cli-cmd-peer.c
+++ b/cli/src/cli-cmd-peer.c
@@ -41,6 +41,7 @@ cli_cmd_peer_probe_cbk (struct cli_state *state, struct cli_cmd_word *word,
dict_t *dict = NULL;
int sent = 0;
int parse_error = 0;
+ cli_local_t *local = NULL;
if (!(wordcount == 3)) {
cli_usage_out (word->pattern);
@@ -78,6 +79,9 @@ cli_cmd_peer_probe_cbk (struct cli_state *state, struct cli_cmd_word *word,
goto out;
}
*/
+
+ CLI_LOCAL_INIT (local, words, frame, dict);
+
if (proc->fn) {
ret = proc->fn (frame, THIS, dict);
}
@@ -106,6 +110,7 @@ cli_cmd_peer_deprobe_cbk (struct cli_state *state, struct cli_cmd_word *word,
int flags = 0;
int sent = 0;
int parse_error = 0;
+ cli_local_t *local = NULL;
if ((wordcount < 3) || (wordcount > 4)) {
cli_usage_out (word->pattern);
@@ -145,6 +150,8 @@ cli_cmd_peer_deprobe_cbk (struct cli_state *state, struct cli_cmd_word *word,
if (ret)
goto out;
+ CLI_LOCAL_INIT (local, words, frame, dict);
+
if (proc->fn) {
ret = proc->fn (frame, THIS, dict);
}
@@ -184,7 +191,7 @@ cli_cmd_peer_status_cbk (struct cli_state *state, struct cli_cmd_word *word,
goto out;
if (proc->fn) {
- ret = proc->fn (frame, THIS, (char *)words[1] );
+ ret = proc->fn (frame, THIS, (void *)GF_CLI_LIST_PEERS);
}
out:
@@ -199,6 +206,45 @@ out:
return ret;
}
+int
+cli_cmd_pool_list_cbk (struct cli_state *state, struct cli_cmd_word *word,
+ const char **words, int wordcount)
+{
+ int ret = -1;
+ rpc_clnt_procedure_t *proc = NULL;
+ call_frame_t *frame = NULL;
+ int sent = 0;
+ int parse_error = 0;
+
+ if (wordcount != 2) {
+ cli_usage_out (word->pattern);
+ parse_error = 1;
+ goto out;
+ }
+
+ proc = &cli_rpc_prog->proctable[GLUSTER_CLI_LIST_FRIENDS];
+
+ frame = create_frame (THIS, THIS->ctx->pool);
+ if (!frame)
+ goto out;
+
+ if (proc->fn) {
+ ret = proc->fn (frame, THIS,
+ (void *)GF_CLI_LIST_POOL_NODES);
+ }
+
+out:
+ if (ret) {
+ cli_cmd_sent_status_get (&sent);
+ if ((sent == 0) && (parse_error == 0))
+ cli_err ("pool list: command execution failed");
+ }
+
+ CLI_STACK_DESTROY (frame);
+
+ return ret;
+}
+
struct cli_cmd cli_probe_cmds[] = {
{ "peer probe <HOSTNAME>",
cli_cmd_peer_probe_cbk,
@@ -216,6 +262,10 @@ struct cli_cmd cli_probe_cmds[] = {
cli_cmd_peer_help_cbk,
"Help command for peer "},
+ { "pool list",
+ cli_cmd_pool_list_cbk,
+ "list all the nodes in the pool (including localhost)"},
+
{ NULL, NULL, NULL }
};
diff --git a/cli/src/cli-cmd-snapshot.c b/cli/src/cli-cmd-snapshot.c
new file mode 100644
index 000000000..de492d683
--- /dev/null
+++ b/cli/src/cli-cmd-snapshot.c
@@ -0,0 +1,146 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <pthread.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "cli.h"
+#include "cli-cmd.h"
+
+extern rpc_clnt_prog_t *cli_rpc_prog;
+
+int
+cli_cmd_snapshot_help_cbk (struct cli_state *state, struct cli_cmd_word *in_word,
+ const char **words, int wordcount);
+
+int
+cli_cmd_snapshot_cbk (struct cli_state *state, struct cli_cmd_word *word,
+ const char **words, int wordcount)
+{
+ int ret = 0;
+ int parse_err = 0;
+ dict_t *options = NULL;
+ rpc_clnt_procedure_t *proc = NULL;
+ call_frame_t *frame = NULL;
+ cli_local_t *local = NULL;
+
+ proc = &cli_rpc_prog->proctable [GLUSTER_CLI_SNAP];
+ if (proc == NULL) {
+ ret = -1;
+ goto out;
+ }
+
+ frame = create_frame (THIS, THIS->ctx->pool);
+ if (frame == NULL) {
+ ret = -1;
+ goto out;
+ }
+
+ /* Parses the command entered by the user */
+ ret = cli_cmd_snapshot_parse (words, wordcount, &options, state);
+ if (ret) {
+ if (ret < 0) {
+ cli_usage_out (word->pattern);
+ parse_err = 1;
+ }
+ else {
+ /* User might have cancelled the snapshot operation */
+ ret = 0;
+ }
+ goto out;
+ }
+
+ CLI_LOCAL_INIT (local, words, frame, options);
+
+ if (proc->fn)
+ ret = proc->fn (frame, THIS, options);
+
+out:
+ if (ret && parse_err == 0)
+ cli_out ("Snapshot command failed");
+
+ CLI_STACK_DESTROY (frame);
+
+ return ret;
+}
+
+struct cli_cmd snapshot_cmds[] = {
+ { "snapshot help",
+ cli_cmd_snapshot_help_cbk,
+ "display help for snapshot commands"
+ },
+ { "snapshot create <snapname> <volname(s)> [description <description>] [force]",
+ cli_cmd_snapshot_cbk,
+ "Snapshot Create."
+ },
+ { "snapshot restore <snapname>",
+ cli_cmd_snapshot_cbk,
+ "Snapshot Restore."
+ },
+ { "snapshot status [(snapname | volume <volname>)]",
+ cli_cmd_snapshot_cbk,
+ "Snapshot Status."
+ },
+ { "snapshot info [(snapname | volume <volname>)]",
+ cli_cmd_snapshot_cbk,
+ "Snapshot Info."
+ },
+ { "snapshot list [volname]",
+ cli_cmd_snapshot_cbk,
+ "Snapshot List."
+ },
+ {"snapshot config [volname] [snap-max-hard-limit <count>] [snap-max-soft-limit <percent>]",
+ cli_cmd_snapshot_cbk,
+ "Snapshot Config."
+ },
+ {"snapshot delete <snapname>",
+ cli_cmd_snapshot_cbk,
+ "Snapshot Delete."
+ },
+ { NULL, NULL, NULL }
+};
+
+int
+cli_cmd_snapshot_help_cbk (struct cli_state *state,
+ struct cli_cmd_word *in_word,
+ const char **words,
+ int wordcount)
+{
+ struct cli_cmd *cmd = NULL;
+
+ for (cmd = snapshot_cmds; cmd->pattern; cmd++)
+ if (_gf_false == cmd->disable)
+ cli_out ("%s - %s", cmd->pattern, cmd->desc);
+
+ return 0;
+}
+
+int
+cli_cmd_snapshot_register (struct cli_state *state)
+{
+ int ret = 0;
+ struct cli_cmd *cmd = NULL;
+
+ for (cmd = snapshot_cmds; cmd->pattern; cmd++) {
+
+ ret = cli_cmd_register (&state->tree, cmd);
+ if (ret)
+ goto out;
+ }
+out:
+ return ret;
+}
diff --git a/cli/src/cli-cmd-system.c b/cli/src/cli-cmd-system.c
index 255eb605e..8cfa5e70c 100644
--- a/cli/src/cli-cmd-system.c
+++ b/cli/src/cli-cmd-system.c
@@ -31,6 +31,12 @@ extern rpc_clnt_prog_t *cli_rpc_prog;
int cli_cmd_system_help_cbk (struct cli_state *state, struct cli_cmd_word *in_word,
const char **words, int wordcount);
+int cli_cmd_copy_file_cbk (struct cli_state *state, struct cli_cmd_word *word,
+ const char **words, int wordcount);
+
+int cli_cmd_sys_exec_cbk (struct cli_state *state, struct cli_cmd_word *word,
+ const char **words, int wordcount);
+
int
cli_cmd_getspec_cbk (struct cli_state *state, struct cli_cmd_word *word,
const char **words, int wordcount)
@@ -278,6 +284,114 @@ cli_cmd_umount_cbk (struct cli_state *state, struct cli_cmd_word *word,
return ret;
}
+int
+cli_cmd_uuid_get_cbk (struct cli_state *state, struct cli_cmd_word *word,
+ const char **words, int wordcount)
+{
+ int ret = -1;
+ int sent = 0;
+ int parse_error = 0;
+ dict_t *dict = NULL;
+ rpc_clnt_procedure_t *proc = NULL;
+ call_frame_t *frame = NULL;
+ cli_local_t *local = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ if (wordcount != 3) {
+ cli_usage_out (word->pattern);
+ parse_error = 1;
+ goto out;
+ }
+
+ proc = &cli_rpc_prog->proctable[GLUSTER_CLI_UUID_GET];
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame)
+ goto out;
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ CLI_LOCAL_INIT (local, words, frame, dict);
+ if (proc->fn)
+ ret = proc->fn (frame, this, dict);
+
+out:
+ if (ret) {
+ cli_cmd_sent_status_get (&sent);
+ if ((sent == 0) && (parse_error == 0))
+ cli_out ("uuid get failed");
+ }
+
+ if (dict)
+ dict_unref (dict);
+
+ CLI_STACK_DESTROY (frame);
+ return ret;
+}
+
+int
+cli_cmd_uuid_reset_cbk (struct cli_state *state, struct cli_cmd_word *word,
+ const char **words, int wordcount)
+{
+ int ret = -1;
+ rpc_clnt_procedure_t *proc = NULL;
+ call_frame_t *frame = NULL;
+ int sent = 0;
+ int parse_error = 0;
+ gf_answer_t answer = GF_ANSWER_NO;
+ char *question = NULL;
+ cli_local_t *local = NULL;
+ dict_t *dict = NULL;
+ xlator_t *this = NULL;
+
+ question = "Resetting uuid changes the uuid of local glusterd. "
+ "Do you want to continue?";
+
+ if (wordcount != 3) {
+ cli_usage_out (word->pattern);
+ parse_error = 1;
+ goto out;
+ }
+
+ proc = &cli_rpc_prog->proctable[GLUSTER_CLI_UUID_RESET];
+
+ this = THIS;
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame)
+ goto out;
+
+ dict = dict_new ();
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ CLI_LOCAL_INIT (local, words, frame, dict);
+ answer = cli_cmd_get_confirmation (state, question);
+
+ if (GF_ANSWER_NO == answer) {
+ ret = 0;
+ goto out;
+ }
+
+ //send NULL as argument since no dictionary is sent to glusterd
+ if (proc->fn) {
+ ret = proc->fn (frame, this, dict);
+ }
+
+out:
+ if (ret) {
+ cli_cmd_sent_status_get (&sent);
+ if ((sent == 0) && (parse_error == 0))
+ cli_out ("uuid reset failed");
+ }
+
+ CLI_STACK_DESTROY (frame);
+
+ return ret;
+}
+
struct cli_cmd cli_system_cmds[] = {
{ "system:: getspec <VOLID>",
cli_cmd_getspec_cbk,
@@ -303,14 +417,163 @@ struct cli_cmd cli_system_cmds[] = {
cli_cmd_umount_cbk,
"request an umount"},
+ { "system:: uuid get",
+ cli_cmd_uuid_get_cbk,
+ "get uuid of glusterd"},
+
+ { "system:: uuid reset",
+ cli_cmd_uuid_reset_cbk,
+ "reset the uuid of glusterd"},
+
{ "system:: help",
cli_cmd_system_help_cbk,
"display help for system commands"},
+ { "system:: copy file [<filename>]",
+ cli_cmd_copy_file_cbk,
+ "Copy file from current node's $working_dir to "
+ "$working_dir of all cluster nodes"},
+
+ { "system:: execute <command> <args>",
+ cli_cmd_sys_exec_cbk,
+ "Execute the command on all the nodes "
+ "in the cluster and display their output."},
+
{ NULL, NULL, NULL }
};
int
+cli_cmd_sys_exec_cbk (struct cli_state *state, struct cli_cmd_word *word,
+ const char **words, int wordcount)
+{
+ char cmd_arg_name[PATH_MAX] = "";
+ char *command = NULL;
+ char *saveptr = NULL;
+ char *tmp = NULL;
+ int ret = -1;
+ int i = -1;
+ int cmd_args_count = 0;
+ int in_cmd_args_count = 0;
+ rpc_clnt_procedure_t *proc = NULL;
+ call_frame_t *frame = NULL;
+ dict_t *dict = NULL;
+ cli_local_t *local = NULL;
+
+ if (wordcount < 3) {
+ cli_usage_out (word->pattern);
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ command = strtok_r ((char *)words[2], " ", &saveptr);
+ do {
+ tmp = strtok_r (NULL, " ", &saveptr);
+ if (tmp) {
+ in_cmd_args_count++;
+ memset (cmd_arg_name, '\0', sizeof(cmd_arg_name));
+ snprintf (cmd_arg_name, sizeof(cmd_arg_name),
+ "cmd_arg_%d", in_cmd_args_count);
+ ret = dict_set_str (dict, cmd_arg_name, tmp);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to set "
+ "%s in dict", cmd_arg_name);
+ goto out;
+ }
+ }
+ } while (tmp);
+
+ cmd_args_count = wordcount - 3;
+
+ ret = dict_set_str (dict, "command", command);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to set command in dict");
+ goto out;
+ }
+
+ for (i=1; i <= cmd_args_count; i++) {
+ in_cmd_args_count++;
+ memset (cmd_arg_name, '\0', sizeof(cmd_arg_name));
+ snprintf (cmd_arg_name, sizeof(cmd_arg_name),
+ "cmd_arg_%d", in_cmd_args_count);
+ ret = dict_set_str (dict, cmd_arg_name,
+ (char *)words[2+i]);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to set %s in dict",
+ cmd_arg_name);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (dict, "cmd_args_count", in_cmd_args_count);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set cmd_args_count in dict");
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "volname", "N/A");
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to set volname in dict");
+ goto out;
+ }
+
+ proc = &cli_rpc_prog->proctable[GLUSTER_CLI_SYS_EXEC];
+ if (proc && proc->fn) {
+ frame = create_frame (THIS, THIS->ctx->pool);
+ if (!frame)
+ goto out;
+ CLI_LOCAL_INIT (local, words, frame, dict);
+ ret = proc->fn (frame, THIS, (void*)dict);
+ }
+out:
+ return ret;
+}
+
+int
+cli_cmd_copy_file_cbk (struct cli_state *state, struct cli_cmd_word *word,
+ const char **words, int wordcount)
+{
+ int ret = -1;
+ rpc_clnt_procedure_t *proc = NULL;
+ call_frame_t *frame = NULL;
+ char *filename = "";
+ dict_t *dict = NULL;
+ cli_local_t *local = NULL;
+
+ if (wordcount != 4) {
+ cli_usage_out (word->pattern);
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ filename = (char*)words[3];
+ ret = dict_set_str (dict, "source", filename);
+ if (ret)
+ gf_log ("", GF_LOG_ERROR, "Unable to set filename in dict");
+
+ ret = dict_set_str (dict, "volname", "N/A");
+ if (ret)
+ gf_log ("", GF_LOG_ERROR, "Unable to set volname in dict");
+
+ proc = &cli_rpc_prog->proctable[GLUSTER_CLI_COPY_FILE];
+ if (proc && proc->fn) {
+ frame = create_frame (THIS, THIS->ctx->pool);
+ if (!frame)
+ goto out;
+ CLI_LOCAL_INIT (local, words, frame, dict);
+ ret = proc->fn (frame, THIS, (void*)dict);
+ }
+out:
+ return ret;
+}
+
+int
cli_cmd_system_help_cbk (struct cli_state *state, struct cli_cmd_word *in_word,
const char **words, int wordcount)
{
diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c
index ac70cd5c0..100be0b73 100644
--- a/cli/src/cli-cmd-volume.c
+++ b/cli/src/cli-cmd-volume.c
@@ -29,19 +29,6 @@
#include "cli1-xdr.h"
#include "run.h"
-#define CLI_LOCAL_INIT(local, words, frame, dictionary) \
- do { \
- local = cli_local_get (); \
- \
- if (local) { \
- local->words = words; \
- if (dictionary) \
- local->dict = dictionary; \
- if (frame) \
- frame->local = local; \
- } \
- } while (0)
-
extern struct rpc_clnt *global_rpc;
extern rpc_clnt_prog_t *cli_rpc_prog;
@@ -125,6 +112,11 @@ cli_cmd_sync_volume_cbk (struct cli_state *state, struct cli_cmd_word *word,
int parse_error = 0;
dict_t *dict = NULL;
cli_local_t *local = NULL;
+ gf_answer_t answer = GF_ANSWER_NO;
+ const char *question = "Sync volume may make data "
+ "inaccessible while the sync "
+ "is in progress. Do you want "
+ "to continue?";
if ((wordcount < 3) || (wordcount > 4)) {
cli_usage_out (word->pattern);
@@ -159,6 +151,14 @@ cli_cmd_sync_volume_cbk (struct cli_state *state, struct cli_cmd_word *word,
goto out;
}
+ if (!(state->mode & GLUSTER_MODE_SCRIPT)) {
+ answer = cli_cmd_get_confirmation (state, question);
+ if (GF_ANSWER_NO == answer) {
+ ret = 0;
+ goto out;
+ }
+ }
+
proc = &cli_rpc_prog->proctable[GLUSTER_CLI_SYNC_VOLUME];
frame = create_frame (THIS, THIS->ctx->pool);
@@ -392,6 +392,15 @@ cli_cmd_volume_create_cbk (struct cli_state *state, struct cli_cmd_word *word,
}
}
+ if (state->mode & GLUSTER_MODE_SCRIPT) {
+ ret = dict_set_int32 (options, "force", _gf_true);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to set force "
+ "option");
+ goto out;
+ }
+ }
+
CLI_LOCAL_INIT (local, words, frame, options);
if (proc->fn) {
@@ -888,6 +897,7 @@ cli_cmd_volume_set_cbk (struct cli_state *state, struct cli_cmd_word *word,
call_frame_t *frame = NULL;
dict_t *options = NULL;
cli_local_t *local = NULL;
+ char *op_errstr = NULL;
proc = &cli_rpc_prog->proctable[GLUSTER_CLI_SET_VOLUME];
@@ -895,9 +905,14 @@ cli_cmd_volume_set_cbk (struct cli_state *state, struct cli_cmd_word *word,
if (!frame)
goto out;
- ret = cli_cmd_volume_set_parse (words, wordcount, &options);
+ ret = cli_cmd_volume_set_parse (words, wordcount, &options, &op_errstr);
if (ret) {
- cli_usage_out (word->pattern);
+ if (op_errstr) {
+ cli_err ("%s", op_errstr);
+ GF_FREE (op_errstr);
+ } else
+ cli_usage_out (word->pattern);
+
parse_error = 1;
goto out;
}
@@ -963,6 +978,15 @@ cli_cmd_volume_add_brick_cbk (struct cli_state *state,
}
}
+ if (state->mode & GLUSTER_MODE_SCRIPT) {
+ ret = dict_set_int32 (options, "force", _gf_true);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to set force "
+ "option");
+ goto out;
+ }
+ }
+
proc = &cli_rpc_prog->proctable[GLUSTER_CLI_ADD_BRICK];
CLI_LOCAL_INIT (local, words, frame, options);
@@ -1131,6 +1155,15 @@ cli_cmd_volume_replace_brick_cbk (struct cli_state *state,
goto out;
}
+ if (state->mode & GLUSTER_MODE_SCRIPT) {
+ ret = dict_set_int32 (options, "force", _gf_true);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to set force"
+ "option");
+ goto out;
+ }
+ }
+
CLI_LOCAL_INIT (local, words, frame, options);
if (proc->fn) {
@@ -1490,7 +1523,10 @@ void
cli_print_detailed_status (cli_volume_status_t *status)
{
cli_out ("%-20s : %-20s", "Brick", status->brick);
- cli_out ("%-20s : %-20d", "Port", status->port);
+ if (status->online)
+ cli_out ("%-20s : %-20d", "Port", status->port);
+ else
+ cli_out ("%-20s : %-20s", "Port", "N/A");
cli_out ("%-20s : %-20c", "Online", (status->online) ? 'Y' : 'N');
cli_out ("%-20s : %-20s", "Pid", status->pid_str);
@@ -1565,10 +1601,18 @@ cli_print_brick_status (cli_volume_status_t *status)
printf ("%s", p);
while (num_tabs-- != 0)
printf ("\t");
- if (status->port)
- cli_out ("%d\t%c\t%s",
- status->port, status->online?'Y':'N',
- status->pid_str);
+ if (status->port) {
+ if (status->online)
+ cli_out ("%d\t%c\t%s",
+ status->port,
+ status->online?'Y':'N',
+ status->pid_str);
+ else
+ cli_out ("%s\t%c\t%s",
+ "N/A",
+ status->online?'Y':'N',
+ status->pid_str);
+ }
else
cli_out ("%s\t%c\t%s",
"N/A", status->online?'Y':'N',
@@ -1784,7 +1828,13 @@ struct cli_cmd volume_cmds[] = {
cli_cmd_volume_info_cbk,
"list information of all volumes"},
- { "volume create <NEW-VOLNAME> [stripe <COUNT>] [replica <COUNT>] [transport <tcp|rdma|tcp,rdma>] <NEW-BRICK> ...",
+ { "volume create <NEW-VOLNAME> [stripe <COUNT>] [replica <COUNT>] "
+ "[transport <tcp|rdma|tcp,rdma>] <NEW-BRICK>"
+#ifdef HAVE_BD_XLATOR
+ "?<vg_name>"
+#endif
+ "... [force]",
+
cli_cmd_volume_create_cbk,
"create a new volume of specified type with mentioned bricks"},
@@ -1804,11 +1854,11 @@ struct cli_cmd volume_cmds[] = {
cli_cmd_volume_rename_cbk,
"rename volume <VOLNAME> to <NEW-VOLNAME>"},*/
- { "volume add-brick <VOLNAME> [<stripe|replica> <COUNT>] <NEW-BRICK> ...",
+ { "volume add-brick <VOLNAME> [<stripe|replica> <COUNT>] <NEW-BRICK> ... [force]",
cli_cmd_volume_add_brick_cbk,
"add brick to volume <VOLNAME>"},
- { "volume remove-brick <VOLNAME> [replica <COUNT>] <BRICK> ... {start|stop|status|commit|force}",
+ { "volume remove-brick <VOLNAME> [replica <COUNT>] <BRICK> ... [start|stop|status|commit|force]",
cli_cmd_volume_remove_brick_cbk,
"remove brick from volume <VOLNAME>"},
@@ -1816,7 +1866,7 @@ struct cli_cmd volume_cmds[] = {
cli_cmd_volume_defrag_cbk,
"rebalance operations"},
- { "volume replace-brick <VOLNAME> <BRICK> <NEW-BRICK> {start|pause|abort|status|commit [force]}",
+ { "volume replace-brick <VOLNAME> <BRICK> <NEW-BRICK> {start [force]|pause|abort|status|commit [force]}",
cli_cmd_volume_replace_brick_cbk,
"replace-brick operations"},
@@ -1845,13 +1895,14 @@ struct cli_cmd volume_cmds[] = {
"reset all the reconfigured options"},
#if (SYNCDAEMON_COMPILE)
- {"volume "GEOREP" [<VOLNAME>] [<SLAVE-URL>] {start|stop|config|status|log-rotate} [options...]",
+ {"volume "GEOREP" [<VOLNAME>] [<SLAVE-URL>] {create [push-pem] [force]"
+ "|start [force]|stop [force]|config|status [detail]|delete} [options...]",
cli_cmd_volume_gsync_set_cbk,
"Geo-sync operations",
cli_cmd_check_gsync_exists_cbk},
#endif
- { "volume profile <VOLNAME> {start|info|stop} [nfs]",
+ { "volume profile <VOLNAME> {start|stop|info [nfs]}",
cli_cmd_volume_profile_cbk,
"volume profile operations"},
@@ -1859,18 +1910,17 @@ struct cli_cmd volume_cmds[] = {
cli_cmd_quota_cbk,
"quota translator specific operations"},
- { "volume top <VOLNAME> {[open|read|write|opendir|readdir [nfs]] "
- "|[read-perf|write-perf [nfs|{bs <size> count <count>}]]"
- "|[clear [nfs]]} [brick <brick>] [list-cnt <count>]",
+ { "volume top <VOLNAME> {open|read|write|opendir|readdir|clear} [nfs|brick <brick>] [list-cnt <value>] |\n"
+ "volume top <VOLNAME> {read-perf|write-perf} [bs <size> count <count>] [brick <brick>] [list-cnt <value>]",
cli_cmd_volume_top_cbk,
"volume top operations"},
{ "volume status [all | <VOLNAME> [nfs|shd|<BRICK>]]"
- " [detail|clients|mem|inode|fd|callpool]",
+ " [detail|clients|mem|inode|fd|callpool|tasks]",
cli_cmd_volume_status_cbk,
"display status of all or specified volume(s)/brick"},
- { "volume heal <VOLNAME> [{full | info {healed | heal-failed | split-brain}}]",
+ { "volume heal <VOLNAME> [{full | statistics {heal-count {replica <hostname:brickname>}} |info {healed | heal-failed | split-brain}}]",
cli_cmd_volume_heal_cbk,
"self-heal commands on volume specified by <VOLNAME>"},
diff --git a/cli/src/cli-cmd.c b/cli/src/cli-cmd.c
index 64aba5d9f..b81f75b5b 100644
--- a/cli/src/cli-cmd.c
+++ b/cli/src/cli-cmd.c
@@ -231,6 +231,9 @@ cli_cmds_register (struct cli_state *state)
if (ret)
goto out;
+ ret = cli_cmd_snapshot_register (state);
+ if (ret)
+ goto out;
out:
return ret;
}
@@ -360,8 +363,11 @@ cli_cmd_submit (void *req, call_frame_t *frame,
int ret = -1;
unsigned timeout = 0;
- timeout = (GLUSTER_CLI_PROFILE_VOLUME == procnum) ?
- CLI_TOP_CMD_TIMEOUT : CLI_DEFAULT_CMD_TIMEOUT;
+ if ((GLUSTER_CLI_PROFILE_VOLUME == procnum) ||
+ (GLUSTER_CLI_HEAL_VOLUME == procnum))
+ timeout = CLI_TEN_MINUTES_TIMEOUT;
+ else
+ timeout = CLI_DEFAULT_CMD_TIMEOUT;
cli_cmd_lock ();
cmd_sent = 0;
diff --git a/cli/src/cli-cmd.h b/cli/src/cli-cmd.h
index 0ec316774..041729276 100644
--- a/cli/src/cli-cmd.h
+++ b/cli/src/cli-cmd.h
@@ -20,6 +20,19 @@
#include "cli.h"
#include "list.h"
+#define CLI_LOCAL_INIT(local, words, frame, dictionary) \
+ do { \
+ local = cli_local_get (); \
+ \
+ if (local) { \
+ local->words = words; \
+ if (dictionary) \
+ local->dict = dictionary; \
+ if (frame) \
+ frame->local = local; \
+ } \
+ } while (0)
+
#define CLI_STACK_DESTROY(_frame) \
do { \
if (_frame) { \
@@ -80,6 +93,8 @@ int cli_cmd_probe_register (struct cli_state *state);
int cli_cmd_system_register (struct cli_state *state);
+int cli_cmd_snapshot_register (struct cli_state *state);
+
int cli_cmd_misc_register (struct cli_state *state);
struct cli_cmd_word *cli_cmd_nextword (struct cli_cmd_word *word,
@@ -105,4 +120,5 @@ cli_cmd_submit (void *req, call_frame_t *frame,
gf_answer_t
cli_cmd_get_confirmation (struct cli_state *state, const char *question);
int cli_cmd_sent_status_get (int *status);
+
#endif /* __CLI_CMD_H__ */
diff --git a/cli/src/cli-rl.c b/cli/src/cli-rl.c
index 6f75b6f4c..ade1c8ebb 100644
--- a/cli/src/cli-rl.c
+++ b/cli/src/cli-rl.c
@@ -365,7 +365,8 @@ cli_rl_input (void *_data)
if (!line)
exit(0); //break;
- cli_rl_process_line (line);
+ if (*line)
+ cli_rl_process_line (line);
free (line);
}
diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c
index 256f3299f..bfeb854ad 100644
--- a/cli/src/cli-rpc-ops.c
+++ b/cli/src/cli-rpc-ops.c
@@ -13,10 +13,6 @@
#include "config.h"
#endif
-#ifndef GSYNC_CONF
-#define GSYNC_CONF GEOREP"/gsyncd.conf"
-#endif
-
/* Widths of various columns in top read/write-perf output
* Total width of top read/write-perf should be 80 chars
* including one space between column
@@ -26,6 +22,8 @@
#define VOL_TOP_PERF_SPEED_WIDTH 4
#define VOL_TOP_PERF_TIME_WIDTH 26
+#define INDENT_MAIN_HEAD "%-25s %s "
+
#include "cli.h"
#include "compat-errno.h"
#include "cli-cmd.h"
@@ -62,6 +60,17 @@ char *cli_vol_status_str[] = {"Created",
"Stopped",
};
+char *cli_vol_task_status_str[] = {"not started",
+ "in progress",
+ "stopped",
+ "completed",
+ "failed",
+ "fix-layout in progress",
+ "fix-layout stopped",
+ "fix-layout completed",
+ "fix-layout failed",
+};
+
int32_t
gf_cli_get_volume (call_frame_t *frame, xlator_t *this,
void *data);
@@ -87,7 +96,7 @@ int
gf_cli_probe_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
- gf1_cli_probe_rsp rsp = {0,};
+ gf_cli_rsp rsp = {0,};
int ret = -1;
char msg[1024] = {0,};
@@ -95,92 +104,28 @@ gf_cli_probe_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
}
- ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_probe_rsp);
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
//rsp.op_ret = -1;
//rsp.op_errno = EINVAL;
goto out;
}
gf_log ("cli", GF_LOG_INFO, "Received resp to probe");
- if (!rsp.op_ret) {
- switch (rsp.op_errno) {
- case GF_PROBE_SUCCESS:
- snprintf (msg, sizeof (msg),
- "success");
- break;
- case GF_PROBE_LOCALHOST:
- snprintf (msg, sizeof (msg),
- "success: on localhost not needed");
- break;
- case GF_PROBE_FRIEND:
- snprintf (msg, sizeof (msg),
- "success: host %s port %d already"
- " in peer list", rsp.hostname,
- rsp.port);
- break;
- default:
- rsp.op_ret = -1;
- snprintf (msg, sizeof (msg),
- "Probe returned with unknown errno"
- " %d", rsp.op_errno);
- break;
- }
- }
- if (rsp.op_ret) {
- if (rsp.op_errstr && (strlen (rsp.op_errstr) > 0)) {
- snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
- } else {
- switch (rsp.op_errno) {
- case GF_PROBE_ANOTHER_CLUSTER:
- snprintf (msg, sizeof (msg),
- "%s is already part of "
- "another cluster",
- rsp.hostname);
- break;
- case GF_PROBE_VOLUME_CONFLICT:
- snprintf (msg, sizeof (msg),
- "Atleast one volume on %s "
- "conflicts with existing "
- "volumes in the cluster",
- rsp.hostname);
- break;
- case GF_PROBE_UNKNOWN_PEER:
- snprintf (msg, sizeof (msg),
- "%s responded with 'unknown "
- "peer' error, this could "
- "happen if %s doesn't have "
- "localhost in its peer "
- "database", rsp.hostname,
- rsp.hostname);
- break;
- case GF_PROBE_ADD_FAILED:
- snprintf (msg, sizeof (msg),
- "Failed to add peer "
- "information on %s" ,
- rsp.hostname);
- break;
- case GF_PROBE_SAME_UUID:
- snprintf (msg, sizeof (msg),
- "Peer uuid (host %s) is"
- "same as local uuid",
- rsp.hostname);
- break;
- default:
- snprintf (msg, sizeof (msg),
- "Probe returned with unknown "
- "errno %d", rsp.op_errno);
- break;
- }
- }
- gf_log ("cli", GF_LOG_ERROR, "%s", msg);
+ if (rsp.op_errstr && (strlen (rsp.op_errstr) > 0)) {
+ snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
+ if (rsp.op_ret)
+ gf_log ("cli", GF_LOG_ERROR, "%s", msg);
}
if (global_state->mode & GLUSTER_MODE_XML) {
- ret = cli_xml_output_str ("peerProbe", msg, rsp.op_ret,
- rsp.op_errno, NULL);
+ ret = cli_xml_output_str (NULL,
+ (rsp.op_ret)? NULL : msg,
+ rsp.op_ret, rsp.op_errno,
+ (rsp.op_ret)? msg : NULL);
if (ret)
gf_log ("cli", GF_LOG_ERROR,
"Error outputting to xml");
@@ -188,7 +133,7 @@ gf_cli_probe_cbk (struct rpc_req *req, struct iovec *iov,
}
if (!rsp.op_ret)
- cli_out ("peer probe: %s", msg);
+ cli_out ("peer probe: success. %s", msg);
else
cli_err ("peer probe: failed: %s", msg);
@@ -203,67 +148,39 @@ int
gf_cli_deprobe_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
- gf1_cli_deprobe_rsp rsp = {0,};
+ gf_cli_rsp rsp = {0,};
int ret = -1;
- char msg[1024] = {0,};
+ char msg[1024] = {0,};
if (-1 == req->rpc_status) {
goto out;
}
- ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_deprobe_rsp);
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
//rsp.op_ret = -1;
//rsp.op_errno = EINVAL;
goto out;
}
gf_log ("cli", GF_LOG_INFO, "Received resp to deprobe");
+
if (rsp.op_ret) {
if (strlen (rsp.op_errstr) > 0) {
snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
gf_log ("cli", GF_LOG_ERROR, "%s", rsp.op_errstr);
- } else {
- switch (rsp.op_errno) {
- case GF_DEPROBE_LOCALHOST:
- snprintf (msg, sizeof (msg),
- "%s is localhost",
- rsp.hostname);
- break;
- case GF_DEPROBE_NOT_FRIEND:
- snprintf (msg, sizeof (msg),
- "%s is not part of cluster",
- rsp.hostname);
- break;
- case GF_DEPROBE_BRICK_EXIST:
- snprintf (msg, sizeof (msg),
- "Brick(s) with the peer %s "
- "exist in cluster",
- rsp.hostname);
- break;
- case GF_DEPROBE_FRIEND_DOWN:
- snprintf (msg, sizeof (msg),
- "One of the peers is probably"
- " down. Check with 'peer "
- "status'.");
- break;
- default:
- snprintf (msg, sizeof (msg),
- "Detach returned with unknown"
- " errno %d", rsp.op_errno);
- break;
- }
- gf_log ("cli", GF_LOG_ERROR,"Detach failed with op_ret "
- "%d and op_errno %d", rsp.op_ret, rsp.op_errno);
}
} else {
snprintf (msg, sizeof (msg), "success");
}
if (global_state->mode & GLUSTER_MODE_XML) {
- ret = cli_xml_output_str ("peerDetach", msg, rsp.op_ret,
- rsp.op_errno, NULL);
+ ret = cli_xml_output_str (NULL,
+ (rsp.op_ret)? NULL : msg,
+ rsp.op_ret, rsp.op_errno,
+ (rsp.op_ret)? msg : NULL);
if (ret)
gf_log ("cli", GF_LOG_ERROR,
"Error outputting to xml");
@@ -283,21 +200,128 @@ out:
}
int
-gf_cli_list_friends_cbk (struct rpc_req *req, struct iovec *iov,
- int count, void *myframe)
+gf_cli_output_peer_status (dict_t *dict, int count)
{
- gf1_cli_peer_list_rsp rsp = {0,};
int ret = -1;
- dict_t *dict = NULL;
char *uuid_buf = NULL;
char *hostname_buf = NULL;
int32_t i = 1;
char key[256] = {0,};
char *state = NULL;
- int32_t port = 0;
int32_t connected = 0;
char *connected_str = NULL;
+
+ cli_out ("Number of Peers: %d", count);
+ i = 1;
+ while ( i <= count) {
+ snprintf (key, 256, "friend%d.uuid", i);
+ ret = dict_get_str (dict, key, &uuid_buf);
+ if (ret)
+ goto out;
+
+ snprintf (key, 256, "friend%d.hostname", i);
+ ret = dict_get_str (dict, key, &hostname_buf);
+ if (ret)
+ goto out;
+
+ snprintf (key, 256, "friend%d.connected", i);
+ ret = dict_get_int32 (dict, key, &connected);
+ if (ret)
+ goto out;
+ if (connected)
+ connected_str = "Connected";
+ else
+ connected_str = "Disconnected";
+
+
+ snprintf (key, 256, "friend%d.state", i);
+ ret = dict_get_str (dict, key, &state);
+ if (ret)
+ goto out;
+
+ cli_out ("\nHostname: %s\nUuid: %s\nState: %s (%s)",
+ hostname_buf, uuid_buf, state, connected_str);
+ i++;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+gf_cli_output_pool_list (dict_t *dict, int count)
+{
+ int ret = -1;
+ char *uuid_buf = NULL;
+ char *hostname_buf = NULL;
+ int32_t i = 1;
+ char key[256] = {0,};
+ int32_t connected = 0;
+ char *connected_str = NULL;
+
+ if (count >= 1)
+ cli_out ("UUID\t\t\t\t\tHostname\tState");
+
+ while ( i <= count) {
+ snprintf (key, 256, "friend%d.uuid", i);
+ ret = dict_get_str (dict, key, &uuid_buf);
+ if (ret)
+ goto out;
+
+ snprintf (key, 256, "friend%d.hostname", i);
+ ret = dict_get_str (dict, key, &hostname_buf);
+ if (ret)
+ goto out;
+
+ snprintf (key, 256, "friend%d.connected", i);
+ ret = dict_get_int32 (dict, key, &connected);
+ if (ret)
+ goto out;
+ if (connected)
+ connected_str = "Connected";
+ else
+ connected_str = "Disconnected";
+
+ cli_out ("%s\t%-9s\t%s ", uuid_buf, hostname_buf,
+ connected_str);
+ i++;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/* function pointer for gf_cli_output_{pool_list,peer_status} */
+typedef int (*cli_friend_output_fn) (dict_t*, int);
+
+int
+gf_cli_list_friends_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ gf1_cli_peer_list_rsp rsp = {0,};
+ int ret = -1;
+ dict_t *dict = NULL;
char msg[1024] = {0,};
+ char *cmd = NULL;
+ cli_friend_output_fn friend_output_fn;
+ call_frame_t *frame = NULL;
+ unsigned long flags = 0;
+
+ frame = myframe;
+ flags = (long)frame->local;
+
+ if (flags == GF_CLI_LIST_POOL_NODES) {
+ cmd = "pool list";
+ friend_output_fn = &gf_cli_output_pool_list;
+ } else {
+ cmd = "peer status";
+ friend_output_fn = &gf_cli_output_peer_status;
+ }
+
+ /* 'free' the flags set by gf_cli_list_friends */
+ frame->local = NULL;
if (-1 == req->rpc_status) {
goto out;
@@ -305,13 +329,14 @@ gf_cli_list_friends_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_peer_list_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
//rsp.op_ret = -1;
//rsp.op_errno = EINVAL;
goto out;
}
- gf_log ("cli", GF_LOG_INFO, "Received resp to list: %d",
+ gf_log ("cli", GF_LOG_DEBUG, "Received resp to list: %d",
rsp.op_ret);
ret = rsp.op_ret;
@@ -320,7 +345,7 @@ gf_cli_list_friends_cbk (struct rpc_req *req, struct iovec *iov,
if (!rsp.friends.friends_len) {
snprintf (msg, sizeof (msg),
- "peer status: No peers present");
+ "%s: No peers present", cmd);
if (global_state->mode & GLUSTER_MODE_XML) {
ret = cli_xml_output_peer_status (dict,
rsp.op_ret,
@@ -367,49 +392,9 @@ gf_cli_list_friends_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
}
- cli_out ("Number of Peers: %d", count);
- i = 1;
- while ( i <= count) {
- snprintf (key, 256, "friend%d.uuid", i);
- ret = dict_get_str (dict, key, &uuid_buf);
- if (ret)
- goto out;
-
- snprintf (key, 256, "friend%d.hostname", i);
- ret = dict_get_str (dict, key, &hostname_buf);
- if (ret)
- goto out;
-
- snprintf (key, 256, "friend%d.connected", i);
- ret = dict_get_int32 (dict, key, &connected);
- if (ret)
- goto out;
- if (connected)
- connected_str = "Connected";
- else
- connected_str = "Disconnected";
-
- snprintf (key, 256, "friend%d.port", i);
- ret = dict_get_int32 (dict, key, &port);
- if (ret)
- goto out;
-
- snprintf (key, 256, "friend%d.state", i);
- ret = dict_get_str (dict, key, &state);
- if (ret)
- goto out;
-
- if (!port) {
- cli_out ("\nHostname: %s\nUuid: %s\nState: %s "
- "(%s)",
- hostname_buf, uuid_buf, state,
- connected_str);
- } else {
- cli_out ("\nHostname: %s\nPort: %d\nUuid: %s\n"
- "State: %s (%s)", hostname_buf, port,
- uuid_buf, state, connected_str);
- }
- i++;
+ ret = friend_output_fn (dict, count);
+ if (ret) {
+ goto out;
}
} else {
if (global_state->mode & GLUSTER_MODE_XML) {
@@ -430,7 +415,7 @@ gf_cli_list_friends_cbk (struct rpc_req *req, struct iovec *iov,
out:
cli_cmd_broadcast_response (ret);
if (ret)
- cli_err ("peer status: failed");
+ cli_err ("%s: failed", cmd);
if (dict)
dict_destroy (dict);
@@ -464,6 +449,30 @@ cli_out_options ( char *substr, char *optstr, char *valstr)
cli_out ("%s: %s",ptr2 , valstr);
}
+static int
+_gf_cli_output_volinfo_opts (dict_t *d, char *k,
+ data_t *v, void *tmp)
+{
+ int ret = 0;
+ char *key = NULL;
+ char *ptr = NULL;
+ data_t *value = NULL;
+
+ key = tmp;
+
+ ptr = strstr (k, "option.");
+ if (ptr) {
+ value = v;
+ if (!value) {
+ ret = -1;
+ goto out;
+ }
+ cli_out_options (key, k, v->data);
+ }
+out:
+ return ret;
+}
+
int
gf_cli_get_volume_cbk (struct rpc_req *req, struct iovec *iov,
@@ -481,23 +490,27 @@ gf_cli_get_volume_cbk (struct rpc_req *req, struct iovec *iov,
int32_t replica_count = 0;
int32_t vol_type = 0;
int32_t transport = 0;
- char *ptr = NULL;
char *volume_id_str = NULL;
char *brick = NULL;
char *volname = NULL;
dict_t *dict = NULL;
- data_t *value = NULL;
cli_local_t *local = NULL;
char key[1024] = {0};
char err_str[2048] = {0};
gf_cli_rsp rsp = {0};
+ char *caps = NULL;
+ int k __attribute__((unused)) = 0;
+ // snap_volume variable helps in showing whether a volume is a normal
+ //volume or a volume for the snapshot
+ int32_t snap_volume = 0;
if (-1 == req->rpc_status)
goto out;
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("cli", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -611,6 +624,11 @@ xml_output:
if (ret)
goto out;
+ snprintf (key, sizeof (key), "volume%d.snap_volume", i);
+ ret = dict_get_int32 (dict, key, &snap_volume);
+ if (ret)
+ goto out;
+
snprintf (key, 256, "volume%d.brick_count", i);
ret = dict_get_int32 (dict, key, &brick_count);
if (ret)
@@ -651,6 +669,44 @@ xml_output:
cli_out ("Type: %s", cli_vol_type_str[vol_type]);
cli_out ("Volume ID: %s", volume_id_str);
cli_out ("Status: %s", cli_vol_status_str[status]);
+ if (snap_volume)
+ cli_out ("Snap Volume: %s", "yes");
+ else
+ cli_out ("Snap Volume: %s", "no");
+
+#ifdef HAVE_BD_XLATOR
+ k = 0;
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "volume%d.xlator%d", i, k);
+ ret = dict_get_str (dict, key, &caps);
+ if (ret)
+ goto next;
+ do {
+ j = 0;
+ cli_out ("Xlator %d: %s", k + 1, caps);
+ do {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key),
+ "volume%d.xlator%d.caps%d",
+ i, k, j++);
+ ret = dict_get_str (dict, key, &caps);
+ if (ret)
+ break;
+ cli_out ("Capability %d: %s", j, caps);
+ } while (1);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key),
+ "volume%d.xlator%d", i, ++k);
+ ret = dict_get_str (dict, key, &caps);
+ if (ret)
+ break;
+ } while (1);
+
+next:
+#else
+ caps = 0; /* Avoid compiler warnings when BD not enabled */
+#endif
if (type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) {
cli_out ("Number of Bricks: %d x %d x %d = %d",
@@ -658,10 +714,8 @@ xml_output:
stripe_count,
replica_count,
brick_count);
-
} else if (type == GF_CLUSTER_TYPE_NONE) {
cli_out ("Number of Bricks: %d", brick_count);
-
} else {
/* For both replicate and stripe, dist_count is
good enough */
@@ -689,6 +743,12 @@ xml_output:
goto out;
cli_out ("Brick%d: %s", j, brick);
+#ifdef HAVE_BD_XLATOR
+ snprintf (key, 256, "volume%d.vg%d", i, j);
+ ret = dict_get_str (dict, key, &caps);
+ if (!ret)
+ cli_out ("Brick%d VG: %s", j, caps);
+#endif
j++;
}
@@ -703,22 +763,8 @@ xml_output:
cli_out ("Options Reconfigured:");
snprintf (key, 256, "volume%d.option.",i);
- int _output_volinfo_opts (dict_t *d, char *k,
- data_t *v, void *tmp)
- {
- ptr = strstr (k, "option.");
- if (ptr) {
- value = v;
- if (!value) {
- ret = -1;
- goto internal_out;
- }
- cli_out_options (key, k, v->data);
- }
- internal_out:
- return ret;
- }
- ret = dict_foreach (dict, _output_volinfo_opts, NULL);
+
+ ret = dict_foreach (dict, _gf_cli_output_volinfo_opts, key);
if (ret)
goto out;
@@ -762,7 +808,8 @@ gf_cli_create_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -829,20 +876,22 @@ gf_cli_delete_volume_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
}
+ frame = myframe;
+
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
- frame = myframe;
local = frame->local;
if (local)
dict = local->dict;
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (frame->this->name, GF_LOG_ERROR,
"dict get failed");
goto out;
}
@@ -890,6 +939,144 @@ out:
}
int
+gf_cli3_1_uuid_get_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ char *uuid_str = NULL;
+ gf_cli_rsp rsp = {0,};
+ int ret = -1;
+ cli_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ dict_t *dict = NULL;
+
+ if (-1 == req->rpc_status)
+ goto out;
+
+ frame = myframe;
+
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
+ if (ret < 0) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
+ goto out;
+ }
+
+ local = frame->local;
+ frame->local = NULL;
+
+ gf_log ("cli", GF_LOG_INFO, "Received resp to uuid get");
+
+ dict = dict_new ();
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len,
+ &dict);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to unserialize "
+ "response for uuid get");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "uuid", &uuid_str);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to get uuid "
+ "from dictionary");
+ goto out;
+ }
+
+ if (global_state->mode & GLUSTER_MODE_XML) {
+ ret = cli_xml_output_dict ("uuidGenerate", dict, rsp.op_ret,
+ rsp.op_errno, rsp.op_errstr);
+ if (ret)
+ gf_log ("cli", GF_LOG_ERROR,
+ "Error outputting to xml");
+ goto out;
+ }
+
+ if (rsp.op_ret) {
+ if (strcmp (rsp.op_errstr, "") == 0)
+ cli_err ("Get uuid was unsuccessful");
+ else
+ cli_err ("%s", rsp.op_errstr);
+
+ } else {
+ cli_out ("UUID: %s", uuid_str);
+
+ }
+ ret = rsp.op_ret;
+
+out:
+ cli_cmd_broadcast_response (ret);
+ cli_local_wipe (local);
+ if (rsp.dict.dict_val)
+ free (rsp.dict.dict_val);
+ if (dict)
+ dict_unref (dict);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ return ret;
+}
+
+int
+gf_cli3_1_uuid_reset_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ gf_cli_rsp rsp = {0,};
+ int ret = -1;
+ cli_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ dict_t *dict = NULL;
+
+ if (-1 == req->rpc_status) {
+ goto out;
+ }
+
+ frame = myframe;
+
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
+ if (ret < 0) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
+ goto out;
+ }
+
+ local = frame->local;
+ frame->local = NULL;
+
+ gf_log ("cli", GF_LOG_INFO, "Received resp to uuid reset");
+
+ if (global_state->mode & GLUSTER_MODE_XML) {
+ ret = cli_xml_output_dict ("uuidReset", dict, rsp.op_ret,
+ rsp.op_errno, rsp.op_errstr);
+ if (ret)
+ gf_log ("cli", GF_LOG_ERROR,
+ "Error outputting to xml");
+ goto out;
+ }
+
+ if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
+ cli_err ("%s", rsp.op_errstr);
+ else
+ cli_out ("resetting the peer uuid has been %s",
+ (rsp.op_ret) ? "unsuccessful": "successful");
+ ret = rsp.op_ret;
+
+out:
+ cli_cmd_broadcast_response (ret);
+ cli_local_wipe (local);
+ if (rsp.dict.dict_val)
+ free (rsp.dict.dict_val);
+ if (dict)
+ dict_unref (dict);
+
+ gf_log ("", GF_LOG_INFO, "Returning with %d", ret);
+ return ret;
+}
+
+int
gf_cli_start_volume_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
@@ -905,14 +1092,15 @@ gf_cli_start_volume_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
}
+ frame = myframe;
+
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
- frame = myframe;
-
if (frame)
local = frame->local;
@@ -921,7 +1109,7 @@ gf_cli_start_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "dict get failed");
+ gf_log (frame->this->name, GF_LOG_ERROR, "dict get failed");
goto out;
}
@@ -982,14 +1170,15 @@ gf_cli_stop_volume_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
}
+ frame = myframe;
+
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
- frame = myframe;
-
if (frame)
local = frame->local;
@@ -997,7 +1186,7 @@ gf_cli_stop_volume_cbk (struct rpc_req *req, struct iovec *iov,
dict = local->dict;
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (frame->this->name, GF_LOG_ERROR,
"Unable to get volname from dict");
goto out;
}
@@ -1063,26 +1252,29 @@ gf_cli_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov,
char msg[1024] = {0,};
gf_defrag_status_t status_rcd = GF_DEFRAG_STATUS_NOT_STARTED;
int32_t counter = 0;
- char *node_uuid = NULL;
+ char *node_name = NULL;
char key[256] = {0,};
int32_t i = 1;
uint64_t failures = 0;
+ uint64_t skipped = 0;
double elapsed = 0;
char *size_str = NULL;
+ char *task_id_str = NULL;
if (-1 == req->rpc_status) {
goto out;
}
+ frame = myframe;
+
ret = xdr_to_generic (*iov, &rsp,
(xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
- frame = myframe;
-
if (frame)
local = frame->local;
@@ -1091,14 +1283,14 @@ gf_cli_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = dict_get_str (local_dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (frame->this->name, GF_LOG_ERROR,
"Failed to get volname");
goto out;
}
ret = dict_get_int32 (local_dict, "rebalance-command", (int32_t*)&cmd);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (frame->this->name, GF_LOG_ERROR,
"Failed to get command");
goto out;
}
@@ -1118,15 +1310,24 @@ gf_cli_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov,
}
}
- if (!((cmd == GF_DEFRAG_CMD_STOP) || (cmd == GF_DEFRAG_CMD_STATUS))) {
- /* All other possibility is about starting a volume */
- if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
+ if (!((cmd == GF_DEFRAG_CMD_STOP) || (cmd == GF_DEFRAG_CMD_STATUS)) &&
+ !(global_state->mode & GLUSTER_MODE_XML)) {
+ /* All other possibilites are about starting a rebalance */
+ ret = dict_get_str (dict, GF_REBALANCE_TID_KEY, &task_id_str);
+ if (rsp.op_ret && strcmp (rsp.op_errstr, "")) {
snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
- else
- snprintf (msg, sizeof (msg),
- "Starting rebalance on volume %s has been %s",
- volname, (rsp.op_ret) ? "unsuccessful":
- "successful");
+ } else {
+ if (!rsp.op_ret) {
+ snprintf (msg, sizeof (msg),
+ "Starting rebalance on volume %s has "
+ "been successful.\nID: %s", volname,
+ task_id_str);
+ } else {
+ snprintf (msg, sizeof (msg),
+ "Starting rebalance on volume %s has "
+ "been unsuccessful.", volname);
+ }
+ }
goto done;
}
@@ -1142,8 +1343,12 @@ gf_cli_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov,
goto done;
} else {
snprintf (msg, sizeof (msg),
- "Stopped rebalance process on volume %s \n",
- volname);
+ "rebalance process may be in the middle of a "
+ "file migration.\nThe process will be fully "
+ "stopped once the migration of the file is "
+ "complete.\nPlease check rebalance process "
+ "for completion before doing any further "
+ "brick related tasks on the volume.");
}
}
if (cmd == GF_DEFRAG_CMD_STATUS) {
@@ -1168,86 +1373,76 @@ gf_cli_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = dict_get_int32 (dict, "count", &counter);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "count not set");
+ gf_log (frame->this->name, GF_LOG_ERROR, "count not set");
goto out;
}
- cli_out ("%40s %16s %13s %13s %13s %14s %s", "Node", "Rebalanced-files",
- "size", "scanned", "failures", "status", "run time in secs");
- cli_out ("%40s %16s %13s %13s %13s %14s %16s", "---------",
+ cli_out ("%40s %16s %13s %13s %13s %13s %20s %18s", "Node",
+ "Rebalanced-files", "size", "scanned", "failures", "skipped",
+ "status", "run time in secs");
+ cli_out ("%40s %16s %13s %13s %13s %13s %20s %18s", "---------",
"-----------", "-----------", "-----------", "-----------",
- "------------", "--------------");
+ "-----------", "------------", "--------------");
do {
- snprintf (key, 256, "node-uuid-%d", i);
- ret = dict_get_str (dict, key, &node_uuid);
+ snprintf (key, 256, "node-name-%d", i);
+ ret = dict_get_str (dict, key, &node_name);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
- "failed to get node-uuid");
+ gf_log (frame->this->name, GF_LOG_TRACE,
+ "failed to get node-name");
memset (key, 0, 256);
snprintf (key, 256, "files-%d", i);
ret = dict_get_uint64 (dict, key, &files);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get file count");
memset (key, 0, 256);
snprintf (key, 256, "size-%d", i);
ret = dict_get_uint64 (dict, key, &size);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get size of xfer");
memset (key, 0, 256);
snprintf (key, 256, "lookups-%d", i);
ret = dict_get_uint64 (dict, key, &lookup);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get lookedup file count");
memset (key, 0, 256);
snprintf (key, 256, "status-%d", i);
ret = dict_get_int32 (dict, key, (int32_t *)&status_rcd);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get status");
memset (key, 0, 256);
snprintf (key, 256, "failures-%d", i);
ret = dict_get_uint64 (dict, key, &failures);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get failures count");
memset (key, 0, 256);
+ snprintf (key, 256, "skipped-%d", i);
+ ret = dict_get_uint64 (dict, key, &skipped);
+ if (ret)
+ gf_log (frame->this->name, GF_LOG_TRACE,
+ "failed to get skipped count");
+ memset (key, 0, 256);
snprintf (key, 256, "run-time-%d", i);
ret = dict_get_double (dict, key, &elapsed);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get run-time");
- switch (status_rcd) {
- case GF_DEFRAG_STATUS_NOT_STARTED:
- status = "not started";
- break;
- case GF_DEFRAG_STATUS_STARTED:
- status = "in progress";
- break;
- case GF_DEFRAG_STATUS_STOPPED:
- status = "stopped";
- break;
- case GF_DEFRAG_STATUS_COMPLETE:
- status = "completed";
- break;
- case GF_DEFRAG_STATUS_FAILED:
- status = "failed";
- break;
- }
-
+ status = cli_vol_task_status_str[status_rcd];
size_str = gf_uint64_2human_readable(size);
cli_out ("%40s %16"PRIu64 " %13s" " %13"PRIu64 " %13"PRIu64
- " %14s %16.2f", node_uuid, files, size_str, lookup,
- failures, status, elapsed);
+ " %13"PRIu64 " %20s %18.2f", node_name, files,
+ size_str, lookup, failures, skipped, status, elapsed);
GF_FREE(size_str);
i++;
@@ -1255,10 +1450,18 @@ gf_cli_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov,
done:
- if (rsp.op_ret)
- cli_err ("volume rebalance: %s: failed: %s", volname, msg);
- else
- cli_out ("volume rebalance: %s: success: %s", volname, msg);
+ if (global_state->mode & GLUSTER_MODE_XML)
+ cli_xml_output_str ("volRebalance", msg,
+ rsp.op_ret, rsp.op_errno,
+ rsp.op_errstr);
+ else {
+ if (rsp.op_ret)
+ cli_err ("volume rebalance: %s: failed: %s", volname,
+ msg);
+ else
+ cli_out ("volume rebalance: %s: success: %s", volname,
+ msg);
+ }
ret = rsp.op_ret;
out:
@@ -1284,7 +1487,8 @@ gf_cli_rename_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -1328,13 +1532,14 @@ gf_cli_reset_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
gf_log ("cli", GF_LOG_INFO, "Received resp to reset");
- if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
+ if (strcmp (rsp.op_errstr, ""))
snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
else
snprintf (msg, sizeof (msg), "reset volume %s",
@@ -1352,7 +1557,7 @@ gf_cli_reset_volume_cbk (struct rpc_req *req, struct iovec *iov,
if (rsp.op_ret)
cli_err ("volume reset: failed: %s", msg);
else
- cli_out ("volume reset: success");
+ cli_out ("volume reset: success: %s", msg);
ret = rsp.op_ret;
@@ -1361,6 +1566,53 @@ out:
return ret;
}
+char *
+is_server_debug_xlator (void *myframe)
+{
+ call_frame_t *frame = NULL;
+ cli_local_t *local = NULL;
+ char **words = NULL;
+ char *key = NULL;
+ char *value = NULL;
+ char *debug_xlator = NULL;
+
+ frame = myframe;
+ local = frame->local;
+ words = (char **)local->words;
+
+ while (*words != NULL) {
+ if (strstr (*words, "trace") == NULL &&
+ strstr (*words, "error-gen") == NULL) {
+ words++;
+ continue;
+ }
+
+ key = *words;
+ words++;
+ value = *words;
+ if (value == NULL)
+ break;
+ if (strstr (value, "client")) {
+ words++;
+ continue;
+ } else {
+ if (!(strstr (value, "posix") || strstr (value, "acl")
+ || strstr (value, "locks") ||
+ strstr (value, "io-threads") ||
+ strstr (value, "marker") ||
+ strstr (value, "index"))) {
+ words++;
+ continue;
+ } else {
+ debug_xlator = gf_strdup (key);
+ break;
+ }
+ }
+ }
+
+ return debug_xlator;
+}
+
int
gf_cli_set_volume_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
@@ -1370,6 +1622,8 @@ gf_cli_set_volume_cbk (struct rpc_req *req, struct iovec *iov,
dict_t *dict = NULL;
char *help_str = NULL;
char msg[1024] = {0,};
+ char *debug_xlator = _gf_false;
+ char tmp_str[512] = {0,};
if (-1 == req->rpc_status) {
goto out;
@@ -1377,7 +1631,8 @@ gf_cli_set_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -1392,9 +1647,21 @@ gf_cli_set_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len, &dict);
+ /* For brick processes graph change does not happen on the fly.
+ * The proces has to be restarted. So this is a check from the
+ * volume set option such that if debug xlators such as trace/errorgen
+ * are provided in the set command, warn the user.
+ */
+ debug_xlator = is_server_debug_xlator (myframe);
+
if (dict_get_str (dict, "help-str", &help_str) && !msg[0])
snprintf (msg, sizeof (msg), "Set volume %s",
(rsp.op_ret) ? "unsuccessful": "successful");
+ if (rsp.op_ret == 0 && debug_xlator) {
+ snprintf (tmp_str, sizeof (tmp_str), "\n%s translator has been "
+ "added to the server volume file. Please restart the"
+ " volume for enabling the translator", debug_xlator);
+ }
if ((global_state->mode & GLUSTER_MODE_XML) && (help_str == NULL)) {
ret = cli_xml_output_str ("volSet", msg, rsp.op_ret,
@@ -1405,16 +1672,20 @@ gf_cli_set_volume_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
}
- if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
- cli_err ("volume set: failed: %s", rsp.op_errstr);
-
- if (!rsp.op_ret) {
- if (help_str == NULL)
- cli_out ("volume set: success");
+ if (rsp.op_ret) {
+ if (strcmp (rsp.op_errstr, ""))
+ cli_err ("volume set: failed: %s", rsp.op_errstr);
else
- cli_out ("%s", help_str);
+ cli_err ("volume set: failed");
} else {
- cli_err ("volume set: failed");
+ if (help_str == NULL) {
+ if (debug_xlator == NULL)
+ cli_out ("volume set: success");
+ else
+ cli_out ("volume set: success%s", tmp_str);
+ }else {
+ cli_out ("%s", help_str);
+ }
}
ret = rsp.op_ret;
@@ -1422,6 +1693,7 @@ gf_cli_set_volume_cbk (struct rpc_req *req, struct iovec *iov,
out:
if (dict)
dict_unref (dict);
+ GF_FREE (debug_xlator);
cli_cmd_broadcast_response (ret);
return ret;
}
@@ -1440,7 +1712,8 @@ gf_cli_add_brick_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -1490,31 +1763,58 @@ gf_cli3_remove_brick_status_cbk (struct rpc_req *req, struct iovec *iov,
char key[256] = {0,};
int32_t i = 1;
int32_t counter = 0;
- char *node_uuid = 0;
+ char *node_name = 0;
gf_defrag_status_t status_rcd = GF_DEFRAG_STATUS_NOT_STARTED;
uint64_t failures = 0;
+ uint64_t skipped = 0;
double elapsed = 0;
char *size_str = NULL;
+ int32_t command = 0;
+ gf1_op_commands cmd = GF_OP_CMD_NONE;
+ cli_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ char *cmd_str = "unknown";
if (-1 == req->rpc_status) {
goto out;
}
+ frame = myframe;
+
ret = xdr_to_generic (*iov, &rsp,
(xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
+ if (frame)
+ local = frame->local;
+ ret = dict_get_int32 (local->dict, "command", &command);
+ if (ret)
+ goto out;
+ cmd = command;
+
+ switch (cmd) {
+ case GF_OP_CMD_STOP:
+ cmd_str = "stop";
+ break;
+ case GF_OP_CMD_STATUS:
+ cmd_str = "status";
+ break;
+ default:
+ break;
+ }
+
ret = rsp.op_ret;
if (rsp.op_ret == -1) {
if (strcmp (rsp.op_errstr, ""))
- snprintf (msg, sizeof (msg), "volume remove-brick: "
- "failed: %s", rsp.op_errstr);
+ snprintf (msg, sizeof (msg), "volume remove-brick %s: "
+ "failed: %s", cmd_str, rsp.op_errstr);
else
- snprintf (msg, sizeof (msg), "volume remove-brick: "
- "failed: status getting failed");
+ snprintf (msg, sizeof (msg), "volume remove-brick %s: "
+ "failed", cmd_str);
if (global_state->mode & GLUSTER_MODE_XML)
goto xml_output;
@@ -1562,63 +1862,69 @@ xml_output:
ret = dict_get_int32 (dict, "count", &counter);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "count not set");
+ gf_log (frame->this->name, GF_LOG_ERROR, "count not set");
goto out;
}
- cli_out ("%40s %16s %13s %13s %13s %14s %s", "Node", "Rebalanced-files",
- "size", "scanned", "failures", "status", "run-time in secs");
- cli_out ("%40s %16s %13s %13s %13s %14s %16s", "---------",
+ cli_out ("%40s %16s %13s %13s %13s %13s %14s %s", "Node",
+ "Rebalanced-files", "size", "scanned", "failures", "skipped",
+ "status", "run-time in secs");
+ cli_out ("%40s %16s %13s %13s %13s %13s %14s %16s", "---------",
"-----------", "-----------", "-----------", "-----------",
- "------------", "--------------");
+ "-----------","------------", "--------------");
do {
- snprintf (key, 256, "node-uuid-%d", i);
- ret = dict_get_str (dict, key, &node_uuid);
+ snprintf (key, 256, "node-name-%d", i);
+ ret = dict_get_str (dict, key, &node_name);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
- "failed to get node-uuid");
+ gf_log (frame->this->name, GF_LOG_TRACE,
+ "failed to get node-name");
memset (key, 0, 256);
snprintf (key, 256, "files-%d", i);
ret = dict_get_uint64 (dict, key, &files);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get file count");
memset (key, 0, 256);
snprintf (key, 256, "size-%d", i);
ret = dict_get_uint64 (dict, key, &size);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get size of xfer");
memset (key, 0, 256);
snprintf (key, 256, "lookups-%d", i);
ret = dict_get_uint64 (dict, key, &lookup);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get lookedup file count");
memset (key, 0, 256);
snprintf (key, 256, "status-%d", i);
ret = dict_get_int32 (dict, key, (int32_t *)&status_rcd);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get status");
snprintf (key, 256, "failures-%d", i);
ret = dict_get_uint64 (dict, key, &failures);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"Failed to get failure on files");
+ snprintf (key, 256, "failures-%d", i);
+ ret = dict_get_uint64 (dict, key, &skipped);
+ if (ret)
+ gf_log (frame->this->name, GF_LOG_TRACE,
+ "Failed to get skipped files");
memset (key, 0, 256);
snprintf (key, 256, "run-time-%d", i);
ret = dict_get_double (dict, key, &elapsed);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"Failed to get run-time");
switch (status_rcd) {
@@ -1637,17 +1943,32 @@ xml_output:
case GF_DEFRAG_STATUS_FAILED:
status = "failed";
break;
+ default:
+ break;
}
size_str = gf_uint64_2human_readable(size);
- cli_out ("%40s %16"PRIu64 " %13s" " %13"PRIu64 " %13"PRIu64
- " %14s %16.2f", node_uuid, files, size_str, lookup,
- failures, status, elapsed);
+
+ if (strcmp (status, "not started")) {
+ cli_out ("%40s %16"PRIu64 " %13s" " %13"PRIu64 " %13"
+ PRIu64 " %13"PRIu64 " %14s %16.2f", node_name,
+ files, size_str, lookup, failures, skipped,
+ status, elapsed);
+ }
GF_FREE(size_str);
i++;
} while (i <= counter);
+ if ((cmd == GF_OP_CMD_STOP) && (rsp.op_ret == 0)) {
+ cli_out ("'remove-brick' process may be in the middle of a "
+ "file migration.\nThe process will be fully stopped "
+ "once the migration of the file is complete.\nPlease "
+ "check remove-brick process for completion before "
+ "doing any further brick related tasks on the "
+ "volume.");
+ }
+
out:
free (rsp.dict.dict_val); //malloced by xdr
if (dict)
@@ -1668,6 +1989,8 @@ gf_cli_remove_brick_cbk (struct rpc_req *req, struct iovec *iov,
char *cmd_str = "unknown";
cli_local_t *local = NULL;
call_frame_t *frame = NULL;
+ char *task_id_str = NULL;
+ dict_t *rsp_dict = NULL;
if (-1 == req->rpc_status) {
goto out;
@@ -1678,7 +2001,8 @@ gf_cli_remove_brick_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -1688,10 +2012,31 @@ gf_cli_remove_brick_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
}
- switch (cmd) {
+ if (rsp.dict.dict_len) {
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len,
+ &rsp_dict);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Failed to unserialize rsp_dict");
+ goto out;
+ }
+ }
+
+ switch (cmd) {
case GF_OP_CMD_START:
cmd_str = "start";
+
+ ret = dict_get_str (rsp_dict, GF_REMOVE_BRICK_TID_KEY, &task_id_str);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "remove-brick-id is not present in dict");
+ }
break;
case GF_OP_CMD_COMMIT:
cmd_str = "commit";
@@ -1713,19 +2058,23 @@ gf_cli_remove_brick_cbk (struct rpc_req *req, struct iovec *iov,
(rsp.op_ret) ? "unsuccessful": "successful");
if (global_state->mode & GLUSTER_MODE_XML) {
- ret = cli_xml_output_vol_remove_brick (_gf_false, NULL,
+ ret = cli_xml_output_vol_remove_brick (_gf_false, rsp_dict,
rsp.op_ret, rsp.op_errno,
- rsp.op_errstr);
+ msg);
if (ret)
gf_log ("cli", GF_LOG_ERROR,
"Error outputting to xml");
goto out;
}
- if (rsp.op_ret)
- cli_err ("volume remove-brick: failed: %s", rsp.op_errstr);
- else
- cli_out ("volume remove-brick: success");
+ if (rsp.op_ret) {
+ cli_err ("volume remove-brick %s: failed: %s", cmd_str,
+ msg);
+ } else {
+ cli_out ("volume remove-brick %s: success", cmd_str);
+ if (GF_OP_CMD_START == cmd && task_id_str != NULL)
+ cli_out ("ID: %s", task_id_str);
+ }
ret = rsp.op_ret;
@@ -1754,7 +2103,8 @@ gf_cli_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
gf1_cli_replace_op replace_op = 0;
char *rb_operation_str = NULL;
dict_t *rsp_dict = NULL;
- char msg[1024] = {0,};
+ char msg[1024] = {0,};
+ char *task_id_str = NULL;
if (-1 == req->rpc_status) {
goto out;
@@ -1764,7 +2114,8 @@ gf_cli_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -1779,58 +2130,77 @@ gf_cli_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
}
+ if (rsp.dict.dict_len) {
+ /* Unserialize the dictionary */
+ rsp_dict = dict_new ();
+
+ ret = dict_unserialize (rsp.dict.dict_val,
+ rsp.dict.dict_len,
+ &rsp_dict);
+ if (ret < 0) {
+ gf_log ("glusterd", GF_LOG_ERROR,
+ "failed to "
+ "unserialize rsp buffer to dictionary");
+ goto out;
+ }
+ }
+
switch (replace_op) {
case GF_REPLACE_OP_START:
- if (rsp.op_ret)
- rb_operation_str = "replace-brick failed to start";
- else
- rb_operation_str = "replace-brick started successfully";
+ if (rsp.op_ret) {
+ rb_operation_str = gf_strdup ("replace-brick failed to"
+ " start");
+ } else {
+ ret = dict_get_str (rsp_dict, GF_REPLACE_BRICK_TID_KEY,
+ &task_id_str);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to get "
+ "\"replace-brick-id\" from dict");
+ goto out;
+ }
+ ret = gf_asprintf (&rb_operation_str,
+ "replace-brick started successfully"
+ "\nID: %s", task_id_str);
+ if (ret < 0)
+ goto out;
+ }
break;
case GF_REPLACE_OP_STATUS:
- if (rsp.op_ret || ret)
- rb_operation_str = "replace-brick status unknown";
- else {
- if (rsp.dict.dict_len) {
- /* Unserialize the dictionary */
- rsp_dict = dict_new ();
-
- ret = dict_unserialize (rsp.dict.dict_val,
- rsp.dict.dict_len,
- &rsp_dict);
- if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
- "failed to "
- "unserialize req-buffer to dictionary");
- goto out;
- }
- }
+ if (rsp.op_ret || ret) {
+ rb_operation_str = gf_strdup ("replace-brick status "
+ "unknown");
+ } else {
ret = dict_get_str (rsp_dict, "status-reply",
&status_reply);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to"
+ gf_log (frame->this->name, GF_LOG_ERROR, "failed to"
"get status");
goto out;
}
- rb_operation_str = status_reply;
+ rb_operation_str = gf_strdup (status_reply);
}
break;
case GF_REPLACE_OP_PAUSE:
if (rsp.op_ret)
- rb_operation_str = "replace-brick pause failed";
+ rb_operation_str = gf_strdup ("replace-brick pause "
+ "failed");
else
- rb_operation_str = "replace-brick paused successfully";
+ rb_operation_str = gf_strdup ("replace-brick paused "
+ "successfully");
break;
case GF_REPLACE_OP_ABORT:
if (rsp.op_ret)
- rb_operation_str = "replace-brick abort failed";
+ rb_operation_str = gf_strdup ("replace-brick abort "
+ "failed");
else
- rb_operation_str = "replace-brick aborted successfully";
+ rb_operation_str = gf_strdup ("replace-brick aborted "
+ "successfully");
break;
case GF_REPLACE_OP_COMMIT:
@@ -1851,9 +2221,11 @@ gf_cli_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
if (rsp.op_ret || ret)
- rb_operation_str = "replace-brick commit failed";
+ rb_operation_str = gf_strdup ("replace-brick commit "
+ "failed");
else
- rb_operation_str = "replace-brick commit successful";
+ rb_operation_str = gf_strdup ("replace-brick commit "
+ "successful");
break;
@@ -1864,7 +2236,7 @@ gf_cli_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
}
if (rsp.op_ret && (strcmp (rsp.op_errstr, ""))) {
- rb_operation_str = rsp.op_errstr;
+ rb_operation_str = gf_strdup (rsp.op_errstr);
}
gf_log ("cli", GF_LOG_INFO, "Received resp to replace brick");
@@ -1888,6 +2260,17 @@ gf_cli_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
ret = rsp.op_ret;
out:
+ if (frame)
+ frame->local = NULL;
+
+ if (local) {
+ dict_unref (local->dict);
+ cli_local_wipe (local);
+ }
+
+ if (rb_operation_str)
+ GF_FREE (rb_operation_str);
+
cli_cmd_broadcast_response (ret);
free (rsp.dict.dict_val);
if (rsp_dict)
@@ -1911,7 +2294,8 @@ gf_cli_log_rotate_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -1959,7 +2343,8 @@ gf_cli_sync_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -1998,7 +2383,7 @@ gf_cli_print_limit_list (char *volname, char *limit_list,
{
int64_t size = 0;
int64_t limit_value = 0;
- int32_t i, j, k;
+ int32_t i, j;
int32_t len = 0, ret = -1;
char *size_str = NULL;
char path [PATH_MAX] = {0, };
@@ -2006,6 +2391,7 @@ gf_cli_print_limit_list (char *volname, char *limit_list,
char value [1024] = {0, };
char mountdir [] = "/tmp/mntXXXXXX";
char abspath [PATH_MAX] = {0, };
+ char *colon_ptr = NULL;
runner_t runner = {0,};
GF_VALIDATE_OR_GOTO ("cli", volname, out);
@@ -2053,19 +2439,16 @@ gf_cli_print_limit_list (char *volname, char *limit_list,
"-----------------------");
while (i < len) {
j = 0;
- k = 0;
-
- while (limit_list [i] != ':') {
- path [k++] = limit_list [i++];
- }
- path [k] = '\0';
-
- i++; //skip ':'
while (limit_list [i] != ',' && limit_list [i] != '\0') {
- value [j++] = limit_list[i++];
+ path [j++] = limit_list[i++];
}
- value [j] = '\0';
+ path [j] = '\0';
+ //here path[] contains both path and limit value
+
+ colon_ptr = strrchr (path, ':');
+ *colon_ptr = '\0';
+ strcpy (value, ++colon_ptr);
snprintf (abspath, sizeof (abspath), "%s/%s", mountdir, path);
@@ -2117,15 +2500,19 @@ gf_cli_quota_cbk (struct rpc_req *req, struct iovec *iov,
char *volname = NULL;
char *limit_list = NULL;
int32_t type = 0;
- char msg[1024] = {0,};
+ char msg[1024] = {0,};
+ call_frame_t *frame = NULL;
if (-1 == req->rpc_status) {
goto out;
}
+ frame = myframe;
+
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -2156,17 +2543,17 @@ gf_cli_quota_cbk (struct rpc_req *req, struct iovec *iov,
ret = dict_get_str (dict, "volname", &volname);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get volname");
ret = dict_get_str (dict, "limit_list", &limit_list);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get limit_list");
ret = dict_get_int32 (dict, "type", &type);
if (ret)
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (frame->this->name, GF_LOG_TRACE,
"failed to get type");
if (type == GF_QUOTA_OPTION_TYPE_LIST) {
@@ -2235,14 +2622,24 @@ gf_cli_getspec_cbk (struct rpc_req *req, struct iovec *iov,
gf_getspec_rsp rsp = {0,};
int ret = -1;
char *spec = NULL;
+ call_frame_t *frame = NULL;
if (-1 == req->rpc_status) {
goto out;
}
+ frame = myframe;
+
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_getspec_rsp);
- if (ret < 0 || rsp.op_ret == -1) {
- gf_log ("", GF_LOG_ERROR, "error");
+ if (ret < 0) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
+ goto out;
+ }
+
+ if (rsp.op_ret == -1) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "getspec failed");
goto out;
}
@@ -2272,14 +2669,24 @@ gf_cli_pmap_b2p_cbk (struct rpc_req *req, struct iovec *iov,
pmap_port_by_brick_rsp rsp = {0,};
int ret = -1;
char *spec = NULL;
+ call_frame_t *frame = NULL;
if (-1 == req->rpc_status) {
goto out;
}
+ frame = myframe;
+
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_pmap_port_by_brick_rsp);
- if (ret < 0 || rsp.op_ret == -1) {
- gf_log ("", GF_LOG_ERROR, "error");
+ if (ret < 0) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
+ goto out;
+ }
+
+ if (rsp.op_ret == -1) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "pump_b2p failed");
goto out;
}
@@ -2300,10 +2707,9 @@ int32_t
gf_cli_probe (call_frame_t *frame, xlator_t *this,
void *data)
{
- gf1_cli_probe_req req = {0,};
+ gf_cli_req req = {{0,},};
int ret = 0;
dict_t *dict = NULL;
- char *hostname = NULL;
int port = 0;
if (!frame || !this || !data) {
@@ -2312,24 +2718,22 @@ gf_cli_probe (call_frame_t *frame, xlator_t *this,
}
dict = data;
- ret = dict_get_str (dict, "hostname", &hostname);
- if (ret)
- goto out;
ret = dict_get_int32 (dict, "port", &port);
- if (ret)
- port = CLI_GLUSTERD_PORT;
-
- req.hostname = hostname;
- req.port = port;
+ if (ret) {
+ ret = dict_set_int32 (dict, "port", CLI_GLUSTERD_PORT);
+ if (ret)
+ goto out;
+ }
- ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
- GLUSTER_CLI_PROBE, NULL,
- this, gf_cli_probe_cbk,
- (xdrproc_t)xdr_gf1_cli_probe_req);
+ ret = cli_to_glusterd (&req, frame, gf_cli_probe_cbk,
+ (xdrproc_t) xdr_gf_cli_req, dict,
+ GLUSTER_CLI_PROBE, this, cli_rpc_prog, NULL);
out:
+ GF_FREE (req.dict.dict_val);
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+
return ret;
}
@@ -2337,10 +2741,9 @@ int32_t
gf_cli_deprobe (call_frame_t *frame, xlator_t *this,
void *data)
{
- gf1_cli_deprobe_req req = {0,};
+ gf_cli_req req = {{0,},};
int ret = 0;
dict_t *dict = NULL;
- char *hostname = NULL;
int port = 0;
int flags = 0;
@@ -2350,51 +2753,64 @@ gf_cli_deprobe (call_frame_t *frame, xlator_t *this,
}
dict = data;
- ret = dict_get_str (dict, "hostname", &hostname);
- if (ret)
- goto out;
-
ret = dict_get_int32 (dict, "port", &port);
- if (ret)
- port = CLI_GLUSTERD_PORT;
+ if (ret) {
+ ret = dict_set_int32 (dict, "port", CLI_GLUSTERD_PORT);
+ if (ret)
+ goto out;
+ }
ret = dict_get_int32 (dict, "flags", &flags);
- if (ret)
- flags = 0;
+ if (ret) {
+ ret = dict_set_int32 (dict, "flags", 0);
+ if (ret)
+ goto out;
+ }
- req.hostname = hostname;
- req.port = port;
- req.flags = flags;
- ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
- GLUSTER_CLI_DEPROBE, NULL,
- this, gf_cli_deprobe_cbk,
- (xdrproc_t)xdr_gf1_cli_deprobe_req);
+ ret = cli_to_glusterd (&req, frame, gf_cli_deprobe_cbk,
+ (xdrproc_t)xdr_gf_cli_req, dict,
+ GLUSTER_CLI_DEPROBE, this, cli_rpc_prog, NULL);
out:
+ GF_FREE (req.dict.dict_val);
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+
return ret;
}
int32_t
gf_cli_list_friends (call_frame_t *frame, xlator_t *this,
- void *data)
+ void *data)
{
gf1_cli_peer_list_req req = {0,};
int ret = 0;
+ unsigned long flags = 0;
if (!frame || !this) {
ret = -1;
goto out;
}
- req.flags = GF_CLI_LIST_ALL;
+ GF_ASSERT (frame->local == NULL);
+ flags = (long)data;
+ req.flags = flags;
+ frame->local = (void*)flags;
ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
GLUSTER_CLI_LIST_FRIENDS, NULL,
this, gf_cli_list_friends_cbk,
(xdrproc_t) xdr_gf1_cli_peer_list_req);
out:
+ if (ret) {
+ /*
+ * If everything goes fine, gf_cli_list_friends_cbk()
+ * [invoked through cli_cmd_submit()]resets the
+ * frame->local to NULL. In case cli_cmd_submit()
+ * fails in between, RESET frame->local here.
+ */
+ frame->local = NULL;
+ }
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -2487,7 +2903,7 @@ gf_cli_get_volume (call_frame_t *frame, xlator_t *this,
flags = ctx->flags;
ret = dict_set_int32 (dict, "flags", flags);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to set flags");
+ gf_log (frame->this->name, GF_LOG_ERROR, "failed to set flags");
goto out;
}
@@ -2509,6 +2925,51 @@ out:
return ret;
}
+int32_t
+gf_cli3_1_uuid_get (call_frame_t *frame, xlator_t *this,
+ void *data)
+{
+ gf_cli_req req = {{0,}};
+ int ret = 0;
+ dict_t *dict = NULL;
+
+ if (!frame || !this || !data) {
+ ret = -1;
+ goto out;
+ }
+
+ dict = data;
+ ret = cli_to_glusterd (&req, frame, gf_cli3_1_uuid_get_cbk,
+ (xdrproc_t)xdr_gf_cli_req, dict,
+ GLUSTER_CLI_UUID_GET, this, cli_rpc_prog,
+ NULL);
+out:
+ gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+gf_cli3_1_uuid_reset (call_frame_t *frame, xlator_t *this,
+ void *data)
+{
+ gf_cli_req req = {{0,}};
+ int ret = 0;
+ dict_t *dict = NULL;
+
+ if (!frame || !this || !data) {
+ ret = -1;
+ goto out;
+ }
+
+ dict = data;
+ ret = cli_to_glusterd (&req, frame, gf_cli3_1_uuid_reset_cbk,
+ (xdrproc_t)xdr_gf_cli_req, dict,
+ GLUSTER_CLI_UUID_RESET, this, cli_rpc_prog,
+ NULL);
+out:
+ gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
int32_t
gf_cli_create_volume (call_frame_t *frame, xlator_t *this,
@@ -2805,7 +3266,7 @@ gf_cli_remove_brick (call_frame_t *frame, xlator_t *this,
GLUSTER_CLI_REMOVE_BRICK, this,
cli_rpc_prog, NULL);
} else {
- /* Need rebalance status to e sent :-) */
+ /* Need rebalance status to be sent :-) */
req_dict = dict_new ();
if (!req_dict) {
ret = -1;
@@ -2814,7 +3275,7 @@ gf_cli_remove_brick (call_frame_t *frame, xlator_t *this,
ret = dict_set_str (req_dict, "volname", volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Failed to set dict");
goto out;
}
@@ -2826,7 +3287,7 @@ gf_cli_remove_brick (call_frame_t *frame, xlator_t *this,
ret = dict_set_int32 (req_dict, "rebalance-command", (int32_t) cmd);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Failed to set dict");
goto out;
}
@@ -2977,6 +3438,7 @@ gf_cli_getspec (call_frame_t *frame, xlator_t *this,
gf_getspec_req req = {0,};
int ret = 0;
dict_t *dict = NULL;
+ dict_t *op_dict = NULL;
if (!frame || !this || !data) {
ret = -1;
@@ -2989,12 +3451,45 @@ gf_cli_getspec (call_frame_t *frame, xlator_t *this,
if (ret)
goto out;
+ op_dict = dict_new ();
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+
+ // Set the supported min and max op-versions, so glusterd can make a
+ // decision
+ ret = dict_set_int32 (op_dict, "min-op-version", GD_OP_VERSION_MIN);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to set min-op-version"
+ " in request dict");
+ goto out;
+ }
+
+ ret = dict_set_int32 (op_dict, "max-op-version", GD_OP_VERSION_MAX);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to set max-op-version"
+ " in request dict");
+ goto out;
+ }
+
+ ret = dict_allocate_and_serialize (op_dict, &req.xdata.xdata_val,
+ &req.xdata.xdata_len);
+ if (ret < 0) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Failed to serialize dictionary");
+ goto out;
+ }
+
ret = cli_cmd_submit (&req, frame, &cli_handshake_prog,
GF_HNDSK_GETSPEC, NULL,
this, gf_cli_getspec_cbk,
(xdrproc_t) xdr_gf_getspec_req);
out:
+ if (op_dict) {
+ dict_unref(op_dict);
+ }
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -3075,7 +3570,8 @@ gf_cli_fsm_log_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_fsm_log_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -3175,8 +3671,11 @@ gf_cli_gsync_config_command (dict_t *dict)
char *subop = NULL;
char *gwd = NULL;
char *slave = NULL;
+ char *confpath = NULL;
char *master = NULL;
char *op_name = NULL;
+ int ret = -1;
+ char conf_path[PATH_MAX] = "";
if (dict_get_str (dict, "subop", &subop) != 0)
return -1;
@@ -3195,9 +3694,17 @@ gf_cli_gsync_config_command (dict_t *dict)
if (dict_get_str (dict, "op_name", &op_name) != 0)
op_name = NULL;
+ ret = dict_get_str (dict, "conf_path", &confpath);
+ if (!confpath) {
+ ret = snprintf (conf_path, sizeof(conf_path) - 1,
+ "%s/"GEOREP"/gsyncd_template.conf", gwd);
+ conf_path[ret] = '\0';
+ confpath = conf_path;
+ }
+
runinit (&runner);
runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, gwd);
+ runner_argprintf (&runner, "%s", confpath);
if (master)
runner_argprintf (&runner, ":%s", master);
runner_add_arg (&runner, slave);
@@ -3209,60 +3716,639 @@ gf_cli_gsync_config_command (dict_t *dict)
}
int
-gf_cli_gsync_out_status (dict_t *dict)
+gf_cli_fetch_gsyncd_status_values (char *status,
+ gf_cli_gsync_status_t *sts_val)
{
- int gsync_count = 0;
- int i = 0;
- int ret = 0;
- char mst[PATH_MAX] = {0, };
- char slv[PATH_MAX]= {0, };
- char sts[PATH_MAX] = {0, };
- char hyphens[81] = {0, };
- char *mst_val = NULL;
- char *slv_val = NULL;
- char *sts_val = NULL;
-
- cli_out ("%-20s %-50s %-10s", "MASTER", "SLAVE", "STATUS");
-
- for (i=0; i<sizeof(hyphens)-1; i++)
- hyphens[i] = '-';
+ int32_t ret = -1;
+ char *tmp = NULL;
+ char *save_ptr = NULL;
+ char *key = NULL;
+ char *value = NULL;
- cli_out ("%s", hyphens);
+ if (!status || !sts_val) {
+ gf_log ("", GF_LOG_ERROR, "status or sts_val is null");
+ goto out;
+ }
+ tmp = strtok_r (status, "\n", &save_ptr);
- ret = dict_get_int32 (dict, "gsync-count", &gsync_count);
+ if (tmp)
+ sts_val->health = gf_strdup (tmp);
+
+ while (tmp) {
+ key = strtok_r (tmp, "=", &value);
+
+ if ((key) && (!strcmp(key, "Uptime")))
+ sts_val->uptime = gf_strdup (value);
+
+ if ((key) && (!strcmp(key, "FilesSyncd")))
+ sts_val->files_syncd = gf_strdup (value);
+
+ if ((key) && (!strcmp(key, "FilesPending")))
+ sts_val->files_pending = gf_strdup (value);
+
+ if ((key) && (!strcmp(key, "BytesPending"))) {
+ value = gf_uint64_2human_readable(atol(value));
+ sts_val->bytes_pending = gf_strdup (value);
+ }
+
+ if ((key) && (!strcmp(key, "DeletesPending")))
+ sts_val->deletes_pending = gf_strdup (value);
+
+ tmp = strtok_r (NULL, ";", &save_ptr);
+ }
+
+ if (sts_val->health)
+ ret = 0;
+
+ if (!sts_val->uptime)
+ sts_val->uptime = gf_strdup ("N/A");
+
+ if (!sts_val->files_syncd)
+ sts_val->files_syncd = gf_strdup ("N/A");
+
+ if (!sts_val->files_pending)
+ sts_val->files_pending = gf_strdup ("N/A");
+
+ if (!sts_val->bytes_pending)
+ sts_val->bytes_pending = gf_strdup ("N/A");
+
+ if (!sts_val->deletes_pending)
+ sts_val->deletes_pending = gf_strdup ("N/A");
+
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d.", ret);
+ return ret;
+}
+
+char*
+get_struct_variable (int mem_num, gf_cli_gsync_status_t *sts_val)
+{
+ switch (mem_num) {
+ case 0: return (sts_val->node);
+ case 1: return (sts_val->master);
+ case 2: return (sts_val->slave);
+ case 3: return (sts_val->health);
+ case 4: return (sts_val->uptime);
+ case 5: return (sts_val->files_syncd);
+ case 6: return (sts_val->files_pending);
+ case 7: return (sts_val->bytes_pending);
+ case 8: return (sts_val->deletes_pending);
+ default:
+ goto out;
+ }
+
+out:
+ return NULL;
+}
+
+int
+gf_cli_print_status (char **title_values,
+ gf_cli_gsync_status_t **sts_vals,
+ int *spacing, int gsync_count,
+ int number_of_fields, int is_detail)
+{
+ int indents = 0;
+ int i = 0;
+ int j = 0;
+ int ret = 0;
+ int total_spacing = 0;
+ char **output_values = NULL;
+ char *tmp = NULL;
+ char *hyphens = NULL;
+ char heading[PATH_MAX] = {0, };
+ char indent_spaces[PATH_MAX] = {0, };
+
+ /* calculating spacing for hyphens */
+ for (i = 0; i < number_of_fields; i++) {
+ /* Suppressing master and slave output for status detail */
+ if ((is_detail) && ((i == 1) || (i == 2))) {
+ total_spacing++;
+ continue;
+ } else if ((!is_detail) && (i > 4)) {
+ /* Suppressing detailed output for
+ * status */
+ continue;
+ }
+ spacing[i] += 3; /* Adding extra space to
+ distinguish between fields */
+ total_spacing += spacing[i];
+ }
+ total_spacing += 4; /* For the spacing between the fields */
+
+ /* char pointers for each field */
+ output_values = GF_CALLOC (number_of_fields, sizeof (char *),
+ gf_common_mt_char);
+ if (!output_values) {
+ ret = -1;
+ goto out;
+ }
+ for (i = 0; i < number_of_fields; i++) {
+ output_values[i] = GF_CALLOC (spacing[i] + 1, sizeof (char),
+ gf_common_mt_char);
+ if (!output_values[i]) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ hyphens = GF_CALLOC (total_spacing + 1, sizeof (char),
+ gf_common_mt_char);
+ if (!hyphens) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = snprintf(heading, sizeof(heading), "MASTER: %s SLAVE: %s",
+ sts_vals[0]->master, sts_vals[0]->slave);
if (ret) {
- gf_log ("cli", GF_LOG_INFO, "No active geo-replication sessions"
- "present for the selected");
+ if (ret < sizeof(heading))
+ heading[ret] = '\0';
+ else
+ heading[sizeof(heading) - 1] = '\0';
ret = 0;
+ } else {
+ ret = -1;
goto out;
}
- for (i = 1; i <= gsync_count; i++) {
- snprintf (mst, sizeof(mst), "master%d", i);
- snprintf (slv, sizeof(slv), "slave%d", i);
- snprintf (sts, sizeof(sts), "status%d", i);
+ if (is_detail) {
+ cli_out (" ");
+ if (strlen(heading) > total_spacing)
+ cli_out ("%s", heading);
+ else {
+ /* Printing the heading with centre justification */
+ indents = (total_spacing - strlen(heading)) / 2;
+ memset (indent_spaces, ' ', indents);
+ indent_spaces[indents] = '\0';
+ ret = snprintf (hyphens, total_spacing, "%s%s",
+ indent_spaces, heading);
+ if (ret) {
+ hyphens[ret] = '\0';
+ cli_out ("%s", hyphens);
+ ret = 0;
+ } else {
+ ret = -1;
+ goto out;
+ }
+ }
+ cli_out (" ");
+ }
+
+ /* setting the title "NODE", "MASTER", etc. from title_values[]
+ and printing the same */
+ for (j = 0; j < number_of_fields; j++) {
+ /* Suppressing master and slave output for status detail */
+ if ((is_detail) && ((j == 1) || (j == 2))) {
+ output_values[j][0] = '\0';
+ continue;
+ } else if ((!is_detail) && (j > 4)) {
+ /* Suppressing detailed output for
+ * status */
+ output_values[j][0] = '\0';
+ continue;
+ }
+ memset (output_values[j], ' ', spacing[j]);
+ memcpy (output_values[j], title_values[j],
+ strlen(title_values[j]));
+ output_values[j][spacing[j]] = '\0';
+ }
+ cli_out ("%s %s %s %s %s %s %s %s %s", output_values[0],
+ output_values[1], output_values[2], output_values[3],
+ output_values[4], output_values[5], output_values[6],
+ output_values[7], output_values[8]);
+
+ /* setting and printing the hyphens */
+ memset (hyphens, '-', total_spacing);
+ hyphens[total_spacing] = '\0';
+ cli_out ("%s", hyphens);
+
+ for (i = 0; i < gsync_count; i++) {
+ for (j = 0; j < number_of_fields; j++) {
+ /* Suppressing master and slave output for
+ * status detail */
+ if ((is_detail) && ((j == 1) || (j == 2))) {
+ output_values[j][0] = '\0';
+ continue;
+ } else if ((!is_detail) && (j > 4)) {
+ /* Suppressing detailed output for
+ * status */
+ output_values[j][0] = '\0';
+ continue;
+ }
+ tmp = get_struct_variable(j, sts_vals[i]);
+ if (!tmp) {
+ gf_log ("", GF_LOG_ERROR,
+ "struct member empty.");
+ ret = -1;
+ goto out;
+ }
+ memset (output_values[j], ' ', spacing[j]);
+ memcpy (output_values[j], tmp, strlen (tmp));
+ output_values[j][spacing[j]] = '\0';
+ }
+
+ cli_out ("%s %s %s %s %s %s %s %s %s", output_values[0],
+ output_values[1], output_values[2], output_values[3],
+ output_values[4], output_values[5], output_values[6],
+ output_values[7], output_values[8]);
+ }
+
+out:
+ if (output_values) {
+ for (i = 0; i < number_of_fields; i++) {
+ if (output_values[i])
+ GF_FREE (output_values[i]);
+ }
+ GF_FREE (output_values);
+ }
+
+ if (hyphens)
+ GF_FREE (hyphens);
+
+ return ret;
+}
+
+int
+gf_cli_read_status_data (dict_t *dict,
+ gf_cli_gsync_status_t **sts_vals,
+ int *spacing, int gsync_count,
+ int number_of_fields)
+{
+ int ret = 0;
+ int i = 0;
+ int j = 0;
+ char mst[PATH_MAX] = {0, };
+ char slv[PATH_MAX] = {0, };
+ char sts[PATH_MAX] = {0, };
+ char nds[PATH_MAX] = {0, };
+ char *status = NULL;
+ char *tmp = NULL;
+
+ /* Storing per node status info in each object */
+ for (i = 0; i < gsync_count; i++) {
+ snprintf (nds, sizeof(nds), "node%d", i + 1);
+ snprintf (mst, sizeof(mst), "master%d", i + 1);
+ snprintf (slv, sizeof(slv), "slave%d", i + 1);
+ snprintf (sts, sizeof(sts), "status%d", i + 1);
+
+ /* Fetching the values from dict, and calculating
+ the max length for each field */
+ ret = dict_get_str (dict, nds, &(sts_vals[i]->node));
+ if (ret)
+ goto out;
- ret = dict_get_str (dict, mst, &mst_val);
+ ret = dict_get_str (dict, mst, &(sts_vals[i]->master));
if (ret)
goto out;
- ret = dict_get_str (dict, slv, &slv_val);
+ ret = dict_get_str (dict, slv, &(sts_vals[i]->slave));
if (ret)
goto out;
- ret = dict_get_str (dict, sts, &sts_val);
+ ret = dict_get_str (dict, sts, &status);
if (ret)
goto out;
- cli_out ("%-20s %-50s %-10s", mst_val,
- slv_val, sts_val);
+ /* Fetching health and uptime from sts_val */
+ ret = gf_cli_fetch_gsyncd_status_values (status, sts_vals[i]);
+ if (ret)
+ goto out;
+ for (j = 0; j < number_of_fields; j++) {
+ tmp = get_struct_variable(j, sts_vals[i]);
+ if (!tmp) {
+ gf_log ("", GF_LOG_ERROR,
+ "struct member empty.");
+ ret = -1;
+ goto out;
+ }
+ if (strlen (tmp) > spacing[j])
+ spacing[j] = strlen (tmp);
+ }
}
- out:
+out:
+ return ret;
+}
+
+int
+gf_cli_gsync_status_output (dict_t *dict, int status_detail)
+{
+ int gsync_count = 0;
+ int i = 0;
+ int j = 0;
+ int ret = 0;
+ int spacing[10] = {0};
+ int num_of_fields = 9;
+ char errmsg[1024] = "";
+ char *master = NULL;
+ char *slave = NULL;
+ char *tmp = NULL;
+ char *title_values[] = {"NODE", "MASTER", "SLAVE",
+ "HEALTH", "UPTIME",
+ "FILES SYNCD",
+ "FILES PENDING",
+ "BYTES PENDING",
+ "DELETES PENDING"};
+ gf_cli_gsync_status_t **sts_vals = NULL;
+
+ /* Checks if any session is active or not */
+ ret = dict_get_int32 (dict, "gsync-count", &gsync_count);
+ if (ret) {
+ ret = dict_get_str (dict, "master", &master);
+
+ ret = dict_get_str (dict, "slave", &slave);
+
+ if (master) {
+ if (slave)
+ snprintf (errmsg, sizeof(errmsg), "No active "
+ "geo-replication sessions between %s"
+ " and %s", master, slave);
+ else
+ snprintf (errmsg, sizeof(errmsg), "No active "
+ "geo-replication sessions for %s",
+ master);
+ } else
+ snprintf (errmsg, sizeof(errmsg), "No active "
+ "geo-replication sessions");
+
+ gf_log ("cli", GF_LOG_INFO, "%s", errmsg);
+ cli_out ("%s", errmsg);
+ ret = 0;
+ goto out;
+ }
+
+ for (i = 0; i < num_of_fields; i++)
+ spacing[i] = strlen(title_values[i]);
+
+ /* gsync_count = number of nodes reporting output.
+ each sts_val object will store output of each
+ node */
+ sts_vals = GF_CALLOC (gsync_count, sizeof (gf_cli_gsync_status_t *),
+ gf_common_mt_char);
+ if (!sts_vals) {
+ ret = -1;
+ goto out;
+ }
+ for (i = 0; i < gsync_count; i++) {
+ sts_vals[i] = GF_CALLOC (1, sizeof (gf_cli_gsync_status_t),
+ gf_common_mt_char);
+ if (!sts_vals[i]) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = gf_cli_read_status_data (dict, sts_vals, spacing,
+ gsync_count, num_of_fields);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to read status data");
+ goto out;
+ }
+
+ ret = gf_cli_print_status (title_values, sts_vals, spacing, gsync_count,
+ num_of_fields, status_detail);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to print status output");
+ goto out;
+ }
+
+out:
+ if (sts_vals) {
+ for (i = 0; i < gsync_count; i++) {
+ for (j = 3; j < num_of_fields; j++) {
+ tmp = get_struct_variable(j, sts_vals[i]);
+ if (tmp)
+ GF_FREE (tmp);
+ }
+ }
+ GF_FREE (sts_vals);
+ }
+
+ return ret;
+}
+
+static int32_t
+write_contents_to_common_pem_file (dict_t *dict, int output_count)
+{
+ char *workdir = NULL;
+ char common_pem_file[PATH_MAX] = "";
+ char *output = NULL;
+ char output_name[PATH_MAX] = "";
+ int bytes_writen = 0;
+ int fd = -1;
+ int ret = -1;
+ int i = -1;
+
+ ret = dict_get_str (dict, "glusterd_workdir", &workdir);
+ if (ret || !workdir) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch workdir");
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (common_pem_file, sizeof(common_pem_file),
+ "%s/geo-replication/common_secret.pem.pub",
+ workdir);
+
+ unlink (common_pem_file);
+
+ fd = open (common_pem_file, O_WRONLY | O_CREAT, 0600);
+ if (fd == -1) {
+ gf_log ("", GF_LOG_ERROR, "Failed to open %s"
+ " Error : %s", common_pem_file,
+ strerror (errno));
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 1; i <= output_count; i++) {
+ memset (output_name, '\0', sizeof (output_name));
+ snprintf (output_name, sizeof (output_name),
+ "output_%d", i);
+ ret = dict_get_str (dict, output_name, &output);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Failed to get %s.",
+ output_name);
+ cli_out ("Unable to fetch output.");
+ }
+ if (output) {
+ bytes_writen = write (fd, output, strlen(output));
+ if (bytes_writen != strlen(output)) {
+ gf_log ("", GF_LOG_ERROR, "Failed to write "
+ "to %s", common_pem_file);
+ ret = -1;
+ goto out;
+ }
+ /* Adding the new line character */
+ bytes_writen = write (fd, "\n", strlen("\n"));
+ if (bytes_writen != strlen("\n")) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to add new line char");
+ ret = -1;
+ goto out;
+ }
+ output = NULL;
+ }
+ }
+
+ cli_out ("Common secret pub file present at %s", common_pem_file);
+ ret = 0;
+out:
+ if (fd)
+ close (fd);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+gf_cli_sys_exec_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ int output_count = -1;
+ int i = -1;
+ char *output = NULL;
+ char *command = NULL;
+ char output_name[PATH_MAX] = "";
+ gf_cli_rsp rsp = {0, };
+ dict_t *dict = NULL;
+ call_frame_t *frame = NULL;
+
+ if (req->rpc_status == -1) {
+ ret = -1;
+ goto out;
+ }
+
+ frame = myframe;
+
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
+ if (ret < 0) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
+ goto out;
+ }
+
+ dict = dict_new ();
+
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len, &dict);
+
+ if (ret)
+ goto out;
+
+ if (rsp.op_ret) {
+ cli_err ("%s", rsp.op_errstr ? rsp.op_errstr :
+ "Command failed.");
+ ret = rsp.op_ret;
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "output_count", &output_count);
+ if (ret) {
+ cli_out ("Command executed successfully.");
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "command", &command);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get command from dict");
+ goto out;
+ }
+
+ if (!strcmp (command, "gsec_create")) {
+ ret = write_contents_to_common_pem_file (dict, output_count);
+ if (!ret)
+ goto out;
+ }
+
+ for (i = 1; i <= output_count; i++) {
+ memset (output_name, '\0', sizeof (output_name));
+ snprintf (output_name, sizeof (output_name),
+ "output_%d", i);
+ ret = dict_get_str (dict, output_name, &output);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Failed to get %s.",
+ output_name);
+ cli_out ("Unable to fetch output.");
+ }
+ if (output) {
+ cli_out ("%s", output);
+ output = NULL;
+ }
+ }
+
+ ret = 0;
+out:
+ if (dict)
+ dict_unref (dict);
+ cli_cmd_broadcast_response (ret);
+
+ free (rsp.dict.dict_val);
+
return ret;
+}
+int
+gf_cli_copy_file_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ gf_cli_rsp rsp = {0, };
+ dict_t *dict = NULL;
+ call_frame_t *frame = NULL;
+
+ if (req->rpc_status == -1) {
+ ret = -1;
+ goto out;
+ }
+
+ frame = myframe;
+
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
+ if (ret < 0) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
+ goto out;
+ }
+
+ dict = dict_new ();
+
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len, &dict);
+
+ if (ret)
+ goto out;
+
+ if (rsp.op_ret) {
+ cli_err ("%s", rsp.op_errstr ? rsp.op_errstr :
+ "Copy unsuccessful");
+ ret = rsp.op_ret;
+ goto out;
+ }
+
+ cli_out ("Successfully copied file.");
+
+out:
+ if (dict)
+ dict_unref (dict);
+ cli_cmd_broadcast_response (ret);
+
+ free (rsp.dict.dict_val);
+
+ return ret;
}
int
@@ -3275,17 +4361,22 @@ gf_cli_gsync_set_cbk (struct rpc_req *req, struct iovec *iov,
char *gsync_status = NULL;
char *master = NULL;
char *slave = NULL;
- int32_t type = 0;
+ int32_t type = 0;
+ call_frame_t *frame = NULL;
+ gf_boolean_t status_detail = _gf_false;
+
if (req->rpc_status == -1) {
ret = -1;
goto out;
}
+ frame = myframe;
+
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR,
- "Unable to get response structure");
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -3302,8 +4393,8 @@ gf_cli_gsync_set_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
if (global_state->mode & GLUSTER_MODE_XML) {
- ret = cli_xml_output_dict ("volGeoRep", dict, rsp.op_ret,
- rsp.op_errno, rsp.op_errstr);
+ ret = cli_xml_output_vol_gsync (dict, rsp.op_ret, rsp.op_errno,
+ rsp.op_errstr);
if (ret)
gf_log ("cli", GF_LOG_ERROR,
"Error outputting to xml");
@@ -3325,7 +4416,7 @@ gf_cli_gsync_set_cbk (struct rpc_req *req, struct iovec *iov,
ret = dict_get_int32 (dict, "type", &type);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to get type");
+ gf_log (frame->this->name, GF_LOG_ERROR, "failed to get type");
goto out;
}
@@ -3349,8 +4440,30 @@ gf_cli_gsync_set_cbk (struct rpc_req *req, struct iovec *iov,
break;
case GF_GSYNC_OPTION_TYPE_STATUS:
- ret = gf_cli_gsync_out_status (dict);
- goto out;
+ status_detail = dict_get_str_boolean (dict,
+ "status-detail",
+ _gf_false);
+ ret = gf_cli_gsync_status_output (dict, status_detail);
+ break;
+
+ case GF_GSYNC_OPTION_TYPE_DELETE:
+ if (dict_get_str (dict, "master", &master) != 0)
+ master = "???";
+ if (dict_get_str (dict, "slave", &slave) != 0)
+ slave = "???";
+ cli_out ("Deleting " GEOREP " session between %s & %s"
+ " has been successful", master, slave);
+ break;
+
+ case GF_GSYNC_OPTION_TYPE_CREATE:
+ if (dict_get_str (dict, "master", &master) != 0)
+ master = "???";
+ if (dict_get_str (dict, "slave", &slave) != 0)
+ slave = "???";
+ cli_out ("Creating " GEOREP " session between %s & %s"
+ " has been successful", master, slave);
+ break;
+
default:
cli_out (GEOREP" command executed successfully");
}
@@ -3366,6 +4479,54 @@ out:
}
int32_t
+gf_cli_sys_exec (call_frame_t *frame, xlator_t *this, void *data)
+{
+ int ret = 0;
+ dict_t *dict = NULL;
+ gf_cli_req req = {{0,}};
+
+ if (!frame || !this || !data) {
+ ret = -1;
+ gf_log ("cli", GF_LOG_ERROR, "Invalid data");
+ goto out;
+ }
+
+ dict = data;
+
+ ret = cli_to_glusterd (&req, frame, gf_cli_sys_exec_cbk,
+ (xdrproc_t) xdr_gf_cli_req, dict,
+ GLUSTER_CLI_SYS_EXEC, this, cli_rpc_prog,
+ NULL);
+out:
+ GF_FREE (req.dict.dict_val);
+ return ret;
+}
+
+int32_t
+gf_cli_copy_file (call_frame_t *frame, xlator_t *this, void *data)
+{
+ int ret = 0;
+ dict_t *dict = NULL;
+ gf_cli_req req = {{0,}};
+
+ if (!frame || !this || !data) {
+ ret = -1;
+ gf_log ("cli", GF_LOG_ERROR, "Invalid data");
+ goto out;
+ }
+
+ dict = data;
+
+ ret = cli_to_glusterd (&req, frame, gf_cli_copy_file_cbk,
+ (xdrproc_t) xdr_gf_cli_req, dict,
+ GLUSTER_CLI_COPY_FILE, this, cli_rpc_prog,
+ NULL);
+out:
+ GF_FREE (req.dict.dict_val);
+ return ret;
+}
+
+int32_t
gf_cli_gsync_set (call_frame_t *frame, xlator_t *this,
void *data)
{
@@ -3599,7 +4760,8 @@ gf_cli_profile_volume_cbk (struct rpc_req *req, struct iovec *iov,
gf_log ("cli", GF_LOG_DEBUG, "Received resp to profile");
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -3787,7 +4949,8 @@ gf_cli_top_volume_cbk (struct rpc_req *req, struct iovec *iov,
gf_log ("cli", GF_LOG_DEBUG, "Received resp to top");
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "Unable to decode response");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -4023,8 +5186,15 @@ gf_cli_getwd_cbk (struct rpc_req *req, struct iovec *iov,
}
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_getwd_rsp);
- if (ret < 0 || rsp.op_ret == -1) {
- gf_log ("", GF_LOG_ERROR, "error");
+ if (ret < 0) {
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
+ goto out;
+ }
+
+ if (rsp.op_ret == -1) {
+ cli_err ("getwd failed");
+ ret = rsp.op_ret;
goto out;
}
@@ -5040,6 +6210,123 @@ out:
return;
}
+static void
+cli_print_volume_status_tasks (dict_t *dict)
+{
+ int ret = -1;
+ int i = 0;
+ int j = 0;
+ int count = 0;
+ int task_count = 0;
+ int status = 0;
+ char *op = NULL;
+ char *task_id_str = NULL;
+ char *volname = NULL;
+ char key[1024] = {0,};
+ char task[1024] = {0,};
+ char *brick = NULL;
+ char *src_brick = NULL;
+ char *dest_brick = NULL;
+
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret)
+ goto out;
+
+ ret = dict_get_int32 (dict, "tasks", &task_count);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to get tasks count");
+ return;
+ }
+
+ cli_out ("Task Status of Volume %s", volname);
+ cli_print_line (CLI_BRICK_STATUS_LINE_LEN);
+
+ if (task_count == 0) {
+ cli_out ("There are no active volume tasks");
+ cli_out (" ");
+ return;
+ }
+
+ for (i = 0; i < task_count; i++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.type", i);
+ ret = dict_get_str(dict, key, &op);
+ if (ret)
+ return;
+ cli_out ("%-20s : %-20s", "Task", op);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.id", i);
+ ret = dict_get_str (dict, key, &task_id_str);
+ if (ret)
+ return;
+ cli_out ("%-20s : %-20s", "ID", task_id_str);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.status", i);
+ ret = dict_get_int32 (dict, key, &status);
+ if (ret)
+ return;
+
+ snprintf (task, sizeof (task), "task%d", i);
+
+ /*
+ Replace brick only has two states - In progress and Complete
+ Ref: xlators/mgmt/glusterd/src/glusterd-replace-brick.c
+ */
+
+ if (!strcmp (op, "Replace brick")) {
+ if (status)
+ status = GF_DEFRAG_STATUS_COMPLETE;
+ else
+ status = GF_DEFRAG_STATUS_STARTED;
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s.src-brick", task);
+ ret = dict_get_str (dict, key, &src_brick);
+ if (ret)
+ goto out;
+
+ cli_out ("%-20s : %-20s", "Source Brick", src_brick);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s.dst-brick", task);
+ ret = dict_get_str (dict, key, &dest_brick);
+ if (ret)
+ goto out;
+
+ cli_out ("%-20s : %-20s", "Destination Brick",
+ dest_brick);
+
+ } else if (!strcmp (op, "Remove brick")) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s.count", task);
+ ret = dict_get_int32 (dict, key, &count);
+ if (ret)
+ goto out;
+
+ cli_out ("%-20s", "Removed bricks:");
+
+ for (j = 1; j <= count; j++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key),"%s.brick%d",
+ task, j);
+ ret = dict_get_str (dict, key, &brick);
+ if (ret)
+ goto out;
+
+ cli_out ("%-20s", brick);
+ }
+ }
+ cli_out ("%-20s : %-20s", "Status",
+ cli_vol_task_status_str[status]);
+ cli_out (" ");
+ }
+
+out:
+ return;
+}
+
static int
gf_cli_status_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
@@ -5068,7 +6355,8 @@ gf_cli_status_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("cli", GF_LOG_ERROR, "Volume status response error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -5093,10 +6381,12 @@ gf_cli_status_cbk (struct rpc_req *req, struct iovec *iov,
"status information.");
if (global_state->mode & GLUSTER_MODE_XML) {
- cli_xml_output_str ("volStatus", msg, rsp.op_ret,
- rsp.op_errno, rsp.op_errstr);
- ret = 0;
- goto out;
+ if (!local->all)
+ cli_xml_output_str ("volStatus", msg,
+ rsp.op_ret, rsp.op_errno,
+ rsp.op_errstr);
+ ret = 0;
+ goto out;
}
cli_err ("%s", msg);
@@ -5138,23 +6428,6 @@ gf_cli_status_cbk (struct rpc_req *req, struct iovec *iov,
if ((cmd & GF_CLI_STATUS_NFS) || (cmd & GF_CLI_STATUS_SHD))
notbrick = _gf_true;
- ret = dict_get_int32 (dict, "count", &count);
- if (ret)
- goto out;
- if (count == 0) {
- ret = -1;
- goto out;
- }
-
- ret = dict_get_int32 (dict, "brick-index-max", &brick_index_max);
- if (ret)
- goto out;
- ret = dict_get_int32 (dict, "other-count", &other_count);
- if (ret)
- goto out;
-
- index_max = brick_index_max + other_count;
-
if (global_state->mode & GLUSTER_MODE_XML) {
if (!local->all) {
ret = cli_xml_output_vol_status_begin (local,
@@ -5167,11 +6440,21 @@ gf_cli_status_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
}
}
- ret = cli_xml_output_vol_status (local, dict);
- if (ret) {
- gf_log ("cli", GF_LOG_ERROR,
- "Error outputting to xml");
- goto out;
+ if (cmd & GF_CLI_STATUS_TASKS) {
+ ret = cli_xml_output_vol_status_tasks_detail (local,
+ dict);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,"Error outputting "
+ "to xml");
+ goto out;
+ }
+ } else {
+ ret = cli_xml_output_vol_status (local, dict);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Error outputting to xml");
+ goto out;
+ }
}
if (!local->all) {
@@ -5207,6 +6490,10 @@ gf_cli_status_cbk (struct rpc_req *req, struct iovec *iov,
cli_print_volume_status_callpool (dict, notbrick);
goto cont;
break;
+ case GF_CLI_STATUS_TASKS:
+ cli_print_volume_status_tasks (dict);
+ goto cont;
+ break;
default:
break;
}
@@ -5215,6 +6502,17 @@ gf_cli_status_cbk (struct rpc_req *req, struct iovec *iov,
if (ret)
goto out;
+ ret = dict_get_int32 (dict, "brick-index-max", &brick_index_max);
+ if (ret)
+ goto out;
+
+ ret = dict_get_int32 (dict, "other-count", &other_count);
+ if (ret)
+ goto out;
+
+ index_max = brick_index_max + other_count;
+
+
cli_out ("Status of volume: %s", volname);
if ((cmd & GF_CLI_STATUS_DETAIL) == 0) {
@@ -5286,6 +6584,9 @@ gf_cli_status_cbk (struct rpc_req *req, struct iovec *iov,
}
}
cli_out (" ");
+
+ if ((cmd & GF_CLI_STATUS_MASK) == GF_CLI_STATUS_NONE)
+ cli_print_volume_status_tasks (dict);
cont:
ret = rsp.op_ret;
@@ -5361,12 +6662,6 @@ gf_cli_status_volume_all (call_frame_t *frame, xlator_t *this, void *data)
goto out;
}
- if (vol_count == 0) {
- cli_err ("No volumes present");
- ret = 0;
- goto out;
- }
-
/* remove the "all" flag in cmd */
cmd &= ~GF_CLI_STATUS_ALL;
cmd |= GF_CLI_STATUS_VOL;
@@ -5381,6 +6676,12 @@ gf_cli_status_volume_all (call_frame_t *frame, xlator_t *this, void *data)
}
}
+ if (vol_count == 0 && !(global_state->mode & GLUSTER_MODE_XML)) {
+ cli_err ("No volumes present");
+ ret = 0;
+ goto out;
+ }
+
for (i = 0; i < vol_count; i++) {
dict = dict_new ();
@@ -5444,7 +6745,8 @@ gf_cli_mount_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_mount_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -5511,7 +6813,8 @@ gf_cli_umount_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_umount_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -5561,6 +6864,97 @@ gf_cli_umount (call_frame_t *frame, xlator_t *this, void *data)
}
void
+cmd_heal_volume_statistics_out (dict_t *dict, int brick)
+{
+
+ uint64_t num_entries = 0;
+ int ret = 0;
+ char key[256] = {0};
+ char *hostname = NULL;
+ uint64_t i = 0;
+ uint64_t healed_count = 0;
+ uint64_t split_brain_count = 0;
+ uint64_t heal_failed_count = 0;
+ char *start_time_str = NULL;
+ char *end_time_str = NULL;
+ char *crawl_type = NULL;
+ int progress = -1;
+
+ snprintf (key, sizeof key, "%d-hostname", brick);
+ ret = dict_get_str (dict, key, &hostname);
+ if (ret)
+ goto out;
+ cli_out ("------------------------------------------------");
+ cli_out ("\nCrawl statistics for brick no %d", brick);
+ cli_out ("Hostname of brick %s", hostname);
+
+ snprintf (key, sizeof key, "statistics-%d-count", brick);
+ ret = dict_get_uint64 (dict, key, &num_entries);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < num_entries; i++)
+ {
+ snprintf (key, sizeof key, "statistics_crawl_type-%d-%"PRIu64,
+ brick, i);
+ ret = dict_get_str (dict, key, &crawl_type);
+ if (ret)
+ goto out;
+
+ snprintf (key, sizeof key, "statistics_healed_cnt-%d-%"PRIu64,
+ brick,i);
+ ret = dict_get_uint64 (dict, key, &healed_count);
+ if (ret)
+ goto out;
+
+ snprintf (key, sizeof key, "statistics_sb_cnt-%d-%"PRIu64,
+ brick, i);
+ ret = dict_get_uint64 (dict, key, &split_brain_count);
+ if (ret)
+ goto out;
+ snprintf (key, sizeof key, "statistics_heal_failed_cnt-%d-%"PRIu64,
+ brick, i);
+ ret = dict_get_uint64 (dict, key, &heal_failed_count);
+ if (ret)
+ goto out;
+ snprintf (key, sizeof key, "statistics_strt_time-%d-%"PRIu64,
+ brick, i);
+ ret = dict_get_str (dict, key, &start_time_str);
+ if (ret)
+ goto out;
+ snprintf (key, sizeof key, "statistics_end_time-%d-%"PRIu64,
+ brick, i);
+ ret = dict_get_str (dict, key, &end_time_str);
+ if (ret)
+ goto out;
+ snprintf (key, sizeof key, "statistics_inprogress-%d-%"PRIu64,
+ brick, i);
+ ret = dict_get_int32 (dict, key, &progress);
+ if (ret)
+ goto out;
+
+ cli_out ("\nStarting time of crawl: %s", start_time_str);
+ if (progress == 1)
+ cli_out ("Crawl is in progress");
+ else
+ cli_out ("Ending time of crawl: %s", end_time_str);
+
+ cli_out ("Type of crawl: %s", crawl_type);
+ cli_out ("No. of entries healed: %"PRIu64,
+ healed_count);
+ cli_out ("No. of entries in split-brain: %"PRIu64,
+ split_brain_count);
+ cli_out ("No. of heal failed entries: %"PRIu64,
+ heal_failed_count);
+
+ }
+
+
+out:
+ return;
+}
+
+void
cmd_heal_volume_brick_out (dict_t *dict, int brick)
{
uint64_t num_entries = 0;
@@ -5572,6 +6966,7 @@ cmd_heal_volume_brick_out (dict_t *dict, int brick)
uint64_t i = 0;
uint32_t time = 0;
char timestr[32] = {0};
+ char *shd_status = NULL;
snprintf (key, sizeof key, "%d-hostname", brick);
ret = dict_get_str (dict, key, &hostname);
@@ -5582,37 +6977,96 @@ cmd_heal_volume_brick_out (dict_t *dict, int brick)
if (ret)
goto out;
cli_out ("\nBrick %s:%s", hostname, path);
- snprintf (key, sizeof key, "%d-count", brick);
- ret = dict_get_uint64 (dict, key, &num_entries);
- cli_out ("Number of entries: %"PRIu64, num_entries);
+
snprintf (key, sizeof key, "%d-status", brick);
ret = dict_get_str (dict, key, &status);
if (status && strlen (status))
cli_out ("Status: %s", status);
- for (i = 0; i < num_entries; i++) {
- snprintf (key, sizeof key, "%d-%"PRIu64, brick, i);
- ret = dict_get_str (dict, key, &path);
- if (ret)
- continue;
- time = 0;
- snprintf (key, sizeof key, "%d-%"PRIu64"-time", brick, i);
- ret = dict_get_uint32 (dict, key, &time);
- if (!time) {
- cli_out ("%s", path);
- } else {
- gf_time_fmt (timestr, sizeof timestr,
- time, gf_timefmt_FT);
- if (i == 0) {
+
+ snprintf (key, sizeof key, "%d-shd-status",brick);
+ ret = dict_get_str (dict, key, &shd_status);
+
+ if(!shd_status)
+ {
+ snprintf (key, sizeof key, "%d-count", brick);
+ ret = dict_get_uint64 (dict, key, &num_entries);
+ cli_out ("Number of entries: %"PRIu64, num_entries);
+
+
+ for (i = 0; i < num_entries; i++) {
+ snprintf (key, sizeof key, "%d-%"PRIu64, brick, i);
+ ret = dict_get_str (dict, key, &path);
+ if (ret)
+ continue;
+ time = 0;
+ snprintf (key, sizeof key, "%d-%"PRIu64"-time",
+ brick, i);
+ ret = dict_get_uint32 (dict, key, &time);
+ if (!time) {
+ cli_out ("%s", path);
+ } else {
+ gf_time_fmt (timestr, sizeof timestr,
+ time, gf_timefmt_FT);
+ if (i == 0) {
cli_out ("at path on brick");
cli_out ("-----------------------------------");
+ }
+ cli_out ("%s %s", timestr, path);
}
- cli_out ("%s %s", timestr, path);
}
}
+
out:
return;
}
+
+void
+cmd_heal_volume_statistics_heal_count_out (dict_t *dict, int brick)
+{
+ uint64_t num_entries = 0;
+ int ret = 0;
+ char key[256] = {0};
+ char *hostname = NULL;
+ char *path = NULL;
+ char *status = NULL;
+ char *shd_status = NULL;
+
+ snprintf (key, sizeof key, "%d-hostname", brick);
+ ret = dict_get_str (dict, key, &hostname);
+ if (ret)
+ goto out;
+ snprintf (key, sizeof key, "%d-path", brick);
+ ret = dict_get_str (dict, key, &path);
+ if (ret)
+ goto out;
+ cli_out ("\nBrick %s:%s", hostname, path);
+
+ snprintf (key, sizeof key, "%d-status", brick);
+ ret = dict_get_str (dict, key, &status);
+ if (status && strlen (status))
+ cli_out ("Status: %s", status);
+
+ snprintf (key, sizeof key, "%d-shd-status",brick);
+ ret = dict_get_str (dict, key, &shd_status);
+
+ if(!shd_status)
+ {
+ snprintf (key, sizeof key, "%d-hardlinks", brick);
+ ret = dict_get_uint64 (dict, key, &num_entries);
+ if (ret)
+ cli_out ("No gathered input for this brick");
+ else
+ cli_out ("Number of entries: %"PRIu64, num_entries);
+
+
+ }
+
+out:
+ return;
+}
+
+
int
gf_cli_heal_volume_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
@@ -5629,19 +7083,21 @@ gf_cli_heal_volume_cbk (struct rpc_req *req, struct iovec *iov,
gf_xl_afr_op_t heal_op = GF_AFR_OP_INVALID;
char *operation = NULL;
char *substr = NULL;
+ char *heal_op_str = NULL;
if (-1 == req->rpc_status) {
goto out;
}
+ frame = myframe;
+
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
- frame = myframe;
-
if (frame)
local = frame->local;
@@ -5664,18 +7120,51 @@ gf_cli_heal_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = dict_get_str (input_dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to get volname");
+ gf_log (frame->this->name, GF_LOG_ERROR, "failed to get volname");
goto out;
}
gf_log ("cli", GF_LOG_INFO, "Received resp to heal volume");
+ switch (heal_op) {
+ case GF_AFR_OP_HEAL_INDEX:
+ heal_op_str = "to perform index self heal";
+ break;
+ case GF_AFR_OP_HEAL_FULL:
+ heal_op_str = "to perform full self heal";
+ break;
+ case GF_AFR_OP_INDEX_SUMMARY:
+ heal_op_str = "list of entries to be healed";
+ break;
+ case GF_AFR_OP_HEALED_FILES:
+ heal_op_str = "list of healed entries";
+ break;
+ case GF_AFR_OP_HEAL_FAILED_FILES:
+ heal_op_str = "list of heal failed entries";
+ break;
+ case GF_AFR_OP_SPLIT_BRAIN_FILES:
+ heal_op_str = "list of split brain entries";
+ break;
+ case GF_AFR_OP_STATISTICS:
+ heal_op_str = "crawl statistics";
+ break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+ heal_op_str = "count of entries to be healed";
+ break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ heal_op_str = "count of entries to be healed per replica";
+ break;
+ case GF_AFR_OP_INVALID:
+ heal_op_str = "invalid heal op";
+ break;
+ }
+
if ((heal_op == GF_AFR_OP_HEAL_FULL) ||
(heal_op == GF_AFR_OP_HEAL_INDEX)) {
- operation = "Launching Heal operation";
+ operation = "Launching heal operation";
substr = "\nUse heal info commands to check status";
} else {
- operation = "Gathering Heal info";
+ operation = "Gathering";
substr = "";
}
@@ -5683,15 +7172,15 @@ gf_cli_heal_volume_cbk (struct rpc_req *req, struct iovec *iov,
if (strcmp (rsp.op_errstr, "")) {
cli_err ("%s", rsp.op_errstr);
} else {
- cli_err ("%s on volume %s has been unsuccessful",
- operation, volname);
+ cli_err ("%s %s on volume %s has been unsuccessful",
+ operation, heal_op_str, volname);
}
ret = rsp.op_ret;
goto out;
} else {
- cli_out ("%s on volume %s has been successful%s", operation,
- volname, substr);
+ cli_out ("%s %s on volume %s has been successful %s", operation,
+ heal_op_str, volname, substr);
}
ret = rsp.op_ret;
@@ -5725,8 +7214,28 @@ gf_cli_heal_volume_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
}
- for (i = 0; i < brick_count; i++)
- cmd_heal_volume_brick_out (dict, i);
+ switch (heal_op) {
+ case GF_AFR_OP_STATISTICS:
+ for (i = 0; i < brick_count; i++)
+ cmd_heal_volume_statistics_out (dict, i);
+ break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ for (i = 0; i < brick_count; i++)
+ cmd_heal_volume_statistics_heal_count_out (dict,
+ i);
+ break;
+ case GF_AFR_OP_INDEX_SUMMARY:
+ case GF_AFR_OP_HEALED_FILES:
+ case GF_AFR_OP_HEAL_FAILED_FILES:
+ case GF_AFR_OP_SPLIT_BRAIN_FILES:
+ for (i = 0; i < brick_count; i++)
+ cmd_heal_volume_brick_out (dict, i);
+ break;
+ default:
+ break;
+ }
+
ret = rsp.op_ret;
out:
@@ -5778,7 +7287,8 @@ gf_cli_statedump_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp,
(xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log (THIS->name, GF_LOG_ERROR, "XDR decoding failed");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
gf_log ("cli", GF_LOG_DEBUG, "Received response to statedump");
@@ -5849,7 +7359,8 @@ gf_cli_list_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp,
(xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
- gf_log (THIS->name, GF_LOG_ERROR, "XDR decoding failed");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
@@ -5936,8 +7447,8 @@ gf_cli_clearlocks_volume_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp,
(xdrproc_t)xdr_gf_cli_rsp);
if (ret < 0) {
-
- gf_log ("cli", GF_LOG_ERROR, "XDR decoding failed");
+ gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
goto out;
}
gf_log ("cli", GF_LOG_DEBUG, "Received response to clear-locks");
@@ -6021,6 +7532,1024 @@ out:
return ret;
}
+int32_t
+cli_snapshot_remove_reply (gf_cli_rsp *rsp, dict_t *dict, call_frame_t *frame)
+{
+ int32_t ret = -1;
+ char *snap_name = NULL;
+
+ GF_ASSERT (rsp);
+ GF_ASSERT (dict);
+ GF_ASSERT (frame);
+
+ if (rsp->op_ret) {
+ cli_err("snapshot delete: failed: %s",
+ rsp->op_errstr ? rsp->op_errstr :
+ "Please check log file for details");
+ ret = rsp->op_ret;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snap_name);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to get snapname");
+ goto out;
+ }
+
+ cli_out ("snapshot delete: %s: snap removed successfully",
+ snap_name);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int
+cli_snapshot_config_display (dict_t *dict, gf_cli_rsp *rsp)
+{
+ char buf[PATH_MAX] = "";
+ char *volname = NULL;
+ int ret = -1;
+ int config_command = 0;
+ uint64_t value = 0;
+ uint64_t hard_limit = 0;
+ uint64_t soft_limit = 0;
+ uint64_t i = 0;
+ uint64_t voldisplaycount = 0;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp);
+
+ if (rsp->op_ret) {
+ cli_err ("Snapshot Config : failed: %s",
+ rsp->op_errstr ? rsp->op_errstr :
+ "Please check log file for details");
+ ret = rsp->op_ret;
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "config-command", &config_command);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch config type");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "volname", &volname);
+ /* Ignore the error, as volname is optional */
+
+ if (!volname) {
+ volname = "System";
+ }
+
+ ret = dict_get_uint64 (dict, "snap-max-hard-limit", &hard_limit);
+ /* Ignore the error, as the key specified is optional */
+ ret = dict_get_uint64 (dict, "snap-max-soft-limit", &soft_limit);
+
+ if (!hard_limit && !soft_limit
+ && config_command != GF_SNAP_CONFIG_DISPLAY) {
+ ret = -1;
+ gf_log(THIS->name, GF_LOG_ERROR,
+ "Could not fetch config-key");
+ goto out;
+ }
+
+ switch (config_command) {
+ case GF_SNAP_CONFIG_TYPE_SET:
+ if (hard_limit && soft_limit) {
+ cli_out ("snapshot config: snap-max-hard-limit "
+ "& snap-max-soft-limit for system set "
+ "successfully");
+ } else if (hard_limit){
+ cli_out ("snapshot config: %s "
+ "for snap-max-hard-limit set successfully",
+ volname);
+ } else if (soft_limit) {
+ cli_out ("snapshot config: %s "
+ "for snap-max-soft-limit set successfully",
+ volname);
+ }
+ break;
+
+ case GF_SNAP_CONFIG_DISPLAY :
+ cli_out ("\nSnapshot System Configuration:");
+ ret = dict_get_uint64 (dict, "snap-max-hard-limit",
+ &value);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+ "snap_max_hard_limit for %s", volname);
+ ret = -1;
+ goto out;
+ }
+ cli_out ("snap-max-hard-limit : %"PRIu64, value);
+
+ ret = dict_get_uint64 (dict, "snap-max-soft-limit",
+ &soft_limit);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+ "snap-max-soft-limit for %s", volname);
+ ret = -1;
+ goto out;
+ }
+ cli_out ("snap-max-soft-limit : %"PRIu64"%%\n",
+ soft_limit);
+
+ cli_out ("Snapshot Volume Configuration:");
+
+ ret = dict_get_uint64 (dict, "voldisplaycount",
+ &voldisplaycount);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Could not fetch voldisplaycount");
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < voldisplaycount; i++) {
+ snprintf (buf, sizeof(buf), "volume%ld-volname", i);
+ ret = dict_get_str (dict, buf, &volname);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+ " %s", buf);
+ ret = -1;
+ goto out;
+ }
+ cli_out ("\nVolume : %s", volname);
+
+ snprintf (buf, sizeof(buf),
+ "volume%ld-snap-max-hard-limit", i);
+ ret = dict_get_uint64 (dict, buf, &value);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+ " %s", buf);
+ ret = -1;
+ goto out;
+ }
+ cli_out ("snap-max-hard-limit : %"PRIu64, value);
+
+ snprintf (buf, sizeof(buf),
+ "volume%ld-active-hard-limit", i);
+ ret = dict_get_uint64 (dict, buf, &value);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch"
+ " effective snap_max_hard_limit for "
+ "%s", volname);
+ ret = -1;
+ goto out;
+ }
+ cli_out ("Effective snap-max-hard-limit : %"PRIu64,
+ value);
+
+ snprintf (buf, sizeof(buf),
+ "volume%ld-snap-max-soft-limit", i);
+ ret = dict_get_uint64 (dict, buf, &value);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+ " %s", buf);
+ ret = -1;
+ goto out;
+ }
+ cli_out ("Effective snap-max-soft-limit : %"PRIu64" "
+ "(%"PRIu64"%%)", value, soft_limit);
+ }
+ break;
+ default :
+ break;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/* This function is used to print the volume related information
+ * of a snap.
+ *
+ * arg - 0, dict : Response Dictionary.
+ * arg - 1, prefix str : snaplist.snap{0..}.vol{0..}.*
+ */
+int
+cli_get_each_volinfo_in_snap (dict_t *dict, char *keyprefix,
+ gf_boolean_t snap_driven) {
+ char key[PATH_MAX] = "";
+ char *get_buffer = NULL;
+ int value = 0;
+ int ret = -1;
+ char indent[5] = "\t";
+ char *volname = NULL;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (keyprefix);
+
+ if (snap_driven) {
+ ret = snprintf (key, sizeof (key), "%s.volname", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key, &get_buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to get %s", key);
+ goto out;
+ }
+ cli_out ("%s" INDENT_MAIN_HEAD "%s", indent,
+ "Snap Volume Name", ":", get_buffer);
+
+ ret = snprintf (key, sizeof (key),
+ "%s.origin-volname", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key, &volname);
+ if (ret) {
+ gf_log ("cli", GF_LOG_WARNING, "Failed to get %s", key);
+ cli_out ("%-12s", "Origin:");
+ }
+ cli_out ("%s" INDENT_MAIN_HEAD "%s", indent,
+ "Origin Volume name", ":", volname);
+
+
+ ret = snprintf (key, sizeof (key), "%s.snapcount",
+ keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, key, &value);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to get %s", key);
+ goto out;
+ }
+ cli_out ("%s%s %s %s %d", indent, "Snaps taken for",
+ volname, ":", value);
+
+ ret = snprintf (key, sizeof (key), "%s.snaps-available",
+ keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, key, &value);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to get %s", key);
+ goto out;
+ }
+ cli_out ("%s%s %s %s %d", indent, "Snaps available for",
+ volname, ":", value);
+ }
+
+
+ ret = snprintf (key, sizeof (key), "%s.vol-status", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key, &get_buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to get %s", key);
+ goto out;
+ }
+ cli_out ("%s" INDENT_MAIN_HEAD "%s", indent, "Status",
+ ":", get_buffer);
+out :
+ return ret;
+}
+
+/* This function is used to print snap related information
+ * arg - 0, dict : Response dictionary.
+ * arg - 1, prefix_str : snaplist.snap{0..}.*
+ */
+int
+cli_get_volinfo_in_snap (dict_t *dict, char *keyprefix) {
+
+ char key[PATH_MAX] = "";
+ int i = 0;
+ int volcount = 0;
+ int ret = -1;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (keyprefix);
+
+ ret = snprintf (key, sizeof (key), "%s.vol-count", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, key, &volcount);
+ for (i = 1 ; i <= volcount ; i++) {
+ ret = snprintf (key, sizeof (key),
+ "%s.vol%d", keyprefix, i);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = cli_get_each_volinfo_in_snap (dict, key, _gf_true);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not list "
+ "details of volume in a snap");
+ goto out;
+ }
+ cli_out (" ");
+ }
+
+out :
+ return ret;
+}
+
+int
+cli_get_each_snap_info (dict_t *dict, char *prefix_str,
+ gf_boolean_t snap_driven) {
+ char key_buffer[PATH_MAX] = "";
+ char *get_buffer = NULL;
+ int ret = -1;
+ char indent[5] = "";
+
+ GF_ASSERT (dict);
+ GF_ASSERT (prefix_str);
+
+ if (!snap_driven)
+ strcat (indent, "\t");
+
+ ret = snprintf (key_buffer, sizeof (key_buffer), "%s.snapname",
+ prefix_str);
+ if (ret < 0 ) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key_buffer, &get_buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to fetch snapname %s ",
+ key_buffer);
+ goto out;
+ }
+ cli_out ("%s" INDENT_MAIN_HEAD "%s", indent, "Snapshot",
+ ":", get_buffer);
+
+ ret = snprintf (key_buffer, sizeof (key_buffer), "%s.snap-id",
+ prefix_str);
+ if (ret < 0 ) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key_buffer, &get_buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to fetch snap-id %s ",
+ key_buffer);
+ goto out;
+ }
+ cli_out ("%s" INDENT_MAIN_HEAD "%s", indent, "Snap UUID",
+ ":", get_buffer);
+
+ ret = snprintf (key_buffer, sizeof (key_buffer), "%s.snap-desc",
+ prefix_str);
+ if (ret < 0 ) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key_buffer, &get_buffer);
+ if (!ret) {
+ /* Ignore error for description */
+ cli_out ("%s" INDENT_MAIN_HEAD "%s", indent,
+ "Description", ":", get_buffer);
+ }
+
+ ret = snprintf (key_buffer, sizeof (key_buffer), "%s.snap-time",
+ prefix_str);
+ if (ret < 0 ) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key_buffer, &get_buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to fetch snap-time %s ",
+ prefix_str);
+ goto out;
+ }
+ cli_out ("%s" INDENT_MAIN_HEAD "%s", indent, "Created",
+ ":", get_buffer);
+
+ if (snap_driven) {
+ cli_out ("%-12s", "Snap Volumes:\n");
+ ret = cli_get_volinfo_in_snap (dict, prefix_str);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to list details "
+ "of the snaps");
+ goto out;
+ }
+ }
+out :
+ return ret;
+}
+
+/* This is a generic function to print snap related information.
+ * arg - 0, dict : Response Dictionary
+ */
+int
+cli_call_snapshot_info (dict_t *dict, gf_boolean_t bool_snap_driven) {
+ int snap_count = 0;
+ char key[PATH_MAX] = "";
+ int ret = -1;
+ int i = 0;
+
+ GF_ASSERT (dict);
+
+ ret = dict_get_int32 (dict, "snap-count", &snap_count);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to get snap-count");
+ goto out;
+ }
+
+ if (snap_count == 0) {
+ cli_out ("No snapshots present");
+ }
+
+ for (i = 1 ; i <= snap_count ; i++) {
+ ret = snprintf (key, sizeof (key), "snap%d", i);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = cli_get_each_snap_info (dict, key, bool_snap_driven);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Unable to print snap details");
+ goto out;
+ }
+ }
+out :
+ return ret;
+}
+
+int
+cli_get_snaps_in_volume (dict_t *dict) {
+ int ret = -1;
+ int i = 0;
+ int count = 0;
+ int avail = 0;
+ char key[PATH_MAX] = "";
+ char *get_buffer = NULL;
+
+ GF_ASSERT (dict);
+
+ ret = dict_get_str (dict, "origin-volname", &get_buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch origin-volname");
+ goto out;
+ }
+ cli_out (INDENT_MAIN_HEAD "%s", "Volume Name", ":", get_buffer);
+
+ ret = dict_get_int32 (dict, "snap-count", &avail);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch snap-count");
+ goto out;
+ }
+ cli_out (INDENT_MAIN_HEAD "%d", "Snaps Taken", ":", avail);
+
+ ret = dict_get_int32 (dict, "snaps-available", &count);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch snaps-available");
+ goto out;
+ }
+ cli_out (INDENT_MAIN_HEAD "%d", "Snaps Available", ":", count);
+
+ for (i = 1 ; i <= avail ; i++) {
+ snprintf (key, sizeof (key), "snap%d", i);
+ ret = cli_get_each_snap_info (dict, key, _gf_false);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Unable to print snap details");
+ goto out;
+ }
+
+ ret = snprintf (key, sizeof (key), "snap%d.vol1", i);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = cli_get_each_volinfo_in_snap (dict, key, _gf_false);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not get volume "
+ "related information");
+ goto out;
+ }
+
+ cli_out (" ");
+ }
+out :
+ return ret;
+}
+
+int
+cli_snapshot_list (dict_t *dict) {
+ int snapcount = 0;
+ char key[PATH_MAX] = "";
+ int ret = -1;
+ int i = 0;
+ char *get_buffer = NULL;
+
+ GF_ASSERT (dict);
+
+ ret = dict_get_int32 (dict, "snap-count", &snapcount);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch snap count");
+ goto out;
+ }
+
+ if (snapcount == 0) {
+ cli_out ("No snapshots present");
+ }
+
+ for (i = 1 ; i <= snapcount ; i++) {
+ ret = snprintf (key, sizeof (key), "snapname%d",i);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key, &get_buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not get %s ", key);
+ goto out;
+ } else {
+ cli_out ("%s", get_buffer);
+ }
+ }
+out :
+ return ret;
+}
+
+int
+cli_get_snap_volume_status (dict_t *dict, char *key_prefix)
+{
+ int ret = -1;
+ char key[PATH_MAX] = "";
+ char *buffer = NULL;
+ int brickcount = 0;
+ int i = 0;
+ int pid = 0;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (key_prefix);
+
+ ret = snprintf (key, sizeof (key), "%s.brickcount", key_prefix);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = dict_get_int32 (dict, key, &brickcount);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to fetch brickcount");
+ goto out;
+ }
+
+ for ( i = 0 ; i < brickcount ; i++ ) {
+ ret = snprintf (key, sizeof (key), "%s.brick%d.path",
+ key_prefix, i);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key, &buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_INFO,
+ "Unable to get Brick Path");
+ continue;
+ }
+ cli_out ("\n\t%-17s %s %s", "Brick Path", ":", buffer);
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d.vgname",
+ key_prefix, i);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key, &buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_INFO,
+ "Unable to get Volume Group");
+ cli_out ("\t%-17s %s %s", "Volume Group", ":", "N/A");
+ } else
+ cli_out ("\t%-17s %s %s", "Volume Group", ":", buffer);
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d.status",
+ key_prefix, i);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key, &buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_INFO,
+ "Unable to get Brick Running");
+ cli_out ("\t%-17s %s %s", "Brick Running", ":", "N/A");
+ } else
+ cli_out ("\t%-17s %s %s", "Brick Running", ":", buffer);
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d.pid",
+ key_prefix, i);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, key, &pid);
+ if (ret) {
+ gf_log ("cli", GF_LOG_INFO,
+ "Unable to get pid");
+ cli_out ("\t%-17s %s %s", "Brick PID", ":", "N/A");
+ } else
+ cli_out ("\t%-17s %s %d", "Brick PID", ":", pid);
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d.data",
+ key_prefix, i);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key, &buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_INFO,
+ "Unable to get Data Percent");
+ cli_out ("\t%-17s %s %s", "Data Percentage", ":", "N/A");
+ } else
+ cli_out ("\t%-17s %s %s", "Data Percentage", ":", buffer);
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d.lvsize",
+ key_prefix, i);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = dict_get_str (dict, key, &buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_INFO, "Unable to get LV Size");
+ cli_out ("\t%-17s %s %s", "LV Size", ":", "N/A");
+ } else
+ cli_out ("\t%-17s %s %s", "LV Size", ":", buffer);
+
+ }
+out :
+ return ret;
+}
+
+
+
+int
+cli_get_single_snap_status (dict_t *dict, char *keyprefix)
+{
+ int ret = -1;
+ char key[PATH_MAX] = "";
+ int i = 0;
+ int volcount = 0;
+ char *get_buffer = NULL;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (keyprefix);
+
+ ret = snprintf (key, sizeof (key), "%s.snapname", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key, &get_buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to get snapname");
+ goto out;
+ }
+ cli_out ("\nSnap Name : %s", get_buffer);
+
+ ret = snprintf (key, sizeof (key), "%s.uuid", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_str (dict, key, &get_buffer);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to get snap UUID");
+ goto out;
+ }
+ cli_out ("Snap UUID : %s", get_buffer);
+
+ ret = snprintf (key, sizeof (key), "%s.volcount", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, key, &volcount);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Unable to get volume count");
+ goto out;
+ }
+
+ for (i = 0 ; i < volcount ; i++) {
+ ret = snprintf (key, sizeof (key), "%s.vol%d", keyprefix, i);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = cli_get_snap_volume_status (dict, key);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Could not get snap volume status");
+ goto out;
+ }
+ }
+out :
+ return ret;
+}
+
+int
+cli_snap_status_all (dict_t *dict) {
+ int ret = -1;
+ char key[PATH_MAX] = "";
+ int snapcount = 0;
+ int i = 0;
+
+ GF_ASSERT (dict);
+
+ ret = dict_get_int32 (dict, "status.snapcount", &snapcount);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not get snapcount");
+ goto out;
+ }
+
+ if (snapcount == 0) {
+ cli_out ("No snapshots present");
+ }
+
+ for (i = 0 ; i < snapcount; i++) {
+ ret = snprintf (key, sizeof (key), "status.snap%d",i);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = cli_get_single_snap_status (dict, key);
+ }
+out:
+ return ret;
+}
+
+
+int
+cli_snapshot_status_display (dict_t *dict, gf_cli_rsp *rsp)
+{
+ char key[PATH_MAX] = "";
+ int ret = -1;
+ int status_cmd = -1;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp);
+
+ if (rsp->op_ret) {
+ cli_err ("Snapshot Status : failed: %s",
+ rsp->op_errstr ? rsp->op_errstr :
+ "Please check log file for details");
+ ret = rsp->op_ret;
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "cmd", &status_cmd);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch status type");
+ goto out;
+ }
+ switch (status_cmd) {
+ case GF_SNAP_STATUS_TYPE_ALL :
+ {
+ ret = cli_snap_status_all (dict);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+ "status of all snap");
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_STATUS_TYPE_SNAP :
+ {
+ ret = snprintf (key, sizeof (key), "status.snap0");
+ if (ret < 0) {
+ goto out;
+ }
+ ret = cli_get_single_snap_status (dict, key);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+ "status of snap");
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_STATUS_TYPE_VOL :
+ {
+ ret = cli_snap_status_all (dict);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+ "status of snap in a volume");
+ goto out;
+ }
+ break;
+ }
+ default :
+ break;
+ }
+out :
+ return ret;
+}
+
+int
+gf_cli_snapshot_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ gf_cli_rsp rsp = {0, };
+ dict_t *dict = NULL;
+ char *snap_name = NULL;
+ int32_t type = 0;
+ call_frame_t *frame = NULL;
+ gf_boolean_t snap_driven = _gf_false;
+
+ if (req->rpc_status == -1) {
+ ret = -1;
+ goto out;
+ }
+
+ frame = myframe;
+
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
+ if (ret < 0) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "Failed to decode xdr response");
+ goto out;
+ }
+
+ dict = dict_new ();
+
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len, &dict);
+
+ if (ret)
+ goto out;
+
+ ret = dict_get_int32 (dict, "type", &type);
+ if (ret) {
+ gf_log (frame->this->name, GF_LOG_ERROR, "failed to get type");
+ goto out;
+ }
+
+ switch (type) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ if (rsp.op_ret) {
+ cli_err("snapshot create: failed: %s",
+ rsp.op_errstr ? rsp.op_errstr :
+ "Please check log file for details");
+ ret = rsp.op_ret;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snap_name);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Failed to get snap name");
+ goto out;
+ }
+ cli_out ("snapshot create: %s: snap created successfully",
+ snap_name);
+ break;
+
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ /* TODO: Check if rsp.op_ret needs to be checked here. Or is
+ * it ok to check this in the start of the function where we
+ * get rsp.*/
+ if (rsp.op_ret) {
+ cli_err("snapshot restore: failed: %s",
+ rsp.op_errstr ? rsp.op_errstr :
+ "Please check log file for details");
+ ret = rsp.op_ret;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snap_name);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Failed to get snap name");
+ goto out;
+ }
+
+ cli_out ("Snapshot restore: %s: Snap restored "
+ "successfully", snap_name);
+
+ ret = 0;
+ break;
+
+ case GF_SNAP_OPTION_TYPE_INFO:
+ if (rsp.op_ret) {
+ cli_err ("Snapshot info : failed: %s",
+ rsp.op_errstr ? rsp.op_errstr :
+ "Please check log file for details");
+ ret = rsp.op_ret;
+ goto out;
+ }
+
+ snap_driven = dict_get_str_boolean (dict, "snap-driven",
+ _gf_false);
+ if (snap_driven == _gf_true) {
+ ret = cli_call_snapshot_info (dict, snap_driven);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Snapshot info failed");
+ goto out;
+ }
+ } else if (snap_driven == _gf_false) {
+ ret = cli_get_snaps_in_volume (dict);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Snapshot info failed");
+ goto out;
+ }
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_CONFIG:
+ ret = cli_snapshot_config_display (dict, &rsp);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to display "
+ "snapshot config output.");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_LIST:
+ if (rsp.op_ret) {
+ cli_err ("Snapshot list : failed: %s",
+ rsp.op_errstr ? rsp.op_errstr :
+ "Please check log file for details");
+ ret = rsp.op_ret;
+ goto out;
+ }
+
+ ret = cli_snapshot_list (dict);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to display "
+ "snapshot list");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ ret = cli_snapshot_remove_reply (&rsp, dict, frame);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Failed to delete snap");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_STATUS:
+ ret = cli_snapshot_status_display (dict, &rsp);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to display "
+ "snapshot status output.");
+ goto out;
+ }
+ break;
+
+ default:
+ cli_err ("Unknown command executed");
+ ret = -1;
+ goto out;
+ }
+out:
+ if (dict)
+ dict_unref (dict);
+ cli_cmd_broadcast_response (ret);
+
+ free (rsp.dict.dict_val);
+ free (rsp.op_errstr);
+
+ return ret;
+}
+
+int32_t
+gf_cli_snapshot (call_frame_t *frame, xlator_t *this,
+ void *data)
+{
+ gf_cli_req req = {{0,}};
+ dict_t *options = NULL;
+ int ret = -1;
+
+ if (!frame || !this || !data)
+ goto out;
+
+ options = data;
+
+ ret = cli_to_glusterd (&req, frame, gf_cli_snapshot_cbk,
+ (xdrproc_t) xdr_gf_cli_req, options,
+ GLUSTER_CLI_SNAP, this, cli_rpc_prog,
+ NULL);
+out:
+ gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+
+ GF_FREE (req.dict.dict_val);
+ return ret;
+}
+
int
cli_to_glusterd (gf_cli_req *req, call_frame_t *frame,
fop_cbk_fn_t cbkfn, xdrproc_t xdrproc, dict_t *dict,
@@ -6097,6 +8626,8 @@ struct rpc_clnt_procedure gluster_cli_actors[GLUSTER_CLI_MAXVALUE] = {
[GLUSTER_CLI_PROBE] = {"PROBE_QUERY", gf_cli_probe},
[GLUSTER_CLI_DEPROBE] = {"DEPROBE_QUERY", gf_cli_deprobe},
[GLUSTER_CLI_LIST_FRIENDS] = {"LIST_FRIENDS", gf_cli_list_friends},
+ [GLUSTER_CLI_UUID_RESET] = {"UUID_RESET", gf_cli3_1_uuid_reset},
+ [GLUSTER_CLI_UUID_GET] = {"UUID_GET", gf_cli3_1_uuid_get},
[GLUSTER_CLI_CREATE_VOLUME] = {"CREATE_VOLUME", gf_cli_create_volume},
[GLUSTER_CLI_DELETE_VOLUME] = {"DELETE_VOLUME", gf_cli_delete_volume},
[GLUSTER_CLI_START_VOLUME] = {"START_VOLUME", gf_cli_start_volume},
@@ -6128,6 +8659,9 @@ struct rpc_clnt_procedure gluster_cli_actors[GLUSTER_CLI_MAXVALUE] = {
[GLUSTER_CLI_STATEDUMP_VOLUME] = {"STATEDUMP_VOLUME", gf_cli_statedump_volume},
[GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", gf_cli_list_volume},
[GLUSTER_CLI_CLRLOCKS_VOLUME] = {"CLEARLOCKS_VOLUME", gf_cli_clearlocks_volume},
+ [GLUSTER_CLI_COPY_FILE] = {"COPY_FILE", gf_cli_copy_file},
+ [GLUSTER_CLI_SYS_EXEC] = {"SYS_EXEC", gf_cli_sys_exec},
+ [GLUSTER_CLI_SNAP] = {"SNAP", gf_cli_snapshot},
};
struct rpc_clnt_program cli_prog = {
diff --git a/cli/src/cli-xml-output.c b/cli/src/cli-xml-output.c
index 004ded765..d8884d44b 100644
--- a/cli/src/cli-xml-output.c
+++ b/cli/src/cli-xml-output.c
@@ -15,6 +15,39 @@
#include "syscall.h"
+enum gf_task_types {
+ GF_TASK_TYPE_REBALANCE,
+ GF_TASK_TYPE_REMOVE_BRICK
+};
+
+/*
+ * IMPORTANT NOTE:
+ * All exported functions in this file which use libxml need use a
+ * #if (HAVE_LIB_XML), #else, #endif
+ * For eg,
+ * int exported_func () {
+ * #if (HAVE_LIB_XML)
+ * <Stuff using libxml>
+ * #else
+ * return 0;
+ * #endif
+ * }
+ *
+ * All other functions, which are called internally within this file need to be
+ * within #if (HAVE_LIB_XML), #endif statements
+ * For eg,
+ * #if (HAVE_LIB_XML)
+ * int internal_func ()
+ * {
+ * }
+ * #endif
+ *
+ * Following the above formate ensures that all xml related code is compliled
+ * only when libxml2 is present, and also keeps the rest of the codebase free
+ * of #if (HAVE_LIB_XML)
+ */
+
+
#if (HAVE_LIB_XML)
#include <libxml/encoding.h>
@@ -30,18 +63,11 @@
}while (0) \
int
-cli_begin_xml_output (xmlTextWriterPtr *writer, xmlBufferPtr *buf)
+cli_begin_xml_output (xmlTextWriterPtr *writer, xmlDocPtr *doc)
{
int ret = -1;
- *buf = xmlBufferCreateSize (8192);
- if (*buf == NULL) {
- ret = -1;
- goto out;
- }
- xmlBufferSetAllocationScheme (*buf, XML_BUFFER_ALLOC_DOUBLEIT);
-
- *writer = xmlNewTextWriterMemory (*buf, 0);
+ *writer = xmlNewTextWriterDoc (doc, 0);
if (writer == NULL) {
ret = -1;
goto out;
@@ -60,7 +86,7 @@ out:
}
int
-cli_end_xml_output (xmlTextWriterPtr writer, xmlBufferPtr buf)
+cli_end_xml_output (xmlTextWriterPtr writer, xmlDocPtr doc)
{
int ret = -1;
@@ -71,10 +97,12 @@ cli_end_xml_output (xmlTextWriterPtr writer, xmlBufferPtr buf)
ret = xmlTextWriterEndDocument (writer);
XML_RET_CHECK_AND_GOTO (ret, out);
- cli_out ("%s", (const char *)buf->content);
+
+ /* Dump xml document to stdout and pretty format it */
+ xmlSaveFormatFileEnc ("-", doc, "UTF-8", 1);
xmlFreeTextWriter (writer);
- xmlBufferFree (buf);
+ xmlFreeDoc (doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -96,7 +124,7 @@ cli_xml_output_common (xmlTextWriterPtr writer, int op_ret, int op_errno,
XML_RET_CHECK_AND_GOTO (ret, out);
ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"opErrstr",
- "%s", op_errstr);
+ "%s", op_errstr);
XML_RET_CHECK_AND_GOTO (ret, out);
out:
@@ -112,9 +140,9 @@ cli_xml_output_str (char *op, char *str, int op_ret, int op_errno,
#if (HAVE_LIB_XML)
int ret = -1;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -122,15 +150,21 @@ cli_xml_output_str (char *op, char *str, int op_ret, int op_errno,
if (ret)
goto out;
- ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"cliOp",
- "%s", op);
- XML_RET_CHECK_AND_GOTO (ret, out);
+ if (op) {
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"cliOp",
+ "%s", op);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ }
- ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"output",
- "%s", str);
- XML_RET_CHECK_AND_GOTO (ret, out);
+ if (str) {
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"output",
+ "%s", str);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ }
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -164,9 +198,9 @@ cli_xml_output_dict ( char *op, dict_t *dict, int op_ret, int op_errno,
#if (HAVE_LIB_XML)
int ret = -1;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -185,7 +219,7 @@ cli_xml_output_dict ( char *op, dict_t *dict, int op_ret, int op_errno,
ret = xmlTextWriterEndElement (writer);
XML_RET_CHECK_AND_GOTO (ret, out);
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -230,15 +264,6 @@ cli_xml_output_vol_status_common (xmlTextWriterPtr writer, dict_t *dict,
XML_RET_CHECK_AND_GOTO (ret, out);
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "brick%d.port", brick_index);
- ret = dict_get_int32 (dict, key, &port);
- if (ret)
- goto out;
- ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"port",
- "%d", port);
- XML_RET_CHECK_AND_GOTO (ret, out);
-
- memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "brick%d.status", brick_index);
ret = dict_get_int32 (dict, key, &status);
if (ret)
@@ -249,6 +274,27 @@ cli_xml_output_vol_status_common (xmlTextWriterPtr writer, dict_t *dict,
*online = status;
memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "brick%d.port", brick_index);
+ ret = dict_get_int32 (dict, key, &port);
+ if (ret)
+ goto out;
+
+ /* If the process is either offline or doesn't provide a port (shd)
+ * port = "N/A"
+ * else print the port number of the process.
+ */
+
+ if (*online == 1 && port != 0)
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"port",
+ "%d", port);
+ else
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"port",
+ "%s", "N/A");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "brick%d.pid", brick_index);
ret = dict_get_int32 (dict, key, &pid);
if (ret)
@@ -1294,7 +1340,7 @@ cli_xml_output_vol_status_begin (cli_local_t *local, int op_ret, int op_errno,
#if (HAVE_LIB_XML)
int ret = -1;
- ret = cli_begin_xml_output (&(local->writer), &(local->buf));
+ ret = cli_begin_xml_output (&(local->writer), &(local->doc));
XML_RET_CHECK_AND_GOTO (ret, out);
ret = cli_xml_output_common (local->writer, op_ret, op_errno,
@@ -1332,7 +1378,7 @@ cli_xml_output_vol_status_end (cli_local_t *local)
ret = xmlTextWriterEndElement (local->writer);
XML_RET_CHECK_AND_GOTO(ret, out);
- ret = cli_end_xml_output (local->writer, local->buf);
+ ret = cli_end_xml_output (local->writer, local->doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -1341,6 +1387,215 @@ out:
#endif
}
+#if (HAVE_LIB_XML)
+int
+cli_xml_output_remove_brick_task_params (xmlTextWriterPtr writer, dict_t *dict,
+ char *prefix)
+{
+ int ret = -1;
+ char key[1024] = {0,};
+ int count = 0;
+ int i = 0;
+ char *brick = NULL;
+
+ /* <params> */
+ ret = xmlTextWriterStartElement (writer, (xmlChar *)"params");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ snprintf (key, sizeof (key), "%s.count", prefix);
+ ret = dict_get_int32 (dict, key, &count);
+ if (ret)
+ goto out;
+
+ for (i = 1; i <= count; i++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s.brick%d", prefix, i);
+ ret = dict_get_str (dict, key, &brick);
+ if (ret)
+ goto out;
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"brick",
+ "%s", brick);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ brick = NULL;
+ }
+
+ /* </param> */
+ ret = xmlTextWriterEndElement (writer);
+
+out:
+ gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+cli_xml_output_replace_brick_task_params (xmlTextWriterPtr writer, dict_t *dict,
+ char *prefix)
+{
+
+ int ret = -1;
+ char key[1024] = {0,};
+ char *brick = NULL;
+
+ /* <params> */
+ ret = xmlTextWriterStartElement (writer, (xmlChar *)"params");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ snprintf (key, sizeof (key), "%s.src-brick", prefix);
+ ret = dict_get_str (dict, key, &brick);
+ if (ret)
+ goto out;
+ ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"srcBrick",
+ "%s", brick);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s.dst-brick", prefix);
+ ret = dict_get_str (dict, key, &brick);
+ if (ret)
+ goto out;
+ ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"dstBrick",
+ "%s", brick);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+
+ /* </param> */
+ ret = xmlTextWriterEndElement (writer);
+
+out:
+ gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+cli_xml_output_vol_status_tasks (cli_local_t *local, dict_t *dict) {
+ int ret = -1;
+ char *task_type = NULL;
+ char *task_id_str = NULL;
+ int status = 0;
+ int tasks = 0;
+ char key[1024] = {0,};
+ int i = 0;
+
+ /* <tasks> */
+ ret = xmlTextWriterStartElement (local->writer, (xmlChar *)"tasks");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = dict_get_int32 (dict, "tasks", &tasks);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < tasks; i++) {
+ /* <task> */
+ ret = xmlTextWriterStartElement (local->writer,
+ (xmlChar *)"task");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.type", i);
+ ret = dict_get_str (dict, key, &task_type);
+ if (ret)
+ goto out;
+ ret = xmlTextWriterWriteFormatElement (local->writer,
+ (xmlChar *)"type",
+ "%s", task_type);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.id", i);
+ ret = dict_get_str (dict, key, &task_id_str);
+ if (ret)
+ goto out;
+ ret = xmlTextWriterWriteFormatElement (local->writer,
+ (xmlChar *)"id",
+ "%s", task_id_str);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.status", i);
+ ret = dict_get_int32 (dict, key, &status);
+ if (ret)
+ goto out;
+ ret = xmlTextWriterWriteFormatElement (local->writer,
+ (xmlChar *)"status",
+ "%d", status);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ if (!strcmp (task_type, "Replace brick")) {
+ if (status) {
+ status = GF_DEFRAG_STATUS_COMPLETE;
+ } else {
+ status = GF_DEFRAG_STATUS_STARTED;
+ }
+ }
+
+ ret = xmlTextWriterWriteFormatElement (local->writer,
+ (xmlChar *)"statusStr",
+ "%s",
+ cli_vol_task_status_str[status]);
+
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d", i);
+ if (!strcmp (task_type, "Replace brick")) {
+ ret = cli_xml_output_replace_brick_task_params
+ (local->writer, dict, key);
+ if (ret)
+ goto out;
+ } else if (!strcmp (task_type, "Remove brick")) {
+ ret = cli_xml_output_remove_brick_task_params
+ (local->writer, dict, key);
+ if (ret)
+ goto out;
+ }
+
+
+ /* </task> */
+ ret = xmlTextWriterEndElement (local->writer);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ }
+
+ /* </tasks> */
+ ret = xmlTextWriterEndElement (local->writer);
+
+out:
+ gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+cli_xml_output_vol_status_tasks_detail (cli_local_t *local, dict_t *dict)
+{
+ int ret = -1;
+ char *volname = NULL;
+
+ /*<volume>*/
+ ret = xmlTextWriterStartElement (local->writer, (xmlChar *)"volume");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret)
+ goto out;
+ ret = xmlTextWriterWriteFormatElement (local->writer,
+ (xmlChar *)"volName", "%s",
+ volname);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = cli_xml_output_vol_status_tasks (local, dict);
+ if (ret)
+ goto out;
+
+ /* </volume> */
+ ret = xmlTextWriterEndElement (local->writer);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+ return ret;
+}
+#endif
+
int
cli_xml_output_vol_status (cli_local_t *local, dict_t *dict)
{
@@ -1458,7 +1713,6 @@ cli_xml_output_vol_status (cli_local_t *local, dict_t *dict)
goto out;
}
break;
-
default:
break;
@@ -1468,6 +1722,16 @@ cli_xml_output_vol_status (cli_local_t *local, dict_t *dict)
XML_RET_CHECK_AND_GOTO (ret, out);
}
+ /* Tasks are only present when a normal volume status call is done on a
+ * single volume or on all volumes
+ */
+ if (((cmd & GF_CLI_STATUS_MASK) == GF_CLI_STATUS_NONE) &&
+ (cmd & (GF_CLI_STATUS_VOL|GF_CLI_STATUS_ALL))) {
+ ret = cli_xml_output_vol_status_tasks (local, dict);
+ if (ret)
+ goto out;
+ }
+
/* </volume> */
ret = xmlTextWriterEndElement (local->writer);
XML_RET_CHECK_AND_GOTO (ret, out);
@@ -1594,7 +1858,7 @@ cli_xml_output_vol_top (dict_t *dict, int op_ret, int op_errno,
#if (HAVE_LIB_XML)
int ret = -1;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
int brick_count = 0;
int top_op = GF_CLI_TOP_NONE;
char *brick_name = NULL;
@@ -1608,7 +1872,7 @@ cli_xml_output_vol_top (dict_t *dict, int op_ret, int op_errno,
int i = 0;
int j = 0;
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -1697,8 +1961,6 @@ cli_xml_output_vol_top (dict_t *dict, int op_ret, int op_errno,
case GF_CLI_TOP_WRITE:
case GF_CLI_TOP_OPENDIR:
case GF_CLI_TOP_READDIR:
- if (!members)
- continue;
break;
@@ -1724,9 +1986,6 @@ cli_xml_output_vol_top (dict_t *dict, int op_ret, int op_errno,
"%f", time_taken);
}
- if (!members)
- continue;
-
break;
default:
@@ -1756,7 +2015,7 @@ cli_xml_output_vol_top (dict_t *dict, int op_ret, int op_errno,
/* </volTop> */
ret = xmlTextWriterEndElement (writer);
XML_RET_CHECK_AND_GOTO (ret, out);
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -1951,7 +2210,7 @@ cli_xml_output_vol_profile (dict_t *dict, int op_ret, int op_errno,
#if (HAVE_LIB_XML)
int ret = -1;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
char *volname = NULL;
int op = GF_CLI_STATS_NONE;
int brick_count = 0;
@@ -1960,7 +2219,7 @@ cli_xml_output_vol_profile (dict_t *dict, int op_ret, int op_errno,
char key[1024] = {0,};
int i = 0;
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -2040,7 +2299,7 @@ cont:
ret = xmlTextWriterEndElement (writer);
XML_RET_CHECK_AND_GOTO (ret, out);
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -2056,13 +2315,13 @@ cli_xml_output_vol_list (dict_t *dict, int op_ret, int op_errno,
#if (HAVE_LIB_XML)
int ret = -1;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
int count = 0;
char *volname = NULL;
char key[1024] = {0,};
int i = 0;
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -2097,7 +2356,7 @@ cli_xml_output_vol_list (dict_t *dict, int op_ret, int op_errno,
ret = xmlTextWriterEndElement (writer);
XML_RET_CHECK_AND_GOTO (ret, out);
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -2151,15 +2410,44 @@ out:
return ret;
}
+struct tmp_xml_option_logger {
+ char *key;
+ xmlTextWriterPtr writer;
+};
+
+static int
+_output_vol_info_option (dict_t *d, char *k, data_t *v,
+ void *data)
+{
+ int ret = 0;
+ char *ptr = NULL;
+ struct tmp_xml_option_logger *tmp = NULL;
+
+ tmp = data;
+
+ ptr = strstr (k, "option.");
+ if (!ptr)
+ goto out;
+
+ if (!v) {
+ ret = -1;
+ goto out;
+ }
+ ret = cli_xml_output_vol_info_option (tmp->writer, tmp->key, k,
+ v->data);
+
+out:
+ return ret;
+}
+
int
cli_xml_output_vol_info_options (xmlTextWriterPtr writer, dict_t *dict,
char *prefix)
{
int ret = -1;
int opt_count = 0;
- data_t *value = 0;
- char *ptr = NULL;
char key[1024] = {0,};
+ struct tmp_xml_option_logger tmp = {0,};
snprintf (key, sizeof (key), "%s.opt_count", prefix);
ret = dict_get_int32 (dict, key, &opt_count);
@@ -2174,26 +2462,9 @@ cli_xml_output_vol_info_options (xmlTextWriterPtr writer, dict_t *dict,
XML_RET_CHECK_AND_GOTO (ret, out);
snprintf (key, sizeof (key), "%s.option.", prefix);
- int _output_vol_info_option (dict_t *d, char *k, data_t *v,
- void *data)
- {
- int ret = 0;
- ptr = strstr (k, "option.");
- if (!ptr)
- goto internal_out;
-
- value = v;
- if (!value) {
- ret = -1;
- goto internal_out;
- }
- ret = cli_xml_output_vol_info_option (writer, key, k,
- v->data);
-
- internal_out:
- return ret;
- }
- ret = dict_foreach (dict, _output_vol_info_option, NULL);
+ tmp.key = key;
+ tmp.writer = writer;
+ ret = dict_foreach (dict, _output_vol_info_option, &tmp);
if (ret)
goto out;
@@ -2214,6 +2485,7 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict)
int count = 0;
char *volname = NULL;
char *volume_id = NULL;
+ char *uuid = NULL;
int type = 0;
int status = 0;
int brick_count = 0;
@@ -2225,7 +2497,9 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict)
char key[1024] = {0,};
int i = 0;
int j = 1;
-
+ char *caps = NULL;
+ int k __attribute__((unused)) = 0;
+ char *snap_volume = NULL;
ret = dict_get_int32 (dict, "count", &count);
if (ret)
@@ -2267,6 +2541,18 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict)
"%d", status);
XML_RET_CHECK_AND_GOTO (ret, out);
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "volume%d.snap_volume", i);
+ ret = dict_get_str (dict, key, &snap_volume);
+ if (ret)
+ goto out;
+ if (snap_volume) {
+ ret = xmlTextWriterWriteFormatElement (local->writer,
+ (xmlChar *)"snapVol",
+ "%s", snap_volume);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ }
+
ret =xmlTextWriterWriteFormatElement
(local->writer, (xmlChar *)"statusStr", "%s",
cli_vol_status_str[status]);
@@ -2341,20 +2627,95 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict)
"%d", transport);
XML_RET_CHECK_AND_GOTO (ret, out);
+#ifdef HAVE_BD_XLATOR
+ /* <xlators> */
+ ret = xmlTextWriterStartElement (local->writer,
+ (xmlChar *)"xlators");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ for (k = 0; ; k++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key),"volume%d.xlator%d", i, k);
+ ret = dict_get_str (dict, key, &caps);
+ if (ret)
+ break;
+
+ /* <xlator> */
+ ret = xmlTextWriterStartElement (local->writer,
+ (xmlChar *)"xlator");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = xmlTextWriterWriteFormatElement
+ (local->writer, (xmlChar *)"name", "%s", caps);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ /* <capabilities> */
+ ret = xmlTextWriterStartElement (local->writer,
+ (xmlChar *)
+ "capabilities");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ j = 0;
+ for (j = 0; ;j++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key),
+ "volume%d.xlator%d.caps%d", i, k, j);
+ ret = dict_get_str (dict, key, &caps);
+ if (ret)
+ break;
+ ret = xmlTextWriterWriteFormatElement
+ (local->writer, (xmlChar *)"capability",
+ "%s", caps);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ }
+ /* </capabilities> */
+ ret = xmlTextWriterEndElement (local->writer);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ /* </xlator> */
+ ret = xmlTextWriterEndElement (local->writer);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ }
+ ret = xmlTextWriterFullEndElement (local->writer);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ /* </xlators> */
+#else
+ caps = 0; /* Avoid compiler warnings when BD not enabled */
+#endif
+ j = 1;
+
/* <bricks> */
ret = xmlTextWriterStartElement (local->writer,
(xmlChar *)"bricks");
XML_RET_CHECK_AND_GOTO (ret, out);
while (j <= brick_count) {
+ ret = xmlTextWriterStartElement
+ (local->writer, (xmlChar *)"brick");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "volume%d.brick%d.uuid",
+ i, j);
+ ret = dict_get_str (dict, key, &uuid);
+ if (ret)
+ goto out;
+ ret = xmlTextWriterWriteFormatAttribute
+ (local->writer, (xmlChar *)"uuid", "%s",
+ uuid);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "volume%d.brick%d", i, j);
ret = dict_get_str (dict, key, &brick);
if (ret)
goto out;
- ret = xmlTextWriterWriteFormatElement
- (local->writer, (xmlChar *)"brick", "%s",
- brick);
+ ret = xmlTextWriterWriteFormatString
+ (local->writer, "%s", brick);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ /* </brick> */
+ ret = xmlTextWriterEndElement (local->writer);
XML_RET_CHECK_AND_GOTO (ret, out);
+
j++;
}
/* </bricks> */
@@ -2372,12 +2733,12 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict)
ret = xmlTextWriterEndElement (local->writer);
XML_RET_CHECK_AND_GOTO (ret, out);
}
- GF_FREE (local->get_vol.volname);
+
if (volname) {
+ GF_FREE (local->get_vol.volname);
local->get_vol.volname = gf_strdup (volname);
local->vol_count += count;
}
-
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -2395,7 +2756,7 @@ cli_xml_output_vol_info_begin (cli_local_t *local, int op_ret, int op_errno,
GF_ASSERT (local);
- ret = cli_begin_xml_output (&(local->writer), &(local->buf));
+ ret = cli_begin_xml_output (&(local->writer), &(local->doc));
if (ret)
goto out;
@@ -2443,7 +2804,7 @@ cli_xml_output_vol_info_end (cli_local_t *local)
ret = xmlTextWriterEndElement (local->writer);
XML_RET_CHECK_AND_GOTO (ret, out);
- ret = cli_end_xml_output (local->writer, local->buf);
+ ret = cli_end_xml_output (local->writer, local->doc);
out:
gf_log ("cli", GF_LOG_ERROR, "Returning %d", ret);
@@ -2461,7 +2822,7 @@ cli_xml_output_vol_quota_limit_list (char *volname, char *limit_list,
#if (HAVE_LIB_XML)
int ret = -1;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
int64_t size = 0;
int64_t limit_value = 0;
int i = 0;
@@ -2479,7 +2840,7 @@ cli_xml_output_vol_quota_limit_list (char *volname, char *limit_list,
GF_ASSERT (volname);
GF_ASSERT (limit_list);
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -2589,7 +2950,7 @@ cont:
ret = xmlTextWriterEndElement (writer);
XML_RET_CHECK_AND_GOTO (ret, out);
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
GF_FREE (size_str);
@@ -2607,18 +2968,17 @@ cli_xml_output_peer_status (dict_t *dict, int op_ret, int op_errno,
#if (HAVE_LIB_XML)
int ret = -1;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
int count = 0;
char *uuid = NULL;
char *hostname = NULL;
int connected = 0;
int state_id = 0;
char *state_str = NULL;
- int port = 0;
int i = 1;
char key[1024] = {0,};
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -2678,34 +3038,23 @@ cli_xml_output_peer_status (dict_t *dict, int op_ret, int op_errno,
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "friend%d.stateId", i);
ret = dict_get_int32 (dict, key, &state_id);
- if (ret)
- goto out;
+ if (!ret) {
+ /* ignore */
- ret = xmlTextWriterWriteFormatElement (writer,
- (xmlChar *)"state",
- "%d", state_id);
- XML_RET_CHECK_AND_GOTO (ret, out);
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"state", "%d", state_id);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ }
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "friend%d.state", i);
ret = dict_get_str (dict, key, &state_str);
- if (ret)
- goto out;
+ if (!ret) {
+ /* ignore */
- ret = xmlTextWriterWriteFormatElement (writer,
- (xmlChar *)"stateStr",
- "%s", state_str);
- XML_RET_CHECK_AND_GOTO (ret, out);
-
- memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "friend%d.port", i);
- ret = dict_get_int32 (dict, key, &port);
- if (port != 0) {
- ret = xmlTextWriterWriteFormatElement
- (writer, (xmlChar *)"port", "%d", port);
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"stateStr", "%s", state_str);
XML_RET_CHECK_AND_GOTO (ret, out);
-
- port = 0;
}
/* </peer> */
@@ -2720,7 +3069,7 @@ cont:
ret = xmlTextWriterEndElement (writer);
XML_RET_CHECK_AND_GOTO (ret, out);
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -2733,22 +3082,29 @@ out:
#if (HAVE_LIB_XML)
/* Used for rebalance stop/status, remove-brick status */
int
-cli_xml_output_vol_rebalance_status (xmlTextWriterPtr writer, dict_t *dict)
+cli_xml_output_vol_rebalance_status (xmlTextWriterPtr writer, dict_t *dict,
+ enum gf_task_types task_type)
{
int ret = -1;
int count = 0;
char *node_name = NULL;
+ char *node_uuid = NULL;
uint64_t files = 0;
uint64_t size = 0;
uint64_t lookups = 0;
int status_rcd = 0;
uint64_t failures = 0;
+ uint64_t skipped = 0;
uint64_t total_files = 0;
uint64_t total_size = 0;
uint64_t total_lookups = 0;
uint64_t total_failures = 0;
+ uint64_t total_skipped = 0;
char key[1024] = {0,};
int i = 0;
+ int overall_status = -1;
+ double elapsed = 0;
+ double overall_elapsed = 0;
if (!dict) {
ret = 0;
@@ -2770,13 +3126,22 @@ cli_xml_output_vol_rebalance_status (xmlTextWriterPtr writer, dict_t *dict)
XML_RET_CHECK_AND_GOTO (ret, out);
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "node-uuid-%d", i);
+ snprintf (key, sizeof (key), "node-name-%d", i);
ret = dict_get_str (dict, key, &node_name);
if (ret)
goto out;
ret = xmlTextWriterWriteFormatElement (writer,
(xmlChar *)"nodeName",
"%s", node_name);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "node-uuid-%d", i);
+ ret = dict_get_str (dict, key, &node_uuid);
+ if (ret)
+ goto out;
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"id",
+ "%s", node_uuid);
XML_RET_CHECK_AND_GOTO (ret, out);
memset (key, 0, sizeof (key));
@@ -2823,6 +3188,27 @@ cli_xml_output_vol_rebalance_status (xmlTextWriterPtr writer, dict_t *dict)
"%"PRIu64, failures);
XML_RET_CHECK_AND_GOTO (ret, out);
+ /* skipped-%d is not available for remove brick in dict,
+ so using failures as skipped count in case of remove-brick
+ similar to logic used in CLI(non xml output) */
+ if (task_type == GF_TASK_TYPE_REBALANCE) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "skipped-%d", i);
+ }
+ else {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "failures-%d", i);
+ }
+
+ ret = dict_get_uint64 (dict, key, &skipped);
+ if (ret)
+ goto out;
+ total_skipped += skipped;
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"skipped",
+ "%"PRIu64, skipped);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "status-%d", i);
ret = dict_get_int32 (dict, key, &status_rcd);
@@ -2833,6 +3219,33 @@ cli_xml_output_vol_rebalance_status (xmlTextWriterPtr writer, dict_t *dict)
"%d", status_rcd);
XML_RET_CHECK_AND_GOTO (ret, out);
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"statusStr",
+ "%s",
+ cli_vol_task_status_str[status_rcd]);
+
+ memset (key, 0, 256);
+ snprintf (key, 256, "run-time-%d", i);
+ ret = dict_get_double (dict, key, &elapsed);
+ if (ret)
+ goto out;
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"runtime",
+ "%.2f", elapsed);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ if (elapsed > overall_elapsed) {
+ overall_elapsed = elapsed;
+ }
+
+ if (-1 == overall_status)
+ overall_status = status_rcd;
+ else if ((GF_DEFRAG_STATUS_COMPLETE == overall_status ||
+ status_rcd > overall_status) &&
+ (status_rcd != GF_DEFRAG_STATUS_COMPLETE))
+ overall_status = status_rcd;
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
/* </node> */
ret = xmlTextWriterEndElement (writer);
XML_RET_CHECK_AND_GOTO (ret, out);
@@ -2859,7 +3272,22 @@ cli_xml_output_vol_rebalance_status (xmlTextWriterPtr writer, dict_t *dict)
"%"PRIu64, total_failures);
XML_RET_CHECK_AND_GOTO (ret, out);
- // TODO : Aggregate status
+ ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"skipped",
+ "%"PRIu64, total_skipped);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"status",
+ "%d", overall_status);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"statusStr",
+ "%s",
+ cli_vol_task_status_str[overall_status]);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"runtime",
+ "%.2f", overall_elapsed);
+ XML_RET_CHECK_AND_GOTO (ret, out);
/* </aggregate> */
ret = xmlTextWriterEndElement (writer);
@@ -2878,9 +3306,10 @@ cli_xml_output_vol_rebalance (gf_cli_defrag_type op, dict_t *dict, int op_ret,
#if (HAVE_LIB_XML)
int ret = -1;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
+ char *task_id_str = NULL;
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -2892,12 +3321,21 @@ cli_xml_output_vol_rebalance (gf_cli_defrag_type op, dict_t *dict, int op_ret,
ret = xmlTextWriterStartElement (writer, (xmlChar *)"volRebalance");
XML_RET_CHECK_AND_GOTO (ret, out);
+ ret = dict_get_str (dict, GF_REBALANCE_TID_KEY, &task_id_str);
+ if (ret == 0) {
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"task-id",
+ "%s", task_id_str);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ }
+
ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"op",
"%d", op);
XML_RET_CHECK_AND_GOTO (ret, out);
if ((GF_DEFRAG_CMD_STOP == op) || (GF_DEFRAG_CMD_STATUS == op)) {
- ret = cli_xml_output_vol_rebalance_status (writer, dict);
+ ret = cli_xml_output_vol_rebalance_status (writer, dict,
+ GF_TASK_TYPE_REBALANCE);
if (ret)
goto out;
}
@@ -2907,7 +3345,7 @@ cli_xml_output_vol_rebalance (gf_cli_defrag_type op, dict_t *dict, int op_ret,
XML_RET_CHECK_AND_GOTO (ret, out);
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -2924,9 +3362,10 @@ cli_xml_output_vol_remove_brick (gf_boolean_t status_op, dict_t *dict,
#if (HAVE_LIB_XML)
int ret = -1;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
+ char *task_id_str = NULL;
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -2938,8 +3377,17 @@ cli_xml_output_vol_remove_brick (gf_boolean_t status_op, dict_t *dict,
ret = xmlTextWriterStartElement (writer, (xmlChar *)"volRemoveBrick");
XML_RET_CHECK_AND_GOTO (ret, out);
+ ret = dict_get_str (dict, GF_REMOVE_BRICK_TID_KEY, &task_id_str);
+ if (ret == 0) {
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"task-id",
+ "%s", task_id_str);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ }
+
if (status_op) {
- ret = cli_xml_output_vol_rebalance_status (writer, dict);
+ ret = cli_xml_output_vol_rebalance_status (writer, dict,
+ GF_TASK_TYPE_REMOVE_BRICK);
if (ret)
goto out;
}
@@ -2949,7 +3397,7 @@ cli_xml_output_vol_remove_brick (gf_boolean_t status_op, dict_t *dict,
XML_RET_CHECK_AND_GOTO (ret, out);
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -2968,10 +3416,11 @@ cli_xml_output_vol_replace_brick (gf1_cli_replace_op op, dict_t *dict,
int status = 0;
uint64_t files = 0;
char *current_file = 0;
+ char *task_id_str = NULL;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -2983,6 +3432,14 @@ cli_xml_output_vol_replace_brick (gf1_cli_replace_op op, dict_t *dict,
ret = xmlTextWriterStartElement (writer, (xmlChar *)"volReplaceBrick");
XML_RET_CHECK_AND_GOTO (ret, out);
+ ret = dict_get_str (dict, GF_REPLACE_BRICK_TID_KEY, &task_id_str);
+ if (ret == 0) {
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"task-id",
+ "%s", task_id_str);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+ }
+
ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"op",
"%d", op);
XML_RET_CHECK_AND_GOTO (ret, out);
@@ -3020,7 +3477,7 @@ cont:
ret = xmlTextWriterEndElement (writer);
XML_RET_CHECK_AND_GOTO (ret, out);
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -3037,11 +3494,11 @@ cli_xml_output_vol_create (dict_t *dict, int op_ret, int op_errno,
#if (HAVE_LIB_XML)
int ret = -1;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
char *volname = NULL;
char *volid = NULL;
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -3083,7 +3540,7 @@ cli_xml_output_vol_create (dict_t *dict, int op_ret, int op_errno,
XML_RET_CHECK_AND_GOTO (ret, out);
}
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -3100,13 +3557,13 @@ cli_xml_output_generic_volume (char *op, dict_t *dict, int op_ret, int op_errno,
#if (HAVE_LIB_XML)
int ret = -1;
xmlTextWriterPtr writer = NULL;
- xmlBufferPtr buf = NULL;
+ xmlDocPtr doc = NULL;
char *volname = NULL;
char *volid = NULL;
GF_ASSERT (op);
- ret = cli_begin_xml_output (&writer, &buf);
+ ret = cli_begin_xml_output (&writer, &doc);
if (ret)
goto out;
@@ -3147,7 +3604,7 @@ cli_xml_output_generic_volume (char *op, dict_t *dict, int op_ret, int op_errno,
XML_RET_CHECK_AND_GOTO (ret, out);
}
- ret = cli_end_xml_output (writer, buf);
+ ret = cli_end_xml_output (writer, doc);
out:
gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -3156,3 +3613,160 @@ out:
return 0;
#endif
}
+
+#if (HAVE_LIB_XML)
+int
+cli_xml_output_vol_gsync_status (dict_t *dict, xmlTextWriterPtr writer)
+{
+ char master_key[PATH_MAX] = "";
+ char slave_key[PATH_MAX] = "";
+ char status_key[PATH_MAX] = "";
+ char node_key[PATH_MAX] = "";
+ char *master = NULL;
+ char *slave = NULL;
+ char *status = NULL;
+ char *node = NULL;
+ int ret = -1;
+ int gsync_count = 0;
+ int i = 1;
+
+ ret = dict_get_int32 (dict, "gsync-count", &gsync_count);
+ if (ret)
+ goto out;
+
+ for (i=1; i <= gsync_count; i++) {
+ snprintf (node_key, sizeof(node_key), "node%d", i);
+ snprintf (master_key, sizeof(master_key), "master%d", i);
+ snprintf (slave_key, sizeof(slave_key), "slave%d", i);
+ snprintf (status_key, sizeof(status_key), "status%d", i);
+
+ ret = dict_get_str (dict, node_key, &node);
+ if (ret)
+ goto out;
+
+ ret = dict_get_str (dict, master_key, &master);
+ if (ret)
+ goto out;
+
+ ret = dict_get_str (dict, slave_key, &slave);
+ if (ret)
+ goto out;
+
+ ret = dict_get_str (dict, status_key, &status);
+ if (ret)
+ goto out;
+
+ /* <pair> */
+ ret = xmlTextWriterStartElement (writer, (xmlChar *)"pair");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"node",
+ "%s", node);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"master",
+ "%s", master);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"slave",
+ "%s", slave);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"status",
+ "%s", status);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ /* </pair> */
+ ret = xmlTextWriterEndElement (writer);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ }
+
+out:
+ gf_log ("cli",GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+#endif
+
+int
+cli_xml_output_vol_gsync (dict_t *dict, int op_ret, int op_errno,
+ char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+ int ret = -1;
+ xmlTextWriterPtr writer = NULL;
+ xmlDocPtr doc = NULL;
+ char *master = NULL;
+ char *slave = NULL;
+ int type = 0;
+
+ GF_ASSERT (dict);
+
+ ret = cli_begin_xml_output (&writer, &doc);
+ if (ret)
+ goto out;
+
+ ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+ if (ret)
+ goto out;
+
+ /* <geoRep> */
+ ret = xmlTextWriterStartElement (writer, (xmlChar *)"geoRep");
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = dict_get_int32 (dict, "type", &type);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "Failed to get type");
+ goto out;
+ }
+
+ ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"type",
+ "%d", type);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ switch (type) {
+ case GF_GSYNC_OPTION_TYPE_START:
+ case GF_GSYNC_OPTION_TYPE_STOP:
+ if (dict_get_str (dict, "master", &master) != 0)
+ master = "???";
+ if (dict_get_str (dict, "slave", &slave) != 0)
+ slave = "???";
+
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"master",
+ "%s", master);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = xmlTextWriterWriteFormatElement (writer,
+ (xmlChar *)"slave",
+ "%s", slave);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ break;
+
+ case GF_GSYNC_OPTION_TYPE_CONFIG:
+ break;
+ case GF_GSYNC_OPTION_TYPE_STATUS:
+ ret = cli_xml_output_vol_gsync_status(dict, writer);
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+
+ /* </geoRep> */
+ ret = xmlTextWriterEndElement (writer);
+ XML_RET_CHECK_AND_GOTO (ret, out);
+
+ ret = cli_end_xml_output (writer, doc);
+out:
+ gf_log ("cli",GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+#else
+ return 0;
+#endif
+}
diff --git a/cli/src/cli.c b/cli/src/cli.c
index 7788ce389..91b315ff1 100644
--- a/cli/src/cli.c
+++ b/cli/src/cli.c
@@ -168,7 +168,8 @@ logging_init (glusterfs_ctx_t *ctx, struct cli_state *state)
char *log_file = state->log_file ? state->log_file :
DEFAULT_CLI_LOG_FILE_DIRECTORY "/cli.log";
- if (gf_log_init (ctx, log_file) == -1) {
+ /* passing ident as NULL means to use default ident for syslog */
+ if (gf_log_init (ctx, log_file, NULL) == -1) {
fprintf (stderr, "ERROR: failed to open logfile %s\n",
log_file);
return -1;
@@ -298,7 +299,17 @@ cli_opt_parse (char *opt, struct cli_state *state)
return 1;
if (strcmp (opt, "version") == 0) {
- puts (argp_program_version);
+ cli_out ("%s", argp_program_version);
+ exit (0);
+ }
+
+ if (strcmp (opt, "print-logdir") == 0) {
+ cli_out ("%s", DEFAULT_LOG_FILE_DIRECTORY);
+ exit (0);
+ }
+
+ if (strcmp (opt, "print-statedumpdir") == 0) {
+ cli_out ("%s", DEFAULT_VAR_RUN_DIRECTORY);
exit (0);
}
@@ -342,6 +353,12 @@ cli_opt_parse (char *opt, struct cli_state *state)
return 0;
}
+ oarg = strtail (opt, "glusterd-sock=");
+ if (oarg) {
+ state->glusterd_sock = oarg;
+ return 0;
+ }
+
return -1;
}
@@ -377,6 +394,8 @@ parse_cmdline (int argc, char *argv[], struct cli_state *state)
}
}
+ state->argv[state->argc] = NULL;
+
return ret;
}
@@ -401,7 +420,6 @@ cli_state_init (struct cli_state *state)
int ret = 0;
- state->remote_host = "localhost";
state->log_level = -1;
tree = &state->tree;
@@ -488,23 +506,46 @@ cli_rpc_init (struct cli_state *state)
if (!options)
goto out;
- ret = dict_set_str (options, "remote-host", state->remote_host);
- if (ret)
- goto out;
+ /* Connect using to glusterd using the specified method, giving
+ * preference to unix socket connection. If nothing is specified connect
+ * to the default glusterd socket
+ */
+ if (state->glusterd_sock) {
+ gf_log ("cli", GF_LOG_INFO, "Connecting to glusterd using "
+ "sockfile %s", state->glusterd_sock);
+ ret = rpc_transport_unix_options_build (&options,
+ state->glusterd_sock,
+ 0);
+ if (ret)
+ goto out;
+ } else if (state->remote_host) {
+ gf_log ("cli", GF_LOG_INFO, "Connecting to remote glusterd at "
+ "%s", state->remote_host);
+ ret = dict_set_str (options, "remote-host", state->remote_host);
+ if (ret)
+ goto out;
- if (state->remote_port)
- port = state->remote_port;
+ if (state->remote_port)
+ port = state->remote_port;
- ret = dict_set_int32 (options, "remote-port", port);
- if (ret)
- goto out;
+ ret = dict_set_int32 (options, "remote-port", port);
+ if (ret)
+ goto out;
- ret = dict_set_str (options, "transport.address-family", "inet");
- if (ret)
- goto out;
+ ret = dict_set_str (options, "transport.address-family",
+ "inet");
+ if (ret)
+ goto out;
+ } else {
+ gf_log ("cli", GF_LOG_DEBUG, "Connecting to glusterd using "
+ "default socket");
+ ret = rpc_transport_unix_options_build
+ (&options, DEFAULT_GLUSTERD_SOCKFILE, 0);
+ if (ret)
+ goto out;
+ }
rpc = rpc_clnt_new (options, this->ctx, this->name, 16);
-
if (!rpc)
goto out;
@@ -514,7 +555,7 @@ cli_rpc_init (struct cli_state *state)
goto out;
}
- rpc_clnt_start (rpc);
+ ret = rpc_clnt_start (rpc);
out:
if (ret) {
if (rpc)
@@ -560,7 +601,9 @@ main (int argc, char *argv[])
if (!ctx)
return ENOMEM;
+#ifdef DEBUG
gf_mem_acct_enable_set (ctx);
+#endif
ret = glusterfs_globals_init (ctx);
if (ret)
diff --git a/cli/src/cli.h b/cli/src/cli.h
index 0221f2e85..8daa4b741 100644
--- a/cli/src/cli.h
+++ b/cli/src/cli.h
@@ -18,6 +18,7 @@
#include "rpc-clnt.h"
#include "glusterfs.h"
#include "protocol-common.h"
+#include "logging.h"
#include "cli1-xdr.h"
@@ -30,9 +31,8 @@
#define CLI_GLUSTERD_PORT 24007
#define CLI_DEFAULT_CONN_TIMEOUT 120
#define CLI_DEFAULT_CMD_TIMEOUT 120
-#define CLI_TOP_CMD_TIMEOUT 600 //Longer timeout for volume top
+#define CLI_TEN_MINUTES_TIMEOUT 600 //Longer timeout for volume top
#define DEFAULT_CLI_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs"
-#define DEFAULT_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs"
#define CLI_VOL_STATUS_BRICK_LEN 55
#define CLI_TAB_LENGTH 8
#define CLI_BRICK_STATUS_LINE_LEN 78
@@ -52,6 +52,7 @@ struct cli_cmd;
extern char *cli_vol_type_str[];
extern char *cli_vol_status_str[];
+extern char *cli_vol_task_status_str[];
typedef int (cli_cmd_cbk_t)(struct cli_state *state,
struct cli_cmd_word *word,
@@ -114,6 +115,8 @@ struct cli_state {
char *log_file;
gf_loglevel_t log_level;
+
+ char *glusterd_sock;
};
struct cli_local {
@@ -128,11 +131,23 @@ struct cli_local {
gf_boolean_t all;
#if (HAVE_LIB_XML)
xmlTextWriterPtr writer;
- xmlBufferPtr buf;
+ xmlDocPtr doc;
int vol_count;
#endif
};
+struct gf_cli_gsync_detailed_status_ {
+ char *node;
+ char *master;
+ char *slave;
+ char *health;
+ char *uptime;
+ char *files_syncd;
+ char *files_pending;
+ char *bytes_pending;
+ char *deletes_pending;
+};
+
struct cli_volume_status {
int port;
int online;
@@ -151,6 +166,13 @@ struct cli_volume_status {
#endif
};
+struct snap_config_opt_vals_ {
+ char *op_name;
+ char *question;
+};
+
+typedef struct gf_cli_gsync_detailed_status_ gf_cli_gsync_status_t;
+
typedef struct cli_volume_status cli_volume_status_t;
typedef struct cli_local cli_local_t;
@@ -215,7 +237,7 @@ cli_cmd_quota_parse (const char **words, int wordcount, dict_t **opt);
int32_t
cli_cmd_volume_set_parse (const char **words, int wordcount,
- dict_t **options);
+ dict_t **options, char **op_errstr);
int32_t
cli_cmd_volume_add_brick_parse (const char **words, int wordcount,
@@ -361,4 +383,18 @@ cli_xml_output_vol_create (dict_t *dict, int op_ret, int op_errno,
int
cli_xml_output_generic_volume (char *op, dict_t *dict, int op_ret, int op_errno,
char *op_errstr);
+
+int
+cli_xml_output_vol_gsync (dict_t *dict, int op_ret, int op_errno,
+ char *op_errstr);
+int
+cli_xml_output_vol_status_tasks_detail (cli_local_t *local, dict_t *dict);
+
+char *
+is_server_debug_xlator (void *myframe);
+
+int32_t
+cli_cmd_snapshot_parse (const char **words, int wordcount, dict_t **options,
+ struct cli_state *state);
+
#endif /* __CLI_H__ */
diff --git a/configure.ac b/configure.ac
index ab5af8fd4..b3d1ed184 100644
--- a/configure.ac
+++ b/configure.ac
@@ -24,13 +24,15 @@ if libtool --help 2>&1 | grep -q quiet; then
AM_LIBTOOLFLAGS="--quiet";
fi
-AM_CONFIG_HEADER([config.h])
+AC_CONFIG_HEADERS([config.h])
AC_CONFIG_FILES([Makefile
- libglusterfs/Makefile
- libglusterfs/src/Makefile
- glusterfsd/Makefile
- glusterfsd/src/Makefile
+ libglusterfs/Makefile
+ libglusterfs/src/Makefile
+ geo-replication/src/peer_gsec_create
+ geo-replication/src/peer_add_secret_pub
+ glusterfsd/Makefile
+ glusterfsd/src/Makefile
rpc/Makefile
rpc/rpc-lib/Makefile
rpc/rpc-lib/src/Makefile
@@ -41,102 +43,137 @@ AC_CONFIG_FILES([Makefile
rpc/rpc-transport/rdma/src/Makefile
rpc/xdr/Makefile
rpc/xdr/src/Makefile
- xlators/Makefile
- xlators/mount/Makefile
- xlators/mount/fuse/Makefile
- xlators/mount/fuse/src/Makefile
- xlators/mount/fuse/utils/mount.glusterfs
- xlators/mount/fuse/utils/mount_glusterfs
- xlators/mount/fuse/utils/Makefile
- xlators/storage/Makefile
- xlators/storage/posix/Makefile
- xlators/storage/posix/src/Makefile
- xlators/cluster/Makefile
- xlators/cluster/afr/Makefile
- xlators/cluster/afr/src/Makefile
- xlators/cluster/stripe/Makefile
- xlators/cluster/stripe/src/Makefile
- xlators/cluster/dht/Makefile
- xlators/cluster/dht/src/Makefile
- xlators/performance/Makefile
- xlators/performance/write-behind/Makefile
- xlators/performance/write-behind/src/Makefile
- xlators/performance/read-ahead/Makefile
- xlators/performance/read-ahead/src/Makefile
- xlators/performance/io-threads/Makefile
- xlators/performance/io-threads/src/Makefile
- xlators/performance/io-cache/Makefile
- xlators/performance/io-cache/src/Makefile
- xlators/performance/symlink-cache/Makefile
- xlators/performance/symlink-cache/src/Makefile
- xlators/performance/quick-read/Makefile
- xlators/performance/quick-read/src/Makefile
+ xlators/Makefile
+ xlators/mount/Makefile
+ xlators/mount/fuse/Makefile
+ xlators/mount/fuse/src/Makefile
+ xlators/mount/fuse/utils/mount.glusterfs
+ xlators/mount/fuse/utils/mount_glusterfs
+ xlators/mount/fuse/utils/Makefile
+ xlators/storage/Makefile
+ xlators/storage/posix/Makefile
+ xlators/storage/posix/src/Makefile
+ xlators/storage/bd/Makefile
+ xlators/storage/bd/src/Makefile
+ xlators/cluster/Makefile
+ xlators/cluster/afr/Makefile
+ xlators/cluster/afr/src/Makefile
+ xlators/cluster/stripe/Makefile
+ xlators/cluster/stripe/src/Makefile
+ xlators/cluster/dht/Makefile
+ xlators/cluster/dht/src/Makefile
+ xlators/performance/Makefile
+ xlators/performance/write-behind/Makefile
+ xlators/performance/write-behind/src/Makefile
+ xlators/performance/read-ahead/Makefile
+ xlators/performance/read-ahead/src/Makefile
+ xlators/performance/readdir-ahead/Makefile
+ xlators/performance/readdir-ahead/src/Makefile
+ xlators/performance/io-threads/Makefile
+ xlators/performance/io-threads/src/Makefile
+ xlators/performance/io-cache/Makefile
+ xlators/performance/io-cache/src/Makefile
+ xlators/performance/symlink-cache/Makefile
+ xlators/performance/symlink-cache/src/Makefile
+ xlators/performance/quick-read/Makefile
+ xlators/performance/quick-read/src/Makefile
+ xlators/performance/open-behind/Makefile
+ xlators/performance/open-behind/src/Makefile
xlators/performance/md-cache/Makefile
xlators/performance/md-cache/src/Makefile
- xlators/debug/Makefile
- xlators/debug/trace/Makefile
- xlators/debug/trace/src/Makefile
- xlators/debug/error-gen/Makefile
- xlators/debug/error-gen/src/Makefile
- xlators/debug/io-stats/Makefile
- xlators/debug/io-stats/src/Makefile
- xlators/protocol/Makefile
- xlators/protocol/auth/Makefile
- xlators/protocol/auth/addr/Makefile
- xlators/protocol/auth/addr/src/Makefile
- xlators/protocol/auth/login/Makefile
- xlators/protocol/auth/login/src/Makefile
- xlators/protocol/client/Makefile
- xlators/protocol/client/src/Makefile
- xlators/protocol/server/Makefile
- xlators/protocol/server/src/Makefile
- xlators/features/Makefile
- xlators/features/locks/Makefile
- xlators/features/locks/src/Makefile
- xlators/features/quota/Makefile
- xlators/features/quota/src/Makefile
+ xlators/debug/Makefile
+ xlators/debug/trace/Makefile
+ xlators/debug/trace/src/Makefile
+ xlators/debug/error-gen/Makefile
+ xlators/debug/error-gen/src/Makefile
+ xlators/debug/io-stats/Makefile
+ xlators/debug/io-stats/src/Makefile
+ xlators/protocol/Makefile
+ xlators/protocol/auth/Makefile
+ xlators/protocol/auth/addr/Makefile
+ xlators/protocol/auth/addr/src/Makefile
+ xlators/protocol/auth/login/Makefile
+ xlators/protocol/auth/login/src/Makefile
+ xlators/protocol/client/Makefile
+ xlators/protocol/client/src/Makefile
+ xlators/protocol/server/Makefile
+ xlators/protocol/server/src/Makefile
+ xlators/features/Makefile
+ xlators/features/changelog/Makefile
+ xlators/features/changelog/src/Makefile
+ xlators/features/changelog/lib/Makefile
+ xlators/features/changelog/lib/src/Makefile
+ xlators/features/glupy/Makefile
+ xlators/features/glupy/src/Makefile
+ xlators/features/locks/Makefile
+ xlators/features/locks/src/Makefile
+ xlators/features/quota/Makefile
+ xlators/features/quota/src/Makefile
xlators/features/marker/Makefile
xlators/features/marker/src/Makefile
- xlators/features/marker/utils/Makefile
- xlators/features/marker/utils/src/Makefile
- xlators/features/marker/utils/syncdaemon/Makefile
- xlators/features/read-only/Makefile
- xlators/features/read-only/src/Makefile
- xlators/features/mac-compat/Makefile
- xlators/features/mac-compat/src/Makefile
- xlators/features/quiesce/Makefile
- xlators/features/quiesce/src/Makefile
+ xlators/features/read-only/Makefile
+ xlators/features/read-only/src/Makefile
+ xlators/features/compress/Makefile
+ xlators/features/compress/src/Makefile
+ xlators/features/mac-compat/Makefile
+ xlators/features/mac-compat/src/Makefile
+ xlators/features/quiesce/Makefile
+ xlators/features/quiesce/src/Makefile
xlators/features/index/Makefile
xlators/features/index/src/Makefile
- xlators/encryption/Makefile
- xlators/encryption/rot-13/Makefile
- xlators/encryption/rot-13/src/Makefile
+ xlators/features/protect/Makefile
+ xlators/features/protect/src/Makefile
+ xlators/features/gfid-access/Makefile
+ xlators/features/gfid-access/src/Makefile
+ xlators/playground/Makefile
+ xlators/playground/template/Makefile
+ xlators/playground/template/src/Makefile
+ xlators/encryption/Makefile
+ xlators/encryption/rot-13/Makefile
+ xlators/encryption/rot-13/src/Makefile
+ xlators/encryption/crypt/Makefile
+ xlators/encryption/crypt/src/Makefile
+ xlators/features/qemu-block/Makefile
+ xlators/features/qemu-block/src/Makefile
xlators/system/Makefile
xlators/system/posix-acl/Makefile
xlators/system/posix-acl/src/Makefile
- cli/Makefile
- cli/src/Makefile
- doc/Makefile
- extras/Makefile
- extras/init.d/Makefile
- extras/init.d/glusterd.plist
- extras/init.d/glusterd-Debian
- extras/init.d/glusterd-Redhat
- extras/init.d/glusterd-SuSE
- extras/benchmarking/Makefile
- extras/hook-scripts/Makefile
- contrib/fuse-util/Makefile
- contrib/uuid/uuid_types.h
xlators/nfs/Makefile
xlators/nfs/server/Makefile
xlators/nfs/server/src/Makefile
xlators/mgmt/Makefile
xlators/mgmt/glusterd/Makefile
xlators/mgmt/glusterd/src/Makefile
- glusterfs-api.pc
- api/Makefile
- api/src/Makefile
- glusterfs.spec])
+ cli/Makefile
+ cli/src/Makefile
+ doc/Makefile
+ extras/Makefile
+ extras/init.d/Makefile
+ extras/init.d/glusterd.plist
+ extras/init.d/glusterd-Debian
+ extras/init.d/glusterd-Redhat
+ extras/init.d/glusterd-SuSE
+ extras/systemd/Makefile
+ extras/systemd/glusterd.service
+ extras/benchmarking/Makefile
+ extras/hook-scripts/Makefile
+ extras/ocf/Makefile
+ extras/ocf/glusterd
+ extras/ocf/volume
+ extras/LinuxRPM/Makefile
+ extras/geo-rep/Makefile
+ contrib/fuse-util/Makefile
+ contrib/uuid/uuid_types.h
+ glusterfs-api.pc
+ libgfchangelog.pc
+ api/Makefile
+ api/src/Makefile
+ api/examples/Makefile
+ api/examples/setup.py
+ geo-replication/Makefile
+ geo-replication/src/Makefile
+ geo-replication/syncdaemon/Makefile
+ glusterfs.spec])
AC_CANONICAL_HOST
@@ -156,6 +193,12 @@ AC_ARG_WITH(mountutildir,
[mountutildir='/sbin'])
AC_SUBST(mountutildir)
+AC_ARG_WITH(systemddir,
+ [ --with-systemddir=DIR systemd service files in DIR @<:@/usr/lib/systemd/system@:>@],
+ [systemddir=$withval],
+ [systemddir='/usr/lib/systemd/system'])
+AC_SUBST(systemddir)
+
AC_ARG_WITH(initdir,
[ --with-initdir=DIR init.d scripts in DIR @<:@/etc/init.d@:>@],
[initdir=$withval],
@@ -168,6 +211,13 @@ AC_ARG_WITH(launchddir,
[launchddir='/Library/LaunchDaemons'])
AC_SUBST(launchddir)
+AC_ARG_WITH([ocf],
+ [AS_HELP_STRING([--without-ocf], [build OCF-compliant cluster resource agents])],
+ ,
+ [OCF_SUBDIR='ocf'],
+ )
+AC_SUBST(OCF_SUBDIR)
+
# LEX needs a check
AC_PROG_LEX
if test "x${LEX}" != "xflex" -a "x${FLEX}" != "xlex"; then
@@ -211,6 +261,8 @@ AC_CHECK_HEADERS([sys/extattr.h])
AC_CHECK_HEADERS([openssl/md5.h])
+AC_CHECK_HEADERS([linux/falloc.h])
+
case $host_os in
darwin*)
if ! test "`/usr/bin/sw_vers | grep ProductVersion: | cut -f 2 | cut -d. -f2`" -ge 5; then
@@ -244,8 +296,8 @@ fi
# FUSE section
AC_ARG_ENABLE([fuse-client],
- AC_HELP_STRING([--disable-fuse-client],
- [Do not build the fuse client. NOTE: you cannot mount glusterfs without the client]))
+ AC_HELP_STRING([--disable-fuse-client],
+ [Do not build the fuse client. NOTE: you cannot mount glusterfs without the client]))
BUILD_FUSE_CLIENT=no
if test "x$enable_fuse_client" != "xno"; then
@@ -253,60 +305,153 @@ if test "x$enable_fuse_client" != "xno"; then
BUILD_FUSE_CLIENT="yes"
fi
+AC_ARG_ENABLE([bd-xlator],
+ AC_HELP_STRING([--enable-bd-xlator], [Build BD xlator]))
+
+if test "x$enable_bd_xlator" != "xno"; then
+ AC_CHECK_LIB([lvm2app],
+ [lvm_init,lvm_lv_from_name],
+ [HAVE_BD_LIB="yes"],
+ [HAVE_BD_LIB="no"])
+
+if test "x$HAVE_BD_LIB" = "xyes"; then
+ # lvm_lv_from_name() has been made public with lvm2-2.02.79
+ AC_CHECK_DECLS(
+ [lvm_lv_from_name],
+ [NEED_LVM_LV_FROM_NAME_DECL="no"],
+ [NEED_LVM_LV_FROM_NAME_DECL="yes"],
+ [[#include <lvm2app.h>]])
+ fi
+fi
+
+if test "x$enable_bd_xlator" = "xyes" -a "x$HAVE_BD_LIB" = "xno"; then
+ echo "BD xlator requested but required lvm2 development library not found."
+ exit 1
+fi
+
+BUILD_BD_XLATOR=no
+if test "x${enable-bd-xlator}" != "xno" -a "x${HAVE_BD_LIB}" = "xyes"; then
+ BUILD_BD_XLATOR=yes
+ AC_DEFINE(HAVE_BD_XLATOR, 1, [define if lvm2app library found and bd xlator
+ enabled])
+ if test "x$NEED_LVM_LV_FROM_NAME_DECL" = "xyes"; then
+ AC_DEFINE(NEED_LVM_LV_FROM_NAME_DECL, 1, [defined if lvm_lv_from_name()
+ was not found in the lvm2app.h header, but can be linked])
+ fi
+fi
+
+AM_CONDITIONAL([ENABLE_BD_XLATOR], [test x$BUILD_BD_XLATOR = xyes])
+
+# start encryption/crypt section
+
+AC_CHECK_HEADERS([openssl/cmac.h], [have_cmac_h=yes], [have_cmac_h=no])
+
+AC_ARG_ENABLE([crypt-xlator],
+ AC_HELP_STRING([--enable-crypt-xlator], [Build crypt encryption xlator]))
+
+if test "x$enable_crypt_xlator" = "xyes" -a "x$have_cmac_h" = "xno"; then
+ echo "Encryption xlator requires OpenSSL with cmac.h"
+ exit 1
+fi
+
+BUILD_CRYPT_XLATOR=no
+if test "x$enable_crypt_xlator" != "xno" -a "x$have_cmac_h" = "xyes"; then
+ BUILD_CRYPT_XLATOR=yes
+ AC_DEFINE(HAVE_CRYPT_XLATOR, 1, [enable building crypt encryption xlator])
+fi
+
+AM_CONDITIONAL([ENABLE_CRYPT_XLATOR], [test x$BUILD_CRYPT_XLATOR = xyes])
+
AC_SUBST(FUSE_CLIENT_SUBDIR)
# end FUSE section
# FUSERMOUNT section
AC_ARG_ENABLE([fusermount],
- AC_HELP_STRING([--enable-fusermount],
- [Build fusermount]))
+ AC_HELP_STRING([--disable-fusermount],
+ [Use system's fusermount]))
-BUILD_FUSERMOUNT="no"
-if test "x$enable_fusermount" = "xyes"; then
- FUSERMOUNT_SUBDIR="contrib/fuse-util"
- BUILD_FUSERMOUNT="yes"
+BUILD_FUSERMOUNT="yes"
+if test "x$enable_fusermount" = "xno"; then
+ BUILD_FUSERMOUNT="no"
+else
AC_DEFINE(GF_FUSERMOUNT, 1, [Use our own fusermount])
+ FUSERMOUNT_SUBDIR="contrib/fuse-util"
fi
AC_SUBST(FUSERMOUNT_SUBDIR)
#end FUSERMOUNT section
+# QEMU_BLOCK section
+
+AC_ARG_ENABLE([qemu-block],
+ AC_HELP_STRING([--enable-qemu-block],
+ [Build QEMU Block formats translator]))
+
+if test "x$enable_qemu_block" != "xno"; then
+ PKG_CHECK_MODULES([GLIB], [glib-2.0],
+ [HAVE_GLIB_2="yes"],
+ [HAVE_GLIB_2="no"])
+fi
+
+if test "x$enable_qemu_block" = "xyes" -a "x$HAVE_GLIB_2" = "xno"; then
+ echo "QEMU Block formats translator requires libglib-2.0, but missing."
+ exit 1
+fi
+
+BUILD_QEMU_BLOCK=no
+if test "x${enable_qemu_block}" != "xno" -a "x${HAVE_GLIB_2}" = "xyes"; then
+ BUILD_QEMU_BLOCK=yes
+ AC_DEFINE(HAVE_QEMU_BLOCK, 1, [define if libglib-2.0 library found and QEMU
+ Block translator enabled])
+fi
+
+AM_CONDITIONAL([ENABLE_QEMU_BLOCK], [test x$BUILD_QEMU_BLOCK = xyes])
+
+# end QEMU_BLOCK section
# EPOLL section
AC_ARG_ENABLE([epoll],
- AC_HELP_STRING([--disable-epoll],
- [Use poll instead of epoll.]))
+ AC_HELP_STRING([--disable-epoll],
+ [Use poll instead of epoll.]))
BUILD_EPOLL=no
if test "x$enable_epoll" != "xno"; then
AC_CHECK_HEADERS([sys/epoll.h],
[BUILD_EPOLL=yes],
- [BUILD_EPOLL=no])
+ [BUILD_EPOLL=no])
fi
# end EPOLL section
# IBVERBS section
AC_ARG_ENABLE([ibverbs],
- AC_HELP_STRING([--disable-ibverbs],
- [Do not build the ibverbs transport]))
+ AC_HELP_STRING([--disable-ibverbs],
+ [Do not build the ibverbs transport]))
if test "x$enable_ibverbs" != "xno"; then
AC_CHECK_LIB([ibverbs],
[ibv_get_device_list],
- [HAVE_LIBIBVERBS="yes"],
- [HAVE_LIBIBVERBS="no"])
+ [HAVE_LIBIBVERBS="yes"],
+ [HAVE_LIBIBVERBS="no"])
+ AC_CHECK_LIB([rdmacm], [rdma_create_id], [HAVE_RDMACM="yes"], [HAVE_RDMACM="no"])
fi
-if test "x$enable_ibverbs" = "xyes" -a "x$HAVE_LIBIBVERBS" = "xno"; then
- echo "ibverbs requested but not found."
- exit 1
+if test "x$enable_ibverbs" = "xyes"; then
+ if test "x$HAVE_LIBIBVERBS" = "xno"; then
+ echo "ibverbs-transport requested, but libibverbs is not present."
+ exit 1
+ fi
+
+ if test "x$HAVE_RDMACM" = "xno"; then
+ echo "ibverbs-transport requested, but librdmacm is not present."
+ exit 1
+ fi
fi
BUILD_RDMA=no
BUILD_IBVERBS=no
-if test "x$enable_ibverbs" != "xno" -a "x$HAVE_LIBIBVERBS" = "xyes"; then
+if test "x$enable_ibverbs" != "xno" -a "x$HAVE_LIBIBVERBS" = "xyes" -a "x$HAVE_RDMACM" = "xyes"; then
IBVERBS_SUBDIR=ib-verbs
BUILD_IBVERBS=yes
RDMA_SUBDIR=rdma
@@ -320,8 +465,8 @@ AC_SUBST(RDMA_SUBDIR)
# SYNCDAEMON section
AC_ARG_ENABLE([georeplication],
- AC_HELP_STRING([--disable-georeplication],
- [Do not install georeplication components]))
+ AC_HELP_STRING([--disable-georeplication],
+ [Do not install georeplication components]))
BUILD_SYNCDAEMON=no
case $host_os in
@@ -333,12 +478,12 @@ case $host_os in
;;
*)
#disabling geo replication for non-linux platforms
- enable_georeplication=no
+ enable_georeplication=no
;;
esac
SYNCDAEMON_COMPILE=0
if test "x$enable_georeplication" != "xno"; then
- SYNCDAEMON_SUBDIR=utils
+ SYNCDAEMON_SUBDIR=geo-replication
SYNCDAEMON_COMPILE=1
BUILD_SYNCDAEMON="yes"
@@ -362,6 +507,17 @@ AC_SUBST(SYNCDAEMON_COMPILE)
AC_SUBST(SYNCDAEMON_SUBDIR)
# end SYNCDAEMON section
+# CDC xlator - check if libz is present if so enable HAVE_LIB_Z
+echo -n "checking if libz is present... "
+
+PKG_CHECK_MODULES([ZLIB], [zlib >= 1.2.0],
+ [echo "yes (features requiring zlib enabled)" AC_DEFINE(HAVE_LIB_Z, 1, [define if zlib is present])],
+ [echo "no"] )
+
+AC_SUBST(LIBZ_CFLAGS)
+AC_SUBST(LIBZ_LIBS)
+# end CDC xlator secion
+
# check for systemtap/dtrace
BUILD_SYSTEMTAP=no
AC_MSG_CHECKING([whether to include systemtap tracing support])
@@ -370,7 +526,7 @@ AC_ARG_ENABLE([systemtap],
[Enable inclusion of systemtap trace support])],
[ENABLE_SYSTEMTAP="${enableval}"], [ENABLE_SYSTEMTAP="def"])
-AM_CONDITIONAL([ENABLE_SYSTEMTAP], [test "x${ENABLE_SYSTEMTAP}" == "xyes"])
+AM_CONDITIONAL([ENABLE_SYSTEMTAP], [test "x${ENABLE_SYSTEMTAP}" = "xyes"])
AC_MSG_RESULT(${ENABLE_SYSTEMTAP})
if test "x${ENABLE_SYSTEMTAP}" != "xno"; then
@@ -379,30 +535,39 @@ if test "x${ENABLE_SYSTEMTAP}" != "xno"; then
[SDT_H_FOUND="no"])
fi
-if test "x${ENABLE_SYSTEMTAP}" == "xyes"; then
- if test "x${DTRACE}" == "xno"; then
+if test "x${ENABLE_SYSTEMTAP}" = "xyes"; then
+ if test "x${DTRACE}" = "xno"; then
AC_MSG_ERROR([dtrace not found])
- elif test "$x{SDT_H_FOUND}" == "xno"; then
+ elif test "$x{SDT_H_FOUND}" = "xno"; then
AC_MSG_ERROR([systemtap support needs sys/sdt.h header])
fi
fi
-if test "x${DTRACE}" == "xyes" -a "x${SDT_H_FOUND}" == "xyes"; then
+if test "x${DTRACE}" = "xyes" -a "x${SDT_H_FOUND}" = "xyes"; then
AC_MSG_CHECKING([x"${DTRACE}"xy"${SDT_H_FOUND}"y])
AC_DEFINE([HAVE_SYSTEMTAP], [1], [Define to 1 if using probes.])
BUILD_SYSTEMTAP=yes
fi
# end of systemtap/dtrace
-#check if libxml is present if so enable HAVE_LIB_XML
-echo -n "checking if libxml2 is present... "
-
-PKG_CHECK_MODULES([LIBXML2], [libxml-2.0 >= 2.6.19],
- [echo "yes (features requiring libxml2 enabled)" AC_DEFINE(HAVE_LIB_XML, 1, [define if libxml2 is present])],
- [echo "no"] )
-
-AC_SUBST(LIBXML2_CFLAGS)
-AC_SUBST(LIBXML2_LIBS)
+# xml-output
+AC_ARG_ENABLE([xml-output],
+ AC_HELP_STRING([--disable-xml-output],
+ [Disable the xml output]))
+BUILD_XML_OUTPUT="yes"
+if test "x$enable_xml_output" != "xno"; then
+ #check if libxml is present if so enable HAVE_LIB_XML
+ m4_ifdef([AM_PATH_XML2],[AM_PATH_XML2([2.6.19])], [no_xml=yes])
+ if test "x${no_xml}" = "x"; then
+ AC_DEFINE([HAVE_LIB_XML], [1], [Define to 1 if using libxml2.])
+ else
+ AC_MSG_WARN([libxml2 devel libraries not found disabling XML support])
+ BUILD_XML_OUTPUT="no"
+ fi
+else
+ BUILD_XML_OUTPUT="no"
+fi
+# end of xml-output
dnl FreeBSD > 5 has execinfo as a Ported library for giving a workaround
dnl solution to GCC backtrace functionality
@@ -428,9 +593,9 @@ AC_CHECK_MEMBERS([struct stat.st_atim.tv_nsec])
dnl FreeBSD, NetBSD
AC_CHECK_MEMBERS([struct stat.st_atimespec.tv_nsec])
case $host_os in
- *netbsd*)
- CFLAGS=-D_INCOMPLETE_XOPEN_C063
- ;;
+ *netbsd*)
+ CFLAGS+=" -D_INCOMPLETE_XOPEN_C063"
+ ;;
esac
AC_CHECK_FUNC([linkat], [have_linkat=yes])
if test "x${have_linkat}" = "xyes"; then
@@ -438,11 +603,15 @@ if test "x${have_linkat}" = "xyes"; then
fi
AC_SUBST(HAVE_LINKAT)
+dnl check for Monotonic clock
+AC_CHECK_FUNC([clock_gettime], [has_monotonic_clock=yes], AC_CHECK_LIB([rt], [clock_gettime], , AC_MSG_WARN([System doesn't have monotonic clock using contrib])))
+
dnl Check for argp
AC_CHECK_HEADER([argp.h], AC_DEFINE(HAVE_ARGP, 1, [have argp]))
AC_CONFIG_SUBDIRS(argp-standalone)
+
BUILD_ARGP_STANDALONE=no
-if test "x${ac_cv_header_argp_h}" = "xno"; then
+if test "x${ac_cv_header_argp_h}" = "xno"; then
BUILD_ARGP_STANDALONE=yes
ARGP_STANDALONE_CPPFLAGS='-I${top_srcdir}/argp-standalone'
ARGP_STANDALONE_LDADD='${top_builddir}/argp-standalone/libargp.a'
@@ -463,7 +632,18 @@ if test "x${have_fdatasync}" = "xyes"; then
AC_DEFINE(HAVE_FDATASYNC, 1, [define if fdatasync exists])
fi
-# Check the distribution where you are compiling glusterfs on
+AC_CHECK_FUNC([fallocate], [have_fallocate=yes])
+if test "x${have_fallocate}" = "xyes"; then
+ AC_DEFINE(HAVE_FALLOCATE, 1, [define if fallocate exists])
+fi
+
+AC_CHECK_FUNC([posix_fallocate], [have_posix_fallocate=yes])
+if test "x${have_posix_fallocate}" = "xyes"; then
+ AC_DEFINE(HAVE_POSIX_FALLOCATE, 1, [define if posix_fallocate exists])
+fi
+
+
+# Check the distribution where you are compiling glusterfs on
GF_DISTRIBUTION=
AC_CHECK_FILE([/etc/debian_version])
@@ -484,63 +664,85 @@ AC_SUBST(GF_DISTRIBUTION)
GF_HOST_OS=""
GF_LDFLAGS="-rdynamic"
-CFLAGS="-g"
+
+# check for gcc -Werror=format-security
+saved_CFLAGS=$CFLAGS
+CFLAGS="-Wformat -Werror=format-security"
+AC_MSG_CHECKING([whether $CC accepts -Werror=format-security])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], [cc_werror_format_security=yes], [cc_werror_format_security=no])
+echo $cc_werror_format_security
+if test "x$cc_werror_format_security" = "xno"; then
+ CFLAGS="$saved_CFLAGS"
+else
+ CFLAGS="$saved_CFLAGS $CFLAGS"
+fi
+
+# check for gcc -Werror=implicit-function-declaration
+saved_CFLAGS=$CFLAGS
+CFLAGS="-Werror=implicit-function-declaration"
+AC_MSG_CHECKING([whether $CC accepts -Werror=implicit-function-declaration])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], [cc_werror_implicit=yes], [cc_werror_implicit=no])
+echo $cc_werror_implicit
+if test "x$cc_werror_implicit" = "xno"; then
+ CFLAGS="$saved_CFLAGS"
+else
+ CFLAGS="$saved_CFLAGS $CFLAGS"
+fi
case $host_os in
linux*)
- dnl GF_LINUX_HOST_OS=1
GF_HOST_OS="GF_LINUX_HOST_OS"
- GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -O0"
- GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
- GF_LDADD="${ARGP_STANDALONE_LDADD}"
- GF_FUSE_CFLAGS="-DFUSERMOUNT_DIR=\\\"\$(bindir)\\\""
- ;;
+ GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -O0"
+ GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
+ GF_LDADD="${ARGP_STANDALONE_LDADD}"
+ GF_FUSE_CFLAGS="-DFUSERMOUNT_DIR=\\\"\$(bindir)\\\""
+ ;;
solaris*)
GF_HOST_OS="GF_SOLARIS_HOST_OS"
- GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D_REENTRANT -D_POSIX_PTHREAD_SEMANTICS -O0 -m64"
- GF_LDFLAGS=""
- GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
- GF_LDADD="${ARGP_STANDALONE_LDADD}"
- GF_GLUSTERFS_LIBS="-lnsl -lresolv -lsocket"
+ GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D_REENTRANT -D_POSIX_PTHREAD_SEMANTICS -O0 -m64"
+ GF_LDFLAGS=""
+ GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
+ GF_LDADD="${ARGP_STANDALONE_LDADD}"
+ GF_GLUSTERFS_LIBS="-lnsl -lresolv -lsocket"
BUILD_FUSE_CLIENT=no
FUSE_CLIENT_SUBDIR=""
- ;;
+ ;;
*netbsd*)
- GF_HOST_OS="GF_BSD_HOST_OS"
- GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D_INCOMPLETE_XOPEN_C063"
- GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_BASENAME"
- GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_DIRNAME"
- GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
- GF_LDADD="${ARGP_STANDALONE_LDADD}"
- if test "x$ac_cv_header_execinfo_h" = "xyes"; then
- GF_GLUSTERFS_LIBS="-lexecinfo"
- fi
- GF_FUSE_LDADD="-lperfuse"
- BUILD_FUSE_CLIENT=yes
- LEXLIB=""
- ;;
+ GF_HOST_OS="GF_BSD_HOST_OS"
+ GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D_INCOMPLETE_XOPEN_C063"
+ GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_BASENAME"
+ GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_DIRNAME"
+ GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
+ GF_LDADD="${ARGP_STANDALONE_LDADD}"
+ if test "x$ac_cv_header_execinfo_h" = "xyes"; then
+ GF_GLUSTERFS_LIBS="-lexecinfo"
+ fi
+ GF_FUSE_LDADD="-lperfuse"
+ BUILD_FUSE_CLIENT=yes
+ LEXLIB=""
+ ;;
*bsd*)
GF_HOST_OS="GF_BSD_HOST_OS"
- GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -O0"
- GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_BASENAME"
- GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_DIRNAME"
- GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
- GF_LDADD="${ARGP_STANDALONE_LDADD}"
- if test "x$ac_cv_header_execinfo_h" = "xyes"; then
- GF_GLUSTERFS_LIBS="-lexecinfo"
- fi
- BUILD_FUSE_CLIENT=no
- ;;
+ GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -O0"
+ GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_BASENAME"
+ GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_DIRNAME"
+ GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
+ GF_LDADD="${ARGP_STANDALONE_LDADD}"
+ if test "x$ac_cv_header_execinfo_h" = "xyes"; then
+ GF_GLUSTERFS_LIBS="-lexecinfo"
+ fi
+ BUILD_FUSE_CLIENT=no
+ ;;
darwin*)
GF_HOST_OS="GF_DARWIN_HOST_OS"
- LIBTOOL=glibtool
- GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D__DARWIN_64_BIT_INO_T -bundle -undefined suppress -flat_namespace -D_XOPEN_SOURCE -O0"
- GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_BASENAME"
- GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_DIRNAME"
- GF_GLUSTERFS_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D__DARWIN_64_BIT_INO_T -undefined suppress -flat_namespace -O0"
- GF_LDADD="${ARGP_STANDALONE_LDADD}"
- GF_FUSE_CFLAGS="-I\$(CONTRIBDIR)/macfuse"
- ;;
+ LIBTOOL=glibtool
+ GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D__DARWIN_64_BIT_INO_T -bundle -undefined suppress -flat_namespace -D_XOPEN_SOURCE -O0"
+ GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_BASENAME"
+ GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_DIRNAME"
+ GF_GLUSTERFS_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D__DARWIN_64_BIT_INO_T -undefined suppress -flat_namespace -O0"
+ GF_LDADD="${ARGP_STANDALONE_LDADD}"
+ GF_FUSE_CFLAGS="-I\$(CONTRIBDIR)/macfuse"
+ ;;
esac
# enable debug section
@@ -548,17 +750,30 @@ AC_ARG_ENABLE([debug],
AC_HELP_STRING([--enable-debug],
[Enable debug build options.]))
-DEBUG=no
+BUILD_DEBUG=no
if test "x$enable_debug" = "xyes"; then
- DEBUG=yes
- CFLAGS="-O0 $CFLAGS"
+ BUILD_DEBUG=yes
+ CFLAGS=`echo $CFLAGS | sed -e s/O2/O0/`
else
- CFLAGS="-O2 $CFLAGS"
- DEBUG=no
+ BUILD_DEBUG=no
fi
AC_SUBST(CFLAGS)
# end enable debug section
+# syslog section
+AC_ARG_ENABLE([syslog],
+ AC_HELP_STRING([--disable-syslog],
+ [Disable syslog for logging]))
+
+USE_SYSLOG="yes"
+if test "x$enable_syslog" != "xno"; then
+ AC_DEFINE(GF_USE_SYSLOG, 1, [Use syslog for logging])
+else
+ USE_SYSLOG="no"
+fi
+AM_CONDITIONAL([ENABLE_SYSLOG], [test x$USE_SYSLOG = xyes])
+#end syslog section
+
BUILD_READLINE=no
AC_CHECK_LIB([readline -lcurses],[readline],[RLLIBS="-lreadline -lcurses"])
AC_CHECK_LIB([readline -ltermcap],[readline],[RLLIBS="-lreadline -ltermcap"])
@@ -577,6 +792,55 @@ if test "x$LIBAIO" != "x"; then
BUILD_LIBAIO=yes
fi
+# glupy section
+BUILD_GLUPY=no
+have_python2=no
+have_Python_h=no
+
+AM_PATH_PYTHON()
+if echo $PYTHON_VERSION | grep ^2; then
+ have_python2=yes
+fi
+AC_CHECK_HEADERS([python$PYTHON_VERSION/Python.h],[have_Python_h=yes],[])
+AC_ARG_ENABLE([glupy],
+ AS_HELP_STRING([--enable-glupy],
+ [build glupy]))
+case x$enable_glupy in
+ xyes)
+ if test "x$have_python2" = "xyes" -a "x$have_Python_h" = "xyes"; then
+ BUILD_GLUPY=yes
+ else
+ AC_MSG_ERROR([glupy requires python-devel/python-dev package and python2.x])
+ fi
+ ;;
+ xno)
+ ;;
+ *)
+ if test "x$have_python2" = "xyes" -a "x$have_Python_h" = "xyes"; then
+ BUILD_GLUPY=yes
+ else
+ AC_MSG_WARN([
+ ---------------------------------------------------------------------------------
+ cannot build glupy. python 2.x and python-devel/python-dev package are required.
+ ---------------------------------------------------------------------------------])
+ fi
+ ;;
+esac
+
+if test "x$BUILD_GLUPY" = "xyes"; then
+ BUILD_PYTHON_INC=`$PYTHON -c "from distutils import sysconfig; print sysconfig.get_python_inc()"`
+ BUILD_PYTHON_LIB=python$PYTHON_VERSION
+ GLUPY_SUBDIR=glupy
+ GLUPY_SUBDIR_MAKEFILE=xlators/features/glupy/Makefile
+ GLUPY_SUBDIR_SRC_MAKEFILE=xlators/features/glupy/src/Makefile
+ echo "building glupy with -isystem $BUILD_PYTHON_INC -l $BUILD_PYTHON_LIB"
+ AC_SUBST(BUILD_PYTHON_INC)
+ AC_SUBST(BUILD_PYTHON_LIB)
+ AC_SUBST(GLUPY_SUBDIR)
+ AC_SUBST(GLUPY_SUBDIR_MAKEFILE)
+ AC_SUBST(GLUPY_SUBDIR_SRC_MAKEFILE)
+fi
+# end glupy section
AC_SUBST(GF_HOST_OS)
AC_SUBST([GF_GLUSTERFS_LIBS])
@@ -608,14 +872,20 @@ AC_OUTPUT
echo
echo "GlusterFS configure summary"
echo "==========================="
-echo "FUSE client : $BUILD_FUSE_CLIENT"
-echo "Infiniband verbs : $BUILD_IBVERBS"
-echo "epoll IO multiplex : $BUILD_EPOLL"
-echo "argp-standalone : $BUILD_ARGP_STANDALONE"
-echo "fusermount : $BUILD_FUSERMOUNT"
-echo "readline : $BUILD_READLINE"
-echo "georeplication : $BUILD_SYNCDAEMON"
-echo "Linux-AIO : $BUILD_LIBAIO"
-echo "Enable Debug : $DEBUG"
-echo "systemtap : $BUILD_SYSTEMTAP"
+echo "FUSE client : $BUILD_FUSE_CLIENT"
+echo "Infiniband verbs : $BUILD_IBVERBS"
+echo "epoll IO multiplex : $BUILD_EPOLL"
+echo "argp-standalone : $BUILD_ARGP_STANDALONE"
+echo "fusermount : $BUILD_FUSERMOUNT"
+echo "readline : $BUILD_READLINE"
+echo "georeplication : $BUILD_SYNCDAEMON"
+echo "Linux-AIO : $BUILD_LIBAIO"
+echo "Enable Debug : $BUILD_DEBUG"
+echo "systemtap : $BUILD_SYSTEMTAP"
+echo "Block Device xlator : $BUILD_BD_XLATOR"
+echo "glupy : $BUILD_GLUPY"
+echo "Use syslog : $USE_SYSLOG"
+echo "XML output : $BUILD_XML_OUTPUT"
+echo "QEMU Block formats : $BUILD_QEMU_BLOCK"
+echo "Encryption xlator : $BUILD_CRYPT_XLATOR"
echo
diff --git a/contrib/fuse-include/fuse-mount.h b/contrib/fuse-include/fuse-mount.h
index 7a3756d92..9358ac810 100644
--- a/contrib/fuse-include/fuse-mount.h
+++ b/contrib/fuse-include/fuse-mount.h
@@ -8,5 +8,6 @@
*/
void gf_fuse_unmount (const char *mountpoint, int fd);
-int gf_fuse_mount (const char *mountpoint, char *fsname, char *mnt_param,
+int gf_fuse_mount (const char *mountpoint, char *fsname,
+ unsigned long mountflags, char *mnt_param,
pid_t *mtab_pid, int status_fd);
diff --git a/contrib/fuse-include/fuse_kernel.h b/contrib/fuse-include/fuse_kernel.h
index 9ae25d6f9..60bb2f9f7 100644
--- a/contrib/fuse-include/fuse_kernel.h
+++ b/contrib/fuse-include/fuse_kernel.h
@@ -60,23 +60,75 @@
* 7.13
* - make max number of background requests and congestion threshold
* tunables
+ *
+ * 7.14
+ * - add splice support to fuse device
+ *
+ * 7.15
+ * - add store notify
+ * - add retrieve notify
+ *
+ * 7.16
+ * - add BATCH_FORGET request
+ * - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct
+ * fuse_ioctl_iovec' instead of ambiguous 'struct iovec'
+ * - add FUSE_IOCTL_32BIT flag
+ *
+ * 7.17
+ * - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK
+ *
+ * 7.18
+ * - add FUSE_IOCTL_DIR flag
+ * - add FUSE_NOTIFY_DELETE
+ *
+ * 7.19
+ * - add FUSE_FALLOCATE
+ *
+ * 7.20
+ * - add FUSE_AUTO_INVAL_DATA
+ *
+ * 7.21
+ * - add FUSE_READDIRPLUS
+ * - send the requested events in POLL request
+ *
+ * 7.22
+ * - add FUSE_ASYNC_DIO
*/
#ifndef _LINUX_FUSE_H
#define _LINUX_FUSE_H
-#include <sys/types.h>
-#define __u64 uint64_t
-#define __s64 int64_t
-#define __u32 uint32_t
-#define __s32 int32_t
-#define __u16 uint16_t
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+/*
+ * Version negotiation:
+ *
+ * Both the kernel and userspace send the version they support in the
+ * INIT request and reply respectively.
+ *
+ * If the major versions match then both shall use the smallest
+ * of the two minor versions for communication.
+ *
+ * If the kernel supports a larger major version, then userspace shall
+ * reply with the major version it supports, ignore the rest of the
+ * INIT message and expect a new INIT message from the kernel with a
+ * matching major version.
+ *
+ * If the library supports a larger major version, then it shall fall
+ * back to the major protocol version sent by the kernel for
+ * communication and reply with that major version (and an arbitrary
+ * supported minor version).
+ */
/** Version number of this interface */
#define FUSE_KERNEL_VERSION 7
/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 13
+#define FUSE_KERNEL_MINOR_VERSION 22
/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
@@ -85,42 +137,42 @@
userspace works under 64bit kernels */
struct fuse_attr {
- __u64 ino;
- __u64 size;
- __u64 blocks;
- __u64 atime;
- __u64 mtime;
- __u64 ctime;
- __u32 atimensec;
- __u32 mtimensec;
- __u32 ctimensec;
- __u32 mode;
- __u32 nlink;
- __u32 uid;
- __u32 gid;
- __u32 rdev;
- __u32 blksize;
- __u32 padding;
+ uint64_t ino;
+ uint64_t size;
+ uint64_t blocks;
+ uint64_t atime;
+ uint64_t mtime;
+ uint64_t ctime;
+ uint32_t atimensec;
+ uint32_t mtimensec;
+ uint32_t ctimensec;
+ uint32_t mode;
+ uint32_t nlink;
+ uint32_t uid;
+ uint32_t gid;
+ uint32_t rdev;
+ uint32_t blksize;
+ uint32_t padding;
};
struct fuse_kstatfs {
- __u64 blocks;
- __u64 bfree;
- __u64 bavail;
- __u64 files;
- __u64 ffree;
- __u32 bsize;
- __u32 namelen;
- __u32 frsize;
- __u32 padding;
- __u32 spare[6];
+ uint64_t blocks;
+ uint64_t bfree;
+ uint64_t bavail;
+ uint64_t files;
+ uint64_t ffree;
+ uint32_t bsize;
+ uint32_t namelen;
+ uint32_t frsize;
+ uint32_t padding;
+ uint32_t spare[6];
};
struct fuse_file_lock {
- __u64 start;
- __u64 end;
- __u32 type;
- __u32 pid; /* tgid */
+ uint64_t start;
+ uint64_t end;
+ uint32_t type;
+ uint32_t pid; /* tgid */
};
/**
@@ -151,8 +203,22 @@ struct fuse_file_lock {
/**
* INIT request/reply flags
*
+ * FUSE_ASYNC_READ: asynchronous read requests
+ * FUSE_POSIX_LOCKS: remote locking for POSIX file locks
+ * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported)
+ * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem
* FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".."
+ * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB
* FUSE_DONT_MASK: don't apply umask to file mode on create operations
+ * FUSE_SPLICE_WRITE: kernel supports splice write on the device
+ * FUSE_SPLICE_MOVE: kernel supports splice move on the device
+ * FUSE_SPLICE_READ: kernel supports splice read on the device
+ * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks
+ * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories
+ * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages
+ * FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one)
+ * FUSE_READDIRPLUS_AUTO: adaptive readdirplus
+ * FUSE_ASYNC_DIO: asynchronous direct I/O submission
*/
#define FUSE_ASYNC_READ (1 << 0)
#define FUSE_POSIX_LOCKS (1 << 1)
@@ -161,6 +227,15 @@ struct fuse_file_lock {
#define FUSE_EXPORT_SUPPORT (1 << 4)
#define FUSE_BIG_WRITES (1 << 5)
#define FUSE_DONT_MASK (1 << 6)
+#define FUSE_SPLICE_WRITE (1 << 7)
+#define FUSE_SPLICE_MOVE (1 << 8)
+#define FUSE_SPLICE_READ (1 << 9)
+#define FUSE_FLOCK_LOCKS (1 << 10)
+#define FUSE_HAS_IOCTL_DIR (1 << 11)
+#define FUSE_AUTO_INVAL_DATA (1 << 12)
+#define FUSE_DO_READDIRPLUS (1 << 13)
+#define FUSE_READDIRPLUS_AUTO (1 << 14)
+#define FUSE_ASYNC_DIO (1 << 15)
/**
* CUSE INIT request/reply flags
@@ -173,6 +248,7 @@ struct fuse_file_lock {
* Release flags
*/
#define FUSE_RELEASE_FLUSH (1 << 0)
+#define FUSE_RELEASE_FLOCK_UNLOCK (1 << 1)
/**
* Getattr flags
@@ -204,12 +280,16 @@ struct fuse_file_lock {
* FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
* FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed
* FUSE_IOCTL_RETRY: retry with new iovecs
+ * FUSE_IOCTL_32BIT: 32bit ioctl
+ * FUSE_IOCTL_DIR: is a directory
*
* FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs
*/
#define FUSE_IOCTL_COMPAT (1 << 0)
#define FUSE_IOCTL_UNRESTRICTED (1 << 1)
#define FUSE_IOCTL_RETRY (1 << 2)
+#define FUSE_IOCTL_32BIT (1 << 3)
+#define FUSE_IOCTL_DIR (1 << 4)
#define FUSE_IOCTL_MAX_IOV 256
@@ -259,6 +339,10 @@ enum fuse_opcode {
FUSE_DESTROY = 38,
FUSE_IOCTL = 39,
FUSE_POLL = 40,
+ FUSE_NOTIFY_REPLY = 41,
+ FUSE_BATCH_FORGET = 42,
+ FUSE_FALLOCATE = 43,
+ FUSE_READDIRPLUS = 44,
/* CUSE specific operations */
CUSE_INIT = 4096,
@@ -268,6 +352,9 @@ enum fuse_notify_code {
FUSE_NOTIFY_POLL = 1,
FUSE_NOTIFY_INVAL_INODE = 2,
FUSE_NOTIFY_INVAL_ENTRY = 3,
+ FUSE_NOTIFY_STORE = 4,
+ FUSE_NOTIFY_RETRIEVE = 5,
+ FUSE_NOTIFY_DELETE = 6,
FUSE_NOTIFY_CODE_MAX,
};
@@ -277,133 +364,143 @@ enum fuse_notify_code {
#define FUSE_COMPAT_ENTRY_OUT_SIZE 120
struct fuse_entry_out {
- __u64 nodeid; /* Inode ID */
- __u64 generation; /* Inode generation: nodeid:gen must
- be unique for the fs's lifetime */
- __u64 entry_valid; /* Cache timeout for the name */
- __u64 attr_valid; /* Cache timeout for the attributes */
- __u32 entry_valid_nsec;
- __u32 attr_valid_nsec;
+ uint64_t nodeid; /* Inode ID */
+ uint64_t generation; /* Inode generation: nodeid:gen must
+ be unique for the fs's lifetime */
+ uint64_t entry_valid; /* Cache timeout for the name */
+ uint64_t attr_valid; /* Cache timeout for the attributes */
+ uint32_t entry_valid_nsec;
+ uint32_t attr_valid_nsec;
struct fuse_attr attr;
};
struct fuse_forget_in {
- __u64 nlookup;
+ uint64_t nlookup;
+};
+
+struct fuse_forget_one {
+ uint64_t nodeid;
+ uint64_t nlookup;
+};
+
+struct fuse_batch_forget_in {
+ uint32_t count;
+ uint32_t dummy;
};
struct fuse_getattr_in {
- __u32 getattr_flags;
- __u32 dummy;
- __u64 fh;
+ uint32_t getattr_flags;
+ uint32_t dummy;
+ uint64_t fh;
};
#define FUSE_COMPAT_ATTR_OUT_SIZE 96
struct fuse_attr_out {
- __u64 attr_valid; /* Cache timeout for the attributes */
- __u32 attr_valid_nsec;
- __u32 dummy;
+ uint64_t attr_valid; /* Cache timeout for the attributes */
+ uint32_t attr_valid_nsec;
+ uint32_t dummy;
struct fuse_attr attr;
};
#define FUSE_COMPAT_MKNOD_IN_SIZE 8
struct fuse_mknod_in {
- __u32 mode;
- __u32 rdev;
- __u32 umask;
- __u32 padding;
+ uint32_t mode;
+ uint32_t rdev;
+ uint32_t umask;
+ uint32_t padding;
};
struct fuse_mkdir_in {
- __u32 mode;
- __u32 umask;
+ uint32_t mode;
+ uint32_t umask;
};
struct fuse_rename_in {
- __u64 newdir;
+ uint64_t newdir;
};
struct fuse_link_in {
- __u64 oldnodeid;
+ uint64_t oldnodeid;
};
struct fuse_setattr_in {
- __u32 valid;
- __u32 padding;
- __u64 fh;
- __u64 size;
- __u64 lock_owner;
- __u64 atime;
- __u64 mtime;
- __u64 unused2;
- __u32 atimensec;
- __u32 mtimensec;
- __u32 unused3;
- __u32 mode;
- __u32 unused4;
- __u32 uid;
- __u32 gid;
- __u32 unused5;
+ uint32_t valid;
+ uint32_t padding;
+ uint64_t fh;
+ uint64_t size;
+ uint64_t lock_owner;
+ uint64_t atime;
+ uint64_t mtime;
+ uint64_t unused2;
+ uint32_t atimensec;
+ uint32_t mtimensec;
+ uint32_t unused3;
+ uint32_t mode;
+ uint32_t unused4;
+ uint32_t uid;
+ uint32_t gid;
+ uint32_t unused5;
};
struct fuse_open_in {
- __u32 flags;
- __u32 unused;
+ uint32_t flags;
+ uint32_t unused;
};
struct fuse_create_in {
- __u32 flags;
- __u32 mode;
- __u32 umask;
- __u32 padding;
+ uint32_t flags;
+ uint32_t mode;
+ uint32_t umask;
+ uint32_t padding;
};
struct fuse_open_out {
- __u64 fh;
- __u32 open_flags;
- __u32 padding;
+ uint64_t fh;
+ uint32_t open_flags;
+ uint32_t padding;
};
struct fuse_release_in {
- __u64 fh;
- __u32 flags;
- __u32 release_flags;
- __u64 lock_owner;
+ uint64_t fh;
+ uint32_t flags;
+ uint32_t release_flags;
+ uint64_t lock_owner;
};
struct fuse_flush_in {
- __u64 fh;
- __u32 unused;
- __u32 padding;
- __u64 lock_owner;
+ uint64_t fh;
+ uint32_t unused;
+ uint32_t padding;
+ uint64_t lock_owner;
};
struct fuse_read_in {
- __u64 fh;
- __u64 offset;
- __u32 size;
- __u32 read_flags;
- __u64 lock_owner;
- __u32 flags;
- __u32 padding;
+ uint64_t fh;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t read_flags;
+ uint64_t lock_owner;
+ uint32_t flags;
+ uint32_t padding;
};
#define FUSE_COMPAT_WRITE_IN_SIZE 24
struct fuse_write_in {
- __u64 fh;
- __u64 offset;
- __u32 size;
- __u32 write_flags;
- __u64 lock_owner;
- __u32 flags;
- __u32 padding;
+ uint64_t fh;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t write_flags;
+ uint64_t lock_owner;
+ uint32_t flags;
+ uint32_t padding;
};
struct fuse_write_out {
- __u32 size;
- __u32 padding;
+ uint32_t size;
+ uint32_t padding;
};
#define FUSE_COMPAT_STATFS_SIZE 48
@@ -413,32 +510,32 @@ struct fuse_statfs_out {
};
struct fuse_fsync_in {
- __u64 fh;
- __u32 fsync_flags;
- __u32 padding;
+ uint64_t fh;
+ uint32_t fsync_flags;
+ uint32_t padding;
};
struct fuse_setxattr_in {
- __u32 size;
- __u32 flags;
+ uint32_t size;
+ uint32_t flags;
};
struct fuse_getxattr_in {
- __u32 size;
- __u32 padding;
+ uint32_t size;
+ uint32_t padding;
};
struct fuse_getxattr_out {
- __u32 size;
- __u32 padding;
+ uint32_t size;
+ uint32_t padding;
};
struct fuse_lk_in {
- __u64 fh;
- __u64 owner;
+ uint64_t fh;
+ uint64_t owner;
struct fuse_file_lock lk;
- __u32 lk_flags;
- __u32 padding;
+ uint32_t lk_flags;
+ uint32_t padding;
};
struct fuse_lk_out {
@@ -446,134 +543,190 @@ struct fuse_lk_out {
};
struct fuse_access_in {
- __u32 mask;
- __u32 padding;
+ uint32_t mask;
+ uint32_t padding;
};
struct fuse_init_in {
- __u32 major;
- __u32 minor;
- __u32 max_readahead;
- __u32 flags;
+ uint32_t major;
+ uint32_t minor;
+ uint32_t max_readahead;
+ uint32_t flags;
};
struct fuse_init_out {
- __u32 major;
- __u32 minor;
- __u32 max_readahead;
- __u32 flags;
- __u16 max_background;
- __u16 congestion_threshold;
- __u32 max_write;
+ uint32_t major;
+ uint32_t minor;
+ uint32_t max_readahead;
+ uint32_t flags;
+ uint16_t max_background;
+ uint16_t congestion_threshold;
+ uint32_t max_write;
};
#define CUSE_INIT_INFO_MAX 4096
struct cuse_init_in {
- __u32 major;
- __u32 minor;
- __u32 unused;
- __u32 flags;
+ uint32_t major;
+ uint32_t minor;
+ uint32_t unused;
+ uint32_t flags;
};
struct cuse_init_out {
- __u32 major;
- __u32 minor;
- __u32 unused;
- __u32 flags;
- __u32 max_read;
- __u32 max_write;
- __u32 dev_major; /* chardev major */
- __u32 dev_minor; /* chardev minor */
- __u32 spare[10];
+ uint32_t major;
+ uint32_t minor;
+ uint32_t unused;
+ uint32_t flags;
+ uint32_t max_read;
+ uint32_t max_write;
+ uint32_t dev_major; /* chardev major */
+ uint32_t dev_minor; /* chardev minor */
+ uint32_t spare[10];
};
struct fuse_interrupt_in {
- __u64 unique;
+ uint64_t unique;
};
struct fuse_bmap_in {
- __u64 block;
- __u32 blocksize;
- __u32 padding;
+ uint64_t block;
+ uint32_t blocksize;
+ uint32_t padding;
};
struct fuse_bmap_out {
- __u64 block;
+ uint64_t block;
};
struct fuse_ioctl_in {
- __u64 fh;
- __u32 flags;
- __u32 cmd;
- __u64 arg;
- __u32 in_size;
- __u32 out_size;
+ uint64_t fh;
+ uint32_t flags;
+ uint32_t cmd;
+ uint64_t arg;
+ uint32_t in_size;
+ uint32_t out_size;
+};
+
+struct fuse_ioctl_iovec {
+ uint64_t base;
+ uint64_t len;
};
struct fuse_ioctl_out {
- __s32 result;
- __u32 flags;
- __u32 in_iovs;
- __u32 out_iovs;
+ int32_t result;
+ uint32_t flags;
+ uint32_t in_iovs;
+ uint32_t out_iovs;
};
struct fuse_poll_in {
- __u64 fh;
- __u64 kh;
- __u32 flags;
- __u32 padding;
+ uint64_t fh;
+ uint64_t kh;
+ uint32_t flags;
+ uint32_t events;
};
struct fuse_poll_out {
- __u32 revents;
- __u32 padding;
+ uint32_t revents;
+ uint32_t padding;
};
struct fuse_notify_poll_wakeup_out {
- __u64 kh;
+ uint64_t kh;
+};
+
+struct fuse_fallocate_in {
+ uint64_t fh;
+ uint64_t offset;
+ uint64_t length;
+ uint32_t mode;
+ uint32_t padding;
};
struct fuse_in_header {
- __u32 len;
- __u32 opcode;
- __u64 unique;
- __u64 nodeid;
- __u32 uid;
- __u32 gid;
- __u32 pid;
- __u32 padding;
+ uint32_t len;
+ uint32_t opcode;
+ uint64_t unique;
+ uint64_t nodeid;
+ uint32_t uid;
+ uint32_t gid;
+ uint32_t pid;
+ uint32_t padding;
};
struct fuse_out_header {
- __u32 len;
- __s32 error;
- __u64 unique;
+ uint32_t len;
+ int32_t error;
+ uint64_t unique;
};
struct fuse_dirent {
- __u64 ino;
- __u64 off;
- __u32 namelen;
- __u32 type;
- char name[0];
+ uint64_t ino;
+ uint64_t off;
+ uint32_t namelen;
+ uint32_t type;
+ char name[];
};
#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
-#define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1))
+#define FUSE_DIRENT_ALIGN(x) \
+ (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1))
#define FUSE_DIRENT_SIZE(d) \
FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
+struct fuse_direntplus {
+ struct fuse_entry_out entry_out;
+ struct fuse_dirent dirent;
+};
+
+#define FUSE_NAME_OFFSET_DIRENTPLUS \
+ offsetof(struct fuse_direntplus, dirent.name)
+#define FUSE_DIRENTPLUS_SIZE(d) \
+ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen)
+
struct fuse_notify_inval_inode_out {
- __u64 ino;
- __s64 off;
- __s64 len;
+ uint64_t ino;
+ int64_t off;
+ int64_t len;
};
struct fuse_notify_inval_entry_out {
- __u64 parent;
- __u32 namelen;
- __u32 padding;
+ uint64_t parent;
+ uint32_t namelen;
+ uint32_t padding;
+};
+
+struct fuse_notify_delete_out {
+ uint64_t parent;
+ uint64_t child;
+ uint32_t namelen;
+ uint32_t padding;
+};
+
+struct fuse_notify_store_out {
+ uint64_t nodeid;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t padding;
+};
+
+struct fuse_notify_retrieve_out {
+ uint64_t notify_unique;
+ uint64_t nodeid;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t padding;
+};
+
+/* Matches the size of fuse_write_in */
+struct fuse_notify_retrieve_in {
+ uint64_t dummy1;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t dummy2;
+ uint64_t dummy3;
+ uint64_t dummy4;
};
#endif /* _LINUX_FUSE_H */
diff --git a/contrib/fuse-lib/mount-gluster-compat.h b/contrib/fuse-lib/mount-gluster-compat.h
index 17c11e789..4fc20623b 100644
--- a/contrib/fuse-lib/mount-gluster-compat.h
+++ b/contrib/fuse-lib/mount-gluster-compat.h
@@ -33,6 +33,7 @@
#ifdef __NetBSD__
#include <perfuse.h>
#define umount2(dir, flags) unmount(dir, ((flags) != 0) ? MNT_FORCE : 0)
+#define MS_RDONLY MNT_RDONLY
#endif
#ifdef linux
diff --git a/contrib/fuse-lib/mount.c b/contrib/fuse-lib/mount.c
index f02a835b3..922d9e464 100644
--- a/contrib/fuse-lib/mount.c
+++ b/contrib/fuse-lib/mount.c
@@ -100,7 +100,8 @@ escape (char *s)
}
static int
-fuse_mount_fusermount (const char *mountpoint, char *fsname, char *mnt_param,
+fuse_mount_fusermount (const char *mountpoint, char *fsname,
+ unsigned long mountflags, char *mnt_param,
int fd)
{
int pid = -1;
@@ -124,7 +125,8 @@ fuse_mount_fusermount (const char *mountpoint, char *fsname, char *mnt_param,
return -1;
}
ret = asprintf (&fm_mnt_params,
- "%s,fsname=%s,nonempty,subtype=glusterfs",
+ "%s%s,fsname=%s,nonempty,subtype=glusterfs",
+ (mountflags & MS_RDONLY) ? "ro," : "",
mnt_param, efsname);
FREE (efsname);
if (ret == -1) {
@@ -169,7 +171,8 @@ fuse_mount_fusermount (const char *mountpoint, char *fsname, char *mnt_param,
}
static int
-fuse_mount_sys (const char *mountpoint, char *fsname, char *mnt_param, int fd)
+fuse_mount_sys (const char *mountpoint, char *fsname,
+ unsigned long mountflags, char *mnt_param, int fd)
{
int ret = -1;
unsigned mounted = 0;
@@ -185,7 +188,7 @@ fuse_mount_sys (const char *mountpoint, char *fsname, char *mnt_param, int fd)
goto out;
}
- ret = mount (source, mountpoint, fstype, 0,
+ ret = mount (source, mountpoint, fstype, mountflags,
mnt_param_mnt);
if (ret == -1 && errno == ENODEV) {
/* fs subtype support was added by 79c0b2df aka
@@ -198,7 +201,7 @@ fuse_mount_sys (const char *mountpoint, char *fsname, char *mnt_param, int fd)
goto out;
}
- ret = mount (source, mountpoint, fstype, 0,
+ ret = mount (source, mountpoint, fstype, mountflags,
mnt_param_mnt);
}
if (ret == -1)
@@ -209,6 +212,7 @@ fuse_mount_sys (const char *mountpoint, char *fsname, char *mnt_param, int fd)
#ifndef __NetBSD__
if (geteuid () == 0) {
char *newmnt = fuse_mnt_resolve_path ("fuse", mountpoint);
+ char *mnt_param_mtab = NULL;
if (!newmnt) {
ret = -1;
@@ -216,8 +220,17 @@ fuse_mount_sys (const char *mountpoint, char *fsname, char *mnt_param, int fd)
goto out;
}
- ret = fuse_mnt_add_mount ("fuse", source, newmnt, fstype,
- mnt_param);
+ ret = asprintf (&mnt_param_mtab, "%s%s",
+ mountflags & MS_RDONLY ? "ro," : "",
+ mnt_param);
+ if (ret == -1)
+ GFFUSE_LOGERR ("Out of memory");
+ else {
+ ret = fuse_mnt_add_mount ("fuse", source, newmnt,
+ fstype, mnt_param_mtab);
+ FREE (mnt_param_mtab);
+ }
+
FREE (newmnt);
if (ret == -1) {
GFFUSE_LOGERR ("failed to add mtab entry");
@@ -240,7 +253,8 @@ out:
}
int
-gf_fuse_mount (const char *mountpoint, char *fsname, char *mnt_param,
+gf_fuse_mount (const char *mountpoint, char *fsname,
+ unsigned long mountflags, char *mnt_param,
pid_t *mnt_pid, int status_fd)
{
int fd = -1;
@@ -268,19 +282,20 @@ gf_fuse_mount (const char *mountpoint, char *fsname, char *mnt_param,
exit (pid == -1 ? 1 : 0);
}
- ret = fuse_mount_sys (mountpoint, fsname, mnt_param, fd);
+ ret = fuse_mount_sys (mountpoint, fsname, mountflags, mnt_param, fd);
if (ret == -1) {
gf_log ("glusterfs-fuse", GF_LOG_INFO,
"direct mount failed (%s), "
"retry to mount via fusermount",
strerror (errno));
- ret = fuse_mount_fusermount (mountpoint, fsname,
+ ret = fuse_mount_fusermount (mountpoint, fsname, mountflags,
mnt_param, fd);
}
if (ret == -1)
- GFFUSE_LOGERR ("mount failed");
+ GFFUSE_LOGERR ("mount of %s to %s (%s) failed",
+ fsname, mountpoint, mnt_param);
if (status_fd >= 0)
(void)write (status_fd, &ret, sizeof (ret));
diff --git a/contrib/libgen/basename_r.c b/contrib/libgen/basename_r.c
index d63bce4e5..2c3a87afe 100644
--- a/contrib/libgen/basename_r.c
+++ b/contrib/libgen/basename_r.c
@@ -1,10 +1,10 @@
-#ifdef THREAD_UNSAFE_BASENAME
/*
- * borrowed from glibc-2.12.1/string/basename.c
+ * borrowed from glibc-2.12.1/string/basename.c
* Modified to return "." for NULL or "", as required for SUSv2.
*/
#include <string.h>
#include <stdlib.h>
+#ifdef THREAD_UNSAFE_BASENAME
/* Return the name-within-directory of a file name.
Copyright (C) 1996,97,98,2002 Free Software Foundation, Inc.
diff --git a/contrib/libgen/dirname_r.c b/contrib/libgen/dirname_r.c
index 02981f5ec..131cbcf2a 100644
--- a/contrib/libgen/dirname_r.c
+++ b/contrib/libgen/dirname_r.c
@@ -1,4 +1,3 @@
-#ifdef THREAD_UNSAFE_DIRNAME
/*
* Borrowed from glibc-2.12.1/string/memrchr.c
* Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
@@ -7,6 +6,7 @@
*/
#include <string.h>
#include <stdlib.h>
+#ifdef THREAD_UNSAFE_DIRNAME
/* memrchr -- find the last occurrence of a byte in a memory block
Copyright (C) 1991, 93, 96, 97, 99, 2000 Free Software Foundation, Inc.
diff --git a/contrib/qemu/block.c b/contrib/qemu/block.c
new file mode 100644
index 000000000..b56024113
--- /dev/null
+++ b/contrib/qemu/block.c
@@ -0,0 +1,4604 @@
+/*
+ * QEMU System Emulator block driver
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "config-host.h"
+#include "qemu-common.h"
+#include "trace.h"
+#include "monitor/monitor.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+#include "qemu/module.h"
+#include "qapi/qmp/qjson.h"
+#include "sysemu/sysemu.h"
+#include "qemu/notify.h"
+#include "block/coroutine.h"
+#include "qmp-commands.h"
+#include "qemu/timer.h"
+
+#ifdef CONFIG_BSD
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/queue.h>
+#ifndef __DragonFly__
+#include <sys/disk.h>
+#endif
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
+
+typedef enum {
+ BDRV_REQ_COPY_ON_READ = 0x1,
+ BDRV_REQ_ZERO_WRITE = 0x2,
+} BdrvRequestFlags;
+
+static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
+static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque);
+static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque);
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque,
+ bool is_write);
+static void coroutine_fn bdrv_co_do_rw(void *opaque);
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors);
+
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+ double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, int64_t *wait);
+
+static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
+ QTAILQ_HEAD_INITIALIZER(bdrv_states);
+
+static QLIST_HEAD(, BlockDriver) bdrv_drivers =
+ QLIST_HEAD_INITIALIZER(bdrv_drivers);
+
+/* If non-zero, use only whitelisted block drivers */
+static int use_bdrv_whitelist;
+
+#ifdef _WIN32
+static int is_windows_drive_prefix(const char *filename)
+{
+ return (((filename[0] >= 'a' && filename[0] <= 'z') ||
+ (filename[0] >= 'A' && filename[0] <= 'Z')) &&
+ filename[1] == ':');
+}
+
+int is_windows_drive(const char *filename)
+{
+ if (is_windows_drive_prefix(filename) &&
+ filename[2] == '\0')
+ return 1;
+ if (strstart(filename, "\\\\.\\", NULL) ||
+ strstart(filename, "//./", NULL))
+ return 1;
+ return 0;
+}
+#endif
+
+/* throttling disk I/O limits */
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+ bs->io_limits_enabled = false;
+
+ while (qemu_co_queue_next(&bs->throttled_reqs));
+
+ if (bs->block_timer) {
+ qemu_del_timer(bs->block_timer);
+ qemu_free_timer(bs->block_timer);
+ bs->block_timer = NULL;
+ }
+
+ bs->slice_start = 0;
+ bs->slice_end = 0;
+}
+
+static void bdrv_block_timer(void *opaque)
+{
+ BlockDriverState *bs = opaque;
+
+ qemu_co_queue_next(&bs->throttled_reqs);
+}
+
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+ qemu_co_queue_init(&bs->throttled_reqs);
+ bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
+ bs->io_limits_enabled = true;
+}
+
+bool bdrv_io_limits_enabled(BlockDriverState *bs)
+{
+ BlockIOLimit *io_limits = &bs->io_limits;
+ return io_limits->bps[BLOCK_IO_LIMIT_READ]
+ || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
+ || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
+ || io_limits->iops[BLOCK_IO_LIMIT_READ]
+ || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
+ || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
+}
+
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+ bool is_write, int nb_sectors)
+{
+ int64_t wait_time = -1;
+
+ if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
+ qemu_co_queue_wait(&bs->throttled_reqs);
+ }
+
+ /* In fact, we hope to keep each request's timing, in FIFO mode. The next
+ * throttled requests will not be dequeued until the current request is
+ * allowed to be serviced. So if the current request still exceeds the
+ * limits, it will be inserted to the head. All requests followed it will
+ * be still in throttled_reqs queue.
+ */
+
+ while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
+ qemu_mod_timer(bs->block_timer,
+ wait_time + qemu_get_clock_ns(vm_clock));
+ qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
+ }
+
+ qemu_co_queue_next(&bs->throttled_reqs);
+}
+
+/* check if the path starts with "<protocol>:" */
+static int path_has_protocol(const char *path)
+{
+ const char *p;
+
+#ifdef _WIN32
+ if (is_windows_drive(path) ||
+ is_windows_drive_prefix(path)) {
+ return 0;
+ }
+ p = path + strcspn(path, ":/\\");
+#else
+ p = path + strcspn(path, ":/");
+#endif
+
+ return *p == ':';
+}
+
+int path_is_absolute(const char *path)
+{
+#ifdef _WIN32
+ /* specific case for names like: "\\.\d:" */
+ if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
+ return 1;
+ }
+ return (*path == '/' || *path == '\\');
+#else
+ return (*path == '/');
+#endif
+}
+
+/* if filename is absolute, just copy it to dest. Otherwise, build a
+ path to it by considering it is relative to base_path. URL are
+ supported. */
+void path_combine(char *dest, int dest_size,
+ const char *base_path,
+ const char *filename)
+{
+ const char *p, *p1;
+ int len;
+
+ if (dest_size <= 0)
+ return;
+ if (path_is_absolute(filename)) {
+ pstrcpy(dest, dest_size, filename);
+ } else {
+ p = strchr(base_path, ':');
+ if (p)
+ p++;
+ else
+ p = base_path;
+ p1 = strrchr(base_path, '/');
+#ifdef _WIN32
+ {
+ const char *p2;
+ p2 = strrchr(base_path, '\\');
+ if (!p1 || p2 > p1)
+ p1 = p2;
+ }
+#endif
+ if (p1)
+ p1++;
+ else
+ p1 = base_path;
+ if (p1 > p)
+ p = p1;
+ len = p - base_path;
+ if (len > dest_size - 1)
+ len = dest_size - 1;
+ memcpy(dest, base_path, len);
+ dest[len] = '\0';
+ pstrcat(dest, dest_size, filename);
+ }
+}
+
+void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
+{
+ if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
+ pstrcpy(dest, sz, bs->backing_file);
+ } else {
+ path_combine(dest, sz, bs->filename, bs->backing_file);
+ }
+}
+
+void bdrv_register(BlockDriver *bdrv)
+{
+ /* Block drivers without coroutine functions need emulation */
+ if (!bdrv->bdrv_co_readv) {
+ bdrv->bdrv_co_readv = bdrv_co_readv_em;
+ bdrv->bdrv_co_writev = bdrv_co_writev_em;
+
+ /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
+ * the block driver lacks aio we need to emulate that too.
+ */
+ if (!bdrv->bdrv_aio_readv) {
+ /* add AIO emulation layer */
+ bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
+ bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
+ }
+ }
+
+ QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
+}
+
+/* create a new block device (by default it is empty) */
+BlockDriverState *bdrv_new(const char *device_name)
+{
+ BlockDriverState *bs;
+
+ bs = g_malloc0(sizeof(BlockDriverState));
+ pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
+ if (device_name[0] != '\0') {
+ QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
+ }
+ bdrv_iostatus_disable(bs);
+ notifier_list_init(&bs->close_notifiers);
+ notifier_with_return_list_init(&bs->before_write_notifiers);
+
+ return bs;
+}
+
+void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
+{
+ notifier_list_add(&bs->close_notifiers, notify);
+}
+
+BlockDriver *bdrv_find_format(const char *format_name)
+{
+ BlockDriver *drv1;
+ QLIST_FOREACH(drv1, &bdrv_drivers, list) {
+ if (!strcmp(drv1->format_name, format_name)) {
+ return drv1;
+ }
+ }
+ return NULL;
+}
+
+static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
+{
+ static const char *whitelist_rw[] = {
+ CONFIG_BDRV_RW_WHITELIST
+ };
+ static const char *whitelist_ro[] = {
+ CONFIG_BDRV_RO_WHITELIST
+ };
+ const char **p;
+
+ if (!whitelist_rw[0] && !whitelist_ro[0]) {
+ return 1; /* no whitelist, anything goes */
+ }
+
+ for (p = whitelist_rw; *p; p++) {
+ if (!strcmp(drv->format_name, *p)) {
+ return 1;
+ }
+ }
+ if (read_only) {
+ for (p = whitelist_ro; *p; p++) {
+ if (!strcmp(drv->format_name, *p)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
+ bool read_only)
+{
+ BlockDriver *drv = bdrv_find_format(format_name);
+ return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
+}
+
+typedef struct CreateCo {
+ BlockDriver *drv;
+ char *filename;
+ QEMUOptionParameter *options;
+ int ret;
+} CreateCo;
+
+static void coroutine_fn bdrv_create_co_entry(void *opaque)
+{
+ CreateCo *cco = opaque;
+ assert(cco->drv);
+
+ cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
+}
+
+int bdrv_create(BlockDriver *drv, const char* filename,
+ QEMUOptionParameter *options)
+{
+ int ret;
+
+ Coroutine *co;
+ CreateCo cco = {
+ .drv = drv,
+ .filename = g_strdup(filename),
+ .options = options,
+ .ret = NOT_DONE,
+ };
+
+ if (!drv->bdrv_create) {
+ ret = -ENOTSUP;
+ goto out;
+ }
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_create_co_entry(&cco);
+ } else {
+ co = qemu_coroutine_create(bdrv_create_co_entry);
+ qemu_coroutine_enter(co, &cco);
+ while (cco.ret == NOT_DONE) {
+ qemu_aio_wait();
+ }
+ }
+
+ ret = cco.ret;
+
+out:
+ g_free(cco.filename);
+ return ret;
+}
+
+int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
+{
+ BlockDriver *drv;
+
+ drv = bdrv_find_protocol(filename, true);
+ if (drv == NULL) {
+ return -ENOENT;
+ }
+
+ return bdrv_create(drv, filename, options);
+}
+
+/*
+ * Create a uniquely-named empty temporary file.
+ * Return 0 upon success, otherwise a negative errno value.
+ */
+int get_tmp_filename(char *filename, int size)
+{
+#ifdef _WIN32
+ char temp_dir[MAX_PATH];
+ /* GetTempFileName requires that its output buffer (4th param)
+ have length MAX_PATH or greater. */
+ assert(size >= MAX_PATH);
+ return (GetTempPath(MAX_PATH, temp_dir)
+ && GetTempFileName(temp_dir, "qem", 0, filename)
+ ? 0 : -GetLastError());
+#else
+ int fd;
+ const char *tmpdir;
+ tmpdir = getenv("TMPDIR");
+ if (!tmpdir)
+ tmpdir = "/tmp";
+ if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
+ return -EOVERFLOW;
+ }
+ fd = mkstemp(filename);
+ if (fd < 0) {
+ return -errno;
+ }
+ if (close(fd) != 0) {
+ unlink(filename);
+ return -errno;
+ }
+ return 0;
+#endif
+}
+
+/*
+ * Detect host devices. By convention, /dev/cdrom[N] is always
+ * recognized as a host CDROM.
+ */
+static BlockDriver *find_hdev_driver(const char *filename)
+{
+ int score_max = 0, score;
+ BlockDriver *drv = NULL, *d;
+
+ QLIST_FOREACH(d, &bdrv_drivers, list) {
+ if (d->bdrv_probe_device) {
+ score = d->bdrv_probe_device(filename);
+ if (score > score_max) {
+ score_max = score;
+ drv = d;
+ }
+ }
+ }
+
+ return drv;
+}
+
+BlockDriver *bdrv_find_protocol(const char *filename,
+ bool allow_protocol_prefix)
+{
+ BlockDriver *drv1;
+ char protocol[128];
+ int len;
+ const char *p;
+
+ /* TODO Drivers without bdrv_file_open must be specified explicitly */
+
+ /*
+ * XXX(hch): we really should not let host device detection
+ * override an explicit protocol specification, but moving this
+ * later breaks access to device names with colons in them.
+ * Thanks to the brain-dead persistent naming schemes on udev-
+ * based Linux systems those actually are quite common.
+ */
+ drv1 = find_hdev_driver(filename);
+ if (drv1) {
+ return drv1;
+ }
+
+ if (!path_has_protocol(filename) || !allow_protocol_prefix) {
+ return bdrv_find_format("file");
+ }
+
+ p = strchr(filename, ':');
+ assert(p != NULL);
+ len = p - filename;
+ if (len > sizeof(protocol) - 1)
+ len = sizeof(protocol) - 1;
+ memcpy(protocol, filename, len);
+ protocol[len] = '\0';
+ QLIST_FOREACH(drv1, &bdrv_drivers, list) {
+ if (drv1->protocol_name &&
+ !strcmp(drv1->protocol_name, protocol)) {
+ return drv1;
+ }
+ }
+ return NULL;
+}
+
+static int find_image_format(BlockDriverState *bs, const char *filename,
+ BlockDriver **pdrv)
+{
+ int score, score_max;
+ BlockDriver *drv1, *drv;
+ uint8_t buf[2048];
+ int ret = 0;
+
+ /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
+ if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
+ drv = bdrv_find_format("raw");
+ if (!drv) {
+ ret = -ENOENT;
+ }
+ *pdrv = drv;
+ return ret;
+ }
+
+ ret = bdrv_pread(bs, 0, buf, sizeof(buf));
+ if (ret < 0) {
+ *pdrv = NULL;
+ return ret;
+ }
+
+ score_max = 0;
+ drv = NULL;
+ QLIST_FOREACH(drv1, &bdrv_drivers, list) {
+ if (drv1->bdrv_probe) {
+ score = drv1->bdrv_probe(buf, ret, filename);
+ if (score > score_max) {
+ score_max = score;
+ drv = drv1;
+ }
+ }
+ }
+ if (!drv) {
+ ret = -ENOENT;
+ }
+ *pdrv = drv;
+ return ret;
+}
+
+/**
+ * Set the current 'total_sectors' value
+ */
+static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
+{
+ BlockDriver *drv = bs->drv;
+
+ /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
+ if (bs->sg)
+ return 0;
+
+ /* query actual device if possible, otherwise just trust the hint */
+ if (drv->bdrv_getlength) {
+ int64_t length = drv->bdrv_getlength(bs);
+ if (length < 0) {
+ return length;
+ }
+ hint = length >> BDRV_SECTOR_BITS;
+ }
+
+ bs->total_sectors = hint;
+ return 0;
+}
+
+/**
+ * Set open flags for a given discard mode
+ *
+ * Return 0 on success, -1 if the discard mode was invalid.
+ */
+int bdrv_parse_discard_flags(const char *mode, int *flags)
+{
+ *flags &= ~BDRV_O_UNMAP;
+
+ if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
+ /* do nothing */
+ } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
+ *flags |= BDRV_O_UNMAP;
+ } else {
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * Set open flags for a given cache mode
+ *
+ * Return 0 on success, -1 if the cache mode was invalid.
+ */
+int bdrv_parse_cache_flags(const char *mode, int *flags)
+{
+ *flags &= ~BDRV_O_CACHE_MASK;
+
+ if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
+ *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
+ } else if (!strcmp(mode, "directsync")) {
+ *flags |= BDRV_O_NOCACHE;
+ } else if (!strcmp(mode, "writeback")) {
+ *flags |= BDRV_O_CACHE_WB;
+ } else if (!strcmp(mode, "unsafe")) {
+ *flags |= BDRV_O_CACHE_WB;
+ *flags |= BDRV_O_NO_FLUSH;
+ } else if (!strcmp(mode, "writethrough")) {
+ /* this is the default */
+ } else {
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * The copy-on-read flag is actually a reference count so multiple users may
+ * use the feature without worrying about clobbering its previous state.
+ * Copy-on-read stays enabled until all users have called to disable it.
+ */
+void bdrv_enable_copy_on_read(BlockDriverState *bs)
+{
+ bs->copy_on_read++;
+}
+
+void bdrv_disable_copy_on_read(BlockDriverState *bs)
+{
+ assert(bs->copy_on_read > 0);
+ bs->copy_on_read--;
+}
+
+static int bdrv_open_flags(BlockDriverState *bs, int flags)
+{
+ int open_flags = flags | BDRV_O_CACHE_WB;
+
+ /*
+ * Clear flags that are internal to the block layer before opening the
+ * image.
+ */
+ open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
+
+ /*
+ * Snapshots should be writable.
+ */
+ if (bs->is_temporary) {
+ open_flags |= BDRV_O_RDWR;
+ }
+
+ return open_flags;
+}
+
+/*
+ * Common part for opening disk images and files
+ *
+ * Removes all processed options from *options.
+ */
+static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
+ QDict *options, int flags, BlockDriver *drv)
+{
+ int ret, open_flags;
+ const char *filename;
+
+ assert(drv != NULL);
+ assert(bs->file == NULL);
+ assert(options != NULL && bs->options != options);
+
+ if (file != NULL) {
+ filename = file->filename;
+ } else {
+ filename = qdict_get_try_str(options, "filename");
+ }
+
+ trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
+
+ /* bdrv_open() with directly using a protocol as drv. This layer is already
+ * opened, so assign it to bs (while file becomes a closed BlockDriverState)
+ * and return immediately. */
+ if (file != NULL && drv->bdrv_file_open) {
+ bdrv_swap(file, bs);
+ return 0;
+ }
+
+ bs->open_flags = flags;
+ bs->buffer_alignment = 512;
+ open_flags = bdrv_open_flags(bs, flags);
+ bs->read_only = !(open_flags & BDRV_O_RDWR);
+
+ if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
+ return -ENOTSUP;
+ }
+
+ assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
+ if (!bs->read_only && (flags & BDRV_O_COPY_ON_READ)) {
+ bdrv_enable_copy_on_read(bs);
+ }
+
+ if (filename != NULL) {
+ pstrcpy(bs->filename, sizeof(bs->filename), filename);
+ } else {
+ bs->filename[0] = '\0';
+ }
+
+ bs->drv = drv;
+ bs->opaque = g_malloc0(drv->instance_size);
+
+ bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
+
+ /* Open the image, either directly or using a protocol */
+ if (drv->bdrv_file_open) {
+ assert(file == NULL);
+ assert(drv->bdrv_parse_filename || filename != NULL);
+ ret = drv->bdrv_file_open(bs, options, open_flags);
+ } else {
+ if (file == NULL) {
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't use '%s' as a "
+ "block driver for the protocol level",
+ drv->format_name);
+ ret = -EINVAL;
+ goto free_and_fail;
+ }
+ assert(file != NULL);
+ bs->file = file;
+ ret = drv->bdrv_open(bs, options, open_flags);
+ }
+
+ if (ret < 0) {
+ goto free_and_fail;
+ }
+
+ ret = refresh_total_sectors(bs, bs->total_sectors);
+ if (ret < 0) {
+ goto free_and_fail;
+ }
+
+#ifndef _WIN32
+ if (bs->is_temporary) {
+ assert(filename != NULL);
+ unlink(filename);
+ }
+#endif
+ return 0;
+
+free_and_fail:
+ bs->file = NULL;
+ g_free(bs->opaque);
+ bs->opaque = NULL;
+ bs->drv = NULL;
+ return ret;
+}
+
+/*
+ * Opens a file using a protocol (file, host_device, nbd, ...)
+ *
+ * options is a QDict of options to pass to the block drivers, or NULL for an
+ * empty set of options. The reference to the QDict belongs to the block layer
+ * after the call (even on failure), so if the caller intends to reuse the
+ * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
+ */
+int bdrv_file_open(BlockDriverState **pbs, const char *filename,
+ QDict *options, int flags)
+{
+ BlockDriverState *bs;
+ BlockDriver *drv;
+ const char *drvname;
+ bool allow_protocol_prefix = false;
+ int ret;
+
+ /* NULL means an empty set of options */
+ if (options == NULL) {
+ options = qdict_new();
+ }
+
+ bs = bdrv_new("");
+ bs->options = options;
+ options = qdict_clone_shallow(options);
+
+ /* Fetch the file name from the options QDict if necessary */
+ if (!filename) {
+ filename = qdict_get_try_str(options, "filename");
+ } else if (filename && !qdict_haskey(options, "filename")) {
+ qdict_put(options, "filename", qstring_from_str(filename));
+ allow_protocol_prefix = true;
+ } else {
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't specify 'file' and "
+ "'filename' options at the same time");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* Find the right block driver */
+ drvname = qdict_get_try_str(options, "driver");
+ if (drvname) {
+ drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR));
+ qdict_del(options, "driver");
+ } else if (filename) {
+ drv = bdrv_find_protocol(filename, allow_protocol_prefix);
+ if (!drv) {
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "Unknown protocol");
+ }
+ } else {
+ qerror_report(ERROR_CLASS_GENERIC_ERROR,
+ "Must specify either driver or file");
+ drv = NULL;
+ }
+
+ if (!drv) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ /* Parse the filename and open it */
+ if (drv->bdrv_parse_filename && filename) {
+ Error *local_err = NULL;
+ drv->bdrv_parse_filename(filename, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = -EINVAL;
+ goto fail;
+ }
+ qdict_del(options, "filename");
+ } else if (!drv->bdrv_parse_filename && !filename) {
+ qerror_report(ERROR_CLASS_GENERIC_ERROR,
+ "The '%s' block driver requires a file name",
+ drv->format_name);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ ret = bdrv_open_common(bs, NULL, options, flags, drv);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Check if any unknown options were used */
+ if (qdict_size(options) != 0) {
+ const QDictEntry *entry = qdict_first(options);
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block protocol '%s' doesn't "
+ "support the option '%s'",
+ drv->format_name, entry->key);
+ ret = -EINVAL;
+ goto fail;
+ }
+ QDECREF(options);
+
+ bs->growable = 1;
+ *pbs = bs;
+ return 0;
+
+fail:
+ QDECREF(options);
+ if (!bs->drv) {
+ QDECREF(bs->options);
+ }
+ bdrv_delete(bs);
+ return ret;
+}
+
+/*
+ * Opens the backing file for a BlockDriverState if not yet open
+ *
+ * options is a QDict of options to pass to the block drivers, or NULL for an
+ * empty set of options. The reference to the QDict is transferred to this
+ * function (even on failure), so if the caller intends to reuse the dictionary,
+ * it needs to use QINCREF() before calling bdrv_file_open.
+ */
+int bdrv_open_backing_file(BlockDriverState *bs, QDict *options)
+{
+ char backing_filename[PATH_MAX];
+ int back_flags, ret;
+ BlockDriver *back_drv = NULL;
+
+ if (bs->backing_hd != NULL) {
+ QDECREF(options);
+ return 0;
+ }
+
+ /* NULL means an empty set of options */
+ if (options == NULL) {
+ options = qdict_new();
+ }
+
+ bs->open_flags &= ~BDRV_O_NO_BACKING;
+ if (qdict_haskey(options, "file.filename")) {
+ backing_filename[0] = '\0';
+ } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
+ QDECREF(options);
+ return 0;
+ }
+
+ bs->backing_hd = bdrv_new("");
+ bdrv_get_full_backing_filename(bs, backing_filename,
+ sizeof(backing_filename));
+
+ if (bs->backing_format[0] != '\0') {
+ back_drv = bdrv_find_format(bs->backing_format);
+ }
+
+ /* backing files always opened read-only */
+ back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
+
+ ret = bdrv_open(bs->backing_hd,
+ *backing_filename ? backing_filename : NULL, options,
+ back_flags, back_drv);
+ if (ret < 0) {
+ bdrv_delete(bs->backing_hd);
+ bs->backing_hd = NULL;
+ bs->open_flags |= BDRV_O_NO_BACKING;
+ return ret;
+ }
+ return 0;
+}
+
+static void extract_subqdict(QDict *src, QDict **dst, const char *start)
+{
+ const QDictEntry *entry, *next;
+ const char *p;
+
+ *dst = qdict_new();
+ entry = qdict_first(src);
+
+ while (entry != NULL) {
+ next = qdict_next(src, entry);
+ if (strstart(entry->key, start, &p)) {
+ qobject_incref(entry->value);
+ qdict_put_obj(*dst, p, entry->value);
+ qdict_del(src, entry->key);
+ }
+ entry = next;
+ }
+}
+
+/*
+ * Opens a disk image (raw, qcow2, vmdk, ...)
+ *
+ * options is a QDict of options to pass to the block drivers, or NULL for an
+ * empty set of options. The reference to the QDict belongs to the block layer
+ * after the call (even on failure), so if the caller intends to reuse the
+ * dictionary, it needs to use QINCREF() before calling bdrv_open.
+ */
+int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
+ int flags, BlockDriver *drv)
+{
+ int ret;
+ /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
+ char tmp_filename[PATH_MAX + 1];
+ BlockDriverState *file = NULL;
+ QDict *file_options = NULL;
+
+ /* NULL means an empty set of options */
+ if (options == NULL) {
+ options = qdict_new();
+ }
+
+ bs->options = options;
+ options = qdict_clone_shallow(options);
+
+ /* For snapshot=on, create a temporary qcow2 overlay */
+ if (flags & BDRV_O_SNAPSHOT) {
+ BlockDriverState *bs1;
+ int64_t total_size;
+ BlockDriver *bdrv_qcow2;
+ QEMUOptionParameter *create_options;
+ char backing_filename[PATH_MAX];
+
+ if (qdict_size(options) != 0) {
+ error_report("Can't use snapshot=on with driver-specific options");
+ ret = -EINVAL;
+ goto fail;
+ }
+ assert(filename != NULL);
+
+ /* if snapshot, we create a temporary backing file and open it
+ instead of opening 'filename' directly */
+
+ /* if there is a backing file, use it */
+ bs1 = bdrv_new("");
+ ret = bdrv_open(bs1, filename, NULL, 0, drv);
+ if (ret < 0) {
+ bdrv_delete(bs1);
+ goto fail;
+ }
+ total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
+
+ bdrv_delete(bs1);
+
+ ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Real path is meaningless for protocols */
+ if (path_has_protocol(filename)) {
+ snprintf(backing_filename, sizeof(backing_filename),
+ "%s", filename);
+ } else if (!realpath(filename, backing_filename)) {
+ ret = -errno;
+ goto fail;
+ }
+
+ bdrv_qcow2 = bdrv_find_format("qcow2");
+ create_options = parse_option_parameters("", bdrv_qcow2->create_options,
+ NULL);
+
+ set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
+ set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
+ backing_filename);
+ if (drv) {
+ set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
+ drv->format_name);
+ }
+
+ ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options);
+ free_option_parameters(create_options);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ filename = tmp_filename;
+ drv = bdrv_qcow2;
+ bs->is_temporary = 1;
+ }
+
+ /* Open image file without format layer */
+ if (flags & BDRV_O_RDWR) {
+ flags |= BDRV_O_ALLOW_RDWR;
+ }
+
+ extract_subqdict(options, &file_options, "file.");
+
+ ret = bdrv_file_open(&file, filename, file_options,
+ bdrv_open_flags(bs, flags | BDRV_O_UNMAP));
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Find the right image format driver */
+ if (!drv) {
+ ret = find_image_format(file, filename, &drv);
+ }
+
+ if (!drv) {
+ goto unlink_and_fail;
+ }
+
+ /* Open the image */
+ ret = bdrv_open_common(bs, file, options, flags, drv);
+ if (ret < 0) {
+ goto unlink_and_fail;
+ }
+
+ if (bs->file != file) {
+ bdrv_delete(file);
+ file = NULL;
+ }
+
+ /* If there is a backing file, use it */
+ if ((flags & BDRV_O_NO_BACKING) == 0) {
+ QDict *backing_options;
+
+ extract_subqdict(options, &backing_options, "backing.");
+ ret = bdrv_open_backing_file(bs, backing_options);
+ if (ret < 0) {
+ goto close_and_fail;
+ }
+ }
+
+ /* Check if any unknown options were used */
+ if (qdict_size(options) != 0) {
+ const QDictEntry *entry = qdict_first(options);
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by "
+ "device '%s' doesn't support the option '%s'",
+ drv->format_name, bs->device_name, entry->key);
+
+ ret = -EINVAL;
+ goto close_and_fail;
+ }
+ QDECREF(options);
+
+ if (!bdrv_key_required(bs)) {
+ bdrv_dev_change_media_cb(bs, true);
+ }
+
+ /* throttling disk I/O limits */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_enable(bs);
+ }
+
+ return 0;
+
+unlink_and_fail:
+ if (file != NULL) {
+ bdrv_delete(file);
+ }
+ if (bs->is_temporary) {
+ unlink(filename);
+ }
+fail:
+ QDECREF(bs->options);
+ QDECREF(options);
+ bs->options = NULL;
+ return ret;
+
+close_and_fail:
+ bdrv_close(bs);
+ QDECREF(options);
+ return ret;
+}
+
+typedef struct BlockReopenQueueEntry {
+ bool prepared;
+ BDRVReopenState state;
+ QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
+} BlockReopenQueueEntry;
+
+/*
+ * Adds a BlockDriverState to a simple queue for an atomic, transactional
+ * reopen of multiple devices.
+ *
+ * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
+ * already performed, or alternatively may be NULL a new BlockReopenQueue will
+ * be created and initialized. This newly created BlockReopenQueue should be
+ * passed back in for subsequent calls that are intended to be of the same
+ * atomic 'set'.
+ *
+ * bs is the BlockDriverState to add to the reopen queue.
+ *
+ * flags contains the open flags for the associated bs
+ *
+ * returns a pointer to bs_queue, which is either the newly allocated
+ * bs_queue, or the existing bs_queue being used.
+ *
+ */
+BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
+ BlockDriverState *bs, int flags)
+{
+ assert(bs != NULL);
+
+ BlockReopenQueueEntry *bs_entry;
+ if (bs_queue == NULL) {
+ bs_queue = g_new0(BlockReopenQueue, 1);
+ QSIMPLEQ_INIT(bs_queue);
+ }
+
+ if (bs->file) {
+ bdrv_reopen_queue(bs_queue, bs->file, flags);
+ }
+
+ bs_entry = g_new0(BlockReopenQueueEntry, 1);
+ QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
+
+ bs_entry->state.bs = bs;
+ bs_entry->state.flags = flags;
+
+ return bs_queue;
+}
+
+/*
+ * Reopen multiple BlockDriverStates atomically & transactionally.
+ *
+ * The queue passed in (bs_queue) must have been built up previous
+ * via bdrv_reopen_queue().
+ *
+ * Reopens all BDS specified in the queue, with the appropriate
+ * flags. All devices are prepared for reopen, and failure of any
+ * device will cause all device changes to be abandonded, and intermediate
+ * data cleaned up.
+ *
+ * If all devices prepare successfully, then the changes are committed
+ * to all devices.
+ *
+ */
+int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
+{
+ int ret = -1;
+ BlockReopenQueueEntry *bs_entry, *next;
+ Error *local_err = NULL;
+
+ assert(bs_queue != NULL);
+
+ bdrv_drain_all();
+
+ QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
+ if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
+ error_propagate(errp, local_err);
+ goto cleanup;
+ }
+ bs_entry->prepared = true;
+ }
+
+ /* If we reach this point, we have success and just need to apply the
+ * changes
+ */
+ QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
+ bdrv_reopen_commit(&bs_entry->state);
+ }
+
+ ret = 0;
+
+cleanup:
+ QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
+ if (ret && bs_entry->prepared) {
+ bdrv_reopen_abort(&bs_entry->state);
+ }
+ g_free(bs_entry);
+ }
+ g_free(bs_queue);
+ return ret;
+}
+
+
+/* Reopen a single BlockDriverState with the specified flags. */
+int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
+{
+ int ret = -1;
+ Error *local_err = NULL;
+ BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
+
+ ret = bdrv_reopen_multiple(queue, &local_err);
+ if (local_err != NULL) {
+ error_propagate(errp, local_err);
+ }
+ return ret;
+}
+
+
+/*
+ * Prepares a BlockDriverState for reopen. All changes are staged in the
+ * 'opaque' field of the BDRVReopenState, which is used and allocated by
+ * the block driver layer .bdrv_reopen_prepare()
+ *
+ * bs is the BlockDriverState to reopen
+ * flags are the new open flags
+ * queue is the reopen queue
+ *
+ * Returns 0 on success, non-zero on error. On error errp will be set
+ * as well.
+ *
+ * On failure, bdrv_reopen_abort() will be called to clean up any data.
+ * It is the responsibility of the caller to then call the abort() or
+ * commit() for any other BDS that have been left in a prepare() state
+ *
+ */
+int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
+ Error **errp)
+{
+ int ret = -1;
+ Error *local_err = NULL;
+ BlockDriver *drv;
+
+ assert(reopen_state != NULL);
+ assert(reopen_state->bs->drv != NULL);
+ drv = reopen_state->bs->drv;
+
+ /* if we are to stay read-only, do not allow permission change
+ * to r/w */
+ if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
+ reopen_state->flags & BDRV_O_RDWR) {
+ error_set(errp, QERR_DEVICE_IS_READ_ONLY,
+ reopen_state->bs->device_name);
+ goto error;
+ }
+
+
+ ret = bdrv_flush(reopen_state->bs);
+ if (ret) {
+ error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
+ strerror(-ret));
+ goto error;
+ }
+
+ if (drv->bdrv_reopen_prepare) {
+ ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
+ if (ret) {
+ if (local_err != NULL) {
+ error_propagate(errp, local_err);
+ } else {
+ error_setg(errp, "failed while preparing to reopen image '%s'",
+ reopen_state->bs->filename);
+ }
+ goto error;
+ }
+ } else {
+ /* It is currently mandatory to have a bdrv_reopen_prepare()
+ * handler for each supported drv. */
+ error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+ drv->format_name, reopen_state->bs->device_name,
+ "reopening of file");
+ ret = -1;
+ goto error;
+ }
+
+ ret = 0;
+
+error:
+ return ret;
+}
+
+/*
+ * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
+ * makes them final by swapping the staging BlockDriverState contents into
+ * the active BlockDriverState contents.
+ */
+void bdrv_reopen_commit(BDRVReopenState *reopen_state)
+{
+ BlockDriver *drv;
+
+ assert(reopen_state != NULL);
+ drv = reopen_state->bs->drv;
+ assert(drv != NULL);
+
+ /* If there are any driver level actions to take */
+ if (drv->bdrv_reopen_commit) {
+ drv->bdrv_reopen_commit(reopen_state);
+ }
+
+ /* set BDS specific flags now */
+ reopen_state->bs->open_flags = reopen_state->flags;
+ reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
+ BDRV_O_CACHE_WB);
+ reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
+}
+
+/*
+ * Abort the reopen, and delete and free the staged changes in
+ * reopen_state
+ */
+void bdrv_reopen_abort(BDRVReopenState *reopen_state)
+{
+ BlockDriver *drv;
+
+ assert(reopen_state != NULL);
+ drv = reopen_state->bs->drv;
+ assert(drv != NULL);
+
+ if (drv->bdrv_reopen_abort) {
+ drv->bdrv_reopen_abort(reopen_state);
+ }
+}
+
+
+void bdrv_close(BlockDriverState *bs)
+{
+ if (bs->job) {
+ block_job_cancel_sync(bs->job);
+ }
+ bdrv_drain_all(); /* complete I/O */
+ bdrv_flush(bs);
+ bdrv_drain_all(); /* in case flush left pending I/O */
+ notifier_list_notify(&bs->close_notifiers, bs);
+
+ if (bs->drv) {
+ if (bs->backing_hd) {
+ bdrv_delete(bs->backing_hd);
+ bs->backing_hd = NULL;
+ }
+ bs->drv->bdrv_close(bs);
+ g_free(bs->opaque);
+#ifdef _WIN32
+ if (bs->is_temporary) {
+ unlink(bs->filename);
+ }
+#endif
+ bs->opaque = NULL;
+ bs->drv = NULL;
+ bs->copy_on_read = 0;
+ bs->backing_file[0] = '\0';
+ bs->backing_format[0] = '\0';
+ bs->total_sectors = 0;
+ bs->encrypted = 0;
+ bs->valid_key = 0;
+ bs->sg = 0;
+ bs->growable = 0;
+ QDECREF(bs->options);
+ bs->options = NULL;
+
+ if (bs->file != NULL) {
+ bdrv_delete(bs->file);
+ bs->file = NULL;
+ }
+ }
+
+ bdrv_dev_change_media_cb(bs, false);
+
+ /*throttling disk I/O limits*/
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_disable(bs);
+ }
+}
+
+void bdrv_close_all(void)
+{
+ BlockDriverState *bs;
+
+ QTAILQ_FOREACH(bs, &bdrv_states, list) {
+ bdrv_close(bs);
+ }
+}
+
+/*
+ * Wait for pending requests to complete across all BlockDriverStates
+ *
+ * This function does not flush data to disk, use bdrv_flush_all() for that
+ * after calling this function.
+ *
+ * Note that completion of an asynchronous I/O operation can trigger any
+ * number of other I/O operations on other devices---for example a coroutine
+ * can be arbitrarily complex and a constant flow of I/O can come until the
+ * coroutine is complete. Because of this, it is not possible to have a
+ * function to drain a single device's I/O queue.
+ */
+void bdrv_drain_all(void)
+{
+ BlockDriverState *bs;
+ bool busy;
+
+ do {
+ busy = qemu_aio_wait();
+
+ /* FIXME: We do not have timer support here, so this is effectively
+ * a busy wait.
+ */
+ QTAILQ_FOREACH(bs, &bdrv_states, list) {
+ if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
+ qemu_co_queue_restart_all(&bs->throttled_reqs);
+ busy = true;
+ }
+ }
+ } while (busy);
+
+ /* If requests are still pending there is a bug somewhere */
+ QTAILQ_FOREACH(bs, &bdrv_states, list) {
+ assert(QLIST_EMPTY(&bs->tracked_requests));
+ assert(qemu_co_queue_empty(&bs->throttled_reqs));
+ }
+}
+
+/* make a BlockDriverState anonymous by removing from bdrv_state list.
+ Also, NULL terminate the device_name to prevent double remove */
+void bdrv_make_anon(BlockDriverState *bs)
+{
+ if (bs->device_name[0] != '\0') {
+ QTAILQ_REMOVE(&bdrv_states, bs, list);
+ }
+ bs->device_name[0] = '\0';
+}
+
+static void bdrv_rebind(BlockDriverState *bs)
+{
+ if (bs->drv && bs->drv->bdrv_rebind) {
+ bs->drv->bdrv_rebind(bs);
+ }
+}
+
+static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
+ BlockDriverState *bs_src)
+{
+ /* move some fields that need to stay attached to the device */
+ bs_dest->open_flags = bs_src->open_flags;
+
+ /* dev info */
+ bs_dest->dev_ops = bs_src->dev_ops;
+ bs_dest->dev_opaque = bs_src->dev_opaque;
+ bs_dest->dev = bs_src->dev;
+ bs_dest->buffer_alignment = bs_src->buffer_alignment;
+ bs_dest->copy_on_read = bs_src->copy_on_read;
+
+ bs_dest->enable_write_cache = bs_src->enable_write_cache;
+
+ /* i/o timing parameters */
+ bs_dest->slice_start = bs_src->slice_start;
+ bs_dest->slice_end = bs_src->slice_end;
+ bs_dest->slice_submitted = bs_src->slice_submitted;
+ bs_dest->io_limits = bs_src->io_limits;
+ bs_dest->throttled_reqs = bs_src->throttled_reqs;
+ bs_dest->block_timer = bs_src->block_timer;
+ bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
+
+ /* r/w error */
+ bs_dest->on_read_error = bs_src->on_read_error;
+ bs_dest->on_write_error = bs_src->on_write_error;
+
+ /* i/o status */
+ bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
+ bs_dest->iostatus = bs_src->iostatus;
+
+ /* dirty bitmap */
+ bs_dest->dirty_bitmap = bs_src->dirty_bitmap;
+
+ /* job */
+ bs_dest->in_use = bs_src->in_use;
+ bs_dest->job = bs_src->job;
+
+ /* keep the same entry in bdrv_states */
+ pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
+ bs_src->device_name);
+ bs_dest->list = bs_src->list;
+}
+
+/*
+ * Swap bs contents for two image chains while they are live,
+ * while keeping required fields on the BlockDriverState that is
+ * actually attached to a device.
+ *
+ * This will modify the BlockDriverState fields, and swap contents
+ * between bs_new and bs_old. Both bs_new and bs_old are modified.
+ *
+ * bs_new is required to be anonymous.
+ *
+ * This function does not create any image files.
+ */
+void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
+{
+ BlockDriverState tmp;
+
+ /* bs_new must be anonymous and shouldn't have anything fancy enabled */
+ assert(bs_new->device_name[0] == '\0');
+ assert(bs_new->dirty_bitmap == NULL);
+ assert(bs_new->job == NULL);
+ assert(bs_new->dev == NULL);
+ assert(bs_new->in_use == 0);
+ assert(bs_new->io_limits_enabled == false);
+ assert(bs_new->block_timer == NULL);
+
+ tmp = *bs_new;
+ *bs_new = *bs_old;
+ *bs_old = tmp;
+
+ /* there are some fields that should not be swapped, move them back */
+ bdrv_move_feature_fields(&tmp, bs_old);
+ bdrv_move_feature_fields(bs_old, bs_new);
+ bdrv_move_feature_fields(bs_new, &tmp);
+
+ /* bs_new shouldn't be in bdrv_states even after the swap! */
+ assert(bs_new->device_name[0] == '\0');
+
+ /* Check a few fields that should remain attached to the device */
+ assert(bs_new->dev == NULL);
+ assert(bs_new->job == NULL);
+ assert(bs_new->in_use == 0);
+ assert(bs_new->io_limits_enabled == false);
+ assert(bs_new->block_timer == NULL);
+
+ bdrv_rebind(bs_new);
+ bdrv_rebind(bs_old);
+}
+
+/*
+ * Add new bs contents at the top of an image chain while the chain is
+ * live, while keeping required fields on the top layer.
+ *
+ * This will modify the BlockDriverState fields, and swap contents
+ * between bs_new and bs_top. Both bs_new and bs_top are modified.
+ *
+ * bs_new is required to be anonymous.
+ *
+ * This function does not create any image files.
+ */
+void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
+{
+ bdrv_swap(bs_new, bs_top);
+
+ /* The contents of 'tmp' will become bs_top, as we are
+ * swapping bs_new and bs_top contents. */
+ bs_top->backing_hd = bs_new;
+ bs_top->open_flags &= ~BDRV_O_NO_BACKING;
+ pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
+ bs_new->filename);
+ pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
+ bs_new->drv ? bs_new->drv->format_name : "");
+}
+
+void bdrv_delete(BlockDriverState *bs)
+{
+ assert(!bs->dev);
+ assert(!bs->job);
+ assert(!bs->in_use);
+
+ /* remove from list, if necessary */
+ bdrv_make_anon(bs);
+
+ bdrv_close(bs);
+
+ g_free(bs);
+}
+
+int bdrv_attach_dev(BlockDriverState *bs, void *dev)
+/* TODO change to DeviceState *dev when all users are qdevified */
+{
+ if (bs->dev) {
+ return -EBUSY;
+ }
+ bs->dev = dev;
+ bdrv_iostatus_reset(bs);
+ return 0;
+}
+
+/* TODO qdevified devices don't use this, remove when devices are qdevified */
+void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
+{
+ if (bdrv_attach_dev(bs, dev) < 0) {
+ abort();
+ }
+}
+
+void bdrv_detach_dev(BlockDriverState *bs, void *dev)
+/* TODO change to DeviceState *dev when all users are qdevified */
+{
+ assert(bs->dev == dev);
+ bs->dev = NULL;
+ bs->dev_ops = NULL;
+ bs->dev_opaque = NULL;
+ bs->buffer_alignment = 512;
+}
+
+/* TODO change to return DeviceState * when all users are qdevified */
+void *bdrv_get_attached_dev(BlockDriverState *bs)
+{
+ return bs->dev;
+}
+
+void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
+ void *opaque)
+{
+ bs->dev_ops = ops;
+ bs->dev_opaque = opaque;
+}
+
+void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
+ enum MonitorEvent ev,
+ BlockErrorAction action, bool is_read)
+{
+ QObject *data;
+ const char *action_str;
+
+ switch (action) {
+ case BDRV_ACTION_REPORT:
+ action_str = "report";
+ break;
+ case BDRV_ACTION_IGNORE:
+ action_str = "ignore";
+ break;
+ case BDRV_ACTION_STOP:
+ action_str = "stop";
+ break;
+ default:
+ abort();
+ }
+
+ data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
+ bdrv->device_name,
+ action_str,
+ is_read ? "read" : "write");
+ monitor_protocol_event(ev, data);
+
+ qobject_decref(data);
+}
+
+static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
+{
+ QObject *data;
+
+ data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
+ bdrv_get_device_name(bs), ejected);
+ monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
+
+ qobject_decref(data);
+}
+
+static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
+{
+ if (bs->dev_ops && bs->dev_ops->change_media_cb) {
+ bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
+ bs->dev_ops->change_media_cb(bs->dev_opaque, load);
+ if (tray_was_closed) {
+ /* tray open */
+ bdrv_emit_qmp_eject_event(bs, true);
+ }
+ if (load) {
+ /* tray close */
+ bdrv_emit_qmp_eject_event(bs, false);
+ }
+ }
+}
+
+bool bdrv_dev_has_removable_media(BlockDriverState *bs)
+{
+ return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
+}
+
+void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
+{
+ if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
+ bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
+ }
+}
+
+bool bdrv_dev_is_tray_open(BlockDriverState *bs)
+{
+ if (bs->dev_ops && bs->dev_ops->is_tray_open) {
+ return bs->dev_ops->is_tray_open(bs->dev_opaque);
+ }
+ return false;
+}
+
+static void bdrv_dev_resize_cb(BlockDriverState *bs)
+{
+ if (bs->dev_ops && bs->dev_ops->resize_cb) {
+ bs->dev_ops->resize_cb(bs->dev_opaque);
+ }
+}
+
+bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
+{
+ if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
+ return bs->dev_ops->is_medium_locked(bs->dev_opaque);
+ }
+ return false;
+}
+
+/*
+ * Run consistency checks on an image
+ *
+ * Returns 0 if the check could be completed (it doesn't mean that the image is
+ * free of errors) or -errno when an internal error occurred. The results of the
+ * check are stored in res.
+ */
+int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
+{
+ if (bs->drv->bdrv_check == NULL) {
+ return -ENOTSUP;
+ }
+
+ memset(res, 0, sizeof(*res));
+ return bs->drv->bdrv_check(bs, res, fix);
+}
+
+#define COMMIT_BUF_SECTORS 2048
+
+/* commit COW file into the raw image */
+int bdrv_commit(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ int64_t sector, total_sectors;
+ int n, ro, open_flags;
+ int ret = 0;
+ uint8_t *buf;
+ char filename[PATH_MAX];
+
+ if (!drv)
+ return -ENOMEDIUM;
+
+ if (!bs->backing_hd) {
+ return -ENOTSUP;
+ }
+
+ if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
+ return -EBUSY;
+ }
+
+ ro = bs->backing_hd->read_only;
+ /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
+ pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
+ open_flags = bs->backing_hd->open_flags;
+
+ if (ro) {
+ if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
+ return -EACCES;
+ }
+ }
+
+ total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+ buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
+
+ for (sector = 0; sector < total_sectors; sector += n) {
+ if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
+
+ if (bdrv_read(bs, sector, buf, n) != 0) {
+ ret = -EIO;
+ goto ro_cleanup;
+ }
+
+ if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
+ ret = -EIO;
+ goto ro_cleanup;
+ }
+ }
+ }
+
+ if (drv->bdrv_make_empty) {
+ ret = drv->bdrv_make_empty(bs);
+ bdrv_flush(bs);
+ }
+
+ /*
+ * Make sure all data we wrote to the backing device is actually
+ * stable on disk.
+ */
+ if (bs->backing_hd)
+ bdrv_flush(bs->backing_hd);
+
+ro_cleanup:
+ g_free(buf);
+
+ if (ro) {
+ /* ignoring error return here */
+ bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
+ }
+
+ return ret;
+}
+
+int bdrv_commit_all(void)
+{
+ BlockDriverState *bs;
+
+ QTAILQ_FOREACH(bs, &bdrv_states, list) {
+ if (bs->drv && bs->backing_hd) {
+ int ret = bdrv_commit(bs);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * Remove an active request from the tracked requests list
+ *
+ * This function should be called when a tracked request is completing.
+ */
+static void tracked_request_end(BdrvTrackedRequest *req)
+{
+ QLIST_REMOVE(req, list);
+ qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+/**
+ * Add an active request to the tracked requests list
+ */
+static void tracked_request_begin(BdrvTrackedRequest *req,
+ BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, bool is_write)
+{
+ *req = (BdrvTrackedRequest){
+ .bs = bs,
+ .sector_num = sector_num,
+ .nb_sectors = nb_sectors,
+ .is_write = is_write,
+ .co = qemu_coroutine_self(),
+ };
+
+ qemu_co_queue_init(&req->wait_queue);
+
+ QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+}
+
+/**
+ * Round a region to cluster boundaries
+ */
+void bdrv_round_to_clusters(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ int64_t *cluster_sector_num,
+ int *cluster_nb_sectors)
+{
+ BlockDriverInfo bdi;
+
+ if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+ *cluster_sector_num = sector_num;
+ *cluster_nb_sectors = nb_sectors;
+ } else {
+ int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
+ *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+ *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+ nb_sectors, c);
+ }
+}
+
+static bool tracked_request_overlaps(BdrvTrackedRequest *req,
+ int64_t sector_num, int nb_sectors) {
+ /* aaaa bbbb */
+ if (sector_num >= req->sector_num + req->nb_sectors) {
+ return false;
+ }
+ /* bbbb aaaa */
+ if (req->sector_num >= sector_num + nb_sectors) {
+ return false;
+ }
+ return true;
+}
+
+static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ BdrvTrackedRequest *req;
+ int64_t cluster_sector_num;
+ int cluster_nb_sectors;
+ bool retry;
+
+ /* If we touch the same cluster it counts as an overlap. This guarantees
+ * that allocating writes will be serialized and not race with each other
+ * for the same cluster. For example, in copy-on-read it ensures that the
+ * CoR read and write operations are atomic and guest writes cannot
+ * interleave between them.
+ */
+ bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+ &cluster_sector_num, &cluster_nb_sectors);
+
+ do {
+ retry = false;
+ QLIST_FOREACH(req, &bs->tracked_requests, list) {
+ if (tracked_request_overlaps(req, cluster_sector_num,
+ cluster_nb_sectors)) {
+ /* Hitting this means there was a reentrant request, for
+ * example, a block driver issuing nested requests. This must
+ * never happen since it means deadlock.
+ */
+ assert(qemu_coroutine_self() != req->co);
+
+ qemu_co_queue_wait(&req->wait_queue);
+ retry = true;
+ break;
+ }
+ }
+ } while (retry);
+}
+
+/*
+ * Return values:
+ * 0 - success
+ * -EINVAL - backing format specified, but no file
+ * -ENOSPC - can't update the backing file because no space is left in the
+ * image file header
+ * -ENOTSUP - format driver doesn't support changing the backing file
+ */
+int bdrv_change_backing_file(BlockDriverState *bs,
+ const char *backing_file, const char *backing_fmt)
+{
+ BlockDriver *drv = bs->drv;
+ int ret;
+
+ /* Backing file format doesn't make sense without a backing file */
+ if (backing_fmt && !backing_file) {
+ return -EINVAL;
+ }
+
+ if (drv->bdrv_change_backing_file != NULL) {
+ ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
+ } else {
+ ret = -ENOTSUP;
+ }
+
+ if (ret == 0) {
+ pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
+ pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
+ }
+ return ret;
+}
+
+/*
+ * Finds the image layer in the chain that has 'bs' as its backing file.
+ *
+ * active is the current topmost image.
+ *
+ * Returns NULL if bs is not found in active's image chain,
+ * or if active == bs.
+ */
+BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
+ BlockDriverState *bs)
+{
+ BlockDriverState *overlay = NULL;
+ BlockDriverState *intermediate;
+
+ assert(active != NULL);
+ assert(bs != NULL);
+
+ /* if bs is the same as active, then by definition it has no overlay
+ */
+ if (active == bs) {
+ return NULL;
+ }
+
+ intermediate = active;
+ while (intermediate->backing_hd) {
+ if (intermediate->backing_hd == bs) {
+ overlay = intermediate;
+ break;
+ }
+ intermediate = intermediate->backing_hd;
+ }
+
+ return overlay;
+}
+
+typedef struct BlkIntermediateStates {
+ BlockDriverState *bs;
+ QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
+} BlkIntermediateStates;
+
+
+/*
+ * Drops images above 'base' up to and including 'top', and sets the image
+ * above 'top' to have base as its backing file.
+ *
+ * Requires that the overlay to 'top' is opened r/w, so that the backing file
+ * information in 'bs' can be properly updated.
+ *
+ * E.g., this will convert the following chain:
+ * bottom <- base <- intermediate <- top <- active
+ *
+ * to
+ *
+ * bottom <- base <- active
+ *
+ * It is allowed for bottom==base, in which case it converts:
+ *
+ * base <- intermediate <- top <- active
+ *
+ * to
+ *
+ * base <- active
+ *
+ * Error conditions:
+ * if active == top, that is considered an error
+ *
+ */
+int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
+ BlockDriverState *base)
+{
+ BlockDriverState *intermediate;
+ BlockDriverState *base_bs = NULL;
+ BlockDriverState *new_top_bs = NULL;
+ BlkIntermediateStates *intermediate_state, *next;
+ int ret = -EIO;
+
+ QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
+ QSIMPLEQ_INIT(&states_to_delete);
+
+ if (!top->drv || !base->drv) {
+ goto exit;
+ }
+
+ new_top_bs = bdrv_find_overlay(active, top);
+
+ if (new_top_bs == NULL) {
+ /* we could not find the image above 'top', this is an error */
+ goto exit;
+ }
+
+ /* special case of new_top_bs->backing_hd already pointing to base - nothing
+ * to do, no intermediate images */
+ if (new_top_bs->backing_hd == base) {
+ ret = 0;
+ goto exit;
+ }
+
+ intermediate = top;
+
+ /* now we will go down through the list, and add each BDS we find
+ * into our deletion queue, until we hit the 'base'
+ */
+ while (intermediate) {
+ intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
+ intermediate_state->bs = intermediate;
+ QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
+
+ if (intermediate->backing_hd == base) {
+ base_bs = intermediate->backing_hd;
+ break;
+ }
+ intermediate = intermediate->backing_hd;
+ }
+ if (base_bs == NULL) {
+ /* something went wrong, we did not end at the base. safely
+ * unravel everything, and exit with error */
+ goto exit;
+ }
+
+ /* success - we can delete the intermediate states, and link top->base */
+ ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
+ base_bs->drv ? base_bs->drv->format_name : "");
+ if (ret) {
+ goto exit;
+ }
+ new_top_bs->backing_hd = base_bs;
+
+
+ QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
+ /* so that bdrv_close() does not recursively close the chain */
+ intermediate_state->bs->backing_hd = NULL;
+ bdrv_delete(intermediate_state->bs);
+ }
+ ret = 0;
+
+exit:
+ QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
+ g_free(intermediate_state);
+ }
+ return ret;
+}
+
+
+static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
+ size_t size)
+{
+ int64_t len;
+
+ if (!bdrv_is_inserted(bs))
+ return -ENOMEDIUM;
+
+ if (bs->growable)
+ return 0;
+
+ len = bdrv_getlength(bs);
+
+ if (offset < 0)
+ return -EIO;
+
+ if ((offset > len) || (len - offset < size))
+ return -EIO;
+
+ return 0;
+}
+
+static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
+ nb_sectors * BDRV_SECTOR_SIZE);
+}
+
+typedef struct RwCo {
+ BlockDriverState *bs;
+ int64_t sector_num;
+ int nb_sectors;
+ QEMUIOVector *qiov;
+ bool is_write;
+ int ret;
+} RwCo;
+
+static void coroutine_fn bdrv_rw_co_entry(void *opaque)
+{
+ RwCo *rwco = opaque;
+
+ if (!rwco->is_write) {
+ rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
+ rwco->nb_sectors, rwco->qiov, 0);
+ } else {
+ rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
+ rwco->nb_sectors, rwco->qiov, 0);
+ }
+}
+
+/*
+ * Process a vectored synchronous request using coroutines
+ */
+static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, bool is_write)
+{
+ Coroutine *co;
+ RwCo rwco = {
+ .bs = bs,
+ .sector_num = sector_num,
+ .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
+ .qiov = qiov,
+ .is_write = is_write,
+ .ret = NOT_DONE,
+ };
+ assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+ /**
+ * In sync call context, when the vcpu is blocked, this throttling timer
+ * will not fire; so the I/O throttling function has to be disabled here
+ * if it has been enabled.
+ */
+ if (bs->io_limits_enabled) {
+ fprintf(stderr, "Disabling I/O throttling on '%s' due "
+ "to synchronous I/O.\n", bdrv_get_device_name(bs));
+ bdrv_io_limits_disable(bs);
+ }
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_rw_co_entry(&rwco);
+ } else {
+ co = qemu_coroutine_create(bdrv_rw_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ qemu_aio_wait();
+ }
+ }
+ return rwco.ret;
+}
+
+/*
+ * Process a synchronous request using coroutines
+ */
+static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
+ int nb_sectors, bool is_write)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+ };
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_rwv_co(bs, sector_num, &qiov, is_write);
+}
+
+/* return < 0 if error. See bdrv_write() for the return codes */
+int bdrv_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
+}
+
+/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
+int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ bool enabled;
+ int ret;
+
+ enabled = bs->io_limits_enabled;
+ bs->io_limits_enabled = false;
+ ret = bdrv_read(bs, 0, buf, 1);
+ bs->io_limits_enabled = enabled;
+ return ret;
+}
+
+/* Return < 0 if error. Important errors are:
+ -EIO generic I/O error (may happen for all errors)
+ -ENOMEDIUM No media inserted.
+ -EINVAL Invalid sector number or nb_sectors
+ -EACCES Trying to write a read-only device
+*/
+int bdrv_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
+}
+
+int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
+{
+ return bdrv_rwv_co(bs, sector_num, qiov, true);
+}
+
+int bdrv_pread(BlockDriverState *bs, int64_t offset,
+ void *buf, int count1)
+{
+ uint8_t tmp_buf[BDRV_SECTOR_SIZE];
+ int len, nb_sectors, count;
+ int64_t sector_num;
+ int ret;
+
+ count = count1;
+ /* first read to align to sector start */
+ len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
+ if (len > count)
+ len = count;
+ sector_num = offset >> BDRV_SECTOR_BITS;
+ if (len > 0) {
+ if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
+ return ret;
+ memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
+ count -= len;
+ if (count == 0)
+ return count1;
+ sector_num++;
+ buf += len;
+ }
+
+ /* read the sectors "in place" */
+ nb_sectors = count >> BDRV_SECTOR_BITS;
+ if (nb_sectors > 0) {
+ if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
+ return ret;
+ sector_num += nb_sectors;
+ len = nb_sectors << BDRV_SECTOR_BITS;
+ buf += len;
+ count -= len;
+ }
+
+ /* add data from the last sector */
+ if (count > 0) {
+ if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
+ return ret;
+ memcpy(buf, tmp_buf, count);
+ }
+ return count1;
+}
+
+int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
+{
+ uint8_t tmp_buf[BDRV_SECTOR_SIZE];
+ int len, nb_sectors, count;
+ int64_t sector_num;
+ int ret;
+
+ count = qiov->size;
+
+ /* first write to align to sector start */
+ len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
+ if (len > count)
+ len = count;
+ sector_num = offset >> BDRV_SECTOR_BITS;
+ if (len > 0) {
+ if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
+ return ret;
+ qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
+ len);
+ if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
+ return ret;
+ count -= len;
+ if (count == 0)
+ return qiov->size;
+ sector_num++;
+ }
+
+ /* write the sectors "in place" */
+ nb_sectors = count >> BDRV_SECTOR_BITS;
+ if (nb_sectors > 0) {
+ QEMUIOVector qiov_inplace;
+
+ qemu_iovec_init(&qiov_inplace, qiov->niov);
+ qemu_iovec_concat(&qiov_inplace, qiov, len,
+ nb_sectors << BDRV_SECTOR_BITS);
+ ret = bdrv_writev(bs, sector_num, &qiov_inplace);
+ qemu_iovec_destroy(&qiov_inplace);
+ if (ret < 0) {
+ return ret;
+ }
+
+ sector_num += nb_sectors;
+ len = nb_sectors << BDRV_SECTOR_BITS;
+ count -= len;
+ }
+
+ /* add data from the last sector */
+ if (count > 0) {
+ if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
+ return ret;
+ qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
+ if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
+ return ret;
+ }
+ return qiov->size;
+}
+
+int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
+ const void *buf, int count1)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *) buf,
+ .iov_len = count1,
+ };
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_pwritev(bs, offset, &qiov);
+}
+
+/*
+ * Writes to the file and ensures that no writes are reordered across this
+ * request (acts as a barrier)
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
+ const void *buf, int count)
+{
+ int ret;
+
+ ret = bdrv_pwrite(bs, offset, buf, count);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* No flush needed for cache modes that already do it */
+ if (bs->enable_write_cache) {
+ bdrv_flush(bs);
+ }
+
+ return 0;
+}
+
+static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ /* Perform I/O through a temporary buffer so that users who scribble over
+ * their read buffer while the operation is in progress do not end up
+ * modifying the image file. This is critical for zero-copy guest I/O
+ * where anything might happen inside guest memory.
+ */
+ void *bounce_buffer;
+
+ BlockDriver *drv = bs->drv;
+ struct iovec iov;
+ QEMUIOVector bounce_qiov;
+ int64_t cluster_sector_num;
+ int cluster_nb_sectors;
+ size_t skip_bytes;
+ int ret;
+
+ /* Cover entire cluster so no additional backing file I/O is required when
+ * allocating cluster in the image file.
+ */
+ bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+ &cluster_sector_num, &cluster_nb_sectors);
+
+ trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
+ cluster_sector_num, cluster_nb_sectors);
+
+ iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+ iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
+ qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+ ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+ &bounce_qiov);
+ if (ret < 0) {
+ goto err;
+ }
+
+ if (drv->bdrv_co_write_zeroes &&
+ buffer_is_zero(bounce_buffer, iov.iov_len)) {
+ ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
+ cluster_nb_sectors);
+ } else {
+ /* This does not change the data on the disk, it is not necessary
+ * to flush even in cache=writethrough mode.
+ */
+ ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+ &bounce_qiov);
+ }
+
+ if (ret < 0) {
+ /* It might be okay to ignore write errors for guest requests. If this
+ * is a deliberate copy-on-read then we don't want to ignore the error.
+ * Simply report it in all cases.
+ */
+ goto err;
+ }
+
+ skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
+ qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
+ nb_sectors * BDRV_SECTOR_SIZE);
+
+err:
+ qemu_vfree(bounce_buffer);
+ return ret;
+}
+
+/*
+ * Handle a read request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ BlockDriver *drv = bs->drv;
+ BdrvTrackedRequest req;
+ int ret;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+ if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+ return -EIO;
+ }
+
+ /* throttling disk read I/O */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_intercept(bs, false, nb_sectors);
+ }
+
+ if (bs->copy_on_read) {
+ flags |= BDRV_REQ_COPY_ON_READ;
+ }
+ if (flags & BDRV_REQ_COPY_ON_READ) {
+ bs->copy_on_read_in_flight++;
+ }
+
+ if (bs->copy_on_read_in_flight) {
+ wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+ }
+
+ tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
+
+ if (flags & BDRV_REQ_COPY_ON_READ) {
+ int pnum;
+
+ ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (!ret || pnum != nb_sectors) {
+ ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+ goto out;
+ }
+ }
+
+ ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+
+out:
+ tracked_request_end(&req);
+
+ if (flags & BDRV_REQ_COPY_ON_READ) {
+ bs->copy_on_read_in_flight--;
+ }
+
+ return ret;
+}
+
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_readv(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+ BDRV_REQ_COPY_ON_READ);
+}
+
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ BlockDriver *drv = bs->drv;
+ QEMUIOVector qiov;
+ struct iovec iov;
+ int ret;
+
+ /* TODO Emulate only part of misaligned requests instead of letting block
+ * drivers return -ENOTSUP and emulate everything */
+
+ /* First try the efficient write zeroes operation */
+ if (drv->bdrv_co_write_zeroes) {
+ ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
+ if (ret != -ENOTSUP) {
+ return ret;
+ }
+ }
+
+ /* Fall back to bounce buffer if write zeroes is unsupported */
+ iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
+ iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+ memset(iov.iov_base, 0, iov.iov_len);
+ qemu_iovec_init_external(&qiov, &iov, 1);
+
+ ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
+
+ qemu_vfree(iov.iov_base);
+ return ret;
+}
+
+/*
+ * Handle a write request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ BlockDriver *drv = bs->drv;
+ BdrvTrackedRequest req;
+ int ret;
+
+ if (!bs->drv) {
+ return -ENOMEDIUM;
+ }
+ if (bs->read_only) {
+ return -EACCES;
+ }
+ if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+ return -EIO;
+ }
+
+ /* throttling disk write I/O */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_intercept(bs, true, nb_sectors);
+ }
+
+ if (bs->copy_on_read_in_flight) {
+ wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+ }
+
+ tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
+
+ ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
+
+ if (ret < 0) {
+ /* Do nothing, write notifier decided to fail this request */
+ } else if (flags & BDRV_REQ_ZERO_WRITE) {
+ ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
+ } else {
+ ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+ }
+
+ if (ret == 0 && !bs->enable_write_cache) {
+ ret = bdrv_co_flush(bs);
+ }
+
+ if (bs->dirty_bitmap) {
+ bdrv_set_dirty(bs, sector_num, nb_sectors);
+ }
+
+ if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
+ bs->wr_highest_sector = sector_num + nb_sectors - 1;
+ }
+
+ tracked_request_end(&req);
+
+ return ret;
+}
+
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_writev(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
+ BDRV_REQ_ZERO_WRITE);
+}
+
+/**
+ * Truncate file to 'offset' bytes (needed only for file protocols)
+ */
+int bdrv_truncate(BlockDriverState *bs, int64_t offset)
+{
+ BlockDriver *drv = bs->drv;
+ int ret;
+ if (!drv)
+ return -ENOMEDIUM;
+ if (!drv->bdrv_truncate)
+ return -ENOTSUP;
+ if (bs->read_only)
+ return -EACCES;
+ if (bdrv_in_use(bs))
+ return -EBUSY;
+ ret = drv->bdrv_truncate(bs, offset);
+ if (ret == 0) {
+ ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
+ bdrv_dev_resize_cb(bs);
+ }
+ return ret;
+}
+
+/**
+ * Length of a allocated file in bytes. Sparse files are counted by actual
+ * allocated space. Return < 0 if error or unknown.
+ */
+int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+ if (drv->bdrv_get_allocated_file_size) {
+ return drv->bdrv_get_allocated_file_size(bs);
+ }
+ if (bs->file) {
+ return bdrv_get_allocated_file_size(bs->file);
+ }
+ return -ENOTSUP;
+}
+
+/**
+ * Length of a file in bytes. Return < 0 if error or unknown.
+ */
+int64_t bdrv_getlength(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv)
+ return -ENOMEDIUM;
+
+ if (bs->growable || bdrv_dev_has_removable_media(bs)) {
+ if (drv->bdrv_getlength) {
+ return drv->bdrv_getlength(bs);
+ }
+ }
+ return bs->total_sectors * BDRV_SECTOR_SIZE;
+}
+
+/* return 0 as number of sectors if no device present or error */
+void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
+{
+ int64_t length;
+ length = bdrv_getlength(bs);
+ if (length < 0)
+ length = 0;
+ else
+ length = length >> BDRV_SECTOR_BITS;
+ *nb_sectors_ptr = length;
+}
+
+/* throttling disk io limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+ BlockIOLimit *io_limits)
+{
+ bs->io_limits = *io_limits;
+ bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
+}
+
+void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
+ BlockdevOnError on_write_error)
+{
+ bs->on_read_error = on_read_error;
+ bs->on_write_error = on_write_error;
+}
+
+BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
+{
+ return is_read ? bs->on_read_error : bs->on_write_error;
+}
+
+BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
+{
+ BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
+
+ switch (on_err) {
+ case BLOCKDEV_ON_ERROR_ENOSPC:
+ return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
+ case BLOCKDEV_ON_ERROR_STOP:
+ return BDRV_ACTION_STOP;
+ case BLOCKDEV_ON_ERROR_REPORT:
+ return BDRV_ACTION_REPORT;
+ case BLOCKDEV_ON_ERROR_IGNORE:
+ return BDRV_ACTION_IGNORE;
+ default:
+ abort();
+ }
+}
+
+/* This is done by device models because, while the block layer knows
+ * about the error, it does not know whether an operation comes from
+ * the device or the block layer (from a job, for example).
+ */
+void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
+ bool is_read, int error)
+{
+ assert(error >= 0);
+ bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
+ if (action == BDRV_ACTION_STOP) {
+ vm_stop(RUN_STATE_IO_ERROR);
+ bdrv_iostatus_set_err(bs, error);
+ }
+}
+
+int bdrv_is_read_only(BlockDriverState *bs)
+{
+ return bs->read_only;
+}
+
+int bdrv_is_sg(BlockDriverState *bs)
+{
+ return bs->sg;
+}
+
+int bdrv_enable_write_cache(BlockDriverState *bs)
+{
+ return bs->enable_write_cache;
+}
+
+void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
+{
+ bs->enable_write_cache = wce;
+
+ /* so a reopen() will preserve wce */
+ if (wce) {
+ bs->open_flags |= BDRV_O_CACHE_WB;
+ } else {
+ bs->open_flags &= ~BDRV_O_CACHE_WB;
+ }
+}
+
+int bdrv_is_encrypted(BlockDriverState *bs)
+{
+ if (bs->backing_hd && bs->backing_hd->encrypted)
+ return 1;
+ return bs->encrypted;
+}
+
+int bdrv_key_required(BlockDriverState *bs)
+{
+ BlockDriverState *backing_hd = bs->backing_hd;
+
+ if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
+ return 1;
+ return (bs->encrypted && !bs->valid_key);
+}
+
+int bdrv_set_key(BlockDriverState *bs, const char *key)
+{
+ int ret;
+ if (bs->backing_hd && bs->backing_hd->encrypted) {
+ ret = bdrv_set_key(bs->backing_hd, key);
+ if (ret < 0)
+ return ret;
+ if (!bs->encrypted)
+ return 0;
+ }
+ if (!bs->encrypted) {
+ return -EINVAL;
+ } else if (!bs->drv || !bs->drv->bdrv_set_key) {
+ return -ENOMEDIUM;
+ }
+ ret = bs->drv->bdrv_set_key(bs, key);
+ if (ret < 0) {
+ bs->valid_key = 0;
+ } else if (!bs->valid_key) {
+ bs->valid_key = 1;
+ /* call the change callback now, we skipped it on open */
+ bdrv_dev_change_media_cb(bs, true);
+ }
+ return ret;
+}
+
+const char *bdrv_get_format_name(BlockDriverState *bs)
+{
+ return bs->drv ? bs->drv->format_name : NULL;
+}
+
+void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
+ void *opaque)
+{
+ BlockDriver *drv;
+
+ QLIST_FOREACH(drv, &bdrv_drivers, list) {
+ it(opaque, drv->format_name);
+ }
+}
+
+BlockDriverState *bdrv_find(const char *name)
+{
+ BlockDriverState *bs;
+
+ QTAILQ_FOREACH(bs, &bdrv_states, list) {
+ if (!strcmp(name, bs->device_name)) {
+ return bs;
+ }
+ }
+ return NULL;
+}
+
+BlockDriverState *bdrv_next(BlockDriverState *bs)
+{
+ if (!bs) {
+ return QTAILQ_FIRST(&bdrv_states);
+ }
+ return QTAILQ_NEXT(bs, list);
+}
+
+void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
+{
+ BlockDriverState *bs;
+
+ QTAILQ_FOREACH(bs, &bdrv_states, list) {
+ it(opaque, bs);
+ }
+}
+
+const char *bdrv_get_device_name(BlockDriverState *bs)
+{
+ return bs->device_name;
+}
+
+int bdrv_get_flags(BlockDriverState *bs)
+{
+ return bs->open_flags;
+}
+
+int bdrv_flush_all(void)
+{
+ BlockDriverState *bs;
+ int result = 0;
+
+ QTAILQ_FOREACH(bs, &bdrv_states, list) {
+ int ret = bdrv_flush(bs);
+ if (ret < 0 && !result) {
+ result = ret;
+ }
+ }
+
+ return result;
+}
+
+int bdrv_has_zero_init_1(BlockDriverState *bs)
+{
+ return 1;
+}
+
+int bdrv_has_zero_init(BlockDriverState *bs)
+{
+ assert(bs->drv);
+
+ if (bs->drv->bdrv_has_zero_init) {
+ return bs->drv->bdrv_has_zero_init(bs);
+ }
+
+ /* safe default */
+ return 0;
+}
+
+typedef struct BdrvCoIsAllocatedData {
+ BlockDriverState *bs;
+ BlockDriverState *base;
+ int64_t sector_num;
+ int nb_sectors;
+ int *pnum;
+ int ret;
+ bool done;
+} BdrvCoIsAllocatedData;
+
+/*
+ * Returns true iff the specified sector is present in the disk image. Drivers
+ * not implementing the functionality are assumed to not support backing files,
+ * hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ int64_t n;
+
+ if (sector_num >= bs->total_sectors) {
+ *pnum = 0;
+ return 0;
+ }
+
+ n = bs->total_sectors - sector_num;
+ if (n < nb_sectors) {
+ nb_sectors = n;
+ }
+
+ if (!bs->drv->bdrv_co_is_allocated) {
+ *pnum = nb_sectors;
+ return 1;
+ }
+
+ return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
+}
+
+/* Coroutine wrapper for bdrv_is_allocated() */
+static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
+{
+ BdrvCoIsAllocatedData *data = opaque;
+ BlockDriverState *bs = data->bs;
+
+ data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
+ data->pnum);
+ data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_is_allocated().
+ *
+ * See bdrv_co_is_allocated() for details.
+ */
+int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+ int *pnum)
+{
+ Coroutine *co;
+ BdrvCoIsAllocatedData data = {
+ .bs = bs,
+ .sector_num = sector_num,
+ .nb_sectors = nb_sectors,
+ .pnum = pnum,
+ .done = false,
+ };
+
+ co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
+ qemu_coroutine_enter(co, &data);
+ while (!data.done) {
+ qemu_aio_wait();
+ }
+ return data.ret;
+}
+
+/*
+ * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
+ *
+ * Return true if the given sector is allocated in any image between
+ * BASE and TOP (inclusive). BASE can be NULL to check if the given
+ * sector is allocated in any image of the chain. Return false otherwise.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ */
+int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
+ BlockDriverState *base,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ BlockDriverState *intermediate;
+ int ret, n = nb_sectors;
+
+ intermediate = top;
+ while (intermediate && intermediate != base) {
+ int pnum_inter;
+ ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
+ &pnum_inter);
+ if (ret < 0) {
+ return ret;
+ } else if (ret) {
+ *pnum = pnum_inter;
+ return 1;
+ }
+
+ /*
+ * [sector_num, nb_sectors] is unallocated on top but intermediate
+ * might have
+ *
+ * [sector_num+x, nr_sectors] allocated.
+ */
+ if (n > pnum_inter &&
+ (intermediate == top ||
+ sector_num + pnum_inter < intermediate->total_sectors)) {
+ n = pnum_inter;
+ }
+
+ intermediate = intermediate->backing_hd;
+ }
+
+ *pnum = n;
+ return 0;
+}
+
+/* Coroutine wrapper for bdrv_is_allocated_above() */
+static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque)
+{
+ BdrvCoIsAllocatedData *data = opaque;
+ BlockDriverState *top = data->bs;
+ BlockDriverState *base = data->base;
+
+ data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num,
+ data->nb_sectors, data->pnum);
+ data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_is_allocated_above().
+ *
+ * See bdrv_co_is_allocated_above() for details.
+ */
+int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
+ int64_t sector_num, int nb_sectors, int *pnum)
+{
+ Coroutine *co;
+ BdrvCoIsAllocatedData data = {
+ .bs = top,
+ .base = base,
+ .sector_num = sector_num,
+ .nb_sectors = nb_sectors,
+ .pnum = pnum,
+ .done = false,
+ };
+
+ co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry);
+ qemu_coroutine_enter(co, &data);
+ while (!data.done) {
+ qemu_aio_wait();
+ }
+ return data.ret;
+}
+
+const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
+{
+ if (bs->backing_hd && bs->backing_hd->encrypted)
+ return bs->backing_file;
+ else if (bs->encrypted)
+ return bs->filename;
+ else
+ return NULL;
+}
+
+void bdrv_get_backing_filename(BlockDriverState *bs,
+ char *filename, int filename_size)
+{
+ pstrcpy(filename, filename_size, bs->backing_file);
+}
+
+int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv)
+ return -ENOMEDIUM;
+ if (!drv->bdrv_write_compressed)
+ return -ENOTSUP;
+ if (bdrv_check_request(bs, sector_num, nb_sectors))
+ return -EIO;
+
+ assert(!bs->dirty_bitmap);
+
+ return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
+}
+
+int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv)
+ return -ENOMEDIUM;
+ if (!drv->bdrv_get_info)
+ return -ENOTSUP;
+ memset(bdi, 0, sizeof(*bdi));
+ return drv->bdrv_get_info(bs, bdi);
+}
+
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+ int64_t pos, int size)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *) buf,
+ .iov_len = size,
+ };
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_writev_vmstate(bs, &qiov, pos);
+}
+
+int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ } else if (drv->bdrv_save_vmstate) {
+ return drv->bdrv_save_vmstate(bs, qiov, pos);
+ } else if (bs->file) {
+ return bdrv_writev_vmstate(bs->file, qiov, pos);
+ }
+
+ return -ENOTSUP;
+}
+
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+ int64_t pos, int size)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv)
+ return -ENOMEDIUM;
+ if (drv->bdrv_load_vmstate)
+ return drv->bdrv_load_vmstate(bs, buf, pos, size);
+ if (bs->file)
+ return bdrv_load_vmstate(bs->file, buf, pos, size);
+ return -ENOTSUP;
+}
+
+void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
+{
+ if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
+ return;
+ }
+
+ bs->drv->bdrv_debug_event(bs, event);
+}
+
+int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
+ const char *tag)
+{
+ while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
+ bs = bs->file;
+ }
+
+ if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
+ return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
+ }
+
+ return -ENOTSUP;
+}
+
+int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
+{
+ while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
+ bs = bs->file;
+ }
+
+ if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
+ return bs->drv->bdrv_debug_resume(bs, tag);
+ }
+
+ return -ENOTSUP;
+}
+
+bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
+{
+ while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
+ bs = bs->file;
+ }
+
+ if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
+ return bs->drv->bdrv_debug_is_suspended(bs, tag);
+ }
+
+ return false;
+}
+
+int bdrv_is_snapshot(BlockDriverState *bs)
+{
+ return !!(bs->open_flags & BDRV_O_SNAPSHOT);
+}
+
+/* backing_file can either be relative, or absolute, or a protocol. If it is
+ * relative, it must be relative to the chain. So, passing in bs->filename
+ * from a BDS as backing_file should not be done, as that may be relative to
+ * the CWD rather than the chain. */
+BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
+ const char *backing_file)
+{
+ char *filename_full = NULL;
+ char *backing_file_full = NULL;
+ char *filename_tmp = NULL;
+ int is_protocol = 0;
+ BlockDriverState *curr_bs = NULL;
+ BlockDriverState *retval = NULL;
+
+ if (!bs || !bs->drv || !backing_file) {
+ return NULL;
+ }
+
+ filename_full = g_malloc(PATH_MAX);
+ backing_file_full = g_malloc(PATH_MAX);
+ filename_tmp = g_malloc(PATH_MAX);
+
+ is_protocol = path_has_protocol(backing_file);
+
+ for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
+
+ /* If either of the filename paths is actually a protocol, then
+ * compare unmodified paths; otherwise make paths relative */
+ if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
+ if (strcmp(backing_file, curr_bs->backing_file) == 0) {
+ retval = curr_bs->backing_hd;
+ break;
+ }
+ } else {
+ /* If not an absolute filename path, make it relative to the current
+ * image's filename path */
+ path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
+ backing_file);
+
+ /* We are going to compare absolute pathnames */
+ if (!realpath(filename_tmp, filename_full)) {
+ continue;
+ }
+
+ /* We need to make sure the backing filename we are comparing against
+ * is relative to the current image filename (or absolute) */
+ path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
+ curr_bs->backing_file);
+
+ if (!realpath(filename_tmp, backing_file_full)) {
+ continue;
+ }
+
+ if (strcmp(backing_file_full, filename_full) == 0) {
+ retval = curr_bs->backing_hd;
+ break;
+ }
+ }
+ }
+
+ g_free(filename_full);
+ g_free(backing_file_full);
+ g_free(filename_tmp);
+ return retval;
+}
+
+int bdrv_get_backing_file_depth(BlockDriverState *bs)
+{
+ if (!bs->drv) {
+ return 0;
+ }
+
+ if (!bs->backing_hd) {
+ return 0;
+ }
+
+ return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
+}
+
+BlockDriverState *bdrv_find_base(BlockDriverState *bs)
+{
+ BlockDriverState *curr_bs = NULL;
+
+ if (!bs) {
+ return NULL;
+ }
+
+ curr_bs = bs;
+
+ while (curr_bs->backing_hd) {
+ curr_bs = curr_bs->backing_hd;
+ }
+ return curr_bs;
+}
+
+/**************************************************************/
+/* async I/Os */
+
+BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
+
+ return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
+ cb, opaque, false);
+}
+
+BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
+
+ return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
+ cb, opaque, true);
+}
+
+
+typedef struct MultiwriteCB {
+ int error;
+ int num_requests;
+ int num_callbacks;
+ struct {
+ BlockDriverCompletionFunc *cb;
+ void *opaque;
+ QEMUIOVector *free_qiov;
+ } callbacks[];
+} MultiwriteCB;
+
+static void multiwrite_user_cb(MultiwriteCB *mcb)
+{
+ int i;
+
+ for (i = 0; i < mcb->num_callbacks; i++) {
+ mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
+ if (mcb->callbacks[i].free_qiov) {
+ qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
+ }
+ g_free(mcb->callbacks[i].free_qiov);
+ }
+}
+
+static void multiwrite_cb(void *opaque, int ret)
+{
+ MultiwriteCB *mcb = opaque;
+
+ trace_multiwrite_cb(mcb, ret);
+
+ if (ret < 0 && !mcb->error) {
+ mcb->error = ret;
+ }
+
+ mcb->num_requests--;
+ if (mcb->num_requests == 0) {
+ multiwrite_user_cb(mcb);
+ g_free(mcb);
+ }
+}
+
+static int multiwrite_req_compare(const void *a, const void *b)
+{
+ const BlockRequest *req1 = a, *req2 = b;
+
+ /*
+ * Note that we can't simply subtract req2->sector from req1->sector
+ * here as that could overflow the return value.
+ */
+ if (req1->sector > req2->sector) {
+ return 1;
+ } else if (req1->sector < req2->sector) {
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * Takes a bunch of requests and tries to merge them. Returns the number of
+ * requests that remain after merging.
+ */
+static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
+ int num_reqs, MultiwriteCB *mcb)
+{
+ int i, outidx;
+
+ // Sort requests by start sector
+ qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
+
+ // Check if adjacent requests touch the same clusters. If so, combine them,
+ // filling up gaps with zero sectors.
+ outidx = 0;
+ for (i = 1; i < num_reqs; i++) {
+ int merge = 0;
+ int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
+
+ // Handle exactly sequential writes and overlapping writes.
+ if (reqs[i].sector <= oldreq_last) {
+ merge = 1;
+ }
+
+ if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+ merge = 0;
+ }
+
+ if (merge) {
+ size_t size;
+ QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
+ qemu_iovec_init(qiov,
+ reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
+
+ // Add the first request to the merged one. If the requests are
+ // overlapping, drop the last sectors of the first request.
+ size = (reqs[i].sector - reqs[outidx].sector) << 9;
+ qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
+
+ // We should need to add any zeros between the two requests
+ assert (reqs[i].sector <= oldreq_last);
+
+ // Add the second request
+ qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
+
+ reqs[outidx].nb_sectors = qiov->size >> 9;
+ reqs[outidx].qiov = qiov;
+
+ mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
+ } else {
+ outidx++;
+ reqs[outidx].sector = reqs[i].sector;
+ reqs[outidx].nb_sectors = reqs[i].nb_sectors;
+ reqs[outidx].qiov = reqs[i].qiov;
+ }
+ }
+
+ return outidx + 1;
+}
+
+/*
+ * Submit multiple AIO write requests at once.
+ *
+ * On success, the function returns 0 and all requests in the reqs array have
+ * been submitted. In error case this function returns -1, and any of the
+ * requests may or may not be submitted yet. In particular, this means that the
+ * callback will be called for some of the requests, for others it won't. The
+ * caller must check the error field of the BlockRequest to wait for the right
+ * callbacks (if error != 0, no callback will be called).
+ *
+ * The implementation may modify the contents of the reqs array, e.g. to merge
+ * requests. However, the fields opaque and error are left unmodified as they
+ * are used to signal failure for a single request to the caller.
+ */
+int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
+{
+ MultiwriteCB *mcb;
+ int i;
+
+ /* don't submit writes if we don't have a medium */
+ if (bs->drv == NULL) {
+ for (i = 0; i < num_reqs; i++) {
+ reqs[i].error = -ENOMEDIUM;
+ }
+ return -1;
+ }
+
+ if (num_reqs == 0) {
+ return 0;
+ }
+
+ // Create MultiwriteCB structure
+ mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
+ mcb->num_requests = 0;
+ mcb->num_callbacks = num_reqs;
+
+ for (i = 0; i < num_reqs; i++) {
+ mcb->callbacks[i].cb = reqs[i].cb;
+ mcb->callbacks[i].opaque = reqs[i].opaque;
+ }
+
+ // Check for mergable requests
+ num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+
+ trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
+
+ /* Run the aio requests. */
+ mcb->num_requests = num_reqs;
+ for (i = 0; i < num_reqs; i++) {
+ bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
+ reqs[i].nb_sectors, multiwrite_cb, mcb);
+ }
+
+ return 0;
+}
+
+void bdrv_aio_cancel(BlockDriverAIOCB *acb)
+{
+ acb->aiocb_info->cancel(acb);
+}
+
+/* block I/O throttling */
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, double elapsed_time, uint64_t *wait)
+{
+ uint64_t bps_limit = 0;
+ uint64_t extension;
+ double bytes_limit, bytes_base, bytes_res;
+ double slice_time, wait_time;
+
+ if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+ bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+ } else if (bs->io_limits.bps[is_write]) {
+ bps_limit = bs->io_limits.bps[is_write];
+ } else {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ slice_time = bs->slice_end - bs->slice_start;
+ slice_time /= (NANOSECONDS_PER_SECOND);
+ bytes_limit = bps_limit * slice_time;
+ bytes_base = bs->slice_submitted.bytes[is_write];
+ if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+ bytes_base += bs->slice_submitted.bytes[!is_write];
+ }
+
+ /* bytes_base: the bytes of data which have been read/written; and
+ * it is obtained from the history statistic info.
+ * bytes_res: the remaining bytes of data which need to be read/written.
+ * (bytes_base + bytes_res) / bps_limit: used to calcuate
+ * the total time for completing reading/writting all data.
+ */
+ bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+ if (bytes_base + bytes_res <= bytes_limit) {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ /* Calc approx time to dispatch */
+ wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
+
+ /* When the I/O rate at runtime exceeds the limits,
+ * bs->slice_end need to be extended in order that the current statistic
+ * info can be kept until the timer fire, so it is increased and tuned
+ * based on the result of experiment.
+ */
+ extension = wait_time * NANOSECONDS_PER_SECOND;
+ extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) *
+ BLOCK_IO_SLICE_TIME;
+ bs->slice_end += extension;
+ if (wait) {
+ *wait = wait_time * NANOSECONDS_PER_SECOND;
+ }
+
+ return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+ double elapsed_time, uint64_t *wait)
+{
+ uint64_t iops_limit = 0;
+ double ios_limit, ios_base;
+ double slice_time, wait_time;
+
+ if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+ iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+ } else if (bs->io_limits.iops[is_write]) {
+ iops_limit = bs->io_limits.iops[is_write];
+ } else {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ slice_time = bs->slice_end - bs->slice_start;
+ slice_time /= (NANOSECONDS_PER_SECOND);
+ ios_limit = iops_limit * slice_time;
+ ios_base = bs->slice_submitted.ios[is_write];
+ if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+ ios_base += bs->slice_submitted.ios[!is_write];
+ }
+
+ if (ios_base + 1 <= ios_limit) {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ /* Calc approx time to dispatch, in seconds */
+ wait_time = (ios_base + 1) / iops_limit;
+ if (wait_time > elapsed_time) {
+ wait_time = wait_time - elapsed_time;
+ } else {
+ wait_time = 0;
+ }
+
+ /* Exceeded current slice, extend it by another slice time */
+ bs->slice_end += BLOCK_IO_SLICE_TIME;
+ if (wait) {
+ *wait = wait_time * NANOSECONDS_PER_SECOND;
+ }
+
+ return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, int64_t *wait)
+{
+ int64_t now, max_wait;
+ uint64_t bps_wait = 0, iops_wait = 0;
+ double elapsed_time;
+ int bps_ret, iops_ret;
+
+ now = qemu_get_clock_ns(vm_clock);
+ if (now > bs->slice_end) {
+ bs->slice_start = now;
+ bs->slice_end = now + BLOCK_IO_SLICE_TIME;
+ memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
+ }
+
+ elapsed_time = now - bs->slice_start;
+ elapsed_time /= (NANOSECONDS_PER_SECOND);
+
+ bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
+ is_write, elapsed_time, &bps_wait);
+ iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+ elapsed_time, &iops_wait);
+ if (bps_ret || iops_ret) {
+ max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+ if (wait) {
+ *wait = max_wait;
+ }
+
+ now = qemu_get_clock_ns(vm_clock);
+ if (bs->slice_end < now + max_wait) {
+ bs->slice_end = now + max_wait;
+ }
+
+ return true;
+ }
+
+ if (wait) {
+ *wait = 0;
+ }
+
+ bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors *
+ BDRV_SECTOR_SIZE;
+ bs->slice_submitted.ios[is_write]++;
+
+ return false;
+}
+
+/**************************************************************/
+/* async block device emulation */
+
+typedef struct BlockDriverAIOCBSync {
+ BlockDriverAIOCB common;
+ QEMUBH *bh;
+ int ret;
+ /* vector translation state */
+ QEMUIOVector *qiov;
+ uint8_t *bounce;
+ int is_write;
+} BlockDriverAIOCBSync;
+
+static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
+{
+ BlockDriverAIOCBSync *acb =
+ container_of(blockacb, BlockDriverAIOCBSync, common);
+ qemu_bh_delete(acb->bh);
+ acb->bh = NULL;
+ qemu_aio_release(acb);
+}
+
+static const AIOCBInfo bdrv_em_aiocb_info = {
+ .aiocb_size = sizeof(BlockDriverAIOCBSync),
+ .cancel = bdrv_aio_cancel_em,
+};
+
+static void bdrv_aio_bh_cb(void *opaque)
+{
+ BlockDriverAIOCBSync *acb = opaque;
+
+ if (!acb->is_write)
+ qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+ qemu_vfree(acb->bounce);
+ acb->common.cb(acb->common.opaque, acb->ret);
+ qemu_bh_delete(acb->bh);
+ acb->bh = NULL;
+ qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque,
+ int is_write)
+
+{
+ BlockDriverAIOCBSync *acb;
+
+ acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
+ acb->is_write = is_write;
+ acb->qiov = qiov;
+ acb->bounce = qemu_blockalign(bs, qiov->size);
+ acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
+
+ if (is_write) {
+ qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
+ acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
+ } else {
+ acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
+ }
+
+ qemu_bh_schedule(acb->bh);
+
+ return &acb->common;
+}
+
+static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+
+typedef struct BlockDriverAIOCBCoroutine {
+ BlockDriverAIOCB common;
+ BlockRequest req;
+ bool is_write;
+ bool *done;
+ QEMUBH* bh;
+} BlockDriverAIOCBCoroutine;
+
+static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
+{
+ BlockDriverAIOCBCoroutine *acb =
+ container_of(blockacb, BlockDriverAIOCBCoroutine, common);
+ bool done = false;
+
+ acb->done = &done;
+ while (!done) {
+ qemu_aio_wait();
+ }
+}
+
+static const AIOCBInfo bdrv_em_co_aiocb_info = {
+ .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
+ .cancel = bdrv_aio_co_cancel_em,
+};
+
+static void bdrv_co_em_bh(void *opaque)
+{
+ BlockDriverAIOCBCoroutine *acb = opaque;
+
+ acb->common.cb(acb->common.opaque, acb->req.error);
+
+ if (acb->done) {
+ *acb->done = true;
+ }
+
+ qemu_bh_delete(acb->bh);
+ qemu_aio_release(acb);
+}
+
+/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
+static void coroutine_fn bdrv_co_do_rw(void *opaque)
+{
+ BlockDriverAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ if (!acb->is_write) {
+ acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
+ acb->req.nb_sectors, acb->req.qiov, 0);
+ } else {
+ acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
+ acb->req.nb_sectors, acb->req.qiov, 0);
+ }
+
+ acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
+ qemu_bh_schedule(acb->bh);
+}
+
+static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque,
+ bool is_write)
+{
+ Coroutine *co;
+ BlockDriverAIOCBCoroutine *acb;
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->req.sector = sector_num;
+ acb->req.nb_sectors = nb_sectors;
+ acb->req.qiov = qiov;
+ acb->is_write = is_write;
+ acb->done = NULL;
+
+ co = qemu_coroutine_create(bdrv_co_do_rw);
+ qemu_coroutine_enter(co, acb);
+
+ return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
+{
+ BlockDriverAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ acb->req.error = bdrv_co_flush(bs);
+ acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
+ qemu_bh_schedule(acb->bh);
+}
+
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_flush(bs, opaque);
+
+ Coroutine *co;
+ BlockDriverAIOCBCoroutine *acb;
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->done = NULL;
+
+ co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
+ qemu_coroutine_enter(co, acb);
+
+ return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+{
+ BlockDriverAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+ acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
+ qemu_bh_schedule(acb->bh);
+}
+
+BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ Coroutine *co;
+ BlockDriverAIOCBCoroutine *acb;
+
+ trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->req.sector = sector_num;
+ acb->req.nb_sectors = nb_sectors;
+ acb->done = NULL;
+ co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
+ qemu_coroutine_enter(co, acb);
+
+ return &acb->common;
+}
+
+void bdrv_init(void)
+{
+ module_call_init(MODULE_INIT_BLOCK);
+}
+
+void bdrv_init_with_whitelist(void)
+{
+ use_bdrv_whitelist = 1;
+ bdrv_init();
+}
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BlockDriverAIOCB *acb;
+
+ acb = g_slice_alloc(aiocb_info->aiocb_size);
+ acb->aiocb_info = aiocb_info;
+ acb->bs = bs;
+ acb->cb = cb;
+ acb->opaque = opaque;
+ return acb;
+}
+
+void qemu_aio_release(void *p)
+{
+ BlockDriverAIOCB *acb = p;
+ g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+}
+
+/**************************************************************/
+/* Coroutine block device emulation */
+
+typedef struct CoroutineIOCompletion {
+ Coroutine *coroutine;
+ int ret;
+} CoroutineIOCompletion;
+
+static void bdrv_co_io_em_complete(void *opaque, int ret)
+{
+ CoroutineIOCompletion *co = opaque;
+
+ co->ret = ret;
+ qemu_coroutine_enter(co->coroutine, NULL);
+}
+
+static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *iov,
+ bool is_write)
+{
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+ BlockDriverAIOCB *acb;
+
+ if (is_write) {
+ acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ } else {
+ acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ }
+
+ trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
+ if (!acb) {
+ return -EIO;
+ }
+ qemu_coroutine_yield();
+
+ return co.ret;
+}
+
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov)
+{
+ return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
+}
+
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov)
+{
+ return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
+}
+
+static void coroutine_fn bdrv_flush_co_entry(void *opaque)
+{
+ RwCo *rwco = opaque;
+
+ rwco->ret = bdrv_co_flush(rwco->bs);
+}
+
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+{
+ int ret;
+
+ if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+ return 0;
+ }
+
+ /* Write back cached data to the OS even with cache=unsafe */
+ BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
+ if (bs->drv->bdrv_co_flush_to_os) {
+ ret = bs->drv->bdrv_co_flush_to_os(bs);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ /* But don't actually force it to the disk with cache=unsafe */
+ if (bs->open_flags & BDRV_O_NO_FLUSH) {
+ goto flush_parent;
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
+ if (bs->drv->bdrv_co_flush_to_disk) {
+ ret = bs->drv->bdrv_co_flush_to_disk(bs);
+ } else if (bs->drv->bdrv_aio_flush) {
+ BlockDriverAIOCB *acb;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+
+ acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
+ if (acb == NULL) {
+ ret = -EIO;
+ } else {
+ qemu_coroutine_yield();
+ ret = co.ret;
+ }
+ } else {
+ /*
+ * Some block drivers always operate in either writethrough or unsafe
+ * mode and don't support bdrv_flush therefore. Usually qemu doesn't
+ * know how the server works (because the behaviour is hardcoded or
+ * depends on server-side configuration), so we can't ensure that
+ * everything is safe on disk. Returning an error doesn't work because
+ * that would break guests even if the server operates in writethrough
+ * mode.
+ *
+ * Let's hope the user knows what he's doing.
+ */
+ ret = 0;
+ }
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
+ * in the case of cache=unsafe, so there are no useless flushes.
+ */
+flush_parent:
+ return bdrv_co_flush(bs->file);
+}
+
+void bdrv_invalidate_cache(BlockDriverState *bs)
+{
+ if (bs->drv && bs->drv->bdrv_invalidate_cache) {
+ bs->drv->bdrv_invalidate_cache(bs);
+ }
+}
+
+void bdrv_invalidate_cache_all(void)
+{
+ BlockDriverState *bs;
+
+ QTAILQ_FOREACH(bs, &bdrv_states, list) {
+ bdrv_invalidate_cache(bs);
+ }
+}
+
+void bdrv_clear_incoming_migration_all(void)
+{
+ BlockDriverState *bs;
+
+ QTAILQ_FOREACH(bs, &bdrv_states, list) {
+ bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
+ }
+}
+
+int bdrv_flush(BlockDriverState *bs)
+{
+ Coroutine *co;
+ RwCo rwco = {
+ .bs = bs,
+ .ret = NOT_DONE,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_flush_co_entry(&rwco);
+ } else {
+ co = qemu_coroutine_create(bdrv_flush_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ qemu_aio_wait();
+ }
+ }
+
+ return rwco.ret;
+}
+
+static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+{
+ RwCo *rwco = opaque;
+
+ rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+}
+
+int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ if (!bs->drv) {
+ return -ENOMEDIUM;
+ } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+ return -EIO;
+ } else if (bs->read_only) {
+ return -EROFS;
+ }
+
+ if (bs->dirty_bitmap) {
+ bdrv_reset_dirty(bs, sector_num, nb_sectors);
+ }
+
+ /* Do nothing if disabled. */
+ if (!(bs->open_flags & BDRV_O_UNMAP)) {
+ return 0;
+ }
+
+ if (bs->drv->bdrv_co_discard) {
+ return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
+ } else if (bs->drv->bdrv_aio_discard) {
+ BlockDriverAIOCB *acb;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+
+ acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ if (acb == NULL) {
+ return -EIO;
+ } else {
+ qemu_coroutine_yield();
+ return co.ret;
+ }
+ } else {
+ return 0;
+ }
+}
+
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+ Coroutine *co;
+ RwCo rwco = {
+ .bs = bs,
+ .sector_num = sector_num,
+ .nb_sectors = nb_sectors,
+ .ret = NOT_DONE,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_discard_co_entry(&rwco);
+ } else {
+ co = qemu_coroutine_create(bdrv_discard_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ qemu_aio_wait();
+ }
+ }
+
+ return rwco.ret;
+}
+
+/**************************************************************/
+/* removable device support */
+
+/**
+ * Return TRUE if the media is present
+ */
+int bdrv_is_inserted(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (!drv)
+ return 0;
+ if (!drv->bdrv_is_inserted)
+ return 1;
+ return drv->bdrv_is_inserted(bs);
+}
+
+/**
+ * Return whether the media changed since the last call to this
+ * function, or -ENOTSUP if we don't know. Most drivers don't know.
+ */
+int bdrv_media_changed(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (drv && drv->bdrv_media_changed) {
+ return drv->bdrv_media_changed(bs);
+ }
+ return -ENOTSUP;
+}
+
+/**
+ * If eject_flag is TRUE, eject the media. Otherwise, close the tray
+ */
+void bdrv_eject(BlockDriverState *bs, bool eject_flag)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (drv && drv->bdrv_eject) {
+ drv->bdrv_eject(bs, eject_flag);
+ }
+
+ if (bs->device_name[0] != '\0') {
+ bdrv_emit_qmp_eject_event(bs, eject_flag);
+ }
+}
+
+/**
+ * Lock or unlock the media (if it is locked, the user won't be able
+ * to eject it manually).
+ */
+void bdrv_lock_medium(BlockDriverState *bs, bool locked)
+{
+ BlockDriver *drv = bs->drv;
+
+ trace_bdrv_lock_medium(bs, locked);
+
+ if (drv && drv->bdrv_lock_medium) {
+ drv->bdrv_lock_medium(bs, locked);
+ }
+}
+
+/* needed for generic scsi interface */
+
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (drv && drv->bdrv_ioctl)
+ return drv->bdrv_ioctl(bs, req, buf);
+ return -ENOTSUP;
+}
+
+BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
+ unsigned long int req, void *buf,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (drv && drv->bdrv_aio_ioctl)
+ return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
+ return NULL;
+}
+
+void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
+{
+ bs->buffer_alignment = align;
+}
+
+void *qemu_blockalign(BlockDriverState *bs, size_t size)
+{
+ return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
+}
+
+/*
+ * Check if all memory in this vector is sector aligned.
+ */
+bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+{
+ int i;
+
+ for (i = 0; i < qiov->niov; i++) {
+ if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
+{
+ int64_t bitmap_size;
+
+ assert((granularity & (granularity - 1)) == 0);
+
+ if (granularity) {
+ granularity >>= BDRV_SECTOR_BITS;
+ assert(!bs->dirty_bitmap);
+ bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
+ bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
+ } else {
+ if (bs->dirty_bitmap) {
+ hbitmap_free(bs->dirty_bitmap);
+ bs->dirty_bitmap = NULL;
+ }
+ }
+}
+
+int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
+{
+ if (bs->dirty_bitmap) {
+ return hbitmap_get(bs->dirty_bitmap, sector);
+ } else {
+ return 0;
+ }
+}
+
+void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
+{
+ hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
+}
+
+void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
+ int nr_sectors)
+{
+ hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
+}
+
+void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
+ int nr_sectors)
+{
+ hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
+}
+
+int64_t bdrv_get_dirty_count(BlockDriverState *bs)
+{
+ if (bs->dirty_bitmap) {
+ return hbitmap_count(bs->dirty_bitmap);
+ } else {
+ return 0;
+ }
+}
+
+void bdrv_set_in_use(BlockDriverState *bs, int in_use)
+{
+ assert(bs->in_use != in_use);
+ bs->in_use = in_use;
+}
+
+int bdrv_in_use(BlockDriverState *bs)
+{
+ return bs->in_use;
+}
+
+void bdrv_iostatus_enable(BlockDriverState *bs)
+{
+ bs->iostatus_enabled = true;
+ bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
+}
+
+/* The I/O status is only enabled if the drive explicitly
+ * enables it _and_ the VM is configured to stop on errors */
+bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
+{
+ return (bs->iostatus_enabled &&
+ (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
+ bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
+ bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
+}
+
+void bdrv_iostatus_disable(BlockDriverState *bs)
+{
+ bs->iostatus_enabled = false;
+}
+
+void bdrv_iostatus_reset(BlockDriverState *bs)
+{
+ if (bdrv_iostatus_is_enabled(bs)) {
+ bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
+ if (bs->job) {
+ block_job_iostatus_reset(bs->job);
+ }
+ }
+}
+
+void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
+{
+ assert(bdrv_iostatus_is_enabled(bs));
+ if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
+ bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
+ BLOCK_DEVICE_IO_STATUS_FAILED;
+ }
+}
+
+void
+bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
+ enum BlockAcctType type)
+{
+ assert(type < BDRV_MAX_IOTYPE);
+
+ cookie->bytes = bytes;
+ cookie->start_time_ns = get_clock();
+ cookie->type = type;
+}
+
+void
+bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
+{
+ assert(cookie->type < BDRV_MAX_IOTYPE);
+
+ bs->nr_bytes[cookie->type] += cookie->bytes;
+ bs->nr_ops[cookie->type]++;
+ bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
+}
+
+void bdrv_img_create(const char *filename, const char *fmt,
+ const char *base_filename, const char *base_fmt,
+ char *options, uint64_t img_size, int flags,
+ Error **errp, bool quiet)
+{
+ QEMUOptionParameter *param = NULL, *create_options = NULL;
+ QEMUOptionParameter *backing_fmt, *backing_file, *size;
+ BlockDriverState *bs = NULL;
+ BlockDriver *drv, *proto_drv;
+ BlockDriver *backing_drv = NULL;
+ int ret = 0;
+
+ /* Find driver and parse its options */
+ drv = bdrv_find_format(fmt);
+ if (!drv) {
+ error_setg(errp, "Unknown file format '%s'", fmt);
+ return;
+ }
+
+ proto_drv = bdrv_find_protocol(filename, true);
+ if (!proto_drv) {
+ error_setg(errp, "Unknown protocol '%s'", filename);
+ return;
+ }
+
+ create_options = append_option_parameters(create_options,
+ drv->create_options);
+ create_options = append_option_parameters(create_options,
+ proto_drv->create_options);
+
+ /* Create parameter list with default values */
+ param = parse_option_parameters("", create_options, param);
+
+ set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
+
+ /* Parse -o options */
+ if (options) {
+ param = parse_option_parameters(options, create_options, param);
+ if (param == NULL) {
+ error_setg(errp, "Invalid options for file format '%s'.", fmt);
+ goto out;
+ }
+ }
+
+ if (base_filename) {
+ if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
+ base_filename)) {
+ error_setg(errp, "Backing file not supported for file format '%s'",
+ fmt);
+ goto out;
+ }
+ }
+
+ if (base_fmt) {
+ if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
+ error_setg(errp, "Backing file format not supported for file "
+ "format '%s'", fmt);
+ goto out;
+ }
+ }
+
+ backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
+ if (backing_file && backing_file->value.s) {
+ if (!strcmp(filename, backing_file->value.s)) {
+ error_setg(errp, "Error: Trying to create an image with the "
+ "same filename as the backing file");
+ goto out;
+ }
+ }
+
+ backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
+ if (backing_fmt && backing_fmt->value.s) {
+ backing_drv = bdrv_find_format(backing_fmt->value.s);
+ if (!backing_drv) {
+ error_setg(errp, "Unknown backing file format '%s'",
+ backing_fmt->value.s);
+ goto out;
+ }
+ }
+
+ // The size for the image must always be specified, with one exception:
+ // If we are using a backing file, we can obtain the size from there
+ size = get_option_parameter(param, BLOCK_OPT_SIZE);
+ if (size && size->value.n == -1) {
+ if (backing_file && backing_file->value.s) {
+ uint64_t size;
+ char buf[32];
+ int back_flags;
+
+ /* backing files always opened read-only */
+ back_flags =
+ flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
+
+ bs = bdrv_new("");
+
+ ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
+ backing_drv);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "Could not open '%s'",
+ backing_file->value.s);
+ goto out;
+ }
+ bdrv_get_geometry(bs, &size);
+ size *= 512;
+
+ snprintf(buf, sizeof(buf), "%" PRId64, size);
+ set_option_parameter(param, BLOCK_OPT_SIZE, buf);
+ } else {
+ error_setg(errp, "Image creation needs a size parameter");
+ goto out;
+ }
+ }
+
+ if (!quiet) {
+ printf("Formatting '%s', fmt=%s ", filename, fmt);
+ print_option_parameters(param);
+ puts("");
+ }
+ ret = bdrv_create(drv, filename, param);
+ if (ret < 0) {
+ if (ret == -ENOTSUP) {
+ error_setg(errp,"Formatting or formatting option not supported for "
+ "file format '%s'", fmt);
+ } else if (ret == -EFBIG) {
+ const char *cluster_size_hint = "";
+ if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
+ cluster_size_hint = " (try using a larger cluster size)";
+ }
+ error_setg(errp, "The image size is too large for file format '%s'%s",
+ fmt, cluster_size_hint);
+ } else {
+ error_setg(errp, "%s: error while creating %s: %s", filename, fmt,
+ strerror(-ret));
+ }
+ }
+
+out:
+ free_option_parameters(create_options);
+ free_option_parameters(param);
+
+ if (bs) {
+ bdrv_delete(bs);
+ }
+}
+
+AioContext *bdrv_get_aio_context(BlockDriverState *bs)
+{
+ /* Currently BlockDriverState always uses the main loop AioContext */
+ return qemu_get_aio_context();
+}
+
+void bdrv_add_before_write_notifier(BlockDriverState *bs,
+ NotifierWithReturn *notifier)
+{
+ notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
+}
diff --git a/contrib/qemu/block/qcow.c b/contrib/qemu/block/qcow.c
new file mode 100644
index 000000000..5239bd68f
--- /dev/null
+++ b/contrib/qemu/block/qcow.c
@@ -0,0 +1,914 @@
+/*
+ * Block driver for the QCOW format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include <zlib.h>
+#include "qemu/aes.h"
+#include "migration/migration.h"
+
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES 1
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+
+typedef struct QCowHeader {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t backing_file_offset;
+ uint32_t backing_file_size;
+ uint32_t mtime;
+ uint64_t size; /* in bytes */
+ uint8_t cluster_bits;
+ uint8_t l2_bits;
+ uint32_t crypt_method;
+ uint64_t l1_table_offset;
+} QCowHeader;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct BDRVQcowState {
+ int cluster_bits;
+ int cluster_size;
+ int cluster_sectors;
+ int l2_bits;
+ int l2_size;
+ int l1_size;
+ uint64_t cluster_offset_mask;
+ uint64_t l1_table_offset;
+ uint64_t *l1_table;
+ uint64_t *l2_cache;
+ uint64_t l2_cache_offsets[L2_CACHE_SIZE];
+ uint32_t l2_cache_counts[L2_CACHE_SIZE];
+ uint8_t *cluster_cache;
+ uint8_t *cluster_data;
+ uint64_t cluster_cache_offset;
+ uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+ uint32_t crypt_method_header;
+ AES_KEY aes_encrypt_key;
+ AES_KEY aes_decrypt_key;
+ CoMutex lock;
+ Error *migration_blocker;
+} BDRVQcowState;
+
+static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
+
+static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const QCowHeader *cow_header = (const void *)buf;
+
+ if (buf_size >= sizeof(QCowHeader) &&
+ be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+ be32_to_cpu(cow_header->version) == QCOW_VERSION)
+ return 100;
+ else
+ return 0;
+}
+
+static int qcow_open(BlockDriverState *bs, QDict *options, int flags)
+{
+ BDRVQcowState *s = bs->opaque;
+ int len, i, shift, ret;
+ QCowHeader header;
+
+ ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+ if (ret < 0) {
+ goto fail;
+ }
+ be32_to_cpus(&header.magic);
+ be32_to_cpus(&header.version);
+ be64_to_cpus(&header.backing_file_offset);
+ be32_to_cpus(&header.backing_file_size);
+ be32_to_cpus(&header.mtime);
+ be64_to_cpus(&header.size);
+ be32_to_cpus(&header.crypt_method);
+ be64_to_cpus(&header.l1_table_offset);
+
+ if (header.magic != QCOW_MAGIC) {
+ ret = -EMEDIUMTYPE;
+ goto fail;
+ }
+ if (header.version != QCOW_VERSION) {
+ char version[64];
+ snprintf(version, sizeof(version), "QCOW version %d", header.version);
+ qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bs->device_name, "qcow", version);
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ if (header.size <= 1 || header.cluster_bits < 9) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ if (header.crypt_method > QCOW_CRYPT_AES) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ s->crypt_method_header = header.crypt_method;
+ if (s->crypt_method_header) {
+ bs->encrypted = 1;
+ }
+ s->cluster_bits = header.cluster_bits;
+ s->cluster_size = 1 << s->cluster_bits;
+ s->cluster_sectors = 1 << (s->cluster_bits - 9);
+ s->l2_bits = header.l2_bits;
+ s->l2_size = 1 << s->l2_bits;
+ bs->total_sectors = header.size / 512;
+ s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+
+ /* read the level 1 table */
+ shift = s->cluster_bits + s->l2_bits;
+ s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
+
+ s->l1_table_offset = header.l1_table_offset;
+ s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+
+ ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
+ s->l1_size * sizeof(uint64_t));
+ if (ret < 0) {
+ goto fail;
+ }
+
+ for(i = 0;i < s->l1_size; i++) {
+ be64_to_cpus(&s->l1_table[i]);
+ }
+ /* alloc L2 cache */
+ s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+ s->cluster_cache = g_malloc(s->cluster_size);
+ s->cluster_data = g_malloc(s->cluster_size);
+ s->cluster_cache_offset = -1;
+
+ /* read the backing file name */
+ if (header.backing_file_offset != 0) {
+ len = header.backing_file_size;
+ if (len > 1023) {
+ len = 1023;
+ }
+ ret = bdrv_pread(bs->file, header.backing_file_offset,
+ bs->backing_file, len);
+ if (ret < 0) {
+ goto fail;
+ }
+ bs->backing_file[len] = '\0';
+ }
+
+ /* Disable migration when qcow images are used */
+ error_set(&s->migration_blocker,
+ QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+ "qcow", bs->device_name, "live migration");
+ migrate_add_blocker(s->migration_blocker);
+
+ qemu_co_mutex_init(&s->lock);
+ return 0;
+
+ fail:
+ g_free(s->l1_table);
+ g_free(s->l2_cache);
+ g_free(s->cluster_cache);
+ g_free(s->cluster_data);
+ return ret;
+}
+
+
+/* We have nothing to do for QCOW reopen, stubs just return
+ * success */
+static int qcow_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
+static int qcow_set_key(BlockDriverState *bs, const char *key)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint8_t keybuf[16];
+ int len, i;
+
+ memset(keybuf, 0, 16);
+ len = strlen(key);
+ if (len > 16)
+ len = 16;
+ /* XXX: we could compress the chars to 7 bits to increase
+ entropy */
+ for(i = 0;i < len;i++) {
+ keybuf[i] = key[i];
+ }
+ s->crypt_method = s->crypt_method_header;
+
+ if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+ return -1;
+ if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+ return -1;
+ return 0;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+ algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ supported */
+static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, int enc,
+ const AES_KEY *key)
+{
+ union {
+ uint64_t ll[2];
+ uint8_t b[16];
+ } ivec;
+ int i;
+
+ for(i = 0; i < nb_sectors; i++) {
+ ivec.ll[0] = cpu_to_le64(sector_num);
+ ivec.ll[1] = 0;
+ AES_cbc_encrypt(in_buf, out_buf, 512, key,
+ ivec.b, enc);
+ sector_num++;
+ in_buf += 512;
+ out_buf += 512;
+ }
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+ uint64_t offset, int allocate,
+ int compressed_size,
+ int n_start, int n_end)
+{
+ BDRVQcowState *s = bs->opaque;
+ int min_index, i, j, l1_index, l2_index;
+ uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+ uint32_t min_count;
+ int new_l2_table;
+
+ l1_index = offset >> (s->l2_bits + s->cluster_bits);
+ l2_offset = s->l1_table[l1_index];
+ new_l2_table = 0;
+ if (!l2_offset) {
+ if (!allocate)
+ return 0;
+ /* allocate a new l2 entry */
+ l2_offset = bdrv_getlength(bs->file);
+ /* round to cluster size */
+ l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
+ /* update the L1 entry */
+ s->l1_table[l1_index] = l2_offset;
+ tmp = cpu_to_be64(l2_offset);
+ if (bdrv_pwrite_sync(bs->file,
+ s->l1_table_offset + l1_index * sizeof(tmp),
+ &tmp, sizeof(tmp)) < 0)
+ return 0;
+ new_l2_table = 1;
+ }
+ for(i = 0; i < L2_CACHE_SIZE; i++) {
+ if (l2_offset == s->l2_cache_offsets[i]) {
+ /* increment the hit count */
+ if (++s->l2_cache_counts[i] == 0xffffffff) {
+ for(j = 0; j < L2_CACHE_SIZE; j++) {
+ s->l2_cache_counts[j] >>= 1;
+ }
+ }
+ l2_table = s->l2_cache + (i << s->l2_bits);
+ goto found;
+ }
+ }
+ /* not found: load a new entry in the least used one */
+ min_index = 0;
+ min_count = 0xffffffff;
+ for(i = 0; i < L2_CACHE_SIZE; i++) {
+ if (s->l2_cache_counts[i] < min_count) {
+ min_count = s->l2_cache_counts[i];
+ min_index = i;
+ }
+ }
+ l2_table = s->l2_cache + (min_index << s->l2_bits);
+ if (new_l2_table) {
+ memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+ if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
+ s->l2_size * sizeof(uint64_t)) < 0)
+ return 0;
+ } else {
+ if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+ s->l2_size * sizeof(uint64_t))
+ return 0;
+ }
+ s->l2_cache_offsets[min_index] = l2_offset;
+ s->l2_cache_counts[min_index] = 1;
+ found:
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+ if (!cluster_offset ||
+ ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
+ if (!allocate)
+ return 0;
+ /* allocate a new cluster */
+ if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+ (n_end - n_start) < s->cluster_sectors) {
+ /* if the cluster is already compressed, we must
+ decompress it in the case it is not completely
+ overwritten */
+ if (decompress_cluster(bs, cluster_offset) < 0)
+ return 0;
+ cluster_offset = bdrv_getlength(bs->file);
+ cluster_offset = (cluster_offset + s->cluster_size - 1) &
+ ~(s->cluster_size - 1);
+ /* write the cluster content */
+ if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) !=
+ s->cluster_size)
+ return -1;
+ } else {
+ cluster_offset = bdrv_getlength(bs->file);
+ if (allocate == 1) {
+ /* round to cluster size */
+ cluster_offset = (cluster_offset + s->cluster_size - 1) &
+ ~(s->cluster_size - 1);
+ bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
+ /* if encrypted, we must initialize the cluster
+ content which won't be written */
+ if (s->crypt_method &&
+ (n_end - n_start) < s->cluster_sectors) {
+ uint64_t start_sect;
+ start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
+ memset(s->cluster_data + 512, 0x00, 512);
+ for(i = 0; i < s->cluster_sectors; i++) {
+ if (i < n_start || i >= n_end) {
+ encrypt_sectors(s, start_sect + i,
+ s->cluster_data,
+ s->cluster_data + 512, 1, 1,
+ &s->aes_encrypt_key);
+ if (bdrv_pwrite(bs->file, cluster_offset + i * 512,
+ s->cluster_data, 512) != 512)
+ return -1;
+ }
+ }
+ }
+ } else if (allocate == 2) {
+ cluster_offset |= QCOW_OFLAG_COMPRESSED |
+ (uint64_t)compressed_size << (63 - s->cluster_bits);
+ }
+ }
+ /* update L2 table */
+ tmp = cpu_to_be64(cluster_offset);
+ l2_table[l2_index] = tmp;
+ if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
+ &tmp, sizeof(tmp)) < 0)
+ return 0;
+ }
+ return cluster_offset;
+}
+
+static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, int *pnum)
+{
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster, n;
+ uint64_t cluster_offset;
+
+ qemu_co_mutex_lock(&s->lock);
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+ qemu_co_mutex_unlock(&s->lock);
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+ *pnum = n;
+ return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+ const uint8_t *buf, int buf_size)
+{
+ z_stream strm1, *strm = &strm1;
+ int ret, out_len;
+
+ memset(strm, 0, sizeof(*strm));
+
+ strm->next_in = (uint8_t *)buf;
+ strm->avail_in = buf_size;
+ strm->next_out = out_buf;
+ strm->avail_out = out_buf_size;
+
+ ret = inflateInit2(strm, -12);
+ if (ret != Z_OK)
+ return -1;
+ ret = inflate(strm, Z_FINISH);
+ out_len = strm->next_out - out_buf;
+ if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+ out_len != out_buf_size) {
+ inflateEnd(strm);
+ return -1;
+ }
+ inflateEnd(strm);
+ return 0;
+}
+
+static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, csize;
+ uint64_t coffset;
+
+ coffset = cluster_offset & s->cluster_offset_mask;
+ if (s->cluster_cache_offset != coffset) {
+ csize = cluster_offset >> (63 - s->cluster_bits);
+ csize &= (s->cluster_size - 1);
+ ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
+ if (ret != csize)
+ return -1;
+ if (decompress_buffer(s->cluster_cache, s->cluster_size,
+ s->cluster_data, csize) < 0) {
+ return -1;
+ }
+ s->cluster_cache_offset = coffset;
+ }
+ return 0;
+}
+
+static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster;
+ int ret = 0, n;
+ uint64_t cluster_offset;
+ struct iovec hd_iov;
+ QEMUIOVector hd_qiov;
+ uint8_t *buf;
+ void *orig_buf;
+
+ if (qiov->niov > 1) {
+ buf = orig_buf = qemu_blockalign(bs, qiov->size);
+ } else {
+ orig_buf = NULL;
+ buf = (uint8_t *)qiov->iov->iov_base;
+ }
+
+ qemu_co_mutex_lock(&s->lock);
+
+ while (nb_sectors != 0) {
+ /* prepare next request */
+ cluster_offset = get_cluster_offset(bs, sector_num << 9,
+ 0, 0, 0, 0);
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors) {
+ n = nb_sectors;
+ }
+
+ if (!cluster_offset) {
+ if (bs->backing_hd) {
+ /* read from the base image */
+ hd_iov.iov_base = (void *)buf;
+ hd_iov.iov_len = n * 512;
+ qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_readv(bs->backing_hd, sector_num,
+ n, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ goto fail;
+ }
+ } else {
+ /* Note: in this case, no need to wait */
+ memset(buf, 0, 512 * n);
+ }
+ } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+ /* add AIO support for compressed blocks ? */
+ if (decompress_cluster(bs, cluster_offset) < 0) {
+ goto fail;
+ }
+ memcpy(buf,
+ s->cluster_cache + index_in_cluster * 512, 512 * n);
+ } else {
+ if ((cluster_offset & 511) != 0) {
+ goto fail;
+ }
+ hd_iov.iov_base = (void *)buf;
+ hd_iov.iov_len = n * 512;
+ qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_readv(bs->file,
+ (cluster_offset >> 9) + index_in_cluster,
+ n, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ break;
+ }
+ if (s->crypt_method) {
+ encrypt_sectors(s, sector_num, buf, buf,
+ n, 0,
+ &s->aes_decrypt_key);
+ }
+ }
+ ret = 0;
+
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+
+done:
+ qemu_co_mutex_unlock(&s->lock);
+
+ if (qiov->niov > 1) {
+ qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size);
+ qemu_vfree(orig_buf);
+ }
+
+ return ret;
+
+fail:
+ ret = -EIO;
+ goto done;
+}
+
+static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster;
+ uint64_t cluster_offset;
+ const uint8_t *src_buf;
+ int ret = 0, n;
+ uint8_t *cluster_data = NULL;
+ struct iovec hd_iov;
+ QEMUIOVector hd_qiov;
+ uint8_t *buf;
+ void *orig_buf;
+
+ s->cluster_cache_offset = -1; /* disable compressed cache */
+
+ if (qiov->niov > 1) {
+ buf = orig_buf = qemu_blockalign(bs, qiov->size);
+ qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
+ } else {
+ orig_buf = NULL;
+ buf = (uint8_t *)qiov->iov->iov_base;
+ }
+
+ qemu_co_mutex_lock(&s->lock);
+
+ while (nb_sectors != 0) {
+
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors) {
+ n = nb_sectors;
+ }
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
+ index_in_cluster,
+ index_in_cluster + n);
+ if (!cluster_offset || (cluster_offset & 511) != 0) {
+ ret = -EIO;
+ break;
+ }
+ if (s->crypt_method) {
+ if (!cluster_data) {
+ cluster_data = g_malloc0(s->cluster_size);
+ }
+ encrypt_sectors(s, sector_num, cluster_data, buf,
+ n, 1, &s->aes_encrypt_key);
+ src_buf = cluster_data;
+ } else {
+ src_buf = buf;
+ }
+
+ hd_iov.iov_base = (void *)src_buf;
+ hd_iov.iov_len = n * 512;
+ qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_writev(bs->file,
+ (cluster_offset >> 9) + index_in_cluster,
+ n, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ break;
+ }
+ ret = 0;
+
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+ qemu_co_mutex_unlock(&s->lock);
+
+ if (qiov->niov > 1) {
+ qemu_vfree(orig_buf);
+ }
+ g_free(cluster_data);
+
+ return ret;
+}
+
+static void qcow_close(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+
+ g_free(s->l1_table);
+ g_free(s->l2_cache);
+ g_free(s->cluster_cache);
+ g_free(s->cluster_data);
+
+ migrate_del_blocker(s->migration_blocker);
+ error_free(s->migration_blocker);
+}
+
+static int qcow_create(const char *filename, QEMUOptionParameter *options)
+{
+ int header_size, backing_filename_len, l1_size, shift, i;
+ QCowHeader header;
+ uint8_t *tmp;
+ int64_t total_size = 0;
+ const char *backing_file = NULL;
+ int flags = 0;
+ int ret;
+ BlockDriverState *qcow_bs;
+
+ /* Read out options */
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ total_size = options->value.n / 512;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+ backing_file = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
+ flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
+ }
+ options++;
+ }
+
+ ret = bdrv_create_file(filename, options);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = bdrv_truncate(qcow_bs, 0);
+ if (ret < 0) {
+ goto exit;
+ }
+
+ memset(&header, 0, sizeof(header));
+ header.magic = cpu_to_be32(QCOW_MAGIC);
+ header.version = cpu_to_be32(QCOW_VERSION);
+ header.size = cpu_to_be64(total_size * 512);
+ header_size = sizeof(header);
+ backing_filename_len = 0;
+ if (backing_file) {
+ if (strcmp(backing_file, "fat:")) {
+ header.backing_file_offset = cpu_to_be64(header_size);
+ backing_filename_len = strlen(backing_file);
+ header.backing_file_size = cpu_to_be32(backing_filename_len);
+ header_size += backing_filename_len;
+ } else {
+ /* special backing file for vvfat */
+ backing_file = NULL;
+ }
+ header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+ unmodifyed sectors */
+ header.l2_bits = 12; /* 32 KB L2 tables */
+ } else {
+ header.cluster_bits = 12; /* 4 KB clusters */
+ header.l2_bits = 9; /* 4 KB L2 tables */
+ }
+ header_size = (header_size + 7) & ~7;
+ shift = header.cluster_bits + header.l2_bits;
+ l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
+
+ header.l1_table_offset = cpu_to_be64(header_size);
+ if (flags & BLOCK_FLAG_ENCRYPT) {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+ } else {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+ }
+
+ /* write all the data */
+ ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header));
+ if (ret != sizeof(header)) {
+ goto exit;
+ }
+
+ if (backing_file) {
+ ret = bdrv_pwrite(qcow_bs, sizeof(header),
+ backing_file, backing_filename_len);
+ if (ret != backing_filename_len) {
+ goto exit;
+ }
+ }
+
+ tmp = g_malloc0(BDRV_SECTOR_SIZE);
+ for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
+ BDRV_SECTOR_SIZE); i++) {
+ ret = bdrv_pwrite(qcow_bs, header_size +
+ BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
+ if (ret != BDRV_SECTOR_SIZE) {
+ g_free(tmp);
+ goto exit;
+ }
+ }
+
+ g_free(tmp);
+ ret = 0;
+exit:
+ bdrv_delete(qcow_bs);
+ return ret;
+}
+
+static int qcow_make_empty(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+ int ret;
+
+ memset(s->l1_table, 0, l1_length);
+ if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
+ l1_length) < 0)
+ return -1;
+ ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
+ if (ret < 0)
+ return ret;
+
+ memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+ return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+ tables to avoid losing bytes in alignment */
+static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ z_stream strm;
+ int ret, out_len;
+ uint8_t *out_buf;
+ uint64_t cluster_offset;
+
+ if (nb_sectors != s->cluster_sectors) {
+ ret = -EINVAL;
+
+ /* Zero-pad last write if image size is not cluster aligned */
+ if (sector_num + nb_sectors == bs->total_sectors &&
+ nb_sectors < s->cluster_sectors) {
+ uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+ memset(pad_buf, 0, s->cluster_size);
+ memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+ ret = qcow_write_compressed(bs, sector_num,
+ pad_buf, s->cluster_sectors);
+ qemu_vfree(pad_buf);
+ }
+ return ret;
+ }
+
+ out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+
+ /* best compression, small window, no zlib header */
+ memset(&strm, 0, sizeof(strm));
+ ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, -12,
+ 9, Z_DEFAULT_STRATEGY);
+ if (ret != 0) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ strm.avail_in = s->cluster_size;
+ strm.next_in = (uint8_t *)buf;
+ strm.avail_out = s->cluster_size;
+ strm.next_out = out_buf;
+
+ ret = deflate(&strm, Z_FINISH);
+ if (ret != Z_STREAM_END && ret != Z_OK) {
+ deflateEnd(&strm);
+ ret = -EINVAL;
+ goto fail;
+ }
+ out_len = strm.next_out - out_buf;
+
+ deflateEnd(&strm);
+
+ if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+ /* could not compress: write normal cluster */
+ ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+ if (ret < 0) {
+ goto fail;
+ }
+ } else {
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
+ out_len, 0, 0);
+ if (cluster_offset == 0) {
+ ret = -EIO;
+ goto fail;
+ }
+
+ cluster_offset &= s->cluster_offset_mask;
+ ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ ret = 0;
+fail:
+ g_free(out_buf);
+ return ret;
+}
+
+static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ BDRVQcowState *s = bs->opaque;
+ bdi->cluster_size = s->cluster_size;
+ return 0;
+}
+
+
+static QEMUOptionParameter qcow_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a base image"
+ },
+ {
+ .name = BLOCK_OPT_ENCRYPT,
+ .type = OPT_FLAG,
+ .help = "Encrypt the image"
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_qcow = {
+ .format_name = "qcow",
+ .instance_size = sizeof(BDRVQcowState),
+ .bdrv_probe = qcow_probe,
+ .bdrv_open = qcow_open,
+ .bdrv_close = qcow_close,
+ .bdrv_reopen_prepare = qcow_reopen_prepare,
+ .bdrv_create = qcow_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
+
+ .bdrv_co_readv = qcow_co_readv,
+ .bdrv_co_writev = qcow_co_writev,
+ .bdrv_co_is_allocated = qcow_co_is_allocated,
+
+ .bdrv_set_key = qcow_set_key,
+ .bdrv_make_empty = qcow_make_empty,
+ .bdrv_write_compressed = qcow_write_compressed,
+ .bdrv_get_info = qcow_get_info,
+
+ .create_options = qcow_create_options,
+};
+
+static void bdrv_qcow_init(void)
+{
+ bdrv_register(&bdrv_qcow);
+}
+
+block_init(bdrv_qcow_init);
diff --git a/contrib/qemu/block/qcow2-cache.c b/contrib/qemu/block/qcow2-cache.c
new file mode 100644
index 000000000..2f3114ecc
--- /dev/null
+++ b/contrib/qemu/block/qcow2-cache.c
@@ -0,0 +1,323 @@
+/*
+ * L2/refcount table cache for the QCOW2 format
+ *
+ * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block/block_int.h"
+#include "qemu-common.h"
+#include "qcow2.h"
+#include "trace.h"
+
+typedef struct Qcow2CachedTable {
+ void* table;
+ int64_t offset;
+ bool dirty;
+ int cache_hits;
+ int ref;
+} Qcow2CachedTable;
+
+struct Qcow2Cache {
+ Qcow2CachedTable* entries;
+ struct Qcow2Cache* depends;
+ int size;
+ bool depends_on_flush;
+};
+
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
+{
+ BDRVQcowState *s = bs->opaque;
+ Qcow2Cache *c;
+ int i;
+
+ c = g_malloc0(sizeof(*c));
+ c->size = num_tables;
+ c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
+
+ for (i = 0; i < c->size; i++) {
+ c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
+ }
+
+ return c;
+}
+
+int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c)
+{
+ int i;
+
+ for (i = 0; i < c->size; i++) {
+ assert(c->entries[i].ref == 0);
+ qemu_vfree(c->entries[i].table);
+ }
+
+ g_free(c->entries);
+ g_free(c);
+
+ return 0;
+}
+
+static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c)
+{
+ int ret;
+
+ ret = qcow2_cache_flush(bs, c->depends);
+ if (ret < 0) {
+ return ret;
+ }
+
+ c->depends = NULL;
+ c->depends_on_flush = false;
+
+ return 0;
+}
+
+static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret = 0;
+
+ if (!c->entries[i].dirty || !c->entries[i].offset) {
+ return 0;
+ }
+
+ trace_qcow2_cache_entry_flush(qemu_coroutine_self(),
+ c == s->l2_table_cache, i);
+
+ if (c->depends) {
+ ret = qcow2_cache_flush_dependency(bs, c);
+ } else if (c->depends_on_flush) {
+ ret = bdrv_flush(bs->file);
+ if (ret >= 0) {
+ c->depends_on_flush = false;
+ }
+ }
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (c == s->refcount_block_cache) {
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
+ } else if (c == s->l2_table_cache) {
+ BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
+ }
+
+ ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
+ s->cluster_size);
+ if (ret < 0) {
+ return ret;
+ }
+
+ c->entries[i].dirty = false;
+
+ return 0;
+}
+
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+{
+ BDRVQcowState *s = bs->opaque;
+ int result = 0;
+ int ret;
+ int i;
+
+ trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache);
+
+ for (i = 0; i < c->size; i++) {
+ ret = qcow2_cache_entry_flush(bs, c, i);
+ if (ret < 0 && result != -ENOSPC) {
+ result = ret;
+ }
+ }
+
+ if (result == 0) {
+ ret = bdrv_flush(bs->file);
+ if (ret < 0) {
+ result = ret;
+ }
+ }
+
+ return result;
+}
+
+int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
+ Qcow2Cache *dependency)
+{
+ int ret;
+
+ if (dependency->depends) {
+ ret = qcow2_cache_flush_dependency(bs, dependency);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ if (c->depends && (c->depends != dependency)) {
+ ret = qcow2_cache_flush_dependency(bs, c);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ c->depends = dependency;
+ return 0;
+}
+
+void qcow2_cache_depends_on_flush(Qcow2Cache *c)
+{
+ c->depends_on_flush = true;
+}
+
+static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
+{
+ int i;
+ int min_count = INT_MAX;
+ int min_index = -1;
+
+
+ for (i = 0; i < c->size; i++) {
+ if (c->entries[i].ref) {
+ continue;
+ }
+
+ if (c->entries[i].cache_hits < min_count) {
+ min_index = i;
+ min_count = c->entries[i].cache_hits;
+ }
+
+ /* Give newer hits priority */
+ /* TODO Check how to optimize the replacement strategy */
+ c->entries[i].cache_hits /= 2;
+ }
+
+ if (min_index == -1) {
+ /* This can't happen in current synchronous code, but leave the check
+ * here as a reminder for whoever starts using AIO with the cache */
+ abort();
+ }
+ return min_index;
+}
+
+static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
+ uint64_t offset, void **table, bool read_from_disk)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i;
+ int ret;
+
+ trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache,
+ offset, read_from_disk);
+
+ /* Check if the table is already cached */
+ for (i = 0; i < c->size; i++) {
+ if (c->entries[i].offset == offset) {
+ goto found;
+ }
+ }
+
+ /* If not, write a table back and replace it */
+ i = qcow2_cache_find_entry_to_replace(c);
+ trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(),
+ c == s->l2_table_cache, i);
+ if (i < 0) {
+ return i;
+ }
+
+ ret = qcow2_cache_entry_flush(bs, c, i);
+ if (ret < 0) {
+ return ret;
+ }
+
+ trace_qcow2_cache_get_read(qemu_coroutine_self(),
+ c == s->l2_table_cache, i);
+ c->entries[i].offset = 0;
+ if (read_from_disk) {
+ if (c == s->l2_table_cache) {
+ BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
+ }
+
+ ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ /* Give the table some hits for the start so that it won't be replaced
+ * immediately. The number 32 is completely arbitrary. */
+ c->entries[i].cache_hits = 32;
+ c->entries[i].offset = offset;
+
+ /* And return the right table */
+found:
+ c->entries[i].cache_hits++;
+ c->entries[i].ref++;
+ *table = c->entries[i].table;
+
+ trace_qcow2_cache_get_done(qemu_coroutine_self(),
+ c == s->l2_table_cache, i);
+
+ return 0;
+}
+
+int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+ void **table)
+{
+ return qcow2_cache_do_get(bs, c, offset, table, true);
+}
+
+int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+ void **table)
+{
+ return qcow2_cache_do_get(bs, c, offset, table, false);
+}
+
+int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
+{
+ int i;
+
+ for (i = 0; i < c->size; i++) {
+ if (c->entries[i].table == *table) {
+ goto found;
+ }
+ }
+ return -ENOENT;
+
+found:
+ c->entries[i].ref--;
+ *table = NULL;
+
+ assert(c->entries[i].ref >= 0);
+ return 0;
+}
+
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
+{
+ int i;
+
+ for (i = 0; i < c->size; i++) {
+ if (c->entries[i].table == table) {
+ goto found;
+ }
+ }
+ abort();
+
+found:
+ c->entries[i].dirty = true;
+}
diff --git a/contrib/qemu/block/qcow2-cluster.c b/contrib/qemu/block/qcow2-cluster.c
new file mode 100644
index 000000000..cca76d4fc
--- /dev/null
+++ b/contrib/qemu/block/qcow2-cluster.c
@@ -0,0 +1,1478 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <zlib.h>
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/qcow2.h"
+#include "trace.h"
+
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+ bool exact_size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int new_l1_size2, ret, i;
+ uint64_t *new_l1_table;
+ int64_t new_l1_table_offset, new_l1_size;
+ uint8_t data[12];
+
+ if (min_size <= s->l1_size)
+ return 0;
+
+ if (exact_size) {
+ new_l1_size = min_size;
+ } else {
+ /* Bump size up to reduce the number of times we have to grow */
+ new_l1_size = s->l1_size;
+ if (new_l1_size == 0) {
+ new_l1_size = 1;
+ }
+ while (min_size > new_l1_size) {
+ new_l1_size = (new_l1_size * 3 + 1) / 2;
+ }
+ }
+
+ if (new_l1_size > INT_MAX) {
+ return -EFBIG;
+ }
+
+#ifdef DEBUG_ALLOC2
+ fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
+ s->l1_size, new_l1_size);
+#endif
+
+ new_l1_size2 = sizeof(uint64_t) * new_l1_size;
+ new_l1_table = g_malloc0(align_offset(new_l1_size2, 512));
+ memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
+
+ /* write new table (align to cluster) */
+ BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
+ new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
+ if (new_l1_table_offset < 0) {
+ g_free(new_l1_table);
+ return new_l1_table_offset;
+ }
+
+ ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
+ for(i = 0; i < s->l1_size; i++)
+ new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
+ ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2);
+ if (ret < 0)
+ goto fail;
+ for(i = 0; i < s->l1_size; i++)
+ new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
+
+ /* set new table */
+ BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
+ cpu_to_be32w((uint32_t*)data, new_l1_size);
+ cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset);
+ ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data));
+ if (ret < 0) {
+ goto fail;
+ }
+ g_free(s->l1_table);
+ qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t),
+ QCOW2_DISCARD_OTHER);
+ s->l1_table_offset = new_l1_table_offset;
+ s->l1_table = new_l1_table;
+ s->l1_size = new_l1_size;
+ return 0;
+ fail:
+ g_free(new_l1_table);
+ qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
+ QCOW2_DISCARD_OTHER);
+ return ret;
+}
+
+/*
+ * l2_load
+ *
+ * Loads a L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns a pointer to the L2 table on success, or NULL if the read from
+ * the image file failed.
+ */
+
+static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
+ uint64_t **l2_table)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret;
+
+ ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
+
+ return ret;
+}
+
+/*
+ * Writes one sector of the L1 table to the disk (can't update single entries
+ * and we really don't want bdrv_pread to perform a read-modify-write)
+ */
+#define L1_ENTRIES_PER_SECTOR (512 / 8)
+static int write_l1_entry(BlockDriverState *bs, int l1_index)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t buf[L1_ENTRIES_PER_SECTOR];
+ int l1_start_index;
+ int i, ret;
+
+ l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
+ for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) {
+ buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+ ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index,
+ buf, sizeof(buf));
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * l2_allocate
+ *
+ * Allocate a new l2 entry in the file. If l1_index points to an already
+ * used entry in the L2 table (i.e. we are doing a copy on write for the L2
+ * table) copy the contents of the old L2 table into the newly allocated one.
+ * Otherwise the new table is initialized with zeros.
+ *
+ */
+
+static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t old_l2_offset;
+ uint64_t *l2_table;
+ int64_t l2_offset;
+ int ret;
+
+ old_l2_offset = s->l1_table[l1_index];
+
+ trace_qcow2_l2_allocate(bs, l1_index);
+
+ /* allocate a new l2 entry */
+
+ l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
+ if (l2_offset < 0) {
+ return l2_offset;
+ }
+
+ ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* allocate a new entry in the l2 cache */
+
+ trace_qcow2_l2_allocate_get_empty(bs, l1_index);
+ ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ l2_table = *table;
+
+ if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
+ /* if there was no old l2 table, clear the new table */
+ memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+ } else {
+ uint64_t* old_table;
+
+ /* if there was an old l2 table, read it from the disk */
+ BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
+ ret = qcow2_cache_get(bs, s->l2_table_cache,
+ old_l2_offset & L1E_OFFSET_MASK,
+ (void**) &old_table);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ memcpy(l2_table, old_table, s->cluster_size);
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ /* write the l2 table to the file */
+ BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
+
+ trace_qcow2_l2_allocate_write_l2(bs, l1_index);
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ ret = qcow2_cache_flush(bs, s->l2_table_cache);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* update the L1 entry */
+ trace_qcow2_l2_allocate_write_l1(bs, l1_index);
+ s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
+ ret = write_l1_entry(bs, l1_index);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ *table = l2_table;
+ trace_qcow2_l2_allocate_done(bs, l1_index, 0);
+ return 0;
+
+fail:
+ trace_qcow2_l2_allocate_done(bs, l1_index, ret);
+ qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
+ s->l1_table[l1_index] = old_l2_offset;
+ return ret;
+}
+
+/*
+ * Checks how many clusters in a given L2 table are contiguous in the image
+ * file. As soon as one of the flags in the bitmask stop_flags changes compared
+ * to the first cluster, the search is stopped and the cluster is not counted
+ * as contiguous. (This allows it, for example, to stop at the first compressed
+ * cluster which may require a different handling)
+ */
+static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
+ uint64_t *l2_table, uint64_t start, uint64_t stop_flags)
+{
+ int i;
+ uint64_t mask = stop_flags | L2E_OFFSET_MASK;
+ uint64_t offset = be64_to_cpu(l2_table[0]) & mask;
+
+ if (!offset)
+ return 0;
+
+ for (i = start; i < start + nb_clusters; i++) {
+ uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
+ if (offset + (uint64_t) i * cluster_size != l2_entry) {
+ break;
+ }
+ }
+
+ return (i - start);
+}
+
+static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
+{
+ int i;
+
+ for (i = 0; i < nb_clusters; i++) {
+ int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
+
+ if (type != QCOW2_CLUSTER_UNALLOCATED) {
+ break;
+ }
+ }
+
+ return i;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+ algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ supported */
+void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, int enc,
+ const AES_KEY *key)
+{
+ union {
+ uint64_t ll[2];
+ uint8_t b[16];
+ } ivec;
+ int i;
+
+ for(i = 0; i < nb_sectors; i++) {
+ ivec.ll[0] = cpu_to_le64(sector_num);
+ ivec.ll[1] = 0;
+ AES_cbc_encrypt(in_buf, out_buf, 512, key,
+ ivec.b, enc);
+ sector_num++;
+ in_buf += 512;
+ out_buf += 512;
+ }
+}
+
+static int coroutine_fn copy_sectors(BlockDriverState *bs,
+ uint64_t start_sect,
+ uint64_t cluster_offset,
+ int n_start, int n_end)
+{
+ BDRVQcowState *s = bs->opaque;
+ QEMUIOVector qiov;
+ struct iovec iov;
+ int n, ret;
+
+ /*
+ * If this is the last cluster and it is only partially used, we must only
+ * copy until the end of the image, or bdrv_check_request will fail for the
+ * bdrv_read/write calls below.
+ */
+ if (start_sect + n_end > bs->total_sectors) {
+ n_end = bs->total_sectors - start_sect;
+ }
+
+ n = n_end - n_start;
+ if (n <= 0) {
+ return 0;
+ }
+
+ iov.iov_len = n * BDRV_SECTOR_SIZE;
+ iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+
+ BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
+
+ /* Call .bdrv_co_readv() directly instead of using the public block-layer
+ * interface. This avoids double I/O throttling and request tracking,
+ * which can lead to deadlock when block layer copy-on-read is enabled.
+ */
+ ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (s->crypt_method) {
+ qcow2_encrypt_sectors(s, start_sect + n_start,
+ iov.iov_base, iov.iov_base, n, 1,
+ &s->aes_encrypt_key);
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
+ ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = 0;
+out:
+ qemu_vfree(iov.iov_base);
+ return ret;
+}
+
+
+/*
+ * get_cluster_offset
+ *
+ * For a given offset of the disk image, find the cluster offset in
+ * qcow2 file. The offset is stored in *cluster_offset.
+ *
+ * on entry, *num is the number of contiguous sectors we'd like to
+ * access following offset.
+ *
+ * on exit, *num is the number of contiguous sectors we can read.
+ *
+ * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
+ * cases.
+ */
+int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+ int *num, uint64_t *cluster_offset)
+{
+ BDRVQcowState *s = bs->opaque;
+ unsigned int l2_index;
+ uint64_t l1_index, l2_offset, *l2_table;
+ int l1_bits, c;
+ unsigned int index_in_cluster, nb_clusters;
+ uint64_t nb_available, nb_needed;
+ int ret;
+
+ index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
+ nb_needed = *num + index_in_cluster;
+
+ l1_bits = s->l2_bits + s->cluster_bits;
+
+ /* compute how many bytes there are between the offset and
+ * the end of the l1 entry
+ */
+
+ nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
+
+ /* compute the number of available sectors */
+
+ nb_available = (nb_available >> 9) + index_in_cluster;
+
+ if (nb_needed > nb_available) {
+ nb_needed = nb_available;
+ }
+
+ *cluster_offset = 0;
+
+ /* seek the the l2 offset in the l1 table */
+
+ l1_index = offset >> l1_bits;
+ if (l1_index >= s->l1_size) {
+ ret = QCOW2_CLUSTER_UNALLOCATED;
+ goto out;
+ }
+
+ l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+ if (!l2_offset) {
+ ret = QCOW2_CLUSTER_UNALLOCATED;
+ goto out;
+ }
+
+ /* load the l2 table in memory */
+
+ ret = l2_load(bs, l2_offset, &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* find the cluster offset for the given disk offset */
+
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+ *cluster_offset = be64_to_cpu(l2_table[l2_index]);
+ nb_clusters = size_to_clusters(s, nb_needed << 9);
+
+ ret = qcow2_get_cluster_type(*cluster_offset);
+ switch (ret) {
+ case QCOW2_CLUSTER_COMPRESSED:
+ /* Compressed clusters can only be processed one by one */
+ c = 1;
+ *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
+ break;
+ case QCOW2_CLUSTER_ZERO:
+ if (s->qcow_version < 3) {
+ return -EIO;
+ }
+ c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+ &l2_table[l2_index], 0,
+ QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
+ *cluster_offset = 0;
+ break;
+ case QCOW2_CLUSTER_UNALLOCATED:
+ /* how many empty clusters ? */
+ c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
+ *cluster_offset = 0;
+ break;
+ case QCOW2_CLUSTER_NORMAL:
+ /* how many allocated clusters ? */
+ c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+ &l2_table[l2_index], 0,
+ QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
+ *cluster_offset &= L2E_OFFSET_MASK;
+ break;
+ default:
+ abort();
+ }
+
+ qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+
+ nb_available = (c * s->cluster_sectors);
+
+out:
+ if (nb_available > nb_needed)
+ nb_available = nb_needed;
+
+ *num = nb_available - index_in_cluster;
+
+ return ret;
+}
+
+/*
+ * get_cluster_table
+ *
+ * for a given disk offset, load (and allocate if needed)
+ * the l2 table.
+ *
+ * the l2 table offset in the qcow2 file and the cluster index
+ * in the l2 table are given to the caller.
+ *
+ * Returns 0 on success, -errno in failure case
+ */
+static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
+ uint64_t **new_l2_table,
+ int *new_l2_index)
+{
+ BDRVQcowState *s = bs->opaque;
+ unsigned int l2_index;
+ uint64_t l1_index, l2_offset;
+ uint64_t *l2_table = NULL;
+ int ret;
+
+ /* seek the the l2 offset in the l1 table */
+
+ l1_index = offset >> (s->l2_bits + s->cluster_bits);
+ if (l1_index >= s->l1_size) {
+ ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ assert(l1_index < s->l1_size);
+ l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+
+ /* seek the l2 table of the given l2 offset */
+
+ if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
+ /* load the l2 table in memory */
+ ret = l2_load(bs, l2_offset, &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+ } else {
+ /* First allocate a new L2 table (and do COW if needed) */
+ ret = l2_allocate(bs, l1_index, &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Then decrease the refcount of the old table */
+ if (l2_offset) {
+ qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
+ QCOW2_DISCARD_OTHER);
+ }
+ }
+
+ /* find the cluster offset for the given disk offset */
+
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+
+ *new_l2_table = l2_table;
+ *new_l2_index = l2_index;
+
+ return 0;
+}
+
+/*
+ * alloc_compressed_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * If the offset is not found, allocate a new compressed cluster.
+ *
+ * Return the cluster offset if successful,
+ * Return 0, otherwise.
+ *
+ */
+
+uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+ uint64_t offset,
+ int compressed_size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int l2_index, ret;
+ uint64_t *l2_table;
+ int64_t cluster_offset;
+ int nb_csectors;
+
+ ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ return 0;
+ }
+
+ /* Compression can't overwrite anything. Fail if the cluster was already
+ * allocated. */
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+ if (cluster_offset & L2E_OFFSET_MASK) {
+ qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ return 0;
+ }
+
+ cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
+ if (cluster_offset < 0) {
+ qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ return 0;
+ }
+
+ nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
+ (cluster_offset >> 9);
+
+ cluster_offset |= QCOW_OFLAG_COMPRESSED |
+ ((uint64_t)nb_csectors << s->csize_shift);
+
+ /* update L2 table */
+
+ /* compressed clusters never have the copied flag */
+
+ BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ l2_table[l2_index] = cpu_to_be64(cluster_offset);
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ return 0;
+ }
+
+ return cluster_offset;
+}
+
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret;
+
+ if (r->nb_sectors == 0) {
+ return 0;
+ }
+
+ qemu_co_mutex_unlock(&s->lock);
+ ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
+ r->offset / BDRV_SECTOR_SIZE,
+ r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
+ qemu_co_mutex_lock(&s->lock);
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ /*
+ * Before we update the L2 table to actually point to the new cluster, we
+ * need to be sure that the refcounts have been increased and COW was
+ * handled.
+ */
+ qcow2_cache_depends_on_flush(s->l2_table_cache);
+
+ return 0;
+}
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i, j = 0, l2_index, ret;
+ uint64_t *old_cluster, *l2_table;
+ uint64_t cluster_offset = m->alloc_offset;
+
+ trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
+ assert(m->nb_clusters > 0);
+
+ old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+
+ /* copy content of unmodified sectors */
+ ret = perform_cow(bs, m, &m->cow_start);
+ if (ret < 0) {
+ goto err;
+ }
+
+ ret = perform_cow(bs, m, &m->cow_end);
+ if (ret < 0) {
+ goto err;
+ }
+
+ /* Update L2 table. */
+ if (s->use_lazy_refcounts) {
+ qcow2_mark_dirty(bs);
+ }
+ if (qcow2_need_accurate_refcounts(s)) {
+ qcow2_cache_set_dependency(bs, s->l2_table_cache,
+ s->refcount_block_cache);
+ }
+
+ ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ goto err;
+ }
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+
+ for (i = 0; i < m->nb_clusters; i++) {
+ /* if two concurrent writes happen to the same unallocated cluster
+ * each write allocates separate cluster and writes data concurrently.
+ * The first one to complete updates l2 table with pointer to its
+ * cluster the second one has to do RMW (which is done above by
+ * copy_sectors()), update l2 table with its cluster pointer and free
+ * old cluster. This is what this loop does */
+ if(l2_table[l2_index + i] != 0)
+ old_cluster[j++] = l2_table[l2_index + i];
+
+ l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
+ (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
+ }
+
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ goto err;
+ }
+
+ /*
+ * If this was a COW, we need to decrease the refcount of the old cluster.
+ * Also flush bs->file to get the right order for L2 and refcount update.
+ *
+ * Don't discard clusters that reach a refcount of 0 (e.g. compressed
+ * clusters), the next write will reuse them anyway.
+ */
+ if (j != 0) {
+ for (i = 0; i < j; i++) {
+ qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
+ QCOW2_DISCARD_NEVER);
+ }
+ }
+
+ ret = 0;
+err:
+ g_free(old_cluster);
+ return ret;
+ }
+
+/*
+ * Returns the number of contiguous clusters that can be used for an allocating
+ * write, but require COW to be performed (this includes yet unallocated space,
+ * which must copy from the backing file)
+ */
+static int count_cow_clusters(BDRVQcowState *s, int nb_clusters,
+ uint64_t *l2_table, int l2_index)
+{
+ int i;
+
+ for (i = 0; i < nb_clusters; i++) {
+ uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
+ int cluster_type = qcow2_get_cluster_type(l2_entry);
+
+ switch(cluster_type) {
+ case QCOW2_CLUSTER_NORMAL:
+ if (l2_entry & QCOW_OFLAG_COPIED) {
+ goto out;
+ }
+ break;
+ case QCOW2_CLUSTER_UNALLOCATED:
+ case QCOW2_CLUSTER_COMPRESSED:
+ case QCOW2_CLUSTER_ZERO:
+ break;
+ default:
+ abort();
+ }
+ }
+
+out:
+ assert(i <= nb_clusters);
+ return i;
+}
+
+/*
+ * Check if there already is an AIO write request in flight which allocates
+ * the same cluster. In this case we need to wait until the previous
+ * request has completed and updated the L2 table accordingly.
+ *
+ * Returns:
+ * 0 if there was no dependency. *cur_bytes indicates the number of
+ * bytes from guest_offset that can be read before the next
+ * dependency must be processed (or the request is complete)
+ *
+ * -EAGAIN if we had to wait for another request, previously gathered
+ * information on cluster allocation may be invalid now. The caller
+ * must start over anyway, so consider *cur_bytes undefined.
+ */
+static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
+ uint64_t *cur_bytes, QCowL2Meta **m)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowL2Meta *old_alloc;
+ uint64_t bytes = *cur_bytes;
+
+ QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
+
+ uint64_t start = guest_offset;
+ uint64_t end = start + bytes;
+ uint64_t old_start = l2meta_cow_start(old_alloc);
+ uint64_t old_end = l2meta_cow_end(old_alloc);
+
+ if (end <= old_start || start >= old_end) {
+ /* No intersection */
+ } else {
+ if (start < old_start) {
+ /* Stop at the start of a running allocation */
+ bytes = old_start - start;
+ } else {
+ bytes = 0;
+ }
+
+ /* Stop if already an l2meta exists. After yielding, it wouldn't
+ * be valid any more, so we'd have to clean up the old L2Metas
+ * and deal with requests depending on them before starting to
+ * gather new ones. Not worth the trouble. */
+ if (bytes == 0 && *m) {
+ *cur_bytes = 0;
+ return 0;
+ }
+
+ if (bytes == 0) {
+ /* Wait for the dependency to complete. We need to recheck
+ * the free/allocated clusters when we continue. */
+ qemu_co_mutex_unlock(&s->lock);
+ qemu_co_queue_wait(&old_alloc->dependent_requests);
+ qemu_co_mutex_lock(&s->lock);
+ return -EAGAIN;
+ }
+ }
+ }
+
+ /* Make sure that existing clusters and new allocations are only used up to
+ * the next dependency if we shortened the request above */
+ *cur_bytes = bytes;
+
+ return 0;
+}
+
+/*
+ * Checks how many already allocated clusters that don't require a copy on
+ * write there are at the given guest_offset (up to *bytes). If
+ * *host_offset is not zero, only physically contiguous clusters beginning at
+ * this host offset are counted.
+ *
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
+ *
+ * Returns:
+ * 0: if no allocated clusters are available at the given offset.
+ * *bytes is normally unchanged. It is set to 0 if the cluster
+ * is allocated and doesn't need COW, but doesn't have the right
+ * physical offset.
+ *
+ * 1: if allocated clusters that don't require a COW are available at
+ * the requested offset. *bytes may have decreased and describes
+ * the length of the area that can be written to.
+ *
+ * -errno: in error cases
+ */
+static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
+ uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
+{
+ BDRVQcowState *s = bs->opaque;
+ int l2_index;
+ uint64_t cluster_offset;
+ uint64_t *l2_table;
+ unsigned int nb_clusters;
+ unsigned int keep_clusters;
+ int ret, pret;
+
+ trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
+ *bytes);
+
+ assert(*host_offset == 0 || offset_into_cluster(s, guest_offset)
+ == offset_into_cluster(s, *host_offset));
+
+ /*
+ * Calculate the number of clusters to look for. We stop at L2 table
+ * boundaries to keep things simple.
+ */
+ nb_clusters =
+ size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
+ l2_index = offset_to_l2_index(s, guest_offset);
+ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+ /* Find L2 entry for the first involved cluster */
+ ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ return ret;
+ }
+
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+ /* Check how many clusters are already allocated and don't need COW */
+ if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
+ && (cluster_offset & QCOW_OFLAG_COPIED))
+ {
+ /* If a specific host_offset is required, check it */
+ bool offset_matches =
+ (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
+
+ if (*host_offset != 0 && !offset_matches) {
+ *bytes = 0;
+ ret = 0;
+ goto out;
+ }
+
+ /* We keep all QCOW_OFLAG_COPIED clusters */
+ keep_clusters =
+ count_contiguous_clusters(nb_clusters, s->cluster_size,
+ &l2_table[l2_index], 0,
+ QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
+ assert(keep_clusters <= nb_clusters);
+
+ *bytes = MIN(*bytes,
+ keep_clusters * s->cluster_size
+ - offset_into_cluster(s, guest_offset));
+
+ ret = 1;
+ } else {
+ ret = 0;
+ }
+
+ /* Cleanup */
+out:
+ pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (pret < 0) {
+ return pret;
+ }
+
+ /* Only return a host offset if we actually made progress. Otherwise we
+ * would make requirements for handle_alloc() that it can't fulfill */
+ if (ret) {
+ *host_offset = (cluster_offset & L2E_OFFSET_MASK)
+ + offset_into_cluster(s, guest_offset);
+ }
+
+ return ret;
+}
+
+/*
+ * Allocates new clusters for the given guest_offset.
+ *
+ * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
+ * contain the number of clusters that have been allocated and are contiguous
+ * in the image file.
+ *
+ * If *host_offset is non-zero, it specifies the offset in the image file at
+ * which the new clusters must start. *nb_clusters can be 0 on return in this
+ * case if the cluster at host_offset is already in use. If *host_offset is
+ * zero, the clusters can be allocated anywhere in the image file.
+ *
+ * *host_offset is updated to contain the offset into the image file at which
+ * the first allocated cluster starts.
+ *
+ * Return 0 on success and -errno in error cases. -EAGAIN means that the
+ * function has been waiting for another request and the allocation must be
+ * restarted, but the whole request should not be failed.
+ */
+static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
+ uint64_t *host_offset, unsigned int *nb_clusters)
+{
+ BDRVQcowState *s = bs->opaque;
+
+ trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
+ *host_offset, *nb_clusters);
+
+ /* Allocate new clusters */
+ trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
+ if (*host_offset == 0) {
+ int64_t cluster_offset =
+ qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
+ if (cluster_offset < 0) {
+ return cluster_offset;
+ }
+ *host_offset = cluster_offset;
+ return 0;
+ } else {
+ int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
+ if (ret < 0) {
+ return ret;
+ }
+ *nb_clusters = ret;
+ return 0;
+ }
+}
+
+/*
+ * Allocates new clusters for an area that either is yet unallocated or needs a
+ * copy on write. If *host_offset is non-zero, clusters are only allocated if
+ * the new allocation can match the specified host offset.
+ *
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
+ *
+ * Returns:
+ * 0: if no clusters could be allocated. *bytes is set to 0,
+ * *host_offset is left unchanged.
+ *
+ * 1: if new clusters were allocated. *bytes may be decreased if the
+ * new allocation doesn't cover all of the requested area.
+ * *host_offset is updated to contain the host offset of the first
+ * newly allocated cluster.
+ *
+ * -errno: in error cases
+ */
+static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
+ uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
+{
+ BDRVQcowState *s = bs->opaque;
+ int l2_index;
+ uint64_t *l2_table;
+ uint64_t entry;
+ unsigned int nb_clusters;
+ int ret;
+
+ uint64_t alloc_cluster_offset;
+
+ trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
+ *bytes);
+ assert(*bytes > 0);
+
+ /*
+ * Calculate the number of clusters to look for. We stop at L2 table
+ * boundaries to keep things simple.
+ */
+ nb_clusters =
+ size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
+ l2_index = offset_to_l2_index(s, guest_offset);
+ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+ /* Find L2 entry for the first involved cluster */
+ ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ return ret;
+ }
+
+ entry = be64_to_cpu(l2_table[l2_index]);
+
+ /* For the moment, overwrite compressed clusters one by one */
+ if (entry & QCOW_OFLAG_COMPRESSED) {
+ nb_clusters = 1;
+ } else {
+ nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
+ }
+
+ /* This function is only called when there were no non-COW clusters, so if
+ * we can't find any unallocated or COW clusters either, something is
+ * wrong with our code. */
+ assert(nb_clusters > 0);
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Allocate, if necessary at a given offset in the image file */
+ alloc_cluster_offset = start_of_cluster(s, *host_offset);
+ ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
+ &nb_clusters);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Can't extend contiguous allocation */
+ if (nb_clusters == 0) {
+ *bytes = 0;
+ return 0;
+ }
+
+ /*
+ * Save info needed for meta data update.
+ *
+ * requested_sectors: Number of sectors from the start of the first
+ * newly allocated cluster to the end of the (possibly shortened
+ * before) write request.
+ *
+ * avail_sectors: Number of sectors from the start of the first
+ * newly allocated to the end of the last newly allocated cluster.
+ *
+ * nb_sectors: The number of sectors from the start of the first
+ * newly allocated cluster to the end of the area that the write
+ * request actually writes to (excluding COW at the end)
+ */
+ int requested_sectors =
+ (*bytes + offset_into_cluster(s, guest_offset))
+ >> BDRV_SECTOR_BITS;
+ int avail_sectors = nb_clusters
+ << (s->cluster_bits - BDRV_SECTOR_BITS);
+ int alloc_n_start = offset_into_cluster(s, guest_offset)
+ >> BDRV_SECTOR_BITS;
+ int nb_sectors = MIN(requested_sectors, avail_sectors);
+ QCowL2Meta *old_m = *m;
+
+ *m = g_malloc0(sizeof(**m));
+
+ **m = (QCowL2Meta) {
+ .next = old_m,
+
+ .alloc_offset = alloc_cluster_offset,
+ .offset = start_of_cluster(s, guest_offset),
+ .nb_clusters = nb_clusters,
+ .nb_available = nb_sectors,
+
+ .cow_start = {
+ .offset = 0,
+ .nb_sectors = alloc_n_start,
+ },
+ .cow_end = {
+ .offset = nb_sectors * BDRV_SECTOR_SIZE,
+ .nb_sectors = avail_sectors - nb_sectors,
+ },
+ };
+ qemu_co_queue_init(&(*m)->dependent_requests);
+ QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
+
+ *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
+ *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
+ - offset_into_cluster(s, guest_offset));
+ assert(*bytes != 0);
+
+ return 1;
+
+fail:
+ if (*m && (*m)->nb_clusters > 0) {
+ QLIST_REMOVE(*m, next_in_flight);
+ }
+ return ret;
+}
+
+/*
+ * alloc_cluster_offset
+ *
+ * For a given offset on the virtual disk, find the cluster offset in qcow2
+ * file. If the offset is not found, allocate a new cluster.
+ *
+ * If the cluster was already allocated, m->nb_clusters is set to 0 and
+ * other fields in m are meaningless.
+ *
+ * If the cluster is newly allocated, m->nb_clusters is set to the number of
+ * contiguous clusters that have been allocated. In this case, the other
+ * fields of m are valid and contain information about the first allocated
+ * cluster.
+ *
+ * If the request conflicts with another write request in flight, the coroutine
+ * is queued and will be reentered when the dependency has completed.
+ *
+ * Return 0 on success and -errno in error cases
+ */
+int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
+ int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t start, remaining;
+ uint64_t cluster_offset;
+ uint64_t cur_bytes;
+ int ret;
+
+ trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
+ n_start, n_end);
+
+ assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset));
+ offset = start_of_cluster(s, offset);
+
+again:
+ start = offset + (n_start << BDRV_SECTOR_BITS);
+ remaining = (n_end - n_start) << BDRV_SECTOR_BITS;
+ cluster_offset = 0;
+ *host_offset = 0;
+ cur_bytes = 0;
+ *m = NULL;
+
+ while (true) {
+
+ if (!*host_offset) {
+ *host_offset = start_of_cluster(s, cluster_offset);
+ }
+
+ assert(remaining >= cur_bytes);
+
+ start += cur_bytes;
+ remaining -= cur_bytes;
+ cluster_offset += cur_bytes;
+
+ if (remaining == 0) {
+ break;
+ }
+
+ cur_bytes = remaining;
+
+ /*
+ * Now start gathering as many contiguous clusters as possible:
+ *
+ * 1. Check for overlaps with in-flight allocations
+ *
+ * a) Overlap not in the first cluster -> shorten this request and
+ * let the caller handle the rest in its next loop iteration.
+ *
+ * b) Real overlaps of two requests. Yield and restart the search
+ * for contiguous clusters (the situation could have changed
+ * while we were sleeping)
+ *
+ * c) TODO: Request starts in the same cluster as the in-flight
+ * allocation ends. Shorten the COW of the in-fight allocation,
+ * set cluster_offset to write to the same cluster and set up
+ * the right synchronisation between the in-flight request and
+ * the new one.
+ */
+ ret = handle_dependencies(bs, start, &cur_bytes, m);
+ if (ret == -EAGAIN) {
+ /* Currently handle_dependencies() doesn't yield if we already had
+ * an allocation. If it did, we would have to clean up the L2Meta
+ * structs before starting over. */
+ assert(*m == NULL);
+ goto again;
+ } else if (ret < 0) {
+ return ret;
+ } else if (cur_bytes == 0) {
+ break;
+ } else {
+ /* handle_dependencies() may have decreased cur_bytes (shortened
+ * the allocations below) so that the next dependency is processed
+ * correctly during the next loop iteration. */
+ }
+
+ /*
+ * 2. Count contiguous COPIED clusters.
+ */
+ ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
+ if (ret < 0) {
+ return ret;
+ } else if (ret) {
+ continue;
+ } else if (cur_bytes == 0) {
+ break;
+ }
+
+ /*
+ * 3. If the request still hasn't completed, allocate new clusters,
+ * considering any cluster_offset of steps 1c or 2.
+ */
+ ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
+ if (ret < 0) {
+ return ret;
+ } else if (ret) {
+ continue;
+ } else {
+ assert(cur_bytes == 0);
+ break;
+ }
+ }
+
+ *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS);
+ assert(*num > 0);
+ assert(*host_offset != 0);
+
+ return 0;
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+ const uint8_t *buf, int buf_size)
+{
+ z_stream strm1, *strm = &strm1;
+ int ret, out_len;
+
+ memset(strm, 0, sizeof(*strm));
+
+ strm->next_in = (uint8_t *)buf;
+ strm->avail_in = buf_size;
+ strm->next_out = out_buf;
+ strm->avail_out = out_buf_size;
+
+ ret = inflateInit2(strm, -12);
+ if (ret != Z_OK)
+ return -1;
+ ret = inflate(strm, Z_FINISH);
+ out_len = strm->next_out - out_buf;
+ if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+ out_len != out_buf_size) {
+ inflateEnd(strm);
+ return -1;
+ }
+ inflateEnd(strm);
+ return 0;
+}
+
+int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, csize, nb_csectors, sector_offset;
+ uint64_t coffset;
+
+ coffset = cluster_offset & s->cluster_offset_mask;
+ if (s->cluster_cache_offset != coffset) {
+ nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
+ sector_offset = coffset & 511;
+ csize = nb_csectors * 512 - sector_offset;
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
+ ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors);
+ if (ret < 0) {
+ return ret;
+ }
+ if (decompress_buffer(s->cluster_cache, s->cluster_size,
+ s->cluster_data + sector_offset, csize) < 0) {
+ return -EIO;
+ }
+ s->cluster_cache_offset = coffset;
+ }
+ return 0;
+}
+
+/*
+ * This discards as many clusters of nb_clusters as possible at once (i.e.
+ * all clusters in the same L2 table) and returns the number of discarded
+ * clusters.
+ */
+static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
+ unsigned int nb_clusters)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l2_table;
+ int l2_index;
+ int ret;
+ int i;
+
+ ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Limit nb_clusters to one L2 table */
+ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+ for (i = 0; i < nb_clusters; i++) {
+ uint64_t old_offset;
+
+ old_offset = be64_to_cpu(l2_table[l2_index + i]);
+ if ((old_offset & L2E_OFFSET_MASK) == 0) {
+ continue;
+ }
+
+ /* First remove L2 entries */
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ l2_table[l2_index + i] = cpu_to_be64(0);
+
+ /* Then decrease the refcount */
+ qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
+ }
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return nb_clusters;
+}
+
+int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
+ int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t end_offset;
+ unsigned int nb_clusters;
+ int ret;
+
+ end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
+
+ /* Round start up and end down */
+ offset = align_offset(offset, s->cluster_size);
+ end_offset &= ~(s->cluster_size - 1);
+
+ if (offset > end_offset) {
+ return 0;
+ }
+
+ nb_clusters = size_to_clusters(s, end_offset - offset);
+
+ s->cache_discards = true;
+
+ /* Each L2 table is handled by its own loop iteration */
+ while (nb_clusters > 0) {
+ ret = discard_single_l2(bs, offset, nb_clusters);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ nb_clusters -= ret;
+ offset += (ret * s->cluster_size);
+ }
+
+ ret = 0;
+fail:
+ s->cache_discards = false;
+ qcow2_process_discards(bs, ret);
+
+ return ret;
+}
+
+/*
+ * This zeroes as many clusters of nb_clusters as possible at once (i.e.
+ * all clusters in the same L2 table) and returns the number of zeroed
+ * clusters.
+ */
+static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
+ unsigned int nb_clusters)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l2_table;
+ int l2_index;
+ int ret;
+ int i;
+
+ ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Limit nb_clusters to one L2 table */
+ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+ for (i = 0; i < nb_clusters; i++) {
+ uint64_t old_offset;
+
+ old_offset = be64_to_cpu(l2_table[l2_index + i]);
+
+ /* Update L2 entries */
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ if (old_offset & QCOW_OFLAG_COMPRESSED) {
+ l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
+ qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
+ } else {
+ l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
+ }
+ }
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return nb_clusters;
+}
+
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ unsigned int nb_clusters;
+ int ret;
+
+ /* The zero flag is only supported by version 3 and newer */
+ if (s->qcow_version < 3) {
+ return -ENOTSUP;
+ }
+
+ /* Each L2 table is handled by its own loop iteration */
+ nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
+
+ s->cache_discards = true;
+
+ while (nb_clusters > 0) {
+ ret = zero_single_l2(bs, offset, nb_clusters);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ nb_clusters -= ret;
+ offset += (ret * s->cluster_size);
+ }
+
+ ret = 0;
+fail:
+ s->cache_discards = false;
+ qcow2_process_discards(bs, ret);
+
+ return ret;
+}
diff --git a/contrib/qemu/block/qcow2-refcount.c b/contrib/qemu/block/qcow2-refcount.c
new file mode 100644
index 000000000..1244693f3
--- /dev/null
+++ b/contrib/qemu/block/qcow2-refcount.c
@@ -0,0 +1,1374 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/qcow2.h"
+
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
+static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
+ int64_t offset, int64_t length,
+ int addend, enum qcow2_discard_type type);
+
+
+/*********************************************************/
+/* refcount handling */
+
+int qcow2_refcount_init(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, refcount_table_size2, i;
+
+ refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
+ s->refcount_table = g_malloc(refcount_table_size2);
+ if (s->refcount_table_size > 0) {
+ BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
+ ret = bdrv_pread(bs->file, s->refcount_table_offset,
+ s->refcount_table, refcount_table_size2);
+ if (ret != refcount_table_size2)
+ goto fail;
+ for(i = 0; i < s->refcount_table_size; i++)
+ be64_to_cpus(&s->refcount_table[i]);
+ }
+ return 0;
+ fail:
+ return -ENOMEM;
+}
+
+void qcow2_refcount_close(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ g_free(s->refcount_table);
+}
+
+
+static int load_refcount_block(BlockDriverState *bs,
+ int64_t refcount_block_offset,
+ void **refcount_block)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
+ ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+ refcount_block);
+
+ return ret;
+}
+
+/*
+ * Returns the refcount of the cluster given by its index. Any non-negative
+ * return value is the refcount of the cluster, negative values are -errno
+ * and indicate an error.
+ */
+static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
+{
+ BDRVQcowState *s = bs->opaque;
+ int refcount_table_index, block_index;
+ int64_t refcount_block_offset;
+ int ret;
+ uint16_t *refcount_block;
+ uint16_t refcount;
+
+ refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+ if (refcount_table_index >= s->refcount_table_size)
+ return 0;
+ refcount_block_offset = s->refcount_table[refcount_table_index];
+ if (!refcount_block_offset)
+ return 0;
+
+ ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+ (void**) &refcount_block);
+ if (ret < 0) {
+ return ret;
+ }
+
+ block_index = cluster_index &
+ ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+ refcount = be16_to_cpu(refcount_block[block_index]);
+
+ ret = qcow2_cache_put(bs, s->refcount_block_cache,
+ (void**) &refcount_block);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return refcount;
+}
+
+/*
+ * Rounds the refcount table size up to avoid growing the table for each single
+ * refcount block that is allocated.
+ */
+static unsigned int next_refcount_table_size(BDRVQcowState *s,
+ unsigned int min_size)
+{
+ unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
+ unsigned int refcount_table_clusters =
+ MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
+
+ while (min_clusters > refcount_table_clusters) {
+ refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
+ }
+
+ return refcount_table_clusters << (s->cluster_bits - 3);
+}
+
+
+/* Checks if two offsets are described by the same refcount block */
+static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
+ uint64_t offset_b)
+{
+ uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
+ uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
+
+ return (block_a == block_b);
+}
+
+/*
+ * Loads a refcount block. If it doesn't exist yet, it is allocated first
+ * (including growing the refcount table if needed).
+ *
+ * Returns 0 on success or -errno in error case
+ */
+static int alloc_refcount_block(BlockDriverState *bs,
+ int64_t cluster_index, uint16_t **refcount_block)
+{
+ BDRVQcowState *s = bs->opaque;
+ unsigned int refcount_table_index;
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
+
+ /* Find the refcount block for the given cluster */
+ refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+
+ if (refcount_table_index < s->refcount_table_size) {
+
+ uint64_t refcount_block_offset =
+ s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
+
+ /* If it's already there, we're done */
+ if (refcount_block_offset) {
+ return load_refcount_block(bs, refcount_block_offset,
+ (void**) refcount_block);
+ }
+ }
+
+ /*
+ * If we came here, we need to allocate something. Something is at least
+ * a cluster for the new refcount block. It may also include a new refcount
+ * table if the old refcount table is too small.
+ *
+ * Note that allocating clusters here needs some special care:
+ *
+ * - We can't use the normal qcow2_alloc_clusters(), it would try to
+ * increase the refcount and very likely we would end up with an endless
+ * recursion. Instead we must place the refcount blocks in a way that
+ * they can describe them themselves.
+ *
+ * - We need to consider that at this point we are inside update_refcounts
+ * and doing the initial refcount increase. This means that some clusters
+ * have already been allocated by the caller, but their refcount isn't
+ * accurate yet. free_cluster_index tells us where this allocation ends
+ * as long as we don't overwrite it by freeing clusters.
+ *
+ * - alloc_clusters_noref and qcow2_free_clusters may load a different
+ * refcount block into the cache
+ */
+
+ *refcount_block = NULL;
+
+ /* We write to the refcount table, so we might depend on L2 tables */
+ ret = qcow2_cache_flush(bs, s->l2_table_cache);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Allocate the refcount block itself and mark it as used */
+ int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
+ if (new_block < 0) {
+ return new_block;
+ }
+
+#ifdef DEBUG_ALLOC2
+ fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
+ " at %" PRIx64 "\n",
+ refcount_table_index, cluster_index << s->cluster_bits, new_block);
+#endif
+
+ if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
+ /* Zero the new refcount block before updating it */
+ ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+ (void**) refcount_block);
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ memset(*refcount_block, 0, s->cluster_size);
+
+ /* The block describes itself, need to update the cache */
+ int block_index = (new_block >> s->cluster_bits) &
+ ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+ (*refcount_block)[block_index] = cpu_to_be16(1);
+ } else {
+ /* Described somewhere else. This can recurse at most twice before we
+ * arrive at a block that describes itself. */
+ ret = update_refcount(bs, new_block, s->cluster_size, 1,
+ QCOW2_DISCARD_NEVER);
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ /* Initialize the new refcount block only after updating its refcount,
+ * update_refcount uses the refcount cache itself */
+ ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+ (void**) refcount_block);
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ memset(*refcount_block, 0, s->cluster_size);
+ }
+
+ /* Now the new refcount block needs to be written to disk */
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
+ qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
+ ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ /* If the refcount table is big enough, just hook the block up there */
+ if (refcount_table_index < s->refcount_table_size) {
+ uint64_t data64 = cpu_to_be64(new_block);
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
+ ret = bdrv_pwrite_sync(bs->file,
+ s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
+ &data64, sizeof(data64));
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ s->refcount_table[refcount_table_index] = new_block;
+ return 0;
+ }
+
+ ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+ if (ret < 0) {
+ goto fail_block;
+ }
+
+ /*
+ * If we come here, we need to grow the refcount table. Again, a new
+ * refcount table needs some space and we can't simply allocate to avoid
+ * endless recursion.
+ *
+ * Therefore let's grab new refcount blocks at the end of the image, which
+ * will describe themselves and the new refcount table. This way we can
+ * reference them only in the new table and do the switch to the new
+ * refcount table at once without producing an inconsistent state in
+ * between.
+ */
+ BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
+
+ /* Calculate the number of refcount blocks needed so far */
+ uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
+ uint64_t blocks_used = (s->free_cluster_index +
+ refcount_block_clusters - 1) / refcount_block_clusters;
+
+ /* And now we need at least one block more for the new metadata */
+ uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
+ uint64_t last_table_size;
+ uint64_t blocks_clusters;
+ do {
+ uint64_t table_clusters =
+ size_to_clusters(s, table_size * sizeof(uint64_t));
+ blocks_clusters = 1 +
+ ((table_clusters + refcount_block_clusters - 1)
+ / refcount_block_clusters);
+ uint64_t meta_clusters = table_clusters + blocks_clusters;
+
+ last_table_size = table_size;
+ table_size = next_refcount_table_size(s, blocks_used +
+ ((meta_clusters + refcount_block_clusters - 1)
+ / refcount_block_clusters));
+
+ } while (last_table_size != table_size);
+
+#ifdef DEBUG_ALLOC2
+ fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
+ s->refcount_table_size, table_size);
+#endif
+
+ /* Create the new refcount table and blocks */
+ uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
+ s->cluster_size;
+ uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
+ uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
+ uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
+
+ assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
+
+ /* Fill the new refcount table */
+ memcpy(new_table, s->refcount_table,
+ s->refcount_table_size * sizeof(uint64_t));
+ new_table[refcount_table_index] = new_block;
+
+ int i;
+ for (i = 0; i < blocks_clusters; i++) {
+ new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
+ }
+
+ /* Fill the refcount blocks */
+ uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
+ int block = 0;
+ for (i = 0; i < table_clusters + blocks_clusters; i++) {
+ new_blocks[block++] = cpu_to_be16(1);
+ }
+
+ /* Write refcount blocks to disk */
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
+ ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
+ blocks_clusters * s->cluster_size);
+ g_free(new_blocks);
+ if (ret < 0) {
+ goto fail_table;
+ }
+
+ /* Write refcount table to disk */
+ for(i = 0; i < table_size; i++) {
+ cpu_to_be64s(&new_table[i]);
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
+ ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
+ table_size * sizeof(uint64_t));
+ if (ret < 0) {
+ goto fail_table;
+ }
+
+ for(i = 0; i < table_size; i++) {
+ be64_to_cpus(&new_table[i]);
+ }
+
+ /* Hook up the new refcount table in the qcow2 header */
+ uint8_t data[12];
+ cpu_to_be64w((uint64_t*)data, table_offset);
+ cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
+ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
+ ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
+ data, sizeof(data));
+ if (ret < 0) {
+ goto fail_table;
+ }
+
+ /* And switch it in memory */
+ uint64_t old_table_offset = s->refcount_table_offset;
+ uint64_t old_table_size = s->refcount_table_size;
+
+ g_free(s->refcount_table);
+ s->refcount_table = new_table;
+ s->refcount_table_size = table_size;
+ s->refcount_table_offset = table_offset;
+
+ /* Free old table. Remember, we must not change free_cluster_index */
+ uint64_t old_free_cluster_index = s->free_cluster_index;
+ qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
+ QCOW2_DISCARD_OTHER);
+ s->free_cluster_index = old_free_cluster_index;
+
+ ret = load_refcount_block(bs, new_block, (void**) refcount_block);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+
+fail_table:
+ g_free(new_table);
+fail_block:
+ if (*refcount_block != NULL) {
+ qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+ }
+ return ret;
+}
+
+void qcow2_process_discards(BlockDriverState *bs, int ret)
+{
+ BDRVQcowState *s = bs->opaque;
+ Qcow2DiscardRegion *d, *next;
+
+ QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
+ QTAILQ_REMOVE(&s->discards, d, next);
+
+ /* Discard is optional, ignore the return value */
+ if (ret >= 0) {
+ bdrv_discard(bs->file,
+ d->offset >> BDRV_SECTOR_BITS,
+ d->bytes >> BDRV_SECTOR_BITS);
+ }
+
+ g_free(d);
+ }
+}
+
+static void update_refcount_discard(BlockDriverState *bs,
+ uint64_t offset, uint64_t length)
+{
+ BDRVQcowState *s = bs->opaque;
+ Qcow2DiscardRegion *d, *p, *next;
+
+ QTAILQ_FOREACH(d, &s->discards, next) {
+ uint64_t new_start = MIN(offset, d->offset);
+ uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
+
+ if (new_end - new_start <= length + d->bytes) {
+ /* There can't be any overlap, areas ending up here have no
+ * references any more and therefore shouldn't get freed another
+ * time. */
+ assert(d->bytes + length == new_end - new_start);
+ d->offset = new_start;
+ d->bytes = new_end - new_start;
+ goto found;
+ }
+ }
+
+ d = g_malloc(sizeof(*d));
+ *d = (Qcow2DiscardRegion) {
+ .bs = bs,
+ .offset = offset,
+ .bytes = length,
+ };
+ QTAILQ_INSERT_TAIL(&s->discards, d, next);
+
+found:
+ /* Merge discard requests if they are adjacent now */
+ QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
+ if (p == d
+ || p->offset > d->offset + d->bytes
+ || d->offset > p->offset + p->bytes)
+ {
+ continue;
+ }
+
+ /* Still no overlap possible */
+ assert(p->offset == d->offset + d->bytes
+ || d->offset == p->offset + p->bytes);
+
+ QTAILQ_REMOVE(&s->discards, p, next);
+ d->offset = MIN(d->offset, p->offset);
+ d->bytes += p->bytes;
+ }
+}
+
+/* XXX: cache several refcount block clusters ? */
+static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
+ int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t start, last, cluster_offset;
+ uint16_t *refcount_block = NULL;
+ int64_t old_table_index = -1;
+ int ret;
+
+#ifdef DEBUG_ALLOC2
+ fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
+ offset, length, addend);
+#endif
+ if (length < 0) {
+ return -EINVAL;
+ } else if (length == 0) {
+ return 0;
+ }
+
+ if (addend < 0) {
+ qcow2_cache_set_dependency(bs, s->refcount_block_cache,
+ s->l2_table_cache);
+ }
+
+ start = offset & ~(s->cluster_size - 1);
+ last = (offset + length - 1) & ~(s->cluster_size - 1);
+ for(cluster_offset = start; cluster_offset <= last;
+ cluster_offset += s->cluster_size)
+ {
+ int block_index, refcount;
+ int64_t cluster_index = cluster_offset >> s->cluster_bits;
+ int64_t table_index =
+ cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+
+ /* Load the refcount block and allocate it if needed */
+ if (table_index != old_table_index) {
+ if (refcount_block) {
+ ret = qcow2_cache_put(bs, s->refcount_block_cache,
+ (void**) &refcount_block);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+ old_table_index = table_index;
+
+ qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
+
+ /* we can update the count and save it */
+ block_index = cluster_index &
+ ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+
+ refcount = be16_to_cpu(refcount_block[block_index]);
+ refcount += addend;
+ if (refcount < 0 || refcount > 0xffff) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ if (refcount == 0 && cluster_index < s->free_cluster_index) {
+ s->free_cluster_index = cluster_index;
+ }
+ refcount_block[block_index] = cpu_to_be16(refcount);
+
+ if (refcount == 0 && s->discard_passthrough[type]) {
+ update_refcount_discard(bs, cluster_offset, s->cluster_size);
+ }
+ }
+
+ ret = 0;
+fail:
+ if (!s->cache_discards) {
+ qcow2_process_discards(bs, ret);
+ }
+
+ /* Write last changed block to disk */
+ if (refcount_block) {
+ int wret;
+ wret = qcow2_cache_put(bs, s->refcount_block_cache,
+ (void**) &refcount_block);
+ if (wret < 0) {
+ return ret < 0 ? ret : wret;
+ }
+ }
+
+ /*
+ * Try do undo any updates if an error is returned (This may succeed in
+ * some cases like ENOSPC for allocating a new refcount block)
+ */
+ if (ret < 0) {
+ int dummy;
+ dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
+ QCOW2_DISCARD_NEVER);
+ (void)dummy;
+ }
+
+ return ret;
+}
+
+/*
+ * Increases or decreases the refcount of a given cluster by one.
+ * addend must be 1 or -1.
+ *
+ * If the return value is non-negative, it is the new refcount of the cluster.
+ * If it is negative, it is -errno and indicates an error.
+ */
+static int update_cluster_refcount(BlockDriverState *bs,
+ int64_t cluster_index,
+ int addend,
+ enum qcow2_discard_type type)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret;
+
+ ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
+ type);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return get_refcount(bs, cluster_index);
+}
+
+
+
+/*********************************************************/
+/* cluster allocation functions */
+
+
+
+/* return < 0 if error */
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i, nb_clusters, refcount;
+
+ nb_clusters = size_to_clusters(s, size);
+retry:
+ for(i = 0; i < nb_clusters; i++) {
+ int64_t next_cluster_index = s->free_cluster_index++;
+ refcount = get_refcount(bs, next_cluster_index);
+
+ if (refcount < 0) {
+ return refcount;
+ } else if (refcount != 0) {
+ goto retry;
+ }
+ }
+#ifdef DEBUG_ALLOC2
+ fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
+ size,
+ (s->free_cluster_index - nb_clusters) << s->cluster_bits);
+#endif
+ return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
+}
+
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
+{
+ int64_t offset;
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
+ offset = alloc_clusters_noref(bs, size);
+ if (offset < 0) {
+ return offset;
+ }
+
+ ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return offset;
+}
+
+int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+ int nb_clusters)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t cluster_index;
+ uint64_t old_free_cluster_index;
+ int i, refcount, ret;
+
+ /* Check how many clusters there are free */
+ cluster_index = offset >> s->cluster_bits;
+ for(i = 0; i < nb_clusters; i++) {
+ refcount = get_refcount(bs, cluster_index++);
+
+ if (refcount < 0) {
+ return refcount;
+ } else if (refcount != 0) {
+ break;
+ }
+ }
+
+ /* And then allocate them */
+ old_free_cluster_index = s->free_cluster_index;
+ s->free_cluster_index = cluster_index + i;
+
+ ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
+ QCOW2_DISCARD_NEVER);
+ if (ret < 0) {
+ return ret;
+ }
+
+ s->free_cluster_index = old_free_cluster_index;
+
+ return i;
+}
+
+/* only used to allocate compressed sectors. We try to allocate
+ contiguous sectors. size must be <= cluster_size */
+int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t offset, cluster_offset;
+ int free_in_cluster;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
+ assert(size > 0 && size <= s->cluster_size);
+ if (s->free_byte_offset == 0) {
+ offset = qcow2_alloc_clusters(bs, s->cluster_size);
+ if (offset < 0) {
+ return offset;
+ }
+ s->free_byte_offset = offset;
+ }
+ redo:
+ free_in_cluster = s->cluster_size -
+ (s->free_byte_offset & (s->cluster_size - 1));
+ if (size <= free_in_cluster) {
+ /* enough space in current cluster */
+ offset = s->free_byte_offset;
+ s->free_byte_offset += size;
+ free_in_cluster -= size;
+ if (free_in_cluster == 0)
+ s->free_byte_offset = 0;
+ if ((offset & (s->cluster_size - 1)) != 0)
+ update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+ QCOW2_DISCARD_NEVER);
+ } else {
+ offset = qcow2_alloc_clusters(bs, s->cluster_size);
+ if (offset < 0) {
+ return offset;
+ }
+ cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
+ if ((cluster_offset + s->cluster_size) == offset) {
+ /* we are lucky: contiguous data */
+ offset = s->free_byte_offset;
+ update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+ QCOW2_DISCARD_NEVER);
+ s->free_byte_offset += size;
+ } else {
+ s->free_byte_offset = offset;
+ goto redo;
+ }
+ }
+
+ /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
+ * or explicitly by update_cluster_refcount(). Refcount blocks must be
+ * flushed before the caller's L2 table updates.
+ */
+ qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
+ return offset;
+}
+
+void qcow2_free_clusters(BlockDriverState *bs,
+ int64_t offset, int64_t size,
+ enum qcow2_discard_type type)
+{
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
+ ret = update_refcount(bs, offset, size, -1, type);
+ if (ret < 0) {
+ fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
+ /* TODO Remember the clusters to free them later and avoid leaking */
+ }
+}
+
+/*
+ * Free a cluster using its L2 entry (handles clusters of all types, e.g.
+ * normal cluster, compressed cluster, etc.)
+ */
+void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
+ int nb_clusters, enum qcow2_discard_type type)
+{
+ BDRVQcowState *s = bs->opaque;
+
+ switch (qcow2_get_cluster_type(l2_entry)) {
+ case QCOW2_CLUSTER_COMPRESSED:
+ {
+ int nb_csectors;
+ nb_csectors = ((l2_entry >> s->csize_shift) &
+ s->csize_mask) + 1;
+ qcow2_free_clusters(bs,
+ (l2_entry & s->cluster_offset_mask) & ~511,
+ nb_csectors * 512, type);
+ }
+ break;
+ case QCOW2_CLUSTER_NORMAL:
+ qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
+ nb_clusters << s->cluster_bits, type);
+ break;
+ case QCOW2_CLUSTER_UNALLOCATED:
+ case QCOW2_CLUSTER_ZERO:
+ break;
+ default:
+ abort();
+ }
+}
+
+
+
+/*********************************************************/
+/* snapshots and image creation */
+
+
+
+/* update the refcounts of snapshots and the copied flag */
+int qcow2_update_snapshot_refcount(BlockDriverState *bs,
+ int64_t l1_table_offset, int l1_size, int addend)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
+ int64_t old_offset, old_l2_offset;
+ int i, j, l1_modified = 0, nb_csectors, refcount;
+ int ret;
+
+ l2_table = NULL;
+ l1_table = NULL;
+ l1_size2 = l1_size * sizeof(uint64_t);
+
+ s->cache_discards = true;
+
+ /* WARNING: qcow2_snapshot_goto relies on this function not using the
+ * l1_table_offset when it is the current s->l1_table_offset! Be careful
+ * when changing this! */
+ if (l1_table_offset != s->l1_table_offset) {
+ l1_table = g_malloc0(align_offset(l1_size2, 512));
+ l1_allocated = 1;
+
+ ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ for(i = 0;i < l1_size; i++)
+ be64_to_cpus(&l1_table[i]);
+ } else {
+ assert(l1_size == s->l1_size);
+ l1_table = s->l1_table;
+ l1_allocated = 0;
+ }
+
+ for(i = 0; i < l1_size; i++) {
+ l2_offset = l1_table[i];
+ if (l2_offset) {
+ old_l2_offset = l2_offset;
+ l2_offset &= L1E_OFFSET_MASK;
+
+ ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+ (void**) &l2_table);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ for(j = 0; j < s->l2_size; j++) {
+ offset = be64_to_cpu(l2_table[j]);
+ if (offset != 0) {
+ old_offset = offset;
+ offset &= ~QCOW_OFLAG_COPIED;
+ if (offset & QCOW_OFLAG_COMPRESSED) {
+ nb_csectors = ((offset >> s->csize_shift) &
+ s->csize_mask) + 1;
+ if (addend != 0) {
+ int ret;
+ ret = update_refcount(bs,
+ (offset & s->cluster_offset_mask) & ~511,
+ nb_csectors * 512, addend,
+ QCOW2_DISCARD_SNAPSHOT);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+ /* compressed clusters are never modified */
+ refcount = 2;
+ } else {
+ uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
+ if (addend != 0) {
+ refcount = update_cluster_refcount(bs, cluster_index, addend,
+ QCOW2_DISCARD_SNAPSHOT);
+ } else {
+ refcount = get_refcount(bs, cluster_index);
+ }
+
+ if (refcount < 0) {
+ ret = refcount;
+ goto fail;
+ }
+ }
+
+ if (refcount == 1) {
+ offset |= QCOW_OFLAG_COPIED;
+ }
+ if (offset != old_offset) {
+ if (addend > 0) {
+ qcow2_cache_set_dependency(bs, s->l2_table_cache,
+ s->refcount_block_cache);
+ }
+ l2_table[j] = cpu_to_be64(offset);
+ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ }
+ }
+ }
+
+ ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ if (ret < 0) {
+ goto fail;
+ }
+
+
+ if (addend != 0) {
+ refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend,
+ QCOW2_DISCARD_SNAPSHOT);
+ } else {
+ refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+ }
+ if (refcount < 0) {
+ ret = refcount;
+ goto fail;
+ } else if (refcount == 1) {
+ l2_offset |= QCOW_OFLAG_COPIED;
+ }
+ if (l2_offset != old_l2_offset) {
+ l1_table[i] = l2_offset;
+ l1_modified = 1;
+ }
+ }
+ }
+
+ ret = bdrv_flush(bs);
+fail:
+ if (l2_table) {
+ qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+ }
+
+ s->cache_discards = false;
+ qcow2_process_discards(bs, ret);
+
+ /* Update L1 only if it isn't deleted anyway (addend = -1) */
+ if (ret == 0 && addend >= 0 && l1_modified) {
+ for (i = 0; i < l1_size; i++) {
+ cpu_to_be64s(&l1_table[i]);
+ }
+
+ ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
+
+ for (i = 0; i < l1_size; i++) {
+ be64_to_cpus(&l1_table[i]);
+ }
+ }
+ if (l1_allocated)
+ g_free(l1_table);
+ return ret;
+}
+
+
+
+
+/*********************************************************/
+/* refcount checking functions */
+
+
+
+/*
+ * Increases the refcount for a range of clusters in a given refcount table.
+ * This is used to construct a temporary refcount table out of L1 and L2 tables
+ * which can be compared the the refcount table saved in the image.
+ *
+ * Modifies the number of errors in res.
+ */
+static void inc_refcounts(BlockDriverState *bs,
+ BdrvCheckResult *res,
+ uint16_t *refcount_table,
+ int refcount_table_size,
+ int64_t offset, int64_t size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t start, last, cluster_offset;
+ int k;
+
+ if (size <= 0)
+ return;
+
+ start = offset & ~(s->cluster_size - 1);
+ last = (offset + size - 1) & ~(s->cluster_size - 1);
+ for(cluster_offset = start; cluster_offset <= last;
+ cluster_offset += s->cluster_size) {
+ k = cluster_offset >> s->cluster_bits;
+ if (k < 0) {
+ fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
+ cluster_offset);
+ res->corruptions++;
+ } else if (k >= refcount_table_size) {
+ fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
+ "the end of the image file, can't properly check refcounts.\n",
+ cluster_offset);
+ res->check_errors++;
+ } else {
+ if (++refcount_table[k] == 0) {
+ fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
+ "\n", cluster_offset);
+ res->corruptions++;
+ }
+ }
+ }
+}
+
+/* Flags for check_refcounts_l1() and check_refcounts_l2() */
+enum {
+ CHECK_OFLAG_COPIED = 0x1, /* check QCOW_OFLAG_COPIED matches refcount */
+ CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */
+};
+
+/*
+ * Increases the refcount in the given refcount table for the all clusters
+ * referenced in the L2 table. While doing so, performs some checks on L2
+ * entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+ uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
+ int flags)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l2_table, l2_entry;
+ uint64_t next_contiguous_offset = 0;
+ int i, l2_size, nb_csectors, refcount;
+
+ /* Read L2 table from disk */
+ l2_size = s->l2_size * sizeof(uint64_t);
+ l2_table = g_malloc(l2_size);
+
+ if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
+ goto fail;
+
+ /* Do the actual checks */
+ for(i = 0; i < s->l2_size; i++) {
+ l2_entry = be64_to_cpu(l2_table[i]);
+
+ switch (qcow2_get_cluster_type(l2_entry)) {
+ case QCOW2_CLUSTER_COMPRESSED:
+ /* Compressed clusters don't have QCOW_OFLAG_COPIED */
+ if (l2_entry & QCOW_OFLAG_COPIED) {
+ fprintf(stderr, "ERROR: cluster %" PRId64 ": "
+ "copied flag must never be set for compressed "
+ "clusters\n", l2_entry >> s->cluster_bits);
+ l2_entry &= ~QCOW_OFLAG_COPIED;
+ res->corruptions++;
+ }
+
+ /* Mark cluster as used */
+ nb_csectors = ((l2_entry >> s->csize_shift) &
+ s->csize_mask) + 1;
+ l2_entry &= s->cluster_offset_mask;
+ inc_refcounts(bs, res, refcount_table, refcount_table_size,
+ l2_entry & ~511, nb_csectors * 512);
+
+ if (flags & CHECK_FRAG_INFO) {
+ res->bfi.allocated_clusters++;
+ res->bfi.compressed_clusters++;
+
+ /* Compressed clusters are fragmented by nature. Since they
+ * take up sub-sector space but we only have sector granularity
+ * I/O we need to re-read the same sectors even for adjacent
+ * compressed clusters.
+ */
+ res->bfi.fragmented_clusters++;
+ }
+ break;
+
+ case QCOW2_CLUSTER_ZERO:
+ if ((l2_entry & L2E_OFFSET_MASK) == 0) {
+ break;
+ }
+ /* fall through */
+
+ case QCOW2_CLUSTER_NORMAL:
+ {
+ /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+ uint64_t offset = l2_entry & L2E_OFFSET_MASK;
+
+ if (flags & CHECK_OFLAG_COPIED) {
+ refcount = get_refcount(bs, offset >> s->cluster_bits);
+ if (refcount < 0) {
+ fprintf(stderr, "Can't get refcount for offset %"
+ PRIx64 ": %s\n", l2_entry, strerror(-refcount));
+ goto fail;
+ }
+ if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
+ fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
+ PRIx64 " refcount=%d\n", l2_entry, refcount);
+ res->corruptions++;
+ }
+ }
+
+ if (flags & CHECK_FRAG_INFO) {
+ res->bfi.allocated_clusters++;
+ if (next_contiguous_offset &&
+ offset != next_contiguous_offset) {
+ res->bfi.fragmented_clusters++;
+ }
+ next_contiguous_offset = offset + s->cluster_size;
+ }
+
+ /* Mark cluster as used */
+ inc_refcounts(bs, res, refcount_table,refcount_table_size,
+ offset, s->cluster_size);
+
+ /* Correct offsets are cluster aligned */
+ if (offset & (s->cluster_size - 1)) {
+ fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
+ "properly aligned; L2 entry corrupted.\n", offset);
+ res->corruptions++;
+ }
+ break;
+ }
+
+ case QCOW2_CLUSTER_UNALLOCATED:
+ break;
+
+ default:
+ abort();
+ }
+ }
+
+ g_free(l2_table);
+ return 0;
+
+fail:
+ fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
+ g_free(l2_table);
+ return -EIO;
+}
+
+/*
+ * Increases the refcount for the L1 table, its L2 tables and all referenced
+ * clusters in the given refcount table. While doing so, performs some checks
+ * on L1 and L2 entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l1(BlockDriverState *bs,
+ BdrvCheckResult *res,
+ uint16_t *refcount_table,
+ int refcount_table_size,
+ int64_t l1_table_offset, int l1_size,
+ int flags)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l1_table, l2_offset, l1_size2;
+ int i, refcount, ret;
+
+ l1_size2 = l1_size * sizeof(uint64_t);
+
+ /* Mark L1 table as used */
+ inc_refcounts(bs, res, refcount_table, refcount_table_size,
+ l1_table_offset, l1_size2);
+
+ /* Read L1 table entries from disk */
+ if (l1_size2 == 0) {
+ l1_table = NULL;
+ } else {
+ l1_table = g_malloc(l1_size2);
+ if (bdrv_pread(bs->file, l1_table_offset,
+ l1_table, l1_size2) != l1_size2)
+ goto fail;
+ for(i = 0;i < l1_size; i++)
+ be64_to_cpus(&l1_table[i]);
+ }
+
+ /* Do the actual checks */
+ for(i = 0; i < l1_size; i++) {
+ l2_offset = l1_table[i];
+ if (l2_offset) {
+ /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+ if (flags & CHECK_OFLAG_COPIED) {
+ refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
+ >> s->cluster_bits);
+ if (refcount < 0) {
+ fprintf(stderr, "Can't get refcount for l2_offset %"
+ PRIx64 ": %s\n", l2_offset, strerror(-refcount));
+ goto fail;
+ }
+ if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
+ fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
+ " refcount=%d\n", l2_offset, refcount);
+ res->corruptions++;
+ }
+ }
+
+ /* Mark L2 table as used */
+ l2_offset &= L1E_OFFSET_MASK;
+ inc_refcounts(bs, res, refcount_table, refcount_table_size,
+ l2_offset, s->cluster_size);
+
+ /* L2 tables are cluster aligned */
+ if (l2_offset & (s->cluster_size - 1)) {
+ fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
+ "cluster aligned; L1 entry corrupted\n", l2_offset);
+ res->corruptions++;
+ }
+
+ /* Process and check L2 entries */
+ ret = check_refcounts_l2(bs, res, refcount_table,
+ refcount_table_size, l2_offset, flags);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+ }
+ g_free(l1_table);
+ return 0;
+
+fail:
+ fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+ res->check_errors++;
+ g_free(l1_table);
+ return -EIO;
+}
+
+/*
+ * Checks an image for refcount consistency.
+ *
+ * Returns 0 if no errors are found, the number of errors in case the image is
+ * detected as corrupted, and -errno when an internal error occurred.
+ */
+int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t size, i, highest_cluster;
+ int nb_clusters, refcount1, refcount2;
+ QCowSnapshot *sn;
+ uint16_t *refcount_table;
+ int ret;
+
+ size = bdrv_getlength(bs->file);
+ nb_clusters = size_to_clusters(s, size);
+ refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
+
+ res->bfi.total_clusters =
+ size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
+
+ /* header */
+ inc_refcounts(bs, res, refcount_table, nb_clusters,
+ 0, s->cluster_size);
+
+ /* current L1 table */
+ ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
+ s->l1_table_offset, s->l1_size,
+ CHECK_OFLAG_COPIED | CHECK_FRAG_INFO);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* snapshots */
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
+ sn->l1_table_offset, sn->l1_size, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+ inc_refcounts(bs, res, refcount_table, nb_clusters,
+ s->snapshots_offset, s->snapshots_size);
+
+ /* refcount data */
+ inc_refcounts(bs, res, refcount_table, nb_clusters,
+ s->refcount_table_offset,
+ s->refcount_table_size * sizeof(uint64_t));
+
+ for(i = 0; i < s->refcount_table_size; i++) {
+ uint64_t offset, cluster;
+ offset = s->refcount_table[i];
+ cluster = offset >> s->cluster_bits;
+
+ /* Refcount blocks are cluster aligned */
+ if (offset & (s->cluster_size - 1)) {
+ fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
+ "cluster aligned; refcount table entry corrupted\n", i);
+ res->corruptions++;
+ continue;
+ }
+
+ if (cluster >= nb_clusters) {
+ fprintf(stderr, "ERROR refcount block %" PRId64
+ " is outside image\n", i);
+ res->corruptions++;
+ continue;
+ }
+
+ if (offset != 0) {
+ inc_refcounts(bs, res, refcount_table, nb_clusters,
+ offset, s->cluster_size);
+ if (refcount_table[cluster] != 1) {
+ fprintf(stderr, "ERROR refcount block %" PRId64
+ " refcount=%d\n",
+ i, refcount_table[cluster]);
+ res->corruptions++;
+ }
+ }
+ }
+
+ /* compare ref counts */
+ for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
+ refcount1 = get_refcount(bs, i);
+ if (refcount1 < 0) {
+ fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
+ i, strerror(-refcount1));
+ res->check_errors++;
+ continue;
+ }
+
+ refcount2 = refcount_table[i];
+
+ if (refcount1 > 0 || refcount2 > 0) {
+ highest_cluster = i;
+ }
+
+ if (refcount1 != refcount2) {
+
+ /* Check if we're allowed to fix the mismatch */
+ int *num_fixed = NULL;
+ if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
+ num_fixed = &res->leaks_fixed;
+ } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
+ num_fixed = &res->corruptions_fixed;
+ }
+
+ fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
+ num_fixed != NULL ? "Repairing" :
+ refcount1 < refcount2 ? "ERROR" :
+ "Leaked",
+ i, refcount1, refcount2);
+
+ if (num_fixed) {
+ ret = update_refcount(bs, i << s->cluster_bits, 1,
+ refcount2 - refcount1,
+ QCOW2_DISCARD_ALWAYS);
+ if (ret >= 0) {
+ (*num_fixed)++;
+ continue;
+ }
+ }
+
+ /* And if we couldn't, print an error */
+ if (refcount1 < refcount2) {
+ res->corruptions++;
+ } else {
+ res->leaks++;
+ }
+ }
+ }
+
+ res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
+ ret = 0;
+
+fail:
+ g_free(refcount_table);
+
+ return ret;
+}
+
diff --git a/contrib/qemu/block/qcow2-snapshot.c b/contrib/qemu/block/qcow2-snapshot.c
new file mode 100644
index 000000000..0caac9055
--- /dev/null
+++ b/contrib/qemu/block/qcow2-snapshot.c
@@ -0,0 +1,660 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/qcow2.h"
+
+typedef struct QEMU_PACKED QCowSnapshotHeader {
+ /* header is 8 byte aligned */
+ uint64_t l1_table_offset;
+
+ uint32_t l1_size;
+ uint16_t id_str_size;
+ uint16_t name_size;
+
+ uint32_t date_sec;
+ uint32_t date_nsec;
+
+ uint64_t vm_clock_nsec;
+
+ uint32_t vm_state_size;
+ uint32_t extra_data_size; /* for extension */
+ /* extra data follows */
+ /* id_str follows */
+ /* name follows */
+} QCowSnapshotHeader;
+
+typedef struct QEMU_PACKED QCowSnapshotExtraData {
+ uint64_t vm_state_size_large;
+ uint64_t disk_size;
+} QCowSnapshotExtraData;
+
+void qcow2_free_snapshots(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i;
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ g_free(s->snapshots[i].name);
+ g_free(s->snapshots[i].id_str);
+ }
+ g_free(s->snapshots);
+ s->snapshots = NULL;
+ s->nb_snapshots = 0;
+}
+
+int qcow2_read_snapshots(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshotHeader h;
+ QCowSnapshotExtraData extra;
+ QCowSnapshot *sn;
+ int i, id_str_size, name_size;
+ int64_t offset;
+ uint32_t extra_data_size;
+ int ret;
+
+ if (!s->nb_snapshots) {
+ s->snapshots = NULL;
+ s->snapshots_size = 0;
+ return 0;
+ }
+
+ offset = s->snapshots_offset;
+ s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot));
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ /* Read statically sized part of the snapshot header */
+ offset = align_offset(offset, 8);
+ ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
+ if (ret < 0) {
+ goto fail;
+ }
+
+ offset += sizeof(h);
+ sn = s->snapshots + i;
+ sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
+ sn->l1_size = be32_to_cpu(h.l1_size);
+ sn->vm_state_size = be32_to_cpu(h.vm_state_size);
+ sn->date_sec = be32_to_cpu(h.date_sec);
+ sn->date_nsec = be32_to_cpu(h.date_nsec);
+ sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
+ extra_data_size = be32_to_cpu(h.extra_data_size);
+
+ id_str_size = be16_to_cpu(h.id_str_size);
+ name_size = be16_to_cpu(h.name_size);
+
+ /* Read extra data */
+ ret = bdrv_pread(bs->file, offset, &extra,
+ MIN(sizeof(extra), extra_data_size));
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += extra_data_size;
+
+ if (extra_data_size >= 8) {
+ sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large);
+ }
+
+ if (extra_data_size >= 16) {
+ sn->disk_size = be64_to_cpu(extra.disk_size);
+ } else {
+ sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+ }
+
+ /* Read snapshot ID */
+ sn->id_str = g_malloc(id_str_size + 1);
+ ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += id_str_size;
+ sn->id_str[id_str_size] = '\0';
+
+ /* Read snapshot name */
+ sn->name = g_malloc(name_size + 1);
+ ret = bdrv_pread(bs->file, offset, sn->name, name_size);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += name_size;
+ sn->name[name_size] = '\0';
+ }
+
+ s->snapshots_size = offset - s->snapshots_offset;
+ return 0;
+
+fail:
+ qcow2_free_snapshots(bs);
+ return ret;
+}
+
+/* add at the end of the file a new list of snapshots */
+static int qcow2_write_snapshots(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ QCowSnapshotHeader h;
+ QCowSnapshotExtraData extra;
+ int i, name_size, id_str_size, snapshots_size;
+ struct {
+ uint32_t nb_snapshots;
+ uint64_t snapshots_offset;
+ } QEMU_PACKED header_data;
+ int64_t offset, snapshots_offset;
+ int ret;
+
+ /* compute the size of the snapshots */
+ offset = 0;
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ offset = align_offset(offset, 8);
+ offset += sizeof(h);
+ offset += sizeof(extra);
+ offset += strlen(sn->id_str);
+ offset += strlen(sn->name);
+ }
+ snapshots_size = offset;
+
+ /* Allocate space for the new snapshot list */
+ snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
+ offset = snapshots_offset;
+ if (offset < 0) {
+ return offset;
+ }
+ ret = bdrv_flush(bs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Write all snapshots to the new list */
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ memset(&h, 0, sizeof(h));
+ h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
+ h.l1_size = cpu_to_be32(sn->l1_size);
+ /* If it doesn't fit in 32 bit, older implementations should treat it
+ * as a disk-only snapshot rather than truncate the VM state */
+ if (sn->vm_state_size <= 0xffffffff) {
+ h.vm_state_size = cpu_to_be32(sn->vm_state_size);
+ }
+ h.date_sec = cpu_to_be32(sn->date_sec);
+ h.date_nsec = cpu_to_be32(sn->date_nsec);
+ h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
+ h.extra_data_size = cpu_to_be32(sizeof(extra));
+
+ memset(&extra, 0, sizeof(extra));
+ extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size);
+ extra.disk_size = cpu_to_be64(sn->disk_size);
+
+ id_str_size = strlen(sn->id_str);
+ name_size = strlen(sn->name);
+ h.id_str_size = cpu_to_be16(id_str_size);
+ h.name_size = cpu_to_be16(name_size);
+ offset = align_offset(offset, 8);
+
+ ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h));
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += sizeof(h);
+
+ ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra));
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += sizeof(extra);
+
+ ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += id_str_size;
+
+ ret = bdrv_pwrite(bs->file, offset, sn->name, name_size);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += name_size;
+ }
+
+ /*
+ * Update the header to point to the new snapshot table. This requires the
+ * new table and its refcounts to be stable on disk.
+ */
+ ret = bdrv_flush(bs);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) !=
+ offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots));
+
+ header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots);
+ header_data.snapshots_offset = cpu_to_be64(snapshots_offset);
+
+ ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
+ &header_data, sizeof(header_data));
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* free the old snapshot table */
+ qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size,
+ QCOW2_DISCARD_SNAPSHOT);
+ s->snapshots_offset = snapshots_offset;
+ s->snapshots_size = snapshots_size;
+ return 0;
+
+fail:
+ return ret;
+}
+
+static void find_new_snapshot_id(BlockDriverState *bs,
+ char *id_str, int id_str_size)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ int i, id, id_max = 0;
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ id = strtoul(sn->id_str, NULL, 10);
+ if (id > id_max)
+ id_max = id;
+ }
+ snprintf(id_str, id_str_size, "%d", id_max + 1);
+}
+
+static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i;
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ if (!strcmp(s->snapshots[i].id_str, id_str))
+ return i;
+ }
+ return -1;
+}
+
+static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i, ret;
+
+ ret = find_snapshot_by_id(bs, name);
+ if (ret >= 0)
+ return ret;
+ for(i = 0; i < s->nb_snapshots; i++) {
+ if (!strcmp(s->snapshots[i].name, name))
+ return i;
+ }
+ return -1;
+}
+
+/* if no id is provided, a new one is constructed */
+int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *new_snapshot_list = NULL;
+ QCowSnapshot *old_snapshot_list = NULL;
+ QCowSnapshot sn1, *sn = &sn1;
+ int i, ret;
+ uint64_t *l1_table = NULL;
+ int64_t l1_table_offset;
+
+ memset(sn, 0, sizeof(*sn));
+
+ /* Generate an ID if it wasn't passed */
+ if (sn_info->id_str[0] == '\0') {
+ find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
+ }
+
+ /* Check that the ID is unique */
+ if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) {
+ return -EEXIST;
+ }
+
+ /* Populate sn with passed data */
+ sn->id_str = g_strdup(sn_info->id_str);
+ sn->name = g_strdup(sn_info->name);
+
+ sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+ sn->vm_state_size = sn_info->vm_state_size;
+ sn->date_sec = sn_info->date_sec;
+ sn->date_nsec = sn_info->date_nsec;
+ sn->vm_clock_nsec = sn_info->vm_clock_nsec;
+
+ /* Allocate the L1 table of the snapshot and copy the current one there. */
+ l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
+ if (l1_table_offset < 0) {
+ ret = l1_table_offset;
+ goto fail;
+ }
+
+ sn->l1_table_offset = l1_table_offset;
+ sn->l1_size = s->l1_size;
+
+ l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+ for(i = 0; i < s->l1_size; i++) {
+ l1_table[i] = cpu_to_be64(s->l1_table[i]);
+ }
+
+ ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table,
+ s->l1_size * sizeof(uint64_t));
+ if (ret < 0) {
+ goto fail;
+ }
+
+ g_free(l1_table);
+ l1_table = NULL;
+
+ /*
+ * Increase the refcounts of all clusters and make sure everything is
+ * stable on disk before updating the snapshot table to contain a pointer
+ * to the new L1 table.
+ */
+ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Append the new snapshot to the snapshot list */
+ new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
+ if (s->snapshots) {
+ memcpy(new_snapshot_list, s->snapshots,
+ s->nb_snapshots * sizeof(QCowSnapshot));
+ old_snapshot_list = s->snapshots;
+ }
+ s->snapshots = new_snapshot_list;
+ s->snapshots[s->nb_snapshots++] = *sn;
+
+ ret = qcow2_write_snapshots(bs);
+ if (ret < 0) {
+ g_free(s->snapshots);
+ s->snapshots = old_snapshot_list;
+ goto fail;
+ }
+
+ g_free(old_snapshot_list);
+
+#ifdef DEBUG_ALLOC
+ {
+ BdrvCheckResult result = {0};
+ qcow2_check_refcounts(bs, &result, 0);
+ }
+#endif
+ return 0;
+
+fail:
+ g_free(sn->id_str);
+ g_free(sn->name);
+ g_free(l1_table);
+
+ return ret;
+}
+
+/* copy the snapshot 'snapshot_name' into the current disk image */
+int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ int i, snapshot_index;
+ int cur_l1_bytes, sn_l1_bytes;
+ int ret;
+ uint64_t *sn_l1_table = NULL;
+
+ /* Search the snapshot */
+ snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+ if (snapshot_index < 0) {
+ return -ENOENT;
+ }
+ sn = &s->snapshots[snapshot_index];
+
+ if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) {
+ error_report("qcow2: Loading snapshots with different disk "
+ "size is not implemented");
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ /*
+ * Make sure that the current L1 table is big enough to contain the whole
+ * L1 table of the snapshot. If the snapshot L1 table is smaller, the
+ * current one must be padded with zeros.
+ */
+ ret = qcow2_grow_l1_table(bs, sn->l1_size, true);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ cur_l1_bytes = s->l1_size * sizeof(uint64_t);
+ sn_l1_bytes = sn->l1_size * sizeof(uint64_t);
+
+ /*
+ * Copy the snapshot L1 table to the current L1 table.
+ *
+ * Before overwriting the old current L1 table on disk, make sure to
+ * increase all refcounts for the clusters referenced by the new one.
+ * Decrease the refcount referenced by the old one only when the L1
+ * table is overwritten.
+ */
+ sn_l1_table = g_malloc0(cur_l1_bytes);
+
+ ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset,
+ sn->l1_size, 1);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table,
+ cur_l1_bytes);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /*
+ * Decrease refcount of clusters of current L1 table.
+ *
+ * At this point, the in-memory s->l1_table points to the old L1 table,
+ * whereas on disk we already have the new one.
+ *
+ * qcow2_update_snapshot_refcount special cases the current L1 table to use
+ * the in-memory data instead of really using the offset to load a new one,
+ * which is why this works.
+ */
+ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset,
+ s->l1_size, -1);
+
+ /*
+ * Now update the in-memory L1 table to be in sync with the on-disk one. We
+ * need to do this even if updating refcounts failed.
+ */
+ for(i = 0;i < s->l1_size; i++) {
+ s->l1_table[i] = be64_to_cpu(sn_l1_table[i]);
+ }
+
+ if (ret < 0) {
+ goto fail;
+ }
+
+ g_free(sn_l1_table);
+ sn_l1_table = NULL;
+
+ /*
+ * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed
+ * when we decreased the refcount of the old snapshot.
+ */
+ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+
+#ifdef DEBUG_ALLOC
+ {
+ BdrvCheckResult result = {0};
+ qcow2_check_refcounts(bs, &result, 0);
+ }
+#endif
+ return 0;
+
+fail:
+ g_free(sn_l1_table);
+ return ret;
+}
+
+int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot sn;
+ int snapshot_index, ret;
+
+ /* Search the snapshot */
+ snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+ if (snapshot_index < 0) {
+ return -ENOENT;
+ }
+ sn = s->snapshots[snapshot_index];
+
+ /* Remove it from the snapshot list */
+ memmove(s->snapshots + snapshot_index,
+ s->snapshots + snapshot_index + 1,
+ (s->nb_snapshots - snapshot_index - 1) * sizeof(sn));
+ s->nb_snapshots--;
+ ret = qcow2_write_snapshots(bs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /*
+ * The snapshot is now unused, clean up. If we fail after this point, we
+ * won't recover but just leak clusters.
+ */
+ g_free(sn.id_str);
+ g_free(sn.name);
+
+ /*
+ * Now decrease the refcounts of clusters referenced by the snapshot and
+ * free the L1 table.
+ */
+ ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset,
+ sn.l1_size, -1);
+ if (ret < 0) {
+ return ret;
+ }
+ qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t),
+ QCOW2_DISCARD_SNAPSHOT);
+
+ /* must update the copied flag on the current cluster offsets */
+ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+#ifdef DEBUG_ALLOC
+ {
+ BdrvCheckResult result = {0};
+ qcow2_check_refcounts(bs, &result, 0);
+ }
+#endif
+ return 0;
+}
+
+int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
+{
+ BDRVQcowState *s = bs->opaque;
+ QEMUSnapshotInfo *sn_tab, *sn_info;
+ QCowSnapshot *sn;
+ int i;
+
+ if (!s->nb_snapshots) {
+ *psn_tab = NULL;
+ return s->nb_snapshots;
+ }
+
+ sn_tab = g_malloc0(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn_info = sn_tab + i;
+ sn = s->snapshots + i;
+ pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
+ sn->id_str);
+ pstrcpy(sn_info->name, sizeof(sn_info->name),
+ sn->name);
+ sn_info->vm_state_size = sn->vm_state_size;
+ sn_info->date_sec = sn->date_sec;
+ sn_info->date_nsec = sn->date_nsec;
+ sn_info->vm_clock_nsec = sn->vm_clock_nsec;
+ }
+ *psn_tab = sn_tab;
+ return s->nb_snapshots;
+}
+
+int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
+{
+ int i, snapshot_index;
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ uint64_t *new_l1_table;
+ int new_l1_bytes;
+ int ret;
+
+ assert(bs->read_only);
+
+ /* Search the snapshot */
+ snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name);
+ if (snapshot_index < 0) {
+ return -ENOENT;
+ }
+ sn = &s->snapshots[snapshot_index];
+
+ /* Allocate and read in the snapshot's L1 table */
+ new_l1_bytes = s->l1_size * sizeof(uint64_t);
+ new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));
+
+ ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
+ if (ret < 0) {
+ g_free(new_l1_table);
+ return ret;
+ }
+
+ /* Switch the L1 table */
+ g_free(s->l1_table);
+
+ s->l1_size = sn->l1_size;
+ s->l1_table_offset = sn->l1_table_offset;
+ s->l1_table = new_l1_table;
+
+ for(i = 0;i < s->l1_size; i++) {
+ be64_to_cpus(&s->l1_table[i]);
+ }
+
+ return 0;
+}
diff --git a/contrib/qemu/block/qcow2.c b/contrib/qemu/block/qcow2.c
new file mode 100644
index 000000000..0eceefe2c
--- /dev/null
+++ b/contrib/qemu/block/qcow2.c
@@ -0,0 +1,1825 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include <zlib.h>
+#include "qemu/aes.h"
+#include "block/qcow2.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qbool.h"
+#include "trace.h"
+
+/*
+ Differences with QCOW:
+
+ - Support for multiple incremental snapshots.
+ - Memory management by reference counts.
+ - Clusters which have a reference count of one have the bit
+ QCOW_OFLAG_COPIED to optimize write performance.
+ - Size of compressed clusters is stored in sectors to reduce bit usage
+ in the cluster offsets.
+ - Support for storing additional data (such as the VM state) in the
+ snapshots.
+ - If a backing store is used, the cluster size is not constrained
+ (could be backported to QCOW).
+ - L2 tables have always a size of one cluster.
+*/
+
+
+typedef struct {
+ uint32_t magic;
+ uint32_t len;
+} QCowExtension;
+
+#define QCOW2_EXT_MAGIC_END 0
+#define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
+#define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
+
+static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const QCowHeader *cow_header = (const void *)buf;
+
+ if (buf_size >= sizeof(QCowHeader) &&
+ be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+ be32_to_cpu(cow_header->version) >= 2)
+ return 100;
+ else
+ return 0;
+}
+
+
+/*
+ * read qcow2 extension and fill bs
+ * start reading from start_offset
+ * finish reading upon magic of value 0 or when end_offset reached
+ * unknown magic is skipped (future extension this version knows nothing about)
+ * return 0 upon success, non-0 otherwise
+ */
+static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
+ uint64_t end_offset, void **p_feature_table)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowExtension ext;
+ uint64_t offset;
+ int ret;
+
+#ifdef DEBUG_EXT
+ printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
+#endif
+ offset = start_offset;
+ while (offset < end_offset) {
+
+#ifdef DEBUG_EXT
+ /* Sanity check */
+ if (offset > s->cluster_size)
+ printf("qcow2_read_extension: suspicious offset %lu\n", offset);
+
+ printf("attempting to read extended header in offset %lu\n", offset);
+#endif
+
+ if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) {
+ fprintf(stderr, "qcow2_read_extension: ERROR: "
+ "pread fail from offset %" PRIu64 "\n",
+ offset);
+ return 1;
+ }
+ be32_to_cpus(&ext.magic);
+ be32_to_cpus(&ext.len);
+ offset += sizeof(ext);
+#ifdef DEBUG_EXT
+ printf("ext.magic = 0x%x\n", ext.magic);
+#endif
+ if (ext.len > end_offset - offset) {
+ error_report("Header extension too large");
+ return -EINVAL;
+ }
+
+ switch (ext.magic) {
+ case QCOW2_EXT_MAGIC_END:
+ return 0;
+
+ case QCOW2_EXT_MAGIC_BACKING_FORMAT:
+ if (ext.len >= sizeof(bs->backing_format)) {
+ fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
+ " (>=%zu)\n",
+ ext.len, sizeof(bs->backing_format));
+ return 2;
+ }
+ if (bdrv_pread(bs->file, offset , bs->backing_format,
+ ext.len) != ext.len)
+ return 3;
+ bs->backing_format[ext.len] = '\0';
+#ifdef DEBUG_EXT
+ printf("Qcow2: Got format extension %s\n", bs->backing_format);
+#endif
+ break;
+
+ case QCOW2_EXT_MAGIC_FEATURE_TABLE:
+ if (p_feature_table != NULL) {
+ void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
+ ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
+ if (ret < 0) {
+ return ret;
+ }
+
+ *p_feature_table = feature_table;
+ }
+ break;
+
+ default:
+ /* unknown magic - save it in case we need to rewrite the header */
+ {
+ Qcow2UnknownHeaderExtension *uext;
+
+ uext = g_malloc0(sizeof(*uext) + ext.len);
+ uext->magic = ext.magic;
+ uext->len = ext.len;
+ QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
+
+ ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ break;
+ }
+
+ offset += ((ext.len + 7) & ~7);
+ }
+
+ return 0;
+}
+
+static void cleanup_unknown_header_ext(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ Qcow2UnknownHeaderExtension *uext, *next;
+
+ QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
+ QLIST_REMOVE(uext, next);
+ g_free(uext);
+ }
+}
+
+static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs,
+ const char *fmt, ...)
+{
+ char msg[64];
+ va_list ap;
+
+ va_start(ap, fmt);
+ vsnprintf(msg, sizeof(msg), fmt, ap);
+ va_end(ap);
+
+ qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bs->device_name, "qcow2", msg);
+}
+
+static void report_unsupported_feature(BlockDriverState *bs,
+ Qcow2Feature *table, uint64_t mask)
+{
+ while (table && table->name[0] != '\0') {
+ if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
+ if (mask & (1 << table->bit)) {
+ report_unsupported(bs, "%.46s",table->name);
+ mask &= ~(1 << table->bit);
+ }
+ }
+ table++;
+ }
+
+ if (mask) {
+ report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask);
+ }
+}
+
+/*
+ * Sets the dirty bit and flushes afterwards if necessary.
+ *
+ * The incompatible_features bit is only set if the image file header was
+ * updated successfully. Therefore it is not required to check the return
+ * value of this function.
+ */
+int qcow2_mark_dirty(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t val;
+ int ret;
+
+ assert(s->qcow_version >= 3);
+
+ if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+ return 0; /* already dirty */
+ }
+
+ val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
+ ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
+ &val, sizeof(val));
+ if (ret < 0) {
+ return ret;
+ }
+ ret = bdrv_flush(bs->file);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Only treat image as dirty if the header was updated successfully */
+ s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
+ return 0;
+}
+
+/*
+ * Clears the dirty bit and flushes before if necessary. Only call this
+ * function when there are no pending requests, it does not guard against
+ * concurrent requests dirtying the image.
+ */
+static int qcow2_mark_clean(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+
+ if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+ int ret = bdrv_flush(bs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
+ return qcow2_update_header(bs);
+ }
+ return 0;
+}
+
+static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
+ BdrvCheckMode fix)
+{
+ int ret = qcow2_check_refcounts(bs, result, fix);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (fix && result->check_errors == 0 && result->corruptions == 0) {
+ return qcow2_mark_clean(bs);
+ }
+ return ret;
+}
+
+static QemuOptsList qcow2_runtime_opts = {
+ .name = "qcow2",
+ .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
+ .desc = {
+ {
+ .name = "lazy_refcounts",
+ .type = QEMU_OPT_BOOL,
+ .help = "Postpone refcount updates",
+ },
+ {
+ .name = QCOW2_OPT_DISCARD_REQUEST,
+ .type = QEMU_OPT_BOOL,
+ .help = "Pass guest discard requests to the layer below",
+ },
+ {
+ .name = QCOW2_OPT_DISCARD_SNAPSHOT,
+ .type = QEMU_OPT_BOOL,
+ .help = "Generate discard requests when snapshot related space "
+ "is freed",
+ },
+ {
+ .name = QCOW2_OPT_DISCARD_OTHER,
+ .type = QEMU_OPT_BOOL,
+ .help = "Generate discard requests when other clusters are freed",
+ },
+ { /* end of list */ }
+ },
+};
+
+static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
+{
+ BDRVQcowState *s = bs->opaque;
+ int len, i, ret = 0;
+ QCowHeader header;
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ uint64_t ext_end;
+ uint64_t l1_vm_state_index;
+
+ ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+ if (ret < 0) {
+ goto fail;
+ }
+ be32_to_cpus(&header.magic);
+ be32_to_cpus(&header.version);
+ be64_to_cpus(&header.backing_file_offset);
+ be32_to_cpus(&header.backing_file_size);
+ be64_to_cpus(&header.size);
+ be32_to_cpus(&header.cluster_bits);
+ be32_to_cpus(&header.crypt_method);
+ be64_to_cpus(&header.l1_table_offset);
+ be32_to_cpus(&header.l1_size);
+ be64_to_cpus(&header.refcount_table_offset);
+ be32_to_cpus(&header.refcount_table_clusters);
+ be64_to_cpus(&header.snapshots_offset);
+ be32_to_cpus(&header.nb_snapshots);
+
+ if (header.magic != QCOW_MAGIC) {
+ ret = -EMEDIUMTYPE;
+ goto fail;
+ }
+ if (header.version < 2 || header.version > 3) {
+ report_unsupported(bs, "QCOW version %d", header.version);
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ s->qcow_version = header.version;
+
+ /* Initialise version 3 header fields */
+ if (header.version == 2) {
+ header.incompatible_features = 0;
+ header.compatible_features = 0;
+ header.autoclear_features = 0;
+ header.refcount_order = 4;
+ header.header_length = 72;
+ } else {
+ be64_to_cpus(&header.incompatible_features);
+ be64_to_cpus(&header.compatible_features);
+ be64_to_cpus(&header.autoclear_features);
+ be32_to_cpus(&header.refcount_order);
+ be32_to_cpus(&header.header_length);
+ }
+
+ if (header.header_length > sizeof(header)) {
+ s->unknown_header_fields_size = header.header_length - sizeof(header);
+ s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
+ ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
+ s->unknown_header_fields_size);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ if (header.backing_file_offset) {
+ ext_end = header.backing_file_offset;
+ } else {
+ ext_end = 1 << header.cluster_bits;
+ }
+
+ /* Handle feature bits */
+ s->incompatible_features = header.incompatible_features;
+ s->compatible_features = header.compatible_features;
+ s->autoclear_features = header.autoclear_features;
+
+ if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
+ void *feature_table = NULL;
+ qcow2_read_extensions(bs, header.header_length, ext_end,
+ &feature_table);
+ report_unsupported_feature(bs, feature_table,
+ s->incompatible_features &
+ ~QCOW2_INCOMPAT_MASK);
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ /* Check support for various header values */
+ if (header.refcount_order != 4) {
+ report_unsupported(bs, "%d bit reference counts",
+ 1 << header.refcount_order);
+ ret = -ENOTSUP;
+ goto fail;
+ }
+
+ if (header.cluster_bits < MIN_CLUSTER_BITS ||
+ header.cluster_bits > MAX_CLUSTER_BITS) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ if (header.crypt_method > QCOW_CRYPT_AES) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ s->crypt_method_header = header.crypt_method;
+ if (s->crypt_method_header) {
+ bs->encrypted = 1;
+ }
+ s->cluster_bits = header.cluster_bits;
+ s->cluster_size = 1 << s->cluster_bits;
+ s->cluster_sectors = 1 << (s->cluster_bits - 9);
+ s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
+ s->l2_size = 1 << s->l2_bits;
+ bs->total_sectors = header.size / 512;
+ s->csize_shift = (62 - (s->cluster_bits - 8));
+ s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
+ s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
+ s->refcount_table_offset = header.refcount_table_offset;
+ s->refcount_table_size =
+ header.refcount_table_clusters << (s->cluster_bits - 3);
+
+ s->snapshots_offset = header.snapshots_offset;
+ s->nb_snapshots = header.nb_snapshots;
+
+ /* read the level 1 table */
+ s->l1_size = header.l1_size;
+
+ l1_vm_state_index = size_to_l1(s, header.size);
+ if (l1_vm_state_index > INT_MAX) {
+ ret = -EFBIG;
+ goto fail;
+ }
+ s->l1_vm_state_index = l1_vm_state_index;
+
+ /* the L1 table must contain at least enough entries to put
+ header.size bytes */
+ if (s->l1_size < s->l1_vm_state_index) {
+ ret = -EINVAL;
+ goto fail;
+ }
+ s->l1_table_offset = header.l1_table_offset;
+ if (s->l1_size > 0) {
+ s->l1_table = g_malloc0(
+ align_offset(s->l1_size * sizeof(uint64_t), 512));
+ ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
+ s->l1_size * sizeof(uint64_t));
+ if (ret < 0) {
+ goto fail;
+ }
+ for(i = 0;i < s->l1_size; i++) {
+ be64_to_cpus(&s->l1_table[i]);
+ }
+ }
+
+ /* alloc L2 table/refcount block cache */
+ s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
+ s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
+
+ s->cluster_cache = g_malloc(s->cluster_size);
+ /* one more sector for decompressed data alignment */
+ s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
+ + 512);
+ s->cluster_cache_offset = -1;
+ s->flags = flags;
+
+ ret = qcow2_refcount_init(bs);
+ if (ret != 0) {
+ goto fail;
+ }
+
+ QLIST_INIT(&s->cluster_allocs);
+ QTAILQ_INIT(&s->discards);
+
+ /* read qcow2 extensions */
+ if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* read the backing file name */
+ if (header.backing_file_offset != 0) {
+ len = header.backing_file_size;
+ if (len > 1023) {
+ len = 1023;
+ }
+ ret = bdrv_pread(bs->file, header.backing_file_offset,
+ bs->backing_file, len);
+ if (ret < 0) {
+ goto fail;
+ }
+ bs->backing_file[len] = '\0';
+ }
+
+ ret = qcow2_read_snapshots(bs);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Clear unknown autoclear feature bits */
+ if (!bs->read_only && s->autoclear_features != 0) {
+ s->autoclear_features = 0;
+ ret = qcow2_update_header(bs);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ /* Initialise locks */
+ qemu_co_mutex_init(&s->lock);
+
+ /* Repair image if dirty */
+ if (!(flags & BDRV_O_CHECK) && !bs->read_only &&
+ (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
+ BdrvCheckResult result = {0};
+
+ ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ /* Enable lazy_refcounts according to image and command line options */
+ opts = qemu_opts_create_nofail(&qcow2_runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
+ (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
+
+ s->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
+ s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
+ s->discard_passthrough[QCOW2_DISCARD_REQUEST] =
+ qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
+ flags & BDRV_O_UNMAP);
+ s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
+ qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
+ s->discard_passthrough[QCOW2_DISCARD_OTHER] =
+ qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
+
+ qemu_opts_del(opts);
+
+ if (s->use_lazy_refcounts && s->qcow_version < 3) {
+ qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require "
+ "a qcow2 image with at least qemu 1.1 compatibility level");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+#ifdef DEBUG_ALLOC
+ {
+ BdrvCheckResult result = {0};
+ qcow2_check_refcounts(bs, &result, 0);
+ }
+#endif
+ return ret;
+
+ fail:
+ g_free(s->unknown_header_fields);
+ cleanup_unknown_header_ext(bs);
+ qcow2_free_snapshots(bs);
+ qcow2_refcount_close(bs);
+ g_free(s->l1_table);
+ if (s->l2_table_cache) {
+ qcow2_cache_destroy(bs, s->l2_table_cache);
+ }
+ g_free(s->cluster_cache);
+ qemu_vfree(s->cluster_data);
+ return ret;
+}
+
+static int qcow2_set_key(BlockDriverState *bs, const char *key)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint8_t keybuf[16];
+ int len, i;
+
+ memset(keybuf, 0, 16);
+ len = strlen(key);
+ if (len > 16)
+ len = 16;
+ /* XXX: we could compress the chars to 7 bits to increase
+ entropy */
+ for(i = 0;i < len;i++) {
+ keybuf[i] = key[i];
+ }
+ s->crypt_method = s->crypt_method_header;
+
+ if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+ return -1;
+ if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+ return -1;
+#if 0
+ /* test */
+ {
+ uint8_t in[16];
+ uint8_t out[16];
+ uint8_t tmp[16];
+ for(i=0;i<16;i++)
+ in[i] = i;
+ AES_encrypt(in, tmp, &s->aes_encrypt_key);
+ AES_decrypt(tmp, out, &s->aes_decrypt_key);
+ for(i = 0; i < 16; i++)
+ printf(" %02x", tmp[i]);
+ printf("\n");
+ for(i = 0; i < 16; i++)
+ printf(" %02x", out[i]);
+ printf("\n");
+ }
+#endif
+ return 0;
+}
+
+/* We have nothing to do for QCOW2 reopen, stubs just return
+ * success */
+static int qcow2_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
+static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, int *pnum)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t cluster_offset;
+ int ret;
+
+ *pnum = nb_sectors;
+ /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
+ * can't pass them on today */
+ qemu_co_mutex_lock(&s->lock);
+ ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
+ qemu_co_mutex_unlock(&s->lock);
+ if (ret < 0) {
+ *pnum = 0;
+ }
+
+ return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO);
+}
+
+/* handle reading after the end of the backing file */
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+ int64_t sector_num, int nb_sectors)
+{
+ int n1;
+ if ((sector_num + nb_sectors) <= bs->total_sectors)
+ return nb_sectors;
+ if (sector_num >= bs->total_sectors)
+ n1 = 0;
+ else
+ n1 = bs->total_sectors - sector_num;
+
+ qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
+
+ return n1;
+}
+
+static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int remaining_sectors, QEMUIOVector *qiov)
+{
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster, n1;
+ int ret;
+ int cur_nr_sectors; /* number of sectors in current iteration */
+ uint64_t cluster_offset = 0;
+ uint64_t bytes_done = 0;
+ QEMUIOVector hd_qiov;
+ uint8_t *cluster_data = NULL;
+
+ qemu_iovec_init(&hd_qiov, qiov->niov);
+
+ qemu_co_mutex_lock(&s->lock);
+
+ while (remaining_sectors != 0) {
+
+ /* prepare next request */
+ cur_nr_sectors = remaining_sectors;
+ if (s->crypt_method) {
+ cur_nr_sectors = MIN(cur_nr_sectors,
+ QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+ }
+
+ ret = qcow2_get_cluster_offset(bs, sector_num << 9,
+ &cur_nr_sectors, &cluster_offset);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
+ cur_nr_sectors * 512);
+
+ switch (ret) {
+ case QCOW2_CLUSTER_UNALLOCATED:
+
+ if (bs->backing_hd) {
+ /* read from the base image */
+ n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
+ sector_num, cur_nr_sectors);
+ if (n1 > 0) {
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_readv(bs->backing_hd, sector_num,
+ n1, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+ } else {
+ /* Note: in this case, no need to wait */
+ qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+ }
+ break;
+
+ case QCOW2_CLUSTER_ZERO:
+ qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+ break;
+
+ case QCOW2_CLUSTER_COMPRESSED:
+ /* add AIO support for compressed blocks ? */
+ ret = qcow2_decompress_cluster(bs, cluster_offset);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ qemu_iovec_from_buf(&hd_qiov, 0,
+ s->cluster_cache + index_in_cluster * 512,
+ 512 * cur_nr_sectors);
+ break;
+
+ case QCOW2_CLUSTER_NORMAL:
+ if ((cluster_offset & 511) != 0) {
+ ret = -EIO;
+ goto fail;
+ }
+
+ if (s->crypt_method) {
+ /*
+ * For encrypted images, read everything into a temporary
+ * contiguous buffer on which the AES functions can work.
+ */
+ if (!cluster_data) {
+ cluster_data =
+ qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+ }
+
+ assert(cur_nr_sectors <=
+ QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_add(&hd_qiov, cluster_data,
+ 512 * cur_nr_sectors);
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+ qemu_co_mutex_unlock(&s->lock);
+ ret = bdrv_co_readv(bs->file,
+ (cluster_offset >> 9) + index_in_cluster,
+ cur_nr_sectors, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ goto fail;
+ }
+ if (s->crypt_method) {
+ qcow2_encrypt_sectors(s, sector_num, cluster_data,
+ cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
+ qemu_iovec_from_buf(qiov, bytes_done,
+ cluster_data, 512 * cur_nr_sectors);
+ }
+ break;
+
+ default:
+ g_assert_not_reached();
+ ret = -EIO;
+ goto fail;
+ }
+
+ remaining_sectors -= cur_nr_sectors;
+ sector_num += cur_nr_sectors;
+ bytes_done += cur_nr_sectors * 512;
+ }
+ ret = 0;
+
+fail:
+ qemu_co_mutex_unlock(&s->lock);
+
+ qemu_iovec_destroy(&hd_qiov);
+ qemu_vfree(cluster_data);
+
+ return ret;
+}
+
+static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ int remaining_sectors,
+ QEMUIOVector *qiov)
+{
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster;
+ int n_end;
+ int ret;
+ int cur_nr_sectors; /* number of sectors in current iteration */
+ uint64_t cluster_offset;
+ QEMUIOVector hd_qiov;
+ uint64_t bytes_done = 0;
+ uint8_t *cluster_data = NULL;
+ QCowL2Meta *l2meta = NULL;
+
+ trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
+ remaining_sectors);
+
+ qemu_iovec_init(&hd_qiov, qiov->niov);
+
+ s->cluster_cache_offset = -1; /* disable compressed cache */
+
+ qemu_co_mutex_lock(&s->lock);
+
+ while (remaining_sectors != 0) {
+
+ l2meta = NULL;
+
+ trace_qcow2_writev_start_part(qemu_coroutine_self());
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n_end = index_in_cluster + remaining_sectors;
+ if (s->crypt_method &&
+ n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
+ n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+ }
+
+ ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
+ index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ assert((cluster_offset & 511) == 0);
+
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
+ cur_nr_sectors * 512);
+
+ if (s->crypt_method) {
+ if (!cluster_data) {
+ cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
+ s->cluster_size);
+ }
+
+ assert(hd_qiov.size <=
+ QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+ qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
+
+ qcow2_encrypt_sectors(s, sector_num, cluster_data,
+ cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
+
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_add(&hd_qiov, cluster_data,
+ cur_nr_sectors * 512);
+ }
+
+ qemu_co_mutex_unlock(&s->lock);
+ BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+ trace_qcow2_writev_data(qemu_coroutine_self(),
+ (cluster_offset >> 9) + index_in_cluster);
+ ret = bdrv_co_writev(bs->file,
+ (cluster_offset >> 9) + index_in_cluster,
+ cur_nr_sectors, &hd_qiov);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ while (l2meta != NULL) {
+ QCowL2Meta *next;
+
+ ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Take the request off the list of running requests */
+ if (l2meta->nb_clusters != 0) {
+ QLIST_REMOVE(l2meta, next_in_flight);
+ }
+
+ qemu_co_queue_restart_all(&l2meta->dependent_requests);
+
+ next = l2meta->next;
+ g_free(l2meta);
+ l2meta = next;
+ }
+
+ remaining_sectors -= cur_nr_sectors;
+ sector_num += cur_nr_sectors;
+ bytes_done += cur_nr_sectors * 512;
+ trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
+ }
+ ret = 0;
+
+fail:
+ qemu_co_mutex_unlock(&s->lock);
+
+ while (l2meta != NULL) {
+ QCowL2Meta *next;
+
+ if (l2meta->nb_clusters != 0) {
+ QLIST_REMOVE(l2meta, next_in_flight);
+ }
+ qemu_co_queue_restart_all(&l2meta->dependent_requests);
+
+ next = l2meta->next;
+ g_free(l2meta);
+ l2meta = next;
+ }
+
+ qemu_iovec_destroy(&hd_qiov);
+ qemu_vfree(cluster_data);
+ trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
+
+ return ret;
+}
+
+static void qcow2_close(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ g_free(s->l1_table);
+
+ qcow2_cache_flush(bs, s->l2_table_cache);
+ qcow2_cache_flush(bs, s->refcount_block_cache);
+
+ qcow2_mark_clean(bs);
+
+ qcow2_cache_destroy(bs, s->l2_table_cache);
+ qcow2_cache_destroy(bs, s->refcount_block_cache);
+
+ g_free(s->unknown_header_fields);
+ cleanup_unknown_header_ext(bs);
+
+ g_free(s->cluster_cache);
+ qemu_vfree(s->cluster_data);
+ qcow2_refcount_close(bs);
+ qcow2_free_snapshots(bs);
+}
+
+static void qcow2_invalidate_cache(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int flags = s->flags;
+ AES_KEY aes_encrypt_key;
+ AES_KEY aes_decrypt_key;
+ uint32_t crypt_method = 0;
+ QDict *options;
+
+ /*
+ * Backing files are read-only which makes all of their metadata immutable,
+ * that means we don't have to worry about reopening them here.
+ */
+
+ if (s->crypt_method) {
+ crypt_method = s->crypt_method;
+ memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
+ memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
+ }
+
+ qcow2_close(bs);
+
+ options = qdict_new();
+ qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS,
+ qbool_from_int(s->use_lazy_refcounts));
+
+ memset(s, 0, sizeof(BDRVQcowState));
+ qcow2_open(bs, options, flags);
+
+ QDECREF(options);
+
+ if (crypt_method) {
+ s->crypt_method = crypt_method;
+ memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
+ memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
+ }
+}
+
+static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
+ size_t len, size_t buflen)
+{
+ QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
+ size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
+
+ if (buflen < ext_len) {
+ return -ENOSPC;
+ }
+
+ *ext_backing_fmt = (QCowExtension) {
+ .magic = cpu_to_be32(magic),
+ .len = cpu_to_be32(len),
+ };
+ memcpy(buf + sizeof(QCowExtension), s, len);
+
+ return ext_len;
+}
+
+/*
+ * Updates the qcow2 header, including the variable length parts of it, i.e.
+ * the backing file name and all extensions. qcow2 was not designed to allow
+ * such changes, so if we run out of space (we can only use the first cluster)
+ * this function may fail.
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int qcow2_update_header(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowHeader *header;
+ char *buf;
+ size_t buflen = s->cluster_size;
+ int ret;
+ uint64_t total_size;
+ uint32_t refcount_table_clusters;
+ size_t header_length;
+ Qcow2UnknownHeaderExtension *uext;
+
+ buf = qemu_blockalign(bs, buflen);
+
+ /* Header structure */
+ header = (QCowHeader*) buf;
+
+ if (buflen < sizeof(*header)) {
+ ret = -ENOSPC;
+ goto fail;
+ }
+
+ header_length = sizeof(*header) + s->unknown_header_fields_size;
+ total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+ refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
+
+ *header = (QCowHeader) {
+ /* Version 2 fields */
+ .magic = cpu_to_be32(QCOW_MAGIC),
+ .version = cpu_to_be32(s->qcow_version),
+ .backing_file_offset = 0,
+ .backing_file_size = 0,
+ .cluster_bits = cpu_to_be32(s->cluster_bits),
+ .size = cpu_to_be64(total_size),
+ .crypt_method = cpu_to_be32(s->crypt_method_header),
+ .l1_size = cpu_to_be32(s->l1_size),
+ .l1_table_offset = cpu_to_be64(s->l1_table_offset),
+ .refcount_table_offset = cpu_to_be64(s->refcount_table_offset),
+ .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
+ .nb_snapshots = cpu_to_be32(s->nb_snapshots),
+ .snapshots_offset = cpu_to_be64(s->snapshots_offset),
+
+ /* Version 3 fields */
+ .incompatible_features = cpu_to_be64(s->incompatible_features),
+ .compatible_features = cpu_to_be64(s->compatible_features),
+ .autoclear_features = cpu_to_be64(s->autoclear_features),
+ .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT),
+ .header_length = cpu_to_be32(header_length),
+ };
+
+ /* For older versions, write a shorter header */
+ switch (s->qcow_version) {
+ case 2:
+ ret = offsetof(QCowHeader, incompatible_features);
+ break;
+ case 3:
+ ret = sizeof(*header);
+ break;
+ default:
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ buf += ret;
+ buflen -= ret;
+ memset(buf, 0, buflen);
+
+ /* Preserve any unknown field in the header */
+ if (s->unknown_header_fields_size) {
+ if (buflen < s->unknown_header_fields_size) {
+ ret = -ENOSPC;
+ goto fail;
+ }
+
+ memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
+ buf += s->unknown_header_fields_size;
+ buflen -= s->unknown_header_fields_size;
+ }
+
+ /* Backing file format header extension */
+ if (*bs->backing_format) {
+ ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
+ bs->backing_format, strlen(bs->backing_format),
+ buflen);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ buf += ret;
+ buflen -= ret;
+ }
+
+ /* Feature table */
+ Qcow2Feature features[] = {
+ {
+ .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
+ .bit = QCOW2_INCOMPAT_DIRTY_BITNR,
+ .name = "dirty bit",
+ },
+ {
+ .type = QCOW2_FEAT_TYPE_COMPATIBLE,
+ .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
+ .name = "lazy refcounts",
+ },
+ };
+
+ ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
+ features, sizeof(features), buflen);
+ if (ret < 0) {
+ goto fail;
+ }
+ buf += ret;
+ buflen -= ret;
+
+ /* Keep unknown header extensions */
+ QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
+ ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ buf += ret;
+ buflen -= ret;
+ }
+
+ /* End of header extensions */
+ ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ buf += ret;
+ buflen -= ret;
+
+ /* Backing file name */
+ if (*bs->backing_file) {
+ size_t backing_file_len = strlen(bs->backing_file);
+
+ if (buflen < backing_file_len) {
+ ret = -ENOSPC;
+ goto fail;
+ }
+
+ /* Using strncpy is ok here, since buf is not NUL-terminated. */
+ strncpy(buf, bs->backing_file, buflen);
+
+ header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
+ header->backing_file_size = cpu_to_be32(backing_file_len);
+ }
+
+ /* Write the new header */
+ ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ ret = 0;
+fail:
+ qemu_vfree(header);
+ return ret;
+}
+
+static int qcow2_change_backing_file(BlockDriverState *bs,
+ const char *backing_file, const char *backing_fmt)
+{
+ pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
+ pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
+
+ return qcow2_update_header(bs);
+}
+
+static int preallocate(BlockDriverState *bs)
+{
+ uint64_t nb_sectors;
+ uint64_t offset;
+ uint64_t host_offset = 0;
+ int num;
+ int ret;
+ QCowL2Meta *meta;
+
+ nb_sectors = bdrv_getlength(bs) >> 9;
+ offset = 0;
+
+ while (nb_sectors) {
+ num = MIN(nb_sectors, INT_MAX >> 9);
+ ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
+ &host_offset, &meta);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = qcow2_alloc_cluster_link_l2(bs, meta);
+ if (ret < 0) {
+ qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters,
+ QCOW2_DISCARD_NEVER);
+ return ret;
+ }
+
+ /* There are no dependent requests, but we need to remove our request
+ * from the list of in-flight requests */
+ if (meta != NULL) {
+ QLIST_REMOVE(meta, next_in_flight);
+ }
+
+ /* TODO Preallocate data if requested */
+
+ nb_sectors -= num;
+ offset += num << 9;
+ }
+
+ /*
+ * It is expected that the image file is large enough to actually contain
+ * all of the allocated clusters (otherwise we get failing reads after
+ * EOF). Extend the image to the last allocated sector.
+ */
+ if (host_offset != 0) {
+ uint8_t buf[512];
+ memset(buf, 0, 512);
+ ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int qcow2_create2(const char *filename, int64_t total_size,
+ const char *backing_file, const char *backing_format,
+ int flags, size_t cluster_size, int prealloc,
+ QEMUOptionParameter *options, int version)
+{
+ /* Calculate cluster_bits */
+ int cluster_bits;
+ cluster_bits = ffs(cluster_size) - 1;
+ if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
+ (1 << cluster_bits) != cluster_size)
+ {
+ error_report(
+ "Cluster size must be a power of two between %d and %dk",
+ 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
+ return -EINVAL;
+ }
+
+ /*
+ * Open the image file and write a minimal qcow2 header.
+ *
+ * We keep things simple and start with a zero-sized image. We also
+ * do without refcount blocks or a L1 table for now. We'll fix the
+ * inconsistency later.
+ *
+ * We do need a refcount table because growing the refcount table means
+ * allocating two new refcount blocks - the seconds of which would be at
+ * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
+ * size for any qcow2 image.
+ */
+ BlockDriverState* bs;
+ QCowHeader header;
+ uint8_t* refcount_table;
+ int ret;
+
+ ret = bdrv_create_file(filename, options);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Write the header */
+ memset(&header, 0, sizeof(header));
+ header.magic = cpu_to_be32(QCOW_MAGIC);
+ header.version = cpu_to_be32(version);
+ header.cluster_bits = cpu_to_be32(cluster_bits);
+ header.size = cpu_to_be64(0);
+ header.l1_table_offset = cpu_to_be64(0);
+ header.l1_size = cpu_to_be32(0);
+ header.refcount_table_offset = cpu_to_be64(cluster_size);
+ header.refcount_table_clusters = cpu_to_be32(1);
+ header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
+ header.header_length = cpu_to_be32(sizeof(header));
+
+ if (flags & BLOCK_FLAG_ENCRYPT) {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+ } else {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+ }
+
+ if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
+ header.compatible_features |=
+ cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
+ }
+
+ ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
+ if (ret < 0) {
+ goto out;
+ }
+
+ /* Write an empty refcount table */
+ refcount_table = g_malloc0(cluster_size);
+ ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
+ g_free(refcount_table);
+
+ if (ret < 0) {
+ goto out;
+ }
+
+ bdrv_close(bs);
+
+ /*
+ * And now open the image and make it consistent first (i.e. increase the
+ * refcount of the cluster that is occupied by the header and the refcount
+ * table)
+ */
+ BlockDriver* drv = bdrv_find_format("qcow2");
+ assert(drv != NULL);
+ ret = bdrv_open(bs, filename, NULL,
+ BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
+ if (ret < 0) {
+ goto out;
+
+ } else if (ret != 0) {
+ error_report("Huh, first cluster in empty image is already in use?");
+ abort();
+ }
+
+ /* Okay, now that we have a valid image, let's give it the right size */
+ ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
+ if (ret < 0) {
+ goto out;
+ }
+
+ /* Want a backing file? There you go.*/
+ if (backing_file) {
+ ret = bdrv_change_backing_file(bs, backing_file, backing_format);
+ if (ret < 0) {
+ goto out;
+ }
+ }
+
+ /* And if we're supposed to preallocate metadata, do that now */
+ if (prealloc) {
+ BDRVQcowState *s = bs->opaque;
+ qemu_co_mutex_lock(&s->lock);
+ ret = preallocate(bs);
+ qemu_co_mutex_unlock(&s->lock);
+ if (ret < 0) {
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ bdrv_delete(bs);
+ return ret;
+}
+
+static int qcow2_create(const char *filename, QEMUOptionParameter *options)
+{
+ const char *backing_file = NULL;
+ const char *backing_fmt = NULL;
+ uint64_t sectors = 0;
+ int flags = 0;
+ size_t cluster_size = DEFAULT_CLUSTER_SIZE;
+ int prealloc = 0;
+ int version = 2;
+
+ /* Read out options */
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ sectors = options->value.n / 512;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+ backing_file = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+ backing_fmt = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
+ flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
+ } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+ if (options->value.n) {
+ cluster_size = options->value.n;
+ }
+ } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
+ if (!options->value.s || !strcmp(options->value.s, "off")) {
+ prealloc = 0;
+ } else if (!strcmp(options->value.s, "metadata")) {
+ prealloc = 1;
+ } else {
+ fprintf(stderr, "Invalid preallocation mode: '%s'\n",
+ options->value.s);
+ return -EINVAL;
+ }
+ } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
+ if (!options->value.s || !strcmp(options->value.s, "0.10")) {
+ version = 2;
+ } else if (!strcmp(options->value.s, "1.1")) {
+ version = 3;
+ } else {
+ fprintf(stderr, "Invalid compatibility level: '%s'\n",
+ options->value.s);
+ return -EINVAL;
+ }
+ } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
+ flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
+ }
+ options++;
+ }
+
+ if (backing_file && prealloc) {
+ fprintf(stderr, "Backing file and preallocation cannot be used at "
+ "the same time\n");
+ return -EINVAL;
+ }
+
+ if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
+ fprintf(stderr, "Lazy refcounts only supported with compatibility "
+ "level 1.1 and above (use compat=1.1 or greater)\n");
+ return -EINVAL;
+ }
+
+ return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
+ cluster_size, prealloc, options, version);
+}
+
+static int qcow2_make_empty(BlockDriverState *bs)
+{
+#if 0
+ /* XXX: not correct */
+ BDRVQcowState *s = bs->opaque;
+ uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+ int ret;
+
+ memset(s->l1_table, 0, l1_length);
+ if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
+ return -1;
+ ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
+ if (ret < 0)
+ return ret;
+
+ l2_cache_reset(bs);
+#endif
+ return 0;
+}
+
+static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ int ret;
+ BDRVQcowState *s = bs->opaque;
+
+ /* Emulate misaligned zero writes */
+ if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
+ return -ENOTSUP;
+ }
+
+ /* Whatever is left can use real zero clusters */
+ qemu_co_mutex_lock(&s->lock);
+ ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+
+ return ret;
+}
+
+static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors)
+{
+ int ret;
+ BDRVQcowState *s = bs->opaque;
+
+ qemu_co_mutex_lock(&s->lock);
+ ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors);
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+}
+
+static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t new_l1_size;
+ int ret;
+
+ if (offset & 511) {
+ error_report("The new size must be a multiple of 512");
+ return -EINVAL;
+ }
+
+ /* cannot proceed if image has snapshots */
+ if (s->nb_snapshots) {
+ error_report("Can't resize an image which has snapshots");
+ return -ENOTSUP;
+ }
+
+ /* shrinking is currently not supported */
+ if (offset < bs->total_sectors * 512) {
+ error_report("qcow2 doesn't support shrinking images yet");
+ return -ENOTSUP;
+ }
+
+ new_l1_size = size_to_l1(s, offset);
+ ret = qcow2_grow_l1_table(bs, new_l1_size, true);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* write updated header.size */
+ offset = cpu_to_be64(offset);
+ ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
+ &offset, sizeof(uint64_t));
+ if (ret < 0) {
+ return ret;
+ }
+
+ s->l1_vm_state_index = new_l1_size;
+ return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+ tables to avoid losing bytes in alignment */
+static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ z_stream strm;
+ int ret, out_len;
+ uint8_t *out_buf;
+ uint64_t cluster_offset;
+
+ if (nb_sectors == 0) {
+ /* align end of file to a sector boundary to ease reading with
+ sector based I/Os */
+ cluster_offset = bdrv_getlength(bs->file);
+ cluster_offset = (cluster_offset + 511) & ~511;
+ bdrv_truncate(bs->file, cluster_offset);
+ return 0;
+ }
+
+ if (nb_sectors != s->cluster_sectors) {
+ ret = -EINVAL;
+
+ /* Zero-pad last write if image size is not cluster aligned */
+ if (sector_num + nb_sectors == bs->total_sectors &&
+ nb_sectors < s->cluster_sectors) {
+ uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+ memset(pad_buf, 0, s->cluster_size);
+ memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+ ret = qcow2_write_compressed(bs, sector_num,
+ pad_buf, s->cluster_sectors);
+ qemu_vfree(pad_buf);
+ }
+ return ret;
+ }
+
+ out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+
+ /* best compression, small window, no zlib header */
+ memset(&strm, 0, sizeof(strm));
+ ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, -12,
+ 9, Z_DEFAULT_STRATEGY);
+ if (ret != 0) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ strm.avail_in = s->cluster_size;
+ strm.next_in = (uint8_t *)buf;
+ strm.avail_out = s->cluster_size;
+ strm.next_out = out_buf;
+
+ ret = deflate(&strm, Z_FINISH);
+ if (ret != Z_STREAM_END && ret != Z_OK) {
+ deflateEnd(&strm);
+ ret = -EINVAL;
+ goto fail;
+ }
+ out_len = strm.next_out - out_buf;
+
+ deflateEnd(&strm);
+
+ if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+ /* could not compress: write normal cluster */
+ ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+ if (ret < 0) {
+ goto fail;
+ }
+ } else {
+ cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
+ sector_num << 9, out_len);
+ if (!cluster_offset) {
+ ret = -EIO;
+ goto fail;
+ }
+ cluster_offset &= s->cluster_offset_mask;
+ BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
+ ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ ret = 0;
+fail:
+ g_free(out_buf);
+ return ret;
+}
+
+static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret;
+
+ qemu_co_mutex_lock(&s->lock);
+ ret = qcow2_cache_flush(bs, s->l2_table_cache);
+ if (ret < 0) {
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+ }
+
+ if (qcow2_need_accurate_refcounts(s)) {
+ ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+ if (ret < 0) {
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+ }
+ }
+ qemu_co_mutex_unlock(&s->lock);
+
+ return 0;
+}
+
+static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
+{
+ return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
+}
+
+static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ BDRVQcowState *s = bs->opaque;
+ bdi->cluster_size = s->cluster_size;
+ bdi->vm_state_offset = qcow2_vm_state_offset(s);
+ return 0;
+}
+
+#if 0
+static void dump_refcounts(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t nb_clusters, k, k1, size;
+ int refcount;
+
+ size = bdrv_getlength(bs->file);
+ nb_clusters = size_to_clusters(s, size);
+ for(k = 0; k < nb_clusters;) {
+ k1 = k;
+ refcount = get_refcount(bs, k);
+ k++;
+ while (k < nb_clusters && get_refcount(bs, k) == refcount)
+ k++;
+ printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
+ k - k1);
+ }
+}
+#endif
+
+static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+ int64_t pos)
+{
+ BDRVQcowState *s = bs->opaque;
+ int growable = bs->growable;
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
+ bs->growable = 1;
+ ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
+ bs->growable = growable;
+
+ return ret;
+}
+
+static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+ int64_t pos, int size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int growable = bs->growable;
+ int ret;
+
+ BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
+ bs->growable = 1;
+ ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
+ bs->growable = growable;
+
+ return ret;
+}
+
+static QEMUOptionParameter qcow2_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ {
+ .name = BLOCK_OPT_COMPAT_LEVEL,
+ .type = OPT_STRING,
+ .help = "Compatibility level (0.10 or 1.1)"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a base image"
+ },
+ {
+ .name = BLOCK_OPT_BACKING_FMT,
+ .type = OPT_STRING,
+ .help = "Image format of the base image"
+ },
+ {
+ .name = BLOCK_OPT_ENCRYPT,
+ .type = OPT_FLAG,
+ .help = "Encrypt the image"
+ },
+ {
+ .name = BLOCK_OPT_CLUSTER_SIZE,
+ .type = OPT_SIZE,
+ .help = "qcow2 cluster size",
+ .value = { .n = DEFAULT_CLUSTER_SIZE },
+ },
+ {
+ .name = BLOCK_OPT_PREALLOC,
+ .type = OPT_STRING,
+ .help = "Preallocation mode (allowed values: off, metadata)"
+ },
+ {
+ .name = BLOCK_OPT_LAZY_REFCOUNTS,
+ .type = OPT_FLAG,
+ .help = "Postpone refcount updates",
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_qcow2 = {
+ .format_name = "qcow2",
+ .instance_size = sizeof(BDRVQcowState),
+ .bdrv_probe = qcow2_probe,
+ .bdrv_open = qcow2_open,
+ .bdrv_close = qcow2_close,
+ .bdrv_reopen_prepare = qcow2_reopen_prepare,
+ .bdrv_create = qcow2_create,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
+ .bdrv_co_is_allocated = qcow2_co_is_allocated,
+ .bdrv_set_key = qcow2_set_key,
+ .bdrv_make_empty = qcow2_make_empty,
+
+ .bdrv_co_readv = qcow2_co_readv,
+ .bdrv_co_writev = qcow2_co_writev,
+ .bdrv_co_flush_to_os = qcow2_co_flush_to_os,
+
+ .bdrv_co_write_zeroes = qcow2_co_write_zeroes,
+ .bdrv_co_discard = qcow2_co_discard,
+ .bdrv_truncate = qcow2_truncate,
+ .bdrv_write_compressed = qcow2_write_compressed,
+
+ .bdrv_snapshot_create = qcow2_snapshot_create,
+ .bdrv_snapshot_goto = qcow2_snapshot_goto,
+ .bdrv_snapshot_delete = qcow2_snapshot_delete,
+ .bdrv_snapshot_list = qcow2_snapshot_list,
+ .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
+ .bdrv_get_info = qcow2_get_info,
+
+ .bdrv_save_vmstate = qcow2_save_vmstate,
+ .bdrv_load_vmstate = qcow2_load_vmstate,
+
+ .bdrv_change_backing_file = qcow2_change_backing_file,
+
+ .bdrv_invalidate_cache = qcow2_invalidate_cache,
+
+ .create_options = qcow2_create_options,
+ .bdrv_check = qcow2_check,
+};
+
+static void bdrv_qcow2_init(void)
+{
+ bdrv_register(&bdrv_qcow2);
+}
+
+block_init(bdrv_qcow2_init);
diff --git a/contrib/qemu/block/qcow2.h b/contrib/qemu/block/qcow2.h
new file mode 100644
index 000000000..3b2d5cda7
--- /dev/null
+++ b/contrib/qemu/block/qcow2.h
@@ -0,0 +1,437 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_QCOW2_H
+#define BLOCK_QCOW2_H
+
+#include "qemu/aes.h"
+#include "block/coroutine.h"
+
+//#define DEBUG_ALLOC
+//#define DEBUG_ALLOC2
+//#define DEBUG_EXT
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES 1
+
+#define QCOW_MAX_CRYPT_CLUSTERS 32
+
+/* indicate that the refcount of the referenced cluster is exactly one. */
+#define QCOW_OFLAG_COPIED (1LL << 63)
+/* indicate that the cluster is compressed (they never have the copied flag) */
+#define QCOW_OFLAG_COMPRESSED (1LL << 62)
+/* The cluster reads as all zeros */
+#define QCOW_OFLAG_ZERO (1LL << 0)
+
+#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
+
+#define MIN_CLUSTER_BITS 9
+#define MAX_CLUSTER_BITS 21
+
+#define L2_CACHE_SIZE 16
+
+/* Must be at least 4 to cover all cases of refcount table growth */
+#define REFCOUNT_CACHE_SIZE 4
+
+#define DEFAULT_CLUSTER_SIZE 65536
+
+
+#define QCOW2_OPT_LAZY_REFCOUNTS "lazy_refcounts"
+#define QCOW2_OPT_DISCARD_REQUEST "pass_discard_request"
+#define QCOW2_OPT_DISCARD_SNAPSHOT "pass_discard_snapshot"
+#define QCOW2_OPT_DISCARD_OTHER "pass_discard_other"
+
+typedef struct QCowHeader {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t backing_file_offset;
+ uint32_t backing_file_size;
+ uint32_t cluster_bits;
+ uint64_t size; /* in bytes */
+ uint32_t crypt_method;
+ uint32_t l1_size; /* XXX: save number of clusters instead ? */
+ uint64_t l1_table_offset;
+ uint64_t refcount_table_offset;
+ uint32_t refcount_table_clusters;
+ uint32_t nb_snapshots;
+ uint64_t snapshots_offset;
+
+ /* The following fields are only valid for version >= 3 */
+ uint64_t incompatible_features;
+ uint64_t compatible_features;
+ uint64_t autoclear_features;
+
+ uint32_t refcount_order;
+ uint32_t header_length;
+} QCowHeader;
+
+typedef struct QCowSnapshot {
+ uint64_t l1_table_offset;
+ uint32_t l1_size;
+ char *id_str;
+ char *name;
+ uint64_t disk_size;
+ uint64_t vm_state_size;
+ uint32_t date_sec;
+ uint32_t date_nsec;
+ uint64_t vm_clock_nsec;
+} QCowSnapshot;
+
+struct Qcow2Cache;
+typedef struct Qcow2Cache Qcow2Cache;
+
+typedef struct Qcow2UnknownHeaderExtension {
+ uint32_t magic;
+ uint32_t len;
+ QLIST_ENTRY(Qcow2UnknownHeaderExtension) next;
+ uint8_t data[];
+} Qcow2UnknownHeaderExtension;
+
+enum {
+ QCOW2_FEAT_TYPE_INCOMPATIBLE = 0,
+ QCOW2_FEAT_TYPE_COMPATIBLE = 1,
+ QCOW2_FEAT_TYPE_AUTOCLEAR = 2,
+};
+
+/* Incompatible feature bits */
+enum {
+ QCOW2_INCOMPAT_DIRTY_BITNR = 0,
+ QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR,
+
+ QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY,
+};
+
+/* Compatible feature bits */
+enum {
+ QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0,
+ QCOW2_COMPAT_LAZY_REFCOUNTS = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
+
+ QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS,
+};
+
+enum qcow2_discard_type {
+ QCOW2_DISCARD_NEVER = 0,
+ QCOW2_DISCARD_ALWAYS,
+ QCOW2_DISCARD_REQUEST,
+ QCOW2_DISCARD_SNAPSHOT,
+ QCOW2_DISCARD_OTHER,
+ QCOW2_DISCARD_MAX
+};
+
+typedef struct Qcow2Feature {
+ uint8_t type;
+ uint8_t bit;
+ char name[46];
+} QEMU_PACKED Qcow2Feature;
+
+typedef struct Qcow2DiscardRegion {
+ BlockDriverState *bs;
+ uint64_t offset;
+ uint64_t bytes;
+ QTAILQ_ENTRY(Qcow2DiscardRegion) next;
+} Qcow2DiscardRegion;
+
+typedef struct BDRVQcowState {
+ int cluster_bits;
+ int cluster_size;
+ int cluster_sectors;
+ int l2_bits;
+ int l2_size;
+ int l1_size;
+ int l1_vm_state_index;
+ int csize_shift;
+ int csize_mask;
+ uint64_t cluster_offset_mask;
+ uint64_t l1_table_offset;
+ uint64_t *l1_table;
+
+ Qcow2Cache* l2_table_cache;
+ Qcow2Cache* refcount_block_cache;
+
+ uint8_t *cluster_cache;
+ uint8_t *cluster_data;
+ uint64_t cluster_cache_offset;
+ QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs;
+
+ uint64_t *refcount_table;
+ uint64_t refcount_table_offset;
+ uint32_t refcount_table_size;
+ int64_t free_cluster_index;
+ int64_t free_byte_offset;
+
+ CoMutex lock;
+
+ uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+ uint32_t crypt_method_header;
+ AES_KEY aes_encrypt_key;
+ AES_KEY aes_decrypt_key;
+ uint64_t snapshots_offset;
+ int snapshots_size;
+ int nb_snapshots;
+ QCowSnapshot *snapshots;
+
+ int flags;
+ int qcow_version;
+ bool use_lazy_refcounts;
+
+ bool discard_passthrough[QCOW2_DISCARD_MAX];
+
+ uint64_t incompatible_features;
+ uint64_t compatible_features;
+ uint64_t autoclear_features;
+
+ size_t unknown_header_fields_size;
+ void* unknown_header_fields;
+ QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext;
+ QTAILQ_HEAD (, Qcow2DiscardRegion) discards;
+ bool cache_discards;
+} BDRVQcowState;
+
+/* XXX: use std qcow open function ? */
+typedef struct QCowCreateState {
+ int cluster_size;
+ int cluster_bits;
+ uint16_t *refcount_block;
+ uint64_t *refcount_table;
+ int64_t l1_table_offset;
+ int64_t refcount_table_offset;
+ int64_t refcount_block_offset;
+} QCowCreateState;
+
+struct QCowAIOCB;
+
+typedef struct Qcow2COWRegion {
+ /**
+ * Offset of the COW region in bytes from the start of the first cluster
+ * touched by the request.
+ */
+ uint64_t offset;
+
+ /** Number of sectors to copy */
+ int nb_sectors;
+} Qcow2COWRegion;
+
+/**
+ * Describes an in-flight (part of a) write request that writes to clusters
+ * that are not referenced in their L2 table yet.
+ */
+typedef struct QCowL2Meta
+{
+ /** Guest offset of the first newly allocated cluster */
+ uint64_t offset;
+
+ /** Host offset of the first newly allocated cluster */
+ uint64_t alloc_offset;
+
+ /**
+ * Number of sectors from the start of the first allocated cluster to
+ * the end of the (possibly shortened) request
+ */
+ int nb_available;
+
+ /** Number of newly allocated clusters */
+ int nb_clusters;
+
+ /**
+ * Requests that overlap with this allocation and wait to be restarted
+ * when the allocating request has completed.
+ */
+ CoQueue dependent_requests;
+
+ /**
+ * The COW Region between the start of the first allocated cluster and the
+ * area the guest actually writes to.
+ */
+ Qcow2COWRegion cow_start;
+
+ /**
+ * The COW Region between the area the guest actually writes to and the
+ * end of the last allocated cluster.
+ */
+ Qcow2COWRegion cow_end;
+
+ /** Pointer to next L2Meta of the same write request */
+ struct QCowL2Meta *next;
+
+ QLIST_ENTRY(QCowL2Meta) next_in_flight;
+} QCowL2Meta;
+
+enum {
+ QCOW2_CLUSTER_UNALLOCATED,
+ QCOW2_CLUSTER_NORMAL,
+ QCOW2_CLUSTER_COMPRESSED,
+ QCOW2_CLUSTER_ZERO
+};
+
+#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL
+#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL
+#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
+
+#define REFT_OFFSET_MASK 0xffffffffffffff00ULL
+
+static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset)
+{
+ return offset & ~(s->cluster_size - 1);
+}
+
+static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset)
+{
+ return offset & (s->cluster_size - 1);
+}
+
+static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
+{
+ return (size + (s->cluster_size - 1)) >> s->cluster_bits;
+}
+
+static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size)
+{
+ int shift = s->cluster_bits + s->l2_bits;
+ return (size + (1ULL << shift) - 1) >> shift;
+}
+
+static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset)
+{
+ return (offset >> s->cluster_bits) & (s->l2_size - 1);
+}
+
+static inline int64_t align_offset(int64_t offset, int n)
+{
+ offset = (offset + n - 1) & ~(n - 1);
+ return offset;
+}
+
+static inline int qcow2_get_cluster_type(uint64_t l2_entry)
+{
+ if (l2_entry & QCOW_OFLAG_COMPRESSED) {
+ return QCOW2_CLUSTER_COMPRESSED;
+ } else if (l2_entry & QCOW_OFLAG_ZERO) {
+ return QCOW2_CLUSTER_ZERO;
+ } else if (!(l2_entry & L2E_OFFSET_MASK)) {
+ return QCOW2_CLUSTER_UNALLOCATED;
+ } else {
+ return QCOW2_CLUSTER_NORMAL;
+ }
+}
+
+/* Check whether refcounts are eager or lazy */
+static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s)
+{
+ return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY);
+}
+
+static inline uint64_t l2meta_cow_start(QCowL2Meta *m)
+{
+ return m->offset + m->cow_start.offset;
+}
+
+static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
+{
+ return m->offset + m->cow_end.offset
+ + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
+}
+
+// FIXME Need qcow2_ prefix to global functions
+
+/* qcow2.c functions */
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+ int64_t sector_num, int nb_sectors);
+
+int qcow2_mark_dirty(BlockDriverState *bs);
+int qcow2_update_header(BlockDriverState *bs);
+
+/* qcow2-refcount.c functions */
+int qcow2_refcount_init(BlockDriverState *bs);
+void qcow2_refcount_close(BlockDriverState *bs);
+
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
+int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+ int nb_clusters);
+int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
+void qcow2_free_clusters(BlockDriverState *bs,
+ int64_t offset, int64_t size,
+ enum qcow2_discard_type type);
+void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
+ int nb_clusters, enum qcow2_discard_type type);
+
+int qcow2_update_snapshot_refcount(BlockDriverState *bs,
+ int64_t l1_table_offset, int l1_size, int addend);
+
+int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix);
+
+void qcow2_process_discards(BlockDriverState *bs, int ret);
+
+/* qcow2-cluster.c functions */
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+ bool exact_size);
+void qcow2_l2_cache_reset(BlockDriverState *bs);
+int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
+void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, int enc,
+ const AES_KEY *key);
+
+int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+ int *num, uint64_t *cluster_offset);
+int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
+ int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m);
+uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+ uint64_t offset,
+ int compressed_size);
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
+int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
+ int nb_sectors);
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);
+
+/* qcow2-snapshot.c functions */
+int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
+int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id);
+int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id);
+int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab);
+int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name);
+
+void qcow2_free_snapshots(BlockDriverState *bs);
+int qcow2_read_snapshots(BlockDriverState *bs);
+
+/* qcow2-cache.c functions */
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
+int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
+
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
+int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
+ Qcow2Cache *dependency);
+void qcow2_cache_depends_on_flush(Qcow2Cache *c);
+
+int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+ void **table);
+int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+ void **table);
+int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
+
+#endif
diff --git a/contrib/qemu/block/qed-check.c b/contrib/qemu/block/qed-check.c
new file mode 100644
index 000000000..b473dcd61
--- /dev/null
+++ b/contrib/qemu/block/qed-check.c
@@ -0,0 +1,248 @@
+/*
+ * QEMU Enhanced Disk Format Consistency Check
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+typedef struct {
+ BDRVQEDState *s;
+ BdrvCheckResult *result;
+ bool fix; /* whether to fix invalid offsets */
+
+ uint64_t nclusters;
+ uint32_t *used_clusters; /* referenced cluster bitmap */
+
+ QEDRequest request;
+} QEDCheck;
+
+static bool qed_test_bit(uint32_t *bitmap, uint64_t n) {
+ return !!(bitmap[n / 32] & (1 << (n % 32)));
+}
+
+static void qed_set_bit(uint32_t *bitmap, uint64_t n) {
+ bitmap[n / 32] |= 1 << (n % 32);
+}
+
+/**
+ * Set bitmap bits for clusters
+ *
+ * @check: Check structure
+ * @offset: Starting offset in bytes
+ * @n: Number of clusters
+ */
+static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset,
+ unsigned int n)
+{
+ uint64_t cluster = qed_bytes_to_clusters(check->s, offset);
+ unsigned int corruptions = 0;
+
+ while (n-- != 0) {
+ /* Clusters should only be referenced once */
+ if (qed_test_bit(check->used_clusters, cluster)) {
+ corruptions++;
+ }
+
+ qed_set_bit(check->used_clusters, cluster);
+ cluster++;
+ }
+
+ check->result->corruptions += corruptions;
+ return corruptions == 0;
+}
+
+/**
+ * Check an L2 table
+ *
+ * @ret: Number of invalid cluster offsets
+ */
+static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table)
+{
+ BDRVQEDState *s = check->s;
+ unsigned int i, num_invalid = 0;
+ uint64_t last_offset = 0;
+
+ for (i = 0; i < s->table_nelems; i++) {
+ uint64_t offset = table->offsets[i];
+
+ if (qed_offset_is_unalloc_cluster(offset) ||
+ qed_offset_is_zero_cluster(offset)) {
+ continue;
+ }
+ check->result->bfi.allocated_clusters++;
+ if (last_offset && (last_offset + s->header.cluster_size != offset)) {
+ check->result->bfi.fragmented_clusters++;
+ }
+ last_offset = offset;
+
+ /* Detect invalid cluster offset */
+ if (!qed_check_cluster_offset(s, offset)) {
+ if (check->fix) {
+ table->offsets[i] = 0;
+ check->result->corruptions_fixed++;
+ } else {
+ check->result->corruptions++;
+ }
+
+ num_invalid++;
+ continue;
+ }
+
+ qed_set_used_clusters(check, offset, 1);
+ }
+
+ return num_invalid;
+}
+
+/**
+ * Descend tables and check each cluster is referenced once only
+ */
+static int qed_check_l1_table(QEDCheck *check, QEDTable *table)
+{
+ BDRVQEDState *s = check->s;
+ unsigned int i, num_invalid_l1 = 0;
+ int ret, last_error = 0;
+
+ /* Mark L1 table clusters used */
+ qed_set_used_clusters(check, s->header.l1_table_offset,
+ s->header.table_size);
+
+ for (i = 0; i < s->table_nelems; i++) {
+ unsigned int num_invalid_l2;
+ uint64_t offset = table->offsets[i];
+
+ if (qed_offset_is_unalloc_cluster(offset)) {
+ continue;
+ }
+
+ /* Detect invalid L2 offset */
+ if (!qed_check_table_offset(s, offset)) {
+ /* Clear invalid offset */
+ if (check->fix) {
+ table->offsets[i] = 0;
+ check->result->corruptions_fixed++;
+ } else {
+ check->result->corruptions++;
+ }
+
+ num_invalid_l1++;
+ continue;
+ }
+
+ if (!qed_set_used_clusters(check, offset, s->header.table_size)) {
+ continue; /* skip an invalid table */
+ }
+
+ ret = qed_read_l2_table_sync(s, &check->request, offset);
+ if (ret) {
+ check->result->check_errors++;
+ last_error = ret;
+ continue;
+ }
+
+ num_invalid_l2 = qed_check_l2_table(check,
+ check->request.l2_table->table);
+
+ /* Write out fixed L2 table */
+ if (num_invalid_l2 > 0 && check->fix) {
+ ret = qed_write_l2_table_sync(s, &check->request, 0,
+ s->table_nelems, false);
+ if (ret) {
+ check->result->check_errors++;
+ last_error = ret;
+ continue;
+ }
+ }
+ }
+
+ /* Drop reference to final table */
+ qed_unref_l2_cache_entry(check->request.l2_table);
+ check->request.l2_table = NULL;
+
+ /* Write out fixed L1 table */
+ if (num_invalid_l1 > 0 && check->fix) {
+ ret = qed_write_l1_table_sync(s, 0, s->table_nelems);
+ if (ret) {
+ check->result->check_errors++;
+ last_error = ret;
+ }
+ }
+
+ return last_error;
+}
+
+/**
+ * Check for unreferenced (leaked) clusters
+ */
+static void qed_check_for_leaks(QEDCheck *check)
+{
+ BDRVQEDState *s = check->s;
+ uint64_t i;
+
+ for (i = s->header.header_size; i < check->nclusters; i++) {
+ if (!qed_test_bit(check->used_clusters, i)) {
+ check->result->leaks++;
+ }
+ }
+}
+
+/**
+ * Mark an image clean once it passes check or has been repaired
+ */
+static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result)
+{
+ /* Skip if there were unfixable corruptions or I/O errors */
+ if (result->corruptions > 0 || result->check_errors > 0) {
+ return;
+ }
+
+ /* Skip if image is already marked clean */
+ if (!(s->header.features & QED_F_NEED_CHECK)) {
+ return;
+ }
+
+ /* Ensure fixes reach storage before clearing check bit */
+ bdrv_flush(s->bs);
+
+ s->header.features &= ~QED_F_NEED_CHECK;
+ qed_write_header_sync(s);
+}
+
+int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
+{
+ QEDCheck check = {
+ .s = s,
+ .result = result,
+ .nclusters = qed_bytes_to_clusters(s, s->file_size),
+ .request = { .l2_table = NULL },
+ .fix = fix,
+ };
+ int ret;
+
+ check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) *
+ sizeof(check.used_clusters[0]));
+
+ check.result->bfi.total_clusters =
+ (s->header.image_size + s->header.cluster_size - 1) /
+ s->header.cluster_size;
+ ret = qed_check_l1_table(&check, s->l1_table);
+ if (ret == 0) {
+ /* Only check for leaks if entire image was scanned successfully */
+ qed_check_for_leaks(&check);
+
+ if (fix) {
+ qed_check_mark_clean(s, result);
+ }
+ }
+
+ g_free(check.used_clusters);
+ return ret;
+}
diff --git a/contrib/qemu/block/qed-cluster.c b/contrib/qemu/block/qed-cluster.c
new file mode 100644
index 000000000..f64b2af8f
--- /dev/null
+++ b/contrib/qemu/block/qed-cluster.c
@@ -0,0 +1,165 @@
+/*
+ * QEMU Enhanced Disk Format Cluster functions
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+/**
+ * Count the number of contiguous data clusters
+ *
+ * @s: QED state
+ * @table: L2 table
+ * @index: First cluster index
+ * @n: Maximum number of clusters
+ * @offset: Set to first cluster offset
+ *
+ * This function scans tables for contiguous clusters. A contiguous run of
+ * clusters may be allocated, unallocated, or zero.
+ */
+static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
+ QEDTable *table,
+ unsigned int index,
+ unsigned int n,
+ uint64_t *offset)
+{
+ unsigned int end = MIN(index + n, s->table_nelems);
+ uint64_t last = table->offsets[index];
+ unsigned int i;
+
+ *offset = last;
+
+ for (i = index + 1; i < end; i++) {
+ if (qed_offset_is_unalloc_cluster(last)) {
+ /* Counting unallocated clusters */
+ if (!qed_offset_is_unalloc_cluster(table->offsets[i])) {
+ break;
+ }
+ } else if (qed_offset_is_zero_cluster(last)) {
+ /* Counting zero clusters */
+ if (!qed_offset_is_zero_cluster(table->offsets[i])) {
+ break;
+ }
+ } else {
+ /* Counting allocated clusters */
+ if (table->offsets[i] != last + s->header.cluster_size) {
+ break;
+ }
+ last = table->offsets[i];
+ }
+ }
+ return i - index;
+}
+
+typedef struct {
+ BDRVQEDState *s;
+ uint64_t pos;
+ size_t len;
+
+ QEDRequest *request;
+
+ /* User callback */
+ QEDFindClusterFunc *cb;
+ void *opaque;
+} QEDFindClusterCB;
+
+static void qed_find_cluster_cb(void *opaque, int ret)
+{
+ QEDFindClusterCB *find_cluster_cb = opaque;
+ BDRVQEDState *s = find_cluster_cb->s;
+ QEDRequest *request = find_cluster_cb->request;
+ uint64_t offset = 0;
+ size_t len = 0;
+ unsigned int index;
+ unsigned int n;
+
+ if (ret) {
+ goto out;
+ }
+
+ index = qed_l2_index(s, find_cluster_cb->pos);
+ n = qed_bytes_to_clusters(s,
+ qed_offset_into_cluster(s, find_cluster_cb->pos) +
+ find_cluster_cb->len);
+ n = qed_count_contiguous_clusters(s, request->l2_table->table,
+ index, n, &offset);
+
+ if (qed_offset_is_unalloc_cluster(offset)) {
+ ret = QED_CLUSTER_L2;
+ } else if (qed_offset_is_zero_cluster(offset)) {
+ ret = QED_CLUSTER_ZERO;
+ } else if (qed_check_cluster_offset(s, offset)) {
+ ret = QED_CLUSTER_FOUND;
+ } else {
+ ret = -EINVAL;
+ }
+
+ len = MIN(find_cluster_cb->len, n * s->header.cluster_size -
+ qed_offset_into_cluster(s, find_cluster_cb->pos));
+
+out:
+ find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
+ g_free(find_cluster_cb);
+}
+
+/**
+ * Find the offset of a data cluster
+ *
+ * @s: QED state
+ * @request: L2 cache entry
+ * @pos: Byte position in device
+ * @len: Number of bytes
+ * @cb: Completion function
+ * @opaque: User data for completion function
+ *
+ * This function translates a position in the block device to an offset in the
+ * image file. It invokes the cb completion callback to report back the
+ * translated offset or unallocated range in the image file.
+ *
+ * If the L2 table exists, request->l2_table points to the L2 table cache entry
+ * and the caller must free the reference when they are finished. The cache
+ * entry is exposed in this way to avoid callers having to read the L2 table
+ * again later during request processing. If request->l2_table is non-NULL it
+ * will be unreferenced before taking on the new cache entry.
+ */
+void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+ size_t len, QEDFindClusterFunc *cb, void *opaque)
+{
+ QEDFindClusterCB *find_cluster_cb;
+ uint64_t l2_offset;
+
+ /* Limit length to L2 boundary. Requests are broken up at the L2 boundary
+ * so that a request acts on one L2 table at a time.
+ */
+ len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
+
+ l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
+ if (qed_offset_is_unalloc_cluster(l2_offset)) {
+ cb(opaque, QED_CLUSTER_L1, 0, len);
+ return;
+ }
+ if (!qed_check_table_offset(s, l2_offset)) {
+ cb(opaque, -EINVAL, 0, 0);
+ return;
+ }
+
+ find_cluster_cb = g_malloc(sizeof(*find_cluster_cb));
+ find_cluster_cb->s = s;
+ find_cluster_cb->pos = pos;
+ find_cluster_cb->len = len;
+ find_cluster_cb->cb = cb;
+ find_cluster_cb->opaque = opaque;
+ find_cluster_cb->request = request;
+
+ qed_read_l2_table(s, request, l2_offset,
+ qed_find_cluster_cb, find_cluster_cb);
+}
diff --git a/contrib/qemu/block/qed-gencb.c b/contrib/qemu/block/qed-gencb.c
new file mode 100644
index 000000000..7d7ac1ffc
--- /dev/null
+++ b/contrib/qemu/block/qed-gencb.c
@@ -0,0 +1,32 @@
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque)
+{
+ GenericCB *gencb = g_malloc(len);
+ gencb->cb = cb;
+ gencb->opaque = opaque;
+ return gencb;
+}
+
+void gencb_complete(void *opaque, int ret)
+{
+ GenericCB *gencb = opaque;
+ BlockDriverCompletionFunc *cb = gencb->cb;
+ void *user_opaque = gencb->opaque;
+
+ g_free(gencb);
+ cb(user_opaque, ret);
+}
diff --git a/contrib/qemu/block/qed-l2-cache.c b/contrib/qemu/block/qed-l2-cache.c
new file mode 100644
index 000000000..e9b2aae44
--- /dev/null
+++ b/contrib/qemu/block/qed-l2-cache.c
@@ -0,0 +1,187 @@
+/*
+ * QEMU Enhanced Disk Format L2 Cache
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+/*
+ * L2 table cache usage is as follows:
+ *
+ * An open image has one L2 table cache that is used to avoid accessing the
+ * image file for recently referenced L2 tables.
+ *
+ * Cluster offset lookup translates the logical offset within the block device
+ * to a cluster offset within the image file. This is done by indexing into
+ * the L1 and L2 tables which store cluster offsets. It is here where the L2
+ * table cache serves up recently referenced L2 tables.
+ *
+ * If there is a cache miss, that L2 table is read from the image file and
+ * committed to the cache. Subsequent accesses to that L2 table will be served
+ * from the cache until the table is evicted from the cache.
+ *
+ * L2 tables are also committed to the cache when new L2 tables are allocated
+ * in the image file. Since the L2 table cache is write-through, the new L2
+ * table is first written out to the image file and then committed to the
+ * cache.
+ *
+ * Multiple I/O requests may be using an L2 table cache entry at any given
+ * time. That means an entry may be in use across several requests and
+ * reference counting is needed to free the entry at the correct time. In
+ * particular, an entry evicted from the cache will only be freed once all
+ * references are dropped.
+ *
+ * An in-flight I/O request will hold a reference to a L2 table cache entry for
+ * the period during which it needs to access the L2 table. This includes
+ * cluster offset lookup, L2 table allocation, and L2 table update when a new
+ * data cluster has been allocated.
+ *
+ * An interesting case occurs when two requests need to access an L2 table that
+ * is not in the cache. Since the operation to read the table from the image
+ * file takes some time to complete, both requests may see a cache miss and
+ * start reading the L2 table from the image file. The first to finish will
+ * commit its L2 table into the cache. When the second tries to commit its
+ * table will be deleted in favor of the existing cache entry.
+ */
+
+#include "trace.h"
+#include "qed.h"
+
+/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */
+#define MAX_L2_CACHE_SIZE 50
+
+/**
+ * Initialize the L2 cache
+ */
+void qed_init_l2_cache(L2TableCache *l2_cache)
+{
+ QTAILQ_INIT(&l2_cache->entries);
+ l2_cache->n_entries = 0;
+}
+
+/**
+ * Free the L2 cache
+ */
+void qed_free_l2_cache(L2TableCache *l2_cache)
+{
+ CachedL2Table *entry, *next_entry;
+
+ QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) {
+ qemu_vfree(entry->table);
+ g_free(entry);
+ }
+}
+
+/**
+ * Allocate an uninitialized entry from the cache
+ *
+ * The returned entry has a reference count of 1 and is owned by the caller.
+ * The caller must allocate the actual table field for this entry and it must
+ * be freeable using qemu_vfree().
+ */
+CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache)
+{
+ CachedL2Table *entry;
+
+ entry = g_malloc0(sizeof(*entry));
+ entry->ref++;
+
+ trace_qed_alloc_l2_cache_entry(l2_cache, entry);
+
+ return entry;
+}
+
+/**
+ * Decrease an entry's reference count and free if necessary when the reference
+ * count drops to zero.
+ */
+void qed_unref_l2_cache_entry(CachedL2Table *entry)
+{
+ if (!entry) {
+ return;
+ }
+
+ entry->ref--;
+ trace_qed_unref_l2_cache_entry(entry, entry->ref);
+ if (entry->ref == 0) {
+ qemu_vfree(entry->table);
+ g_free(entry);
+ }
+}
+
+/**
+ * Find an entry in the L2 cache. This may return NULL and it's up to the
+ * caller to satisfy the cache miss.
+ *
+ * For a cached entry, this function increases the reference count and returns
+ * the entry.
+ */
+CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset)
+{
+ CachedL2Table *entry;
+
+ QTAILQ_FOREACH(entry, &l2_cache->entries, node) {
+ if (entry->offset == offset) {
+ trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref);
+ entry->ref++;
+ return entry;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * Commit an L2 cache entry into the cache. This is meant to be used as part of
+ * the process to satisfy a cache miss. A caller would allocate an entry which
+ * is not actually in the L2 cache and then once the entry was valid and
+ * present on disk, the entry can be committed into the cache.
+ *
+ * Since the cache is write-through, it's important that this function is not
+ * called until the entry is present on disk and the L1 has been updated to
+ * point to the entry.
+ *
+ * N.B. This function steals a reference to the l2_table from the caller so the
+ * caller must obtain a new reference by issuing a call to
+ * qed_find_l2_cache_entry().
+ */
+void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table)
+{
+ CachedL2Table *entry;
+
+ entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset);
+ if (entry) {
+ qed_unref_l2_cache_entry(entry);
+ qed_unref_l2_cache_entry(l2_table);
+ return;
+ }
+
+ /* Evict an unused cache entry so we have space. If all entries are in use
+ * we can grow the cache temporarily and we try to shrink back down later.
+ */
+ if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) {
+ CachedL2Table *next;
+ QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next) {
+ if (entry->ref > 1) {
+ continue;
+ }
+
+ QTAILQ_REMOVE(&l2_cache->entries, entry, node);
+ l2_cache->n_entries--;
+ qed_unref_l2_cache_entry(entry);
+
+ /* Stop evicting when we've shrunk back to max size */
+ if (l2_cache->n_entries < MAX_L2_CACHE_SIZE) {
+ break;
+ }
+ }
+ }
+
+ l2_cache->n_entries++;
+ QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node);
+}
diff --git a/contrib/qemu/block/qed-table.c b/contrib/qemu/block/qed-table.c
new file mode 100644
index 000000000..76d2dcccf
--- /dev/null
+++ b/contrib/qemu/block/qed-table.c
@@ -0,0 +1,296 @@
+/*
+ * QEMU Enhanced Disk Format Table I/O
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
+#include "qed.h"
+
+typedef struct {
+ GenericCB gencb;
+ BDRVQEDState *s;
+ QEDTable *table;
+
+ struct iovec iov;
+ QEMUIOVector qiov;
+} QEDReadTableCB;
+
+static void qed_read_table_cb(void *opaque, int ret)
+{
+ QEDReadTableCB *read_table_cb = opaque;
+ QEDTable *table = read_table_cb->table;
+ int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
+ int i;
+
+ /* Handle I/O error */
+ if (ret) {
+ goto out;
+ }
+
+ /* Byteswap offsets */
+ for (i = 0; i < noffsets; i++) {
+ table->offsets[i] = le64_to_cpu(table->offsets[i]);
+ }
+
+out:
+ /* Completion */
+ trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
+ gencb_complete(&read_table_cb->gencb, ret);
+}
+
+static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb),
+ cb, opaque);
+ QEMUIOVector *qiov = &read_table_cb->qiov;
+
+ trace_qed_read_table(s, offset, table);
+
+ read_table_cb->s = s;
+ read_table_cb->table = table;
+ read_table_cb->iov.iov_base = table->offsets,
+ read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size,
+
+ qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
+ bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
+ qiov->size / BDRV_SECTOR_SIZE,
+ qed_read_table_cb, read_table_cb);
+}
+
+typedef struct {
+ GenericCB gencb;
+ BDRVQEDState *s;
+ QEDTable *orig_table;
+ QEDTable *table;
+ bool flush; /* flush after write? */
+
+ struct iovec iov;
+ QEMUIOVector qiov;
+} QEDWriteTableCB;
+
+static void qed_write_table_cb(void *opaque, int ret)
+{
+ QEDWriteTableCB *write_table_cb = opaque;
+
+ trace_qed_write_table_cb(write_table_cb->s,
+ write_table_cb->orig_table,
+ write_table_cb->flush,
+ ret);
+
+ if (ret) {
+ goto out;
+ }
+
+ if (write_table_cb->flush) {
+ /* We still need to flush first */
+ write_table_cb->flush = false;
+ bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
+ write_table_cb);
+ return;
+ }
+
+out:
+ qemu_vfree(write_table_cb->table);
+ gencb_complete(&write_table_cb->gencb, ret);
+}
+
+/**
+ * Write out an updated part or all of a table
+ *
+ * @s: QED state
+ * @offset: Offset of table in image file, in bytes
+ * @table: Table
+ * @index: Index of first element
+ * @n: Number of elements
+ * @flush: Whether or not to sync to disk
+ * @cb: Completion function
+ * @opaque: Argument for completion function
+ */
+static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+ unsigned int index, unsigned int n, bool flush,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ QEDWriteTableCB *write_table_cb;
+ unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
+ unsigned int start, end, i;
+ size_t len_bytes;
+
+ trace_qed_write_table(s, offset, table, index, n);
+
+ /* Calculate indices of the first and one after last elements */
+ start = index & ~sector_mask;
+ end = (index + n + sector_mask) & ~sector_mask;
+
+ len_bytes = (end - start) * sizeof(uint64_t);
+
+ write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque);
+ write_table_cb->s = s;
+ write_table_cb->orig_table = table;
+ write_table_cb->flush = flush;
+ write_table_cb->table = qemu_blockalign(s->bs, len_bytes);
+ write_table_cb->iov.iov_base = write_table_cb->table->offsets;
+ write_table_cb->iov.iov_len = len_bytes;
+ qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1);
+
+ /* Byteswap table */
+ for (i = start; i < end; i++) {
+ uint64_t le_offset = cpu_to_le64(table->offsets[i]);
+ write_table_cb->table->offsets[i - start] = le_offset;
+ }
+
+ /* Adjust for offset into table */
+ offset += start * sizeof(uint64_t);
+
+ bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
+ &write_table_cb->qiov,
+ write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
+ qed_write_table_cb, write_table_cb);
+}
+
+/**
+ * Propagate return value from async callback
+ */
+static void qed_sync_cb(void *opaque, int ret)
+{
+ *(int *)opaque = ret;
+}
+
+int qed_read_l1_table_sync(BDRVQEDState *s)
+{
+ int ret = -EINPROGRESS;
+
+ qed_read_table(s, s->header.l1_table_offset,
+ s->l1_table, qed_sync_cb, &ret);
+ while (ret == -EINPROGRESS) {
+ qemu_aio_wait();
+ }
+
+ return ret;
+}
+
+void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
+ qed_write_table(s, s->header.l1_table_offset,
+ s->l1_table, index, n, false, cb, opaque);
+}
+
+int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
+ unsigned int n)
+{
+ int ret = -EINPROGRESS;
+
+ qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
+ while (ret == -EINPROGRESS) {
+ qemu_aio_wait();
+ }
+
+ return ret;
+}
+
+typedef struct {
+ GenericCB gencb;
+ BDRVQEDState *s;
+ uint64_t l2_offset;
+ QEDRequest *request;
+} QEDReadL2TableCB;
+
+static void qed_read_l2_table_cb(void *opaque, int ret)
+{
+ QEDReadL2TableCB *read_l2_table_cb = opaque;
+ QEDRequest *request = read_l2_table_cb->request;
+ BDRVQEDState *s = read_l2_table_cb->s;
+ CachedL2Table *l2_table = request->l2_table;
+ uint64_t l2_offset = read_l2_table_cb->l2_offset;
+
+ if (ret) {
+ /* can't trust loaded L2 table anymore */
+ qed_unref_l2_cache_entry(l2_table);
+ request->l2_table = NULL;
+ } else {
+ l2_table->offset = l2_offset;
+
+ qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
+
+ /* This is guaranteed to succeed because we just committed the entry
+ * to the cache.
+ */
+ request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+ assert(request->l2_table != NULL);
+ }
+
+ gencb_complete(&read_l2_table_cb->gencb, ret);
+}
+
+void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ QEDReadL2TableCB *read_l2_table_cb;
+
+ qed_unref_l2_cache_entry(request->l2_table);
+
+ /* Check for cached L2 entry */
+ request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
+ if (request->l2_table) {
+ cb(opaque, 0);
+ return;
+ }
+
+ request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
+ request->l2_table->table = qed_alloc_table(s);
+
+ read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque);
+ read_l2_table_cb->s = s;
+ read_l2_table_cb->l2_offset = offset;
+ read_l2_table_cb->request = request;
+
+ BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD);
+ qed_read_table(s, offset, request->l2_table->table,
+ qed_read_l2_table_cb, read_l2_table_cb);
+}
+
+int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
+{
+ int ret = -EINPROGRESS;
+
+ qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
+ while (ret == -EINPROGRESS) {
+ qemu_aio_wait();
+ }
+
+ return ret;
+}
+
+void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+ unsigned int index, unsigned int n, bool flush,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
+ qed_write_table(s, request->l2_table->offset,
+ request->l2_table->table, index, n, flush, cb, opaque);
+}
+
+int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
+ unsigned int index, unsigned int n, bool flush)
+{
+ int ret = -EINPROGRESS;
+
+ qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
+ while (ret == -EINPROGRESS) {
+ qemu_aio_wait();
+ }
+
+ return ret;
+}
diff --git a/contrib/qemu/block/qed.c b/contrib/qemu/block/qed.c
new file mode 100644
index 000000000..f767b0528
--- /dev/null
+++ b/contrib/qemu/block/qed.c
@@ -0,0 +1,1596 @@
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/timer.h"
+#include "trace.h"
+#include "qed.h"
+#include "qapi/qmp/qerror.h"
+#include "migration/migration.h"
+
+static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ QEDAIOCB *acb = (QEDAIOCB *)blockacb;
+ bool finished = false;
+
+ /* Wait for the request to finish */
+ acb->finished = &finished;
+ while (!finished) {
+ qemu_aio_wait();
+ }
+}
+
+static const AIOCBInfo qed_aiocb_info = {
+ .aiocb_size = sizeof(QEDAIOCB),
+ .cancel = qed_aio_cancel,
+};
+
+static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
+ const char *filename)
+{
+ const QEDHeader *header = (const QEDHeader *)buf;
+
+ if (buf_size < sizeof(*header)) {
+ return 0;
+ }
+ if (le32_to_cpu(header->magic) != QED_MAGIC) {
+ return 0;
+ }
+ return 100;
+}
+
+/**
+ * Check whether an image format is raw
+ *
+ * @fmt: Backing file format, may be NULL
+ */
+static bool qed_fmt_is_raw(const char *fmt)
+{
+ return fmt && strcmp(fmt, "raw") == 0;
+}
+
+static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
+{
+ cpu->magic = le32_to_cpu(le->magic);
+ cpu->cluster_size = le32_to_cpu(le->cluster_size);
+ cpu->table_size = le32_to_cpu(le->table_size);
+ cpu->header_size = le32_to_cpu(le->header_size);
+ cpu->features = le64_to_cpu(le->features);
+ cpu->compat_features = le64_to_cpu(le->compat_features);
+ cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
+ cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
+ cpu->image_size = le64_to_cpu(le->image_size);
+ cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
+ cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
+}
+
+static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
+{
+ le->magic = cpu_to_le32(cpu->magic);
+ le->cluster_size = cpu_to_le32(cpu->cluster_size);
+ le->table_size = cpu_to_le32(cpu->table_size);
+ le->header_size = cpu_to_le32(cpu->header_size);
+ le->features = cpu_to_le64(cpu->features);
+ le->compat_features = cpu_to_le64(cpu->compat_features);
+ le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
+ le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
+ le->image_size = cpu_to_le64(cpu->image_size);
+ le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
+ le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
+}
+
+int qed_write_header_sync(BDRVQEDState *s)
+{
+ QEDHeader le;
+ int ret;
+
+ qed_header_cpu_to_le(&s->header, &le);
+ ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
+ if (ret != sizeof(le)) {
+ return ret;
+ }
+ return 0;
+}
+
+typedef struct {
+ GenericCB gencb;
+ BDRVQEDState *s;
+ struct iovec iov;
+ QEMUIOVector qiov;
+ int nsectors;
+ uint8_t *buf;
+} QEDWriteHeaderCB;
+
+static void qed_write_header_cb(void *opaque, int ret)
+{
+ QEDWriteHeaderCB *write_header_cb = opaque;
+
+ qemu_vfree(write_header_cb->buf);
+ gencb_complete(write_header_cb, ret);
+}
+
+static void qed_write_header_read_cb(void *opaque, int ret)
+{
+ QEDWriteHeaderCB *write_header_cb = opaque;
+ BDRVQEDState *s = write_header_cb->s;
+
+ if (ret) {
+ qed_write_header_cb(write_header_cb, ret);
+ return;
+ }
+
+ /* Update header */
+ qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
+
+ bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
+ write_header_cb->nsectors, qed_write_header_cb,
+ write_header_cb);
+}
+
+/**
+ * Update header in-place (does not rewrite backing filename or other strings)
+ *
+ * This function only updates known header fields in-place and does not affect
+ * extra data after the QED header.
+ */
+static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb,
+ void *opaque)
+{
+ /* We must write full sectors for O_DIRECT but cannot necessarily generate
+ * the data following the header if an unrecognized compat feature is
+ * active. Therefore, first read the sectors containing the header, update
+ * them, and write back.
+ */
+
+ int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
+ BDRV_SECTOR_SIZE;
+ size_t len = nsectors * BDRV_SECTOR_SIZE;
+ QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
+ cb, opaque);
+
+ write_header_cb->s = s;
+ write_header_cb->nsectors = nsectors;
+ write_header_cb->buf = qemu_blockalign(s->bs, len);
+ write_header_cb->iov.iov_base = write_header_cb->buf;
+ write_header_cb->iov.iov_len = len;
+ qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
+
+ bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
+ qed_write_header_read_cb, write_header_cb);
+}
+
+static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
+{
+ uint64_t table_entries;
+ uint64_t l2_size;
+
+ table_entries = (table_size * cluster_size) / sizeof(uint64_t);
+ l2_size = table_entries * cluster_size;
+
+ return l2_size * table_entries;
+}
+
+static bool qed_is_cluster_size_valid(uint32_t cluster_size)
+{
+ if (cluster_size < QED_MIN_CLUSTER_SIZE ||
+ cluster_size > QED_MAX_CLUSTER_SIZE) {
+ return false;
+ }
+ if (cluster_size & (cluster_size - 1)) {
+ return false; /* not power of 2 */
+ }
+ return true;
+}
+
+static bool qed_is_table_size_valid(uint32_t table_size)
+{
+ if (table_size < QED_MIN_TABLE_SIZE ||
+ table_size > QED_MAX_TABLE_SIZE) {
+ return false;
+ }
+ if (table_size & (table_size - 1)) {
+ return false; /* not power of 2 */
+ }
+ return true;
+}
+
+static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
+ uint32_t table_size)
+{
+ if (image_size % BDRV_SECTOR_SIZE != 0) {
+ return false; /* not multiple of sector size */
+ }
+ if (image_size > qed_max_image_size(cluster_size, table_size)) {
+ return false; /* image is too large */
+ }
+ return true;
+}
+
+/**
+ * Read a string of known length from the image file
+ *
+ * @file: Image file
+ * @offset: File offset to start of string, in bytes
+ * @n: String length in bytes
+ * @buf: Destination buffer
+ * @buflen: Destination buffer length in bytes
+ * @ret: 0 on success, -errno on failure
+ *
+ * The string is NUL-terminated.
+ */
+static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
+ char *buf, size_t buflen)
+{
+ int ret;
+ if (n >= buflen) {
+ return -EINVAL;
+ }
+ ret = bdrv_pread(file, offset, buf, n);
+ if (ret < 0) {
+ return ret;
+ }
+ buf[n] = '\0';
+ return 0;
+}
+
+/**
+ * Allocate new clusters
+ *
+ * @s: QED state
+ * @n: Number of contiguous clusters to allocate
+ * @ret: Offset of first allocated cluster
+ *
+ * This function only produces the offset where the new clusters should be
+ * written. It updates BDRVQEDState but does not make any changes to the image
+ * file.
+ */
+static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
+{
+ uint64_t offset = s->file_size;
+ s->file_size += n * s->header.cluster_size;
+ return offset;
+}
+
+QEDTable *qed_alloc_table(BDRVQEDState *s)
+{
+ /* Honor O_DIRECT memory alignment requirements */
+ return qemu_blockalign(s->bs,
+ s->header.cluster_size * s->header.table_size);
+}
+
+/**
+ * Allocate a new zeroed L2 table
+ */
+static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
+{
+ CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
+
+ l2_table->table = qed_alloc_table(s);
+ l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
+
+ memset(l2_table->table->offsets, 0,
+ s->header.cluster_size * s->header.table_size);
+ return l2_table;
+}
+
+static void qed_aio_next_io(void *opaque, int ret);
+
+static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
+{
+ assert(!s->allocating_write_reqs_plugged);
+
+ s->allocating_write_reqs_plugged = true;
+}
+
+static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
+{
+ QEDAIOCB *acb;
+
+ assert(s->allocating_write_reqs_plugged);
+
+ s->allocating_write_reqs_plugged = false;
+
+ acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+ if (acb) {
+ qed_aio_next_io(acb, 0);
+ }
+}
+
+static void qed_finish_clear_need_check(void *opaque, int ret)
+{
+ /* Do nothing */
+}
+
+static void qed_flush_after_clear_need_check(void *opaque, int ret)
+{
+ BDRVQEDState *s = opaque;
+
+ bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
+
+ /* No need to wait until flush completes */
+ qed_unplug_allocating_write_reqs(s);
+}
+
+static void qed_clear_need_check(void *opaque, int ret)
+{
+ BDRVQEDState *s = opaque;
+
+ if (ret) {
+ qed_unplug_allocating_write_reqs(s);
+ return;
+ }
+
+ s->header.features &= ~QED_F_NEED_CHECK;
+ qed_write_header(s, qed_flush_after_clear_need_check, s);
+}
+
+static void qed_need_check_timer_cb(void *opaque)
+{
+ BDRVQEDState *s = opaque;
+
+ /* The timer should only fire when allocating writes have drained */
+ assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
+
+ trace_qed_need_check_timer_cb(s);
+
+ qed_plug_allocating_write_reqs(s);
+
+ /* Ensure writes are on disk before clearing flag */
+ bdrv_aio_flush(s->bs, qed_clear_need_check, s);
+}
+
+static void qed_start_need_check_timer(BDRVQEDState *s)
+{
+ trace_qed_start_need_check_timer(s);
+
+ /* Use vm_clock so we don't alter the image file while suspended for
+ * migration.
+ */
+ qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) +
+ get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT);
+}
+
+/* It's okay to call this multiple times or when no timer is started */
+static void qed_cancel_need_check_timer(BDRVQEDState *s)
+{
+ trace_qed_cancel_need_check_timer(s);
+ qemu_del_timer(s->need_check_timer);
+}
+
+static void bdrv_qed_rebind(BlockDriverState *bs)
+{
+ BDRVQEDState *s = bs->opaque;
+ s->bs = bs;
+}
+
+static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags)
+{
+ BDRVQEDState *s = bs->opaque;
+ QEDHeader le_header;
+ int64_t file_size;
+ int ret;
+
+ s->bs = bs;
+ QSIMPLEQ_INIT(&s->allocating_write_reqs);
+
+ ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
+ if (ret < 0) {
+ return ret;
+ }
+ qed_header_le_to_cpu(&le_header, &s->header);
+
+ if (s->header.magic != QED_MAGIC) {
+ return -EMEDIUMTYPE;
+ }
+ if (s->header.features & ~QED_FEATURE_MASK) {
+ /* image uses unsupported feature bits */
+ char buf[64];
+ snprintf(buf, sizeof(buf), "%" PRIx64,
+ s->header.features & ~QED_FEATURE_MASK);
+ qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bs->device_name, "QED", buf);
+ return -ENOTSUP;
+ }
+ if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
+ return -EINVAL;
+ }
+
+ /* Round down file size to the last cluster */
+ file_size = bdrv_getlength(bs->file);
+ if (file_size < 0) {
+ return file_size;
+ }
+ s->file_size = qed_start_of_cluster(s, file_size);
+
+ if (!qed_is_table_size_valid(s->header.table_size)) {
+ return -EINVAL;
+ }
+ if (!qed_is_image_size_valid(s->header.image_size,
+ s->header.cluster_size,
+ s->header.table_size)) {
+ return -EINVAL;
+ }
+ if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
+ return -EINVAL;
+ }
+
+ s->table_nelems = (s->header.cluster_size * s->header.table_size) /
+ sizeof(uint64_t);
+ s->l2_shift = ffs(s->header.cluster_size) - 1;
+ s->l2_mask = s->table_nelems - 1;
+ s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
+
+ if ((s->header.features & QED_F_BACKING_FILE)) {
+ if ((uint64_t)s->header.backing_filename_offset +
+ s->header.backing_filename_size >
+ s->header.cluster_size * s->header.header_size) {
+ return -EINVAL;
+ }
+
+ ret = qed_read_string(bs->file, s->header.backing_filename_offset,
+ s->header.backing_filename_size, bs->backing_file,
+ sizeof(bs->backing_file));
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
+ pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
+ }
+ }
+
+ /* Reset unknown autoclear feature bits. This is a backwards
+ * compatibility mechanism that allows images to be opened by older
+ * programs, which "knock out" unknown feature bits. When an image is
+ * opened by a newer program again it can detect that the autoclear
+ * feature is no longer valid.
+ */
+ if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
+ !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) {
+ s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
+
+ ret = qed_write_header_sync(s);
+ if (ret) {
+ return ret;
+ }
+
+ /* From here on only known autoclear feature bits are valid */
+ bdrv_flush(bs->file);
+ }
+
+ s->l1_table = qed_alloc_table(s);
+ qed_init_l2_cache(&s->l2_cache);
+
+ ret = qed_read_l1_table_sync(s);
+ if (ret) {
+ goto out;
+ }
+
+ /* If image was not closed cleanly, check consistency */
+ if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
+ /* Read-only images cannot be fixed. There is no risk of corruption
+ * since write operations are not possible. Therefore, allow
+ * potentially inconsistent images to be opened read-only. This can
+ * aid data recovery from an otherwise inconsistent image.
+ */
+ if (!bdrv_is_read_only(bs->file) &&
+ !(flags & BDRV_O_INCOMING)) {
+ BdrvCheckResult result = {0};
+
+ ret = qed_check(s, &result, true);
+ if (ret) {
+ goto out;
+ }
+ }
+ }
+
+ s->need_check_timer = qemu_new_timer_ns(vm_clock,
+ qed_need_check_timer_cb, s);
+
+out:
+ if (ret) {
+ qed_free_l2_cache(&s->l2_cache);
+ qemu_vfree(s->l1_table);
+ }
+ return ret;
+}
+
+/* We have nothing to do for QED reopen, stubs just return
+ * success */
+static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
+static void bdrv_qed_close(BlockDriverState *bs)
+{
+ BDRVQEDState *s = bs->opaque;
+
+ qed_cancel_need_check_timer(s);
+ qemu_free_timer(s->need_check_timer);
+
+ /* Ensure writes reach stable storage */
+ bdrv_flush(bs->file);
+
+ /* Clean shutdown, no check required on next open */
+ if (s->header.features & QED_F_NEED_CHECK) {
+ s->header.features &= ~QED_F_NEED_CHECK;
+ qed_write_header_sync(s);
+ }
+
+ qed_free_l2_cache(&s->l2_cache);
+ qemu_vfree(s->l1_table);
+}
+
+static int qed_create(const char *filename, uint32_t cluster_size,
+ uint64_t image_size, uint32_t table_size,
+ const char *backing_file, const char *backing_fmt)
+{
+ QEDHeader header = {
+ .magic = QED_MAGIC,
+ .cluster_size = cluster_size,
+ .table_size = table_size,
+ .header_size = 1,
+ .features = 0,
+ .compat_features = 0,
+ .l1_table_offset = cluster_size,
+ .image_size = image_size,
+ };
+ QEDHeader le_header;
+ uint8_t *l1_table = NULL;
+ size_t l1_size = header.cluster_size * header.table_size;
+ int ret = 0;
+ BlockDriverState *bs = NULL;
+
+ ret = bdrv_create_file(filename, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* File must start empty and grow, check truncate is supported */
+ ret = bdrv_truncate(bs, 0);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (backing_file) {
+ header.features |= QED_F_BACKING_FILE;
+ header.backing_filename_offset = sizeof(le_header);
+ header.backing_filename_size = strlen(backing_file);
+
+ if (qed_fmt_is_raw(backing_fmt)) {
+ header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
+ }
+ }
+
+ qed_header_cpu_to_le(&header, &le_header);
+ ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header));
+ if (ret < 0) {
+ goto out;
+ }
+ ret = bdrv_pwrite(bs, sizeof(le_header), backing_file,
+ header.backing_filename_size);
+ if (ret < 0) {
+ goto out;
+ }
+
+ l1_table = g_malloc0(l1_size);
+ ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = 0; /* success */
+out:
+ g_free(l1_table);
+ bdrv_delete(bs);
+ return ret;
+}
+
+static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
+{
+ uint64_t image_size = 0;
+ uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
+ uint32_t table_size = QED_DEFAULT_TABLE_SIZE;
+ const char *backing_file = NULL;
+ const char *backing_fmt = NULL;
+
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ image_size = options->value.n;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+ backing_file = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+ backing_fmt = options->value.s;
+ } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+ if (options->value.n) {
+ cluster_size = options->value.n;
+ }
+ } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) {
+ if (options->value.n) {
+ table_size = options->value.n;
+ }
+ }
+ options++;
+ }
+
+ if (!qed_is_cluster_size_valid(cluster_size)) {
+ fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n",
+ QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
+ return -EINVAL;
+ }
+ if (!qed_is_table_size_valid(table_size)) {
+ fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n",
+ QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
+ return -EINVAL;
+ }
+ if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) {
+ fprintf(stderr, "QED image size must be a non-zero multiple of "
+ "cluster size and less than %" PRIu64 " bytes\n",
+ qed_max_image_size(cluster_size, table_size));
+ return -EINVAL;
+ }
+
+ return qed_create(filename, cluster_size, image_size, table_size,
+ backing_file, backing_fmt);
+}
+
+typedef struct {
+ Coroutine *co;
+ int is_allocated;
+ int *pnum;
+} QEDIsAllocatedCB;
+
+static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
+{
+ QEDIsAllocatedCB *cb = opaque;
+ *cb->pnum = len / BDRV_SECTOR_SIZE;
+ cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO);
+ if (cb->co) {
+ qemu_coroutine_enter(cb->co, NULL);
+ }
+}
+
+static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ BDRVQEDState *s = bs->opaque;
+ uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
+ size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
+ QEDIsAllocatedCB cb = {
+ .is_allocated = -1,
+ .pnum = pnum,
+ };
+ QEDRequest request = { .l2_table = NULL };
+
+ qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
+
+ /* Now sleep if the callback wasn't invoked immediately */
+ while (cb.is_allocated == -1) {
+ cb.co = qemu_coroutine_self();
+ qemu_coroutine_yield();
+ }
+
+ qed_unref_l2_cache_entry(request.l2_table);
+
+ return cb.is_allocated;
+}
+
+static int bdrv_qed_make_empty(BlockDriverState *bs)
+{
+ return -ENOTSUP;
+}
+
+static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
+{
+ return acb->common.bs->opaque;
+}
+
+/**
+ * Read from the backing file or zero-fill if no backing file
+ *
+ * @s: QED state
+ * @pos: Byte position in device
+ * @qiov: Destination I/O vector
+ * @cb: Completion function
+ * @opaque: User data for completion function
+ *
+ * This function reads qiov->size bytes starting at pos from the backing file.
+ * If there is no backing file then zeroes are read.
+ */
+static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
+ QEMUIOVector *qiov,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ uint64_t backing_length = 0;
+ size_t size;
+
+ /* If there is a backing file, get its length. Treat the absence of a
+ * backing file like a zero length backing file.
+ */
+ if (s->bs->backing_hd) {
+ int64_t l = bdrv_getlength(s->bs->backing_hd);
+ if (l < 0) {
+ cb(opaque, l);
+ return;
+ }
+ backing_length = l;
+ }
+
+ /* Zero all sectors if reading beyond the end of the backing file */
+ if (pos >= backing_length ||
+ pos + qiov->size > backing_length) {
+ qemu_iovec_memset(qiov, 0, 0, qiov->size);
+ }
+
+ /* Complete now if there are no backing file sectors to read */
+ if (pos >= backing_length) {
+ cb(opaque, 0);
+ return;
+ }
+
+ /* If the read straddles the end of the backing file, shorten it */
+ size = MIN((uint64_t)backing_length - pos, qiov->size);
+
+ BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
+ bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE,
+ qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
+}
+
+typedef struct {
+ GenericCB gencb;
+ BDRVQEDState *s;
+ QEMUIOVector qiov;
+ struct iovec iov;
+ uint64_t offset;
+} CopyFromBackingFileCB;
+
+static void qed_copy_from_backing_file_cb(void *opaque, int ret)
+{
+ CopyFromBackingFileCB *copy_cb = opaque;
+ qemu_vfree(copy_cb->iov.iov_base);
+ gencb_complete(&copy_cb->gencb, ret);
+}
+
+static void qed_copy_from_backing_file_write(void *opaque, int ret)
+{
+ CopyFromBackingFileCB *copy_cb = opaque;
+ BDRVQEDState *s = copy_cb->s;
+
+ if (ret) {
+ qed_copy_from_backing_file_cb(copy_cb, ret);
+ return;
+ }
+
+ BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
+ bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
+ &copy_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
+ qed_copy_from_backing_file_cb, copy_cb);
+}
+
+/**
+ * Copy data from backing file into the image
+ *
+ * @s: QED state
+ * @pos: Byte position in device
+ * @len: Number of bytes
+ * @offset: Byte offset in image file
+ * @cb: Completion function
+ * @opaque: User data for completion function
+ */
+static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
+ uint64_t len, uint64_t offset,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ CopyFromBackingFileCB *copy_cb;
+
+ /* Skip copy entirely if there is no work to do */
+ if (len == 0) {
+ cb(opaque, 0);
+ return;
+ }
+
+ copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque);
+ copy_cb->s = s;
+ copy_cb->offset = offset;
+ copy_cb->iov.iov_base = qemu_blockalign(s->bs, len);
+ copy_cb->iov.iov_len = len;
+ qemu_iovec_init_external(&copy_cb->qiov, &copy_cb->iov, 1);
+
+ qed_read_backing_file(s, pos, &copy_cb->qiov,
+ qed_copy_from_backing_file_write, copy_cb);
+}
+
+/**
+ * Link one or more contiguous clusters into a table
+ *
+ * @s: QED state
+ * @table: L2 table
+ * @index: First cluster index
+ * @n: Number of contiguous clusters
+ * @cluster: First cluster offset
+ *
+ * The cluster offset may be an allocated byte offset in the image file, the
+ * zero cluster marker, or the unallocated cluster marker.
+ */
+static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
+ unsigned int n, uint64_t cluster)
+{
+ int i;
+ for (i = index; i < index + n; i++) {
+ table->offsets[i] = cluster;
+ if (!qed_offset_is_unalloc_cluster(cluster) &&
+ !qed_offset_is_zero_cluster(cluster)) {
+ cluster += s->header.cluster_size;
+ }
+ }
+}
+
+static void qed_aio_complete_bh(void *opaque)
+{
+ QEDAIOCB *acb = opaque;
+ BlockDriverCompletionFunc *cb = acb->common.cb;
+ void *user_opaque = acb->common.opaque;
+ int ret = acb->bh_ret;
+ bool *finished = acb->finished;
+
+ qemu_bh_delete(acb->bh);
+ qemu_aio_release(acb);
+
+ /* Invoke callback */
+ cb(user_opaque, ret);
+
+ /* Signal cancel completion */
+ if (finished) {
+ *finished = true;
+ }
+}
+
+static void qed_aio_complete(QEDAIOCB *acb, int ret)
+{
+ BDRVQEDState *s = acb_to_s(acb);
+
+ trace_qed_aio_complete(s, acb, ret);
+
+ /* Free resources */
+ qemu_iovec_destroy(&acb->cur_qiov);
+ qed_unref_l2_cache_entry(acb->request.l2_table);
+
+ /* Free the buffer we may have allocated for zero writes */
+ if (acb->flags & QED_AIOCB_ZERO) {
+ qemu_vfree(acb->qiov->iov[0].iov_base);
+ acb->qiov->iov[0].iov_base = NULL;
+ }
+
+ /* Arrange for a bh to invoke the completion function */
+ acb->bh_ret = ret;
+ acb->bh = qemu_bh_new(qed_aio_complete_bh, acb);
+ qemu_bh_schedule(acb->bh);
+
+ /* Start next allocating write request waiting behind this one. Note that
+ * requests enqueue themselves when they first hit an unallocated cluster
+ * but they wait until the entire request is finished before waking up the
+ * next request in the queue. This ensures that we don't cycle through
+ * requests multiple times but rather finish one at a time completely.
+ */
+ if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
+ QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
+ acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+ if (acb) {
+ qed_aio_next_io(acb, 0);
+ } else if (s->header.features & QED_F_NEED_CHECK) {
+ qed_start_need_check_timer(s);
+ }
+ }
+}
+
+/**
+ * Commit the current L2 table to the cache
+ */
+static void qed_commit_l2_update(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ CachedL2Table *l2_table = acb->request.l2_table;
+ uint64_t l2_offset = l2_table->offset;
+
+ qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
+
+ /* This is guaranteed to succeed because we just committed the entry to the
+ * cache.
+ */
+ acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+ assert(acb->request.l2_table != NULL);
+
+ qed_aio_next_io(opaque, ret);
+}
+
+/**
+ * Update L1 table with new L2 table offset and write it out
+ */
+static void qed_aio_write_l1_update(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ int index;
+
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ index = qed_l1_index(s, acb->cur_pos);
+ s->l1_table->offsets[index] = acb->request.l2_table->offset;
+
+ qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
+}
+
+/**
+ * Update L2 table with new cluster offsets and write them out
+ */
+static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
+{
+ BDRVQEDState *s = acb_to_s(acb);
+ bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
+ int index;
+
+ if (ret) {
+ goto err;
+ }
+
+ if (need_alloc) {
+ qed_unref_l2_cache_entry(acb->request.l2_table);
+ acb->request.l2_table = qed_new_l2_table(s);
+ }
+
+ index = qed_l2_index(s, acb->cur_pos);
+ qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
+ offset);
+
+ if (need_alloc) {
+ /* Write out the whole new L2 table */
+ qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
+ qed_aio_write_l1_update, acb);
+ } else {
+ /* Write out only the updated part of the L2 table */
+ qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
+ qed_aio_next_io, acb);
+ }
+ return;
+
+err:
+ qed_aio_complete(acb, ret);
+}
+
+static void qed_aio_write_l2_update_cb(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+}
+
+/**
+ * Flush new data clusters before updating the L2 table
+ *
+ * This flush is necessary when a backing file is in use. A crash during an
+ * allocating write could result in empty clusters in the image. If the write
+ * only touched a subregion of the cluster, then backing image sectors have
+ * been lost in the untouched region. The solution is to flush after writing a
+ * new data cluster and before updating the L2 table.
+ */
+static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+
+ if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) {
+ qed_aio_complete(acb, -EIO);
+ }
+}
+
+/**
+ * Write data to the image file
+ */
+static void qed_aio_write_main(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ uint64_t offset = acb->cur_cluster +
+ qed_offset_into_cluster(s, acb->cur_pos);
+ BlockDriverCompletionFunc *next_fn;
+
+ trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
+
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
+ next_fn = qed_aio_next_io;
+ } else {
+ if (s->bs->backing_hd) {
+ next_fn = qed_aio_write_flush_before_l2_update;
+ } else {
+ next_fn = qed_aio_write_l2_update_cb;
+ }
+ }
+
+ BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
+ bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
+ &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+ next_fn, acb);
+}
+
+/**
+ * Populate back untouched region of new data cluster
+ */
+static void qed_aio_write_postfill(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ uint64_t start = acb->cur_pos + acb->cur_qiov.size;
+ uint64_t len =
+ qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
+ uint64_t offset = acb->cur_cluster +
+ qed_offset_into_cluster(s, acb->cur_pos) +
+ acb->cur_qiov.size;
+
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ trace_qed_aio_write_postfill(s, acb, start, len, offset);
+ qed_copy_from_backing_file(s, start, len, offset,
+ qed_aio_write_main, acb);
+}
+
+/**
+ * Populate front untouched region of new data cluster
+ */
+static void qed_aio_write_prefill(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
+ uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
+
+ trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
+ qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
+ qed_aio_write_postfill, acb);
+}
+
+/**
+ * Check if the QED_F_NEED_CHECK bit should be set during allocating write
+ */
+static bool qed_should_set_need_check(BDRVQEDState *s)
+{
+ /* The flush before L2 update path ensures consistency */
+ if (s->bs->backing_hd) {
+ return false;
+ }
+
+ return !(s->header.features & QED_F_NEED_CHECK);
+}
+
+static void qed_aio_write_zero_cluster(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ qed_aio_write_l2_update(acb, 0, 1);
+}
+
+/**
+ * Write new data cluster
+ *
+ * @acb: Write request
+ * @len: Length in bytes
+ *
+ * This path is taken when writing to previously unallocated clusters.
+ */
+static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+{
+ BDRVQEDState *s = acb_to_s(acb);
+ BlockDriverCompletionFunc *cb;
+
+ /* Cancel timer when the first allocating request comes in */
+ if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
+ qed_cancel_need_check_timer(s);
+ }
+
+ /* Freeze this request if another allocating write is in progress */
+ if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
+ QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
+ }
+ if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
+ s->allocating_write_reqs_plugged) {
+ return; /* wait for existing request to finish */
+ }
+
+ acb->cur_nclusters = qed_bytes_to_clusters(s,
+ qed_offset_into_cluster(s, acb->cur_pos) + len);
+ qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+ if (acb->flags & QED_AIOCB_ZERO) {
+ /* Skip ahead if the clusters are already zero */
+ if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
+ qed_aio_next_io(acb, 0);
+ return;
+ }
+
+ cb = qed_aio_write_zero_cluster;
+ } else {
+ cb = qed_aio_write_prefill;
+ acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
+ }
+
+ if (qed_should_set_need_check(s)) {
+ s->header.features |= QED_F_NEED_CHECK;
+ qed_write_header(s, cb, acb);
+ } else {
+ cb(acb, 0);
+ }
+}
+
+/**
+ * Write data cluster in place
+ *
+ * @acb: Write request
+ * @offset: Cluster offset in bytes
+ * @len: Length in bytes
+ *
+ * This path is taken when writing to already allocated clusters.
+ */
+static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
+{
+ /* Allocate buffer for zero writes */
+ if (acb->flags & QED_AIOCB_ZERO) {
+ struct iovec *iov = acb->qiov->iov;
+
+ if (!iov->iov_base) {
+ iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len);
+ memset(iov->iov_base, 0, iov->iov_len);
+ }
+ }
+
+ /* Calculate the I/O vector */
+ acb->cur_cluster = offset;
+ qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+ /* Do the actual write */
+ qed_aio_write_main(acb, 0);
+}
+
+/**
+ * Write data cluster
+ *
+ * @opaque: Write request
+ * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
+ * or -errno
+ * @offset: Cluster offset in bytes
+ * @len: Length in bytes
+ *
+ * Callback from qed_find_cluster().
+ */
+static void qed_aio_write_data(void *opaque, int ret,
+ uint64_t offset, size_t len)
+{
+ QEDAIOCB *acb = opaque;
+
+ trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
+
+ acb->find_cluster_ret = ret;
+
+ switch (ret) {
+ case QED_CLUSTER_FOUND:
+ qed_aio_write_inplace(acb, offset, len);
+ break;
+
+ case QED_CLUSTER_L2:
+ case QED_CLUSTER_L1:
+ case QED_CLUSTER_ZERO:
+ qed_aio_write_alloc(acb, len);
+ break;
+
+ default:
+ qed_aio_complete(acb, ret);
+ break;
+ }
+}
+
+/**
+ * Read data cluster
+ *
+ * @opaque: Read request
+ * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
+ * or -errno
+ * @offset: Cluster offset in bytes
+ * @len: Length in bytes
+ *
+ * Callback from qed_find_cluster().
+ */
+static void qed_aio_read_data(void *opaque, int ret,
+ uint64_t offset, size_t len)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ BlockDriverState *bs = acb->common.bs;
+
+ /* Adjust offset into cluster */
+ offset += qed_offset_into_cluster(s, acb->cur_pos);
+
+ trace_qed_aio_read_data(s, acb, ret, offset, len);
+
+ if (ret < 0) {
+ goto err;
+ }
+
+ qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+ /* Handle zero cluster and backing file reads */
+ if (ret == QED_CLUSTER_ZERO) {
+ qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
+ qed_aio_next_io(acb, 0);
+ return;
+ } else if (ret != QED_CLUSTER_FOUND) {
+ qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
+ qed_aio_next_io, acb);
+ return;
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+ bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
+ &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+ qed_aio_next_io, acb);
+ return;
+
+err:
+ qed_aio_complete(acb, ret);
+}
+
+/**
+ * Begin next I/O or complete the request
+ */
+static void qed_aio_next_io(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ BDRVQEDState *s = acb_to_s(acb);
+ QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
+ qed_aio_write_data : qed_aio_read_data;
+
+ trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
+
+ /* Handle I/O error */
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ acb->qiov_offset += acb->cur_qiov.size;
+ acb->cur_pos += acb->cur_qiov.size;
+ qemu_iovec_reset(&acb->cur_qiov);
+
+ /* Complete request */
+ if (acb->cur_pos >= acb->end_pos) {
+ qed_aio_complete(acb, 0);
+ return;
+ }
+
+ /* Find next cluster and start I/O */
+ qed_find_cluster(s, &acb->request,
+ acb->cur_pos, acb->end_pos - acb->cur_pos,
+ io_fn, acb);
+}
+
+static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque, int flags)
+{
+ QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
+
+ trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
+ opaque, flags);
+
+ acb->flags = flags;
+ acb->finished = NULL;
+ acb->qiov = qiov;
+ acb->qiov_offset = 0;
+ acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
+ acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
+ acb->request.l2_table = NULL;
+ qemu_iovec_init(&acb->cur_qiov, qiov->niov);
+
+ /* Start request */
+ qed_aio_next_io(acb, 0);
+ return &acb->common;
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
+ opaque, QED_AIOCB_WRITE);
+}
+
+typedef struct {
+ Coroutine *co;
+ int ret;
+ bool done;
+} QEDWriteZeroesCB;
+
+static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
+{
+ QEDWriteZeroesCB *cb = opaque;
+
+ cb->done = true;
+ cb->ret = ret;
+ if (cb->co) {
+ qemu_coroutine_enter(cb->co, NULL);
+ }
+}
+
+static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors)
+{
+ BlockDriverAIOCB *blockacb;
+ BDRVQEDState *s = bs->opaque;
+ QEDWriteZeroesCB cb = { .done = false };
+ QEMUIOVector qiov;
+ struct iovec iov;
+
+ /* Refuse if there are untouched backing file sectors */
+ if (bs->backing_hd) {
+ if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
+ return -ENOTSUP;
+ }
+ if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
+ return -ENOTSUP;
+ }
+ }
+
+ /* Zero writes start without an I/O buffer. If a buffer becomes necessary
+ * then it will be allocated during request processing.
+ */
+ iov.iov_base = NULL,
+ iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
+ qed_co_write_zeroes_cb, &cb,
+ QED_AIOCB_WRITE | QED_AIOCB_ZERO);
+ if (!blockacb) {
+ return -EIO;
+ }
+ if (!cb.done) {
+ cb.co = qemu_coroutine_self();
+ qemu_coroutine_yield();
+ }
+ assert(cb.done);
+ return cb.ret;
+}
+
+static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset)
+{
+ BDRVQEDState *s = bs->opaque;
+ uint64_t old_image_size;
+ int ret;
+
+ if (!qed_is_image_size_valid(offset, s->header.cluster_size,
+ s->header.table_size)) {
+ re