Blame - src/kernel/linux/v4.14/drivers/nvdimm/pmem.c - T103

blob: 80f8bbf83742c76fca26176aa0d0b2be4cdb1f1f [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Persistent Memory Driver
				3	*
				4	* Copyright (c) 2014-2015, Intel Corporation.
				5	* Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
				6	* Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
				7	*
				8	* This program is free software; you can redistribute it and/or modify it
				9	* under the terms and conditions of the GNU General Public License,
				10	* version 2, as published by the Free Software Foundation.
				11	*
				12	* This program is distributed in the hope it will be useful, but WITHOUT
				13	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				14	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
				15	* more details.
				16	*/
				17
				18	#include <asm/cacheflush.h>
				19	#include <linux/blkdev.h>
				20	#include <linux/hdreg.h>
				21	#include <linux/init.h>
				22	#include <linux/platform_device.h>
				23	#include <linux/module.h>
				24	#include <linux/moduleparam.h>
				25	#include <linux/badblocks.h>
				26	#include <linux/memremap.h>
				27	#include <linux/vmalloc.h>
				28	#include <linux/blk-mq.h>
				29	#include <linux/pfn_t.h>
				30	#include <linux/slab.h>
				31	#include <linux/uio.h>
				32	#include <linux/dax.h>
				33	#include <linux/nd.h>
				34	#include "pmem.h"
				35	#include "pfn.h"
				36	#include "nd.h"
				37
				38	static struct device to_dev(struct pmem_device pmem)
				39	{
				40	/*
				41	* nvdimm bus services need a 'dev' parameter, and we record the device
				42	* at init in bb.dev.
				43	*/
				44	return pmem->bb.dev;
				45	}
				46
				47	static struct nd_region to_region(struct pmem_device pmem)
				48	{
				49	return to_nd_region(to_dev(pmem)->parent);
				50	}
				51
				52	static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
				53	phys_addr_t offset, unsigned int len)
				54	{
				55	struct device *dev = to_dev(pmem);
				56	sector_t sector;
				57	long cleared;
				58	blk_status_t rc = BLK_STS_OK;
				59
				60	sector = (offset - pmem->data_offset) / 512;
				61
				62	cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
				63	if (cleared < len)
				64	rc = BLK_STS_IOERR;
				65	if (cleared > 0 && cleared / 512) {
				66	cleared /= 512;
				67	dev_dbg(dev, "%s: %#llx clear %ld sector%s\n", __func__,
				68	(unsigned long long) sector, cleared,
				69	cleared > 1 ? "s" : "");
				70	badblocks_clear(&pmem->bb, sector, cleared);
				71	if (pmem->bb_state)
				72	sysfs_notify_dirent(pmem->bb_state);
				73	}
				74
				75	arch_invalidate_pmem(pmem->virt_addr + offset, len);
				76
				77	return rc;
				78	}
				79
				80	static void write_pmem(void pmem_addr, struct page page,
				81	unsigned int off, unsigned int len)
				82	{
				83	unsigned int chunk;
				84	void *mem;
				85
				86	while (len) {
				87	mem = kmap_atomic(page);
				88	chunk = min_t(unsigned int, len, PAGE_SIZE);
				89	memcpy_flushcache(pmem_addr, mem + off, chunk);
				90	kunmap_atomic(mem);
				91	len -= chunk;
				92	off = 0;
				93	page++;
				94	pmem_addr += PAGE_SIZE;
				95	}
				96	}
				97
				98	static blk_status_t read_pmem(struct page *page, unsigned int off,
				99	void *pmem_addr, unsigned int len)
				100	{
				101	unsigned int chunk;
				102	int rc;
				103	void *mem;
				104
				105	while (len) {
				106	mem = kmap_atomic(page);
				107	chunk = min_t(unsigned int, len, PAGE_SIZE);
				108	rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
				109	kunmap_atomic(mem);
				110	if (rc)
				111	return BLK_STS_IOERR;
				112	len -= chunk;
				113	off = 0;
				114	page++;
				115	pmem_addr += PAGE_SIZE;
				116	}
				117	return BLK_STS_OK;
				118	}
				119
				120	static blk_status_t pmem_do_bvec(struct pmem_device pmem, struct page page,
				121	unsigned int len, unsigned int off, bool is_write,
				122	sector_t sector)
				123	{
				124	blk_status_t rc = BLK_STS_OK;
				125	bool bad_pmem = false;
				126	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
				127	void *pmem_addr = pmem->virt_addr + pmem_off;
				128
				129	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
				130	bad_pmem = true;
				131
				132	if (!is_write) {
				133	if (unlikely(bad_pmem))
				134	rc = BLK_STS_IOERR;
				135	else {
				136	rc = read_pmem(page, off, pmem_addr, len);
				137	flush_dcache_page(page);
				138	}
				139	} else {
				140	/*
				141	* Note that we write the data both before and after
				142	* clearing poison. The write before clear poison
				143	* handles situations where the latest written data is
				144	* preserved and the clear poison operation simply marks
				145	* the address range as valid without changing the data.
				146	* In this case application software can assume that an
				147	* interrupted write will either return the new good
				148	* data or an error.
				149	*
				150	* However, if pmem_clear_poison() leaves the data in an
				151	* indeterminate state we need to perform the write
				152	* after clear poison.
				153	*/
				154	flush_dcache_page(page);
				155	write_pmem(pmem_addr, page, off, len);
				156	if (unlikely(bad_pmem)) {
				157	rc = pmem_clear_poison(pmem, pmem_off, len);
				158	write_pmem(pmem_addr, page, off, len);
				159	}
				160	}
				161
				162	return rc;
				163	}
				164
				165	/* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
				166	#ifndef REQ_FLUSH
				167	#define REQ_FLUSH REQ_PREFLUSH
				168	#endif
				169
				170	static blk_qc_t pmem_make_request(struct request_queue q, struct bio bio)
				171	{
				172	blk_status_t rc = 0;
				173	bool do_acct;
				174	unsigned long start;
				175	struct bio_vec bvec;
				176	struct bvec_iter iter;
				177	struct pmem_device *pmem = q->queuedata;
				178	struct nd_region *nd_region = to_region(pmem);
				179
				180	if (bio->bi_opf & REQ_FLUSH)
				181	nvdimm_flush(nd_region);
				182
				183	do_acct = nd_iostat_start(bio, &start);
				184	bio_for_each_segment(bvec, bio, iter) {
				185	rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
				186	bvec.bv_offset, op_is_write(bio_op(bio)),
				187	iter.bi_sector);
				188	if (rc) {
				189	bio->bi_status = rc;
				190	break;
				191	}
				192	}
				193	if (do_acct)
				194	nd_iostat_end(bio, start);
				195
				196	if (bio->bi_opf & REQ_FUA)
				197	nvdimm_flush(nd_region);
				198
				199	bio_endio(bio);
				200	return BLK_QC_T_NONE;
				201	}
				202
				203	static int pmem_rw_page(struct block_device *bdev, sector_t sector,
				204	struct page *page, bool is_write)
				205	{
				206	struct pmem_device *pmem = bdev->bd_queue->queuedata;
				207	blk_status_t rc;
				208
				209	rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
				210	0, is_write, sector);
				211
				212	/*
				213	* The ->rw_page interface is subtle and tricky. The core
				214	* retries on any error, so we can only invoke page_endio() in
				215	* the successful completion case. Otherwise, we'll see crashes
				216	* caused by double completion.
				217	*/
				218	if (rc == 0)
				219	page_endio(page, is_write, 0);
				220
				221	return blk_status_to_errno(rc);
				222	}
				223
				224	/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
				225	__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
				226	long nr_pages, void *kaddr, pfn_t pfn)
				227	{
				228	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
				229
				230	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
				231	PFN_PHYS(nr_pages))))
				232	return -EIO;
				233	*kaddr = pmem->virt_addr + offset;
				234	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
				235
				236	/*
				237	* If badblocks are present, limit known good range to the
				238	* requested range.
				239	*/
				240	if (unlikely(pmem->bb.count))
				241	return nr_pages;
				242	return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
				243	}
				244
				245	static const struct block_device_operations pmem_fops = {
				246	.owner = THIS_MODULE,
				247	.rw_page = pmem_rw_page,
				248	.revalidate_disk = nvdimm_revalidate_disk,
				249	};
				250
				251	static long pmem_dax_direct_access(struct dax_device *dax_dev,
				252	pgoff_t pgoff, long nr_pages, void *kaddr, pfn_t pfn)
				253	{
				254	struct pmem_device *pmem = dax_get_private(dax_dev);
				255
				256	return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
				257	}
				258
				259	/*
				260	* Use the 'no check' versions of copy_from_iter_flushcache() and
				261	* copy_to_iter_mcsafe() to bypass HARDENED_USERCOPY overhead. Bounds
				262	* checking, both file offset and device offset, is handled by
				263	* dax_iomap_actor()
				264	*/
				265	static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
				266	void addr, size_t bytes, struct iov_iter i)
				267	{
				268	return _copy_from_iter_flushcache(addr, bytes, i);
				269	}
				270
				271	static const struct dax_operations pmem_dax_ops = {
				272	.direct_access = pmem_dax_direct_access,
				273	.copy_from_iter = pmem_copy_from_iter,
				274	};
				275
				276	static const struct attribute_group *pmem_attribute_groups[] = {
				277	&dax_attribute_group,
				278	NULL,
				279	};
				280
				281	static void pmem_release_queue(void *q)
				282	{
				283	blk_cleanup_queue(q);
				284	}
				285
				286	static void pmem_freeze_queue(void *q)
				287	{
				288	blk_freeze_queue_start(q);
				289	}
				290
				291	static void pmem_release_disk(void *__pmem)
				292	{
				293	struct pmem_device *pmem = __pmem;
				294
				295	kill_dax(pmem->dax_dev);
				296	put_dax(pmem->dax_dev);
				297	del_gendisk(pmem->disk);
				298	put_disk(pmem->disk);
				299	}
				300
				301	static int pmem_attach_disk(struct device *dev,
				302	struct nd_namespace_common *ndns)
				303	{
				304	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
				305	struct nd_region *nd_region = to_nd_region(dev->parent);
				306	struct vmem_altmap __altmap, *altmap = NULL;
				307	int nid = dev_to_node(dev), fua, wbc;
				308	struct resource *res = &nsio->res;
				309	struct nd_pfn *nd_pfn = NULL;
				310	struct dax_device *dax_dev;
				311	struct nd_pfn_sb *pfn_sb;
				312	struct pmem_device *pmem;
				313	struct resource pfn_res;
				314	struct request_queue *q;
				315	struct device *gendev;
				316	struct gendisk *disk;
				317	void *addr;
				318
				319	/* while nsio_rw_bytes is active, parse a pfn info block if present */
				320	if (is_nd_pfn(dev)) {
				321	nd_pfn = to_nd_pfn(dev);
				322	altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap);
				323	if (IS_ERR(altmap))
				324	return PTR_ERR(altmap);
				325	}
				326
				327	/* we're attaching a block device, disable raw namespace access */
				328	devm_nsio_disable(dev, nsio);
				329
				330	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
				331	if (!pmem)
				332	return -ENOMEM;
				333
				334	dev_set_drvdata(dev, pmem);
				335	pmem->phys_addr = res->start;
				336	pmem->size = resource_size(res);
				337	fua = nvdimm_has_flush(nd_region);
				338	if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) \|\| fua < 0) {
				339	dev_warn(dev, "unable to guarantee persistence of writes\n");
				340	fua = 0;
				341	}
				342	wbc = nvdimm_has_cache(nd_region);
				343
				344	if (!devm_request_mem_region(dev, res->start, resource_size(res),
				345	dev_name(&ndns->dev))) {
				346	dev_warn(dev, "could not reserve region %pR\n", res);
				347	return -EBUSY;
				348	}
				349
				350	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
				351	if (!q)
				352	return -ENOMEM;
				353
				354	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
				355	return -ENOMEM;
				356
				357	pmem->pfn_flags = PFN_DEV;
				358	if (is_nd_pfn(dev)) {
				359	addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
				360	altmap);
				361	pfn_sb = nd_pfn->pfn_sb;
				362	pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
				363	pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res);
				364	pmem->pfn_flags \|= PFN_MAP;
				365	res = &pfn_res; /* for badblocks populate */
				366	res->start += pmem->data_offset;
				367	} else if (pmem_should_map_pages(dev)) {
				368	addr = devm_memremap_pages(dev, &nsio->res,
				369	&q->q_usage_counter, NULL);
				370	pmem->pfn_flags \|= PFN_MAP;
				371	} else
				372	addr = devm_memremap(dev, pmem->phys_addr,
				373	pmem->size, ARCH_MEMREMAP_PMEM);
				374
				375	/*
				376	* At release time the queue must be frozen before
				377	* devm_memremap_pages is unwound
				378	*/
				379	if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
				380	return -ENOMEM;
				381
				382	if (IS_ERR(addr))
				383	return PTR_ERR(addr);
				384	pmem->virt_addr = addr;
				385
				386	blk_queue_write_cache(q, wbc, fua);
				387	blk_queue_make_request(q, pmem_make_request);
				388	blk_queue_physical_block_size(q, PAGE_SIZE);
				389	blk_queue_logical_block_size(q, pmem_sector_size(ndns));
				390	blk_queue_max_hw_sectors(q, UINT_MAX);
				391	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
				392	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
				393	q->queuedata = pmem;
				394
				395	disk = alloc_disk_node(0, nid);
				396	if (!disk)
				397	return -ENOMEM;
				398	pmem->disk = disk;
				399
				400	disk->fops = &pmem_fops;
				401	disk->queue = q;
				402	disk->flags = GENHD_FL_EXT_DEVT;
				403	nvdimm_namespace_disk_name(ndns, disk->disk_name);
				404	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
				405	/ 512);
				406	if (devm_init_badblocks(dev, &pmem->bb))
				407	return -ENOMEM;
				408	nvdimm_badblocks_populate(nd_region, &pmem->bb, res);
				409	disk->bb = &pmem->bb;
				410
				411	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
				412	if (!dax_dev) {
				413	put_disk(disk);
				414	return -ENOMEM;
				415	}
				416	dax_write_cache(dax_dev, wbc);
				417	pmem->dax_dev = dax_dev;
				418
				419	gendev = disk_to_dev(disk);
				420	gendev->groups = pmem_attribute_groups;
				421
				422	device_add_disk(dev, disk);
				423	if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
				424	return -ENOMEM;
				425
				426	revalidate_disk(disk);
				427
				428	pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd,
				429	"badblocks");
				430	if (!pmem->bb_state)
				431	dev_warn(dev, "'badblocks' notification disabled\n");
				432
				433	return 0;
				434	}
				435
				436	static int nd_pmem_probe(struct device *dev)
				437	{
				438	struct nd_namespace_common *ndns;
				439
				440	ndns = nvdimm_namespace_common_probe(dev);
				441	if (IS_ERR(ndns))
				442	return PTR_ERR(ndns);
				443
				444	if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
				445	return -ENXIO;
				446
				447	if (is_nd_btt(dev))
				448	return nvdimm_namespace_attach_btt(ndns);
				449
				450	if (is_nd_pfn(dev))
				451	return pmem_attach_disk(dev, ndns);
				452
				453	/* if we find a valid info-block we'll come back as that personality */
				454	if (nd_btt_probe(dev, ndns) == 0 \|\| nd_pfn_probe(dev, ndns) == 0
				455	\|\| nd_dax_probe(dev, ndns) == 0)
				456	return -ENXIO;
				457
				458	/* ...otherwise we're just a raw pmem device */
				459	return pmem_attach_disk(dev, ndns);
				460	}
				461
				462	static int nd_pmem_remove(struct device *dev)
				463	{
				464	struct pmem_device *pmem = dev_get_drvdata(dev);
				465
				466	if (is_nd_btt(dev))
				467	nvdimm_namespace_detach_btt(to_nd_btt(dev));
				468	else {
				469	/*
				470	* Note, this assumes device_lock() context to not race
				471	* nd_pmem_notify()
				472	*/
				473	sysfs_put(pmem->bb_state);
				474	pmem->bb_state = NULL;
				475	}
				476	nvdimm_flush(to_nd_region(dev->parent));
				477
				478	return 0;
				479	}
				480
				481	static void nd_pmem_shutdown(struct device *dev)
				482	{
				483	nvdimm_flush(to_nd_region(dev->parent));
				484	}
				485
				486	static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
				487	{
				488	struct nd_region *nd_region;
				489	resource_size_t offset = 0, end_trunc = 0;
				490	struct nd_namespace_common *ndns;
				491	struct nd_namespace_io *nsio;
				492	struct resource res;
				493	struct badblocks *bb;
				494	struct kernfs_node *bb_state;
				495
				496	if (event != NVDIMM_REVALIDATE_POISON)
				497	return;
				498
				499	if (is_nd_btt(dev)) {
				500	struct nd_btt *nd_btt = to_nd_btt(dev);
				501
				502	ndns = nd_btt->ndns;
				503	nd_region = to_nd_region(ndns->dev.parent);
				504	nsio = to_nd_namespace_io(&ndns->dev);
				505	bb = &nsio->bb;
				506	bb_state = NULL;
				507	} else {
				508	struct pmem_device *pmem = dev_get_drvdata(dev);
				509
				510	nd_region = to_region(pmem);
				511	bb = &pmem->bb;
				512	bb_state = pmem->bb_state;
				513
				514	if (is_nd_pfn(dev)) {
				515	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
				516	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
				517
				518	ndns = nd_pfn->ndns;
				519	offset = pmem->data_offset +
				520	__le32_to_cpu(pfn_sb->start_pad);
				521	end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
				522	} else {
				523	ndns = to_ndns(dev);
				524	}
				525
				526	nsio = to_nd_namespace_io(&ndns->dev);
				527	}
				528
				529	res.start = nsio->res.start + offset;
				530	res.end = nsio->res.end - end_trunc;
				531	nvdimm_badblocks_populate(nd_region, bb, &res);
				532	if (bb_state)
				533	sysfs_notify_dirent(bb_state);
				534	}
				535
				536	MODULE_ALIAS("pmem");
				537	MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
				538	MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
				539	static struct nd_device_driver nd_pmem_driver = {
				540	.probe = nd_pmem_probe,
				541	.remove = nd_pmem_remove,
				542	.notify = nd_pmem_notify,
				543	.shutdown = nd_pmem_shutdown,
				544	.drv = {
				545	.name = "nd_pmem",
				546	},
				547	.type = ND_DRIVER_NAMESPACE_IO \| ND_DRIVER_NAMESPACE_PMEM,
				548	};
				549
				550	static int __init pmem_init(void)
				551	{
				552	return nd_driver_register(&nd_pmem_driver);
				553	}
				554	module_init(pmem_init);
				555
				556	static void pmem_exit(void)
				557	{
				558	driver_unregister(&nd_pmem_driver.drv);
				559	}
				560	module_exit(pmem_exit);
				561
				562	MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
				563	MODULE_LICENSE("GPL v2");