updating to mainline 4.13.3

This commit is contained in:
Jake Day 2017-09-20 10:19:29 -04:00
parent 751666cc41
commit 0bcdae08d9
62 changed files with 932 additions and 518 deletions

View file

@ -45,6 +45,8 @@ Contact: thunderbolt-software@lists.01.org
Description: When a devices supports Thunderbolt secure connect it will
have this attribute. Writing 32 byte hex string changes
authorization to use the secure connection method instead.
Writing an empty string clears the key and regular connection
method can be used again.
What: /sys/bus/thunderbolt/devices/.../device
Date: Sep 2017

View file

@ -1,6 +1,6 @@
VERSION = 4
PATCHLEVEL = 13
SUBLEVEL = 2
SUBLEVEL = 3
EXTRAVERSION =
NAME = Fearless Coyote

View file

@ -204,6 +204,7 @@ void set_personality_ia32(bool);
#define ELF_CORE_COPY_REGS(pr_reg, regs) \
do { \
unsigned long base; \
unsigned v; \
(pr_reg)[0] = (regs)->r15; \
(pr_reg)[1] = (regs)->r14; \
@ -226,8 +227,8 @@ do { \
(pr_reg)[18] = (regs)->flags; \
(pr_reg)[19] = (regs)->sp; \
(pr_reg)[20] = (regs)->ss; \
(pr_reg)[21] = current->thread.fsbase; \
(pr_reg)[22] = current->thread.gsbase; \
rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \
asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \

View file

@ -51,6 +51,10 @@ static inline void clear_page(void *page)
void copy_page(void *to, void *from);
#ifdef CONFIG_X86_MCE
#define arch_unmap_kpfn arch_unmap_kpfn
#endif
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_VSYSCALL_EMULATION

View file

@ -51,6 +51,7 @@
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/reboot.h>
#include <asm/set_memory.h>
#include "mce-internal.h"
@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m)
return ret;
}
#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
void arch_unmap_kpfn(unsigned long pfn)
{
unsigned long decoy_addr;
/*
* Unmap this page from the kernel 1:1 mappings to make sure
* we don't log more errors because of speculative access to
* the page.
* We would like to just call:
* set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
* but doing that would radically increase the odds of a
* speculative access to the posion page because we'd have
* the virtual address of the kernel 1:1 mapping sitting
* around in registers.
* Instead we get tricky. We create a non-canonical address
* that looks just like the one we want, but has bit 63 flipped.
* This relies on set_memory_np() not checking whether we passed
* a legal address.
*/
/*
* Build time check to see if we have a spare virtual bit. Don't want
* to leave this until run time because most developers don't have a
* system that can exercise this code path. This will only become a
* problem if/when we move beyond 5-level page tables.
*
* Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
*/
#if PGDIR_SHIFT + 9 < 63
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
#else
#error "no unused virtual bit available"
#endif
if (set_memory_np(decoy_addr, 1))
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
}
#endif
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.

View file

@ -149,6 +149,123 @@ void release_thread(struct task_struct *dead_task)
}
}
enum which_selector {
FS,
GS
};
/*
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
* It's forcibly inlined because it'll generate better code and this function
* is hot.
*/
static __always_inline void save_base_legacy(struct task_struct *prev_p,
unsigned short selector,
enum which_selector which)
{
if (likely(selector == 0)) {
/*
* On Intel (without X86_BUG_NULL_SEG), the segment base could
* be the pre-existing saved base or it could be zero. On AMD
* (with X86_BUG_NULL_SEG), the segment base could be almost
* anything.
*
* This branch is very hot (it's hit twice on almost every
* context switch between 64-bit programs), and avoiding
* the RDMSR helps a lot, so we just assume that whatever
* value is already saved is correct. This matches historical
* Linux behavior, so it won't break existing applications.
*
* To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
* report that the base is zero, it needs to actually be zero:
* see the corresponding logic in load_seg_legacy.
*/
} else {
/*
* If the selector is 1, 2, or 3, then the base is zero on
* !X86_BUG_NULL_SEG CPUs and could be anything on
* X86_BUG_NULL_SEG CPUs. In the latter case, Linux
* has never attempted to preserve the base across context
* switches.
*
* If selector > 3, then it refers to a real segment, and
* saving the base isn't necessary.
*/
if (which == FS)
prev_p->thread.fsbase = 0;
else
prev_p->thread.gsbase = 0;
}
}
static __always_inline void save_fsgs(struct task_struct *task)
{
savesegment(fs, task->thread.fsindex);
savesegment(gs, task->thread.gsindex);
save_base_legacy(task, task->thread.fsindex, FS);
save_base_legacy(task, task->thread.gsindex, GS);
}
static __always_inline void loadseg(enum which_selector which,
unsigned short sel)
{
if (which == FS)
loadsegment(fs, sel);
else
load_gs_index(sel);
}
static __always_inline void load_seg_legacy(unsigned short prev_index,
unsigned long prev_base,
unsigned short next_index,
unsigned long next_base,
enum which_selector which)
{
if (likely(next_index <= 3)) {
/*
* The next task is using 64-bit TLS, is not using this
* segment at all, or is having fun with arcane CPU features.
*/
if (next_base == 0) {
/*
* Nasty case: on AMD CPUs, we need to forcibly zero
* the base.
*/
if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
loadseg(which, __USER_DS);
loadseg(which, next_index);
} else {
/*
* We could try to exhaustively detect cases
* under which we can skip the segment load,
* but there's really only one case that matters
* for performance: if both the previous and
* next states are fully zeroed, we can skip
* the load.
*
* (This assumes that prev_base == 0 has no
* false positives. This is the case on
* Intel-style CPUs.)
*/
if (likely(prev_index | next_index | prev_base))
loadseg(which, next_index);
}
} else {
if (prev_index != next_index)
loadseg(which, next_index);
wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
next_base);
}
} else {
/*
* The next task is using a real segment. Loading the selector
* is sufficient.
*/
loadseg(which, next_index);
}
}
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
@ -229,10 +346,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp,
unsigned int _cs, unsigned int _ss, unsigned int _ds)
{
WARN_ON_ONCE(regs != current_pt_regs());
if (static_cpu_has(X86_BUG_NULL_SEG)) {
/* Loading zero below won't clear the base. */
loadsegment(fs, __USER_DS);
load_gs_index(__USER_DS);
}
loadsegment(fs, 0);
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
regs->ip = new_ip;
regs->sp = new_sp;
regs->cs = _cs;
@ -277,7 +403,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
unsigned prev_fsindex, prev_gsindex;
switch_fpu_prepare(prev_fpu, cpu);
@ -286,8 +411,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*
* (e.g. xen_load_tls())
*/
savesegment(fs, prev_fsindex);
savesegment(gs, prev_gsindex);
save_fsgs(prev_p);
/*
* Load TLS before restoring any segments so that segment loads
@ -326,108 +450,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
/*
* Switch FS and GS.
*
* These are even more complicated than DS and ES: they have
* 64-bit bases are that controlled by arch_prctl. The bases
* don't necessarily match the selectors, as user code can do
* any number of things to cause them to be inconsistent.
*
* We don't promise to preserve the bases if the selectors are
* nonzero. We also don't promise to preserve the base if the
* selector is zero and the base doesn't match whatever was
* most recently passed to ARCH_SET_FS/GS. (If/when the
* FSGSBASE instructions are enabled, we'll need to offer
* stronger guarantees.)
*
* As an invariant,
* (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
* impossible.
*/
if (next->fsindex) {
/* Loading a nonzero value into FS sets the index and base. */
loadsegment(fs, next->fsindex);
} else {
if (next->fsbase) {
/* Next index is zero but next base is nonzero. */
if (prev_fsindex)
loadsegment(fs, 0);
wrmsrl(MSR_FS_BASE, next->fsbase);
} else {
/* Next base and index are both zero. */
if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
/*
* We don't know the previous base and can't
* find out without RDMSR. Forcibly clear it.
*/
loadsegment(fs, __USER_DS);
loadsegment(fs, 0);
} else {
/*
* If the previous index is zero and ARCH_SET_FS
* didn't change the base, then the base is
* also zero and we don't need to do anything.
*/
if (prev->fsbase || prev_fsindex)
loadsegment(fs, 0);
}
}
}
/*
* Save the old state and preserve the invariant.
* NB: if prev_fsindex == 0, then we can't reliably learn the base
* without RDMSR because Intel user code can zero it without telling
* us and AMD user code can program any 32-bit value without telling
* us.
*/
if (prev_fsindex)
prev->fsbase = 0;
prev->fsindex = prev_fsindex;
if (next->gsindex) {
/* Loading a nonzero value into GS sets the index and base. */
load_gs_index(next->gsindex);
} else {
if (next->gsbase) {
/* Next index is zero but next base is nonzero. */
if (prev_gsindex)
load_gs_index(0);
wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
} else {
/* Next base and index are both zero. */
if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
/*
* We don't know the previous base and can't
* find out without RDMSR. Forcibly clear it.
*
* This contains a pointless SWAPGS pair.
* Fixing it would involve an explicit check
* for Xen or a new pvop.
*/
load_gs_index(__USER_DS);
load_gs_index(0);
} else {
/*
* If the previous index is zero and ARCH_SET_GS
* didn't change the base, then the base is
* also zero and we don't need to do anything.
*/
if (prev->gsbase || prev_gsindex)
load_gs_index(0);
}
}
}
/*
* Save the old state and preserve the invariant.
* NB: if prev_gsindex == 0, then we can't reliably learn the base
* without RDMSR because Intel user code can zero it without telling
* us and AMD user code can program any 32-bit value without telling
* us.
*/
if (prev_gsindex)
prev->gsbase = 0;
prev->gsindex = prev_gsindex;
load_seg_legacy(prev->fsindex, prev->fsbase,
next->fsindex, next->fsbase, FS);
load_seg_legacy(prev->gsindex, prev->gsbase,
next->gsindex, next->gsbase, GS);
switch_fpu_finish(next_fpu, cpu);

View file

@ -2564,6 +2564,23 @@ static int init_resync(struct r1conf *conf)
return 0;
}
static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
{
struct r1bio *r1bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
struct resync_pages *rps;
struct bio *bio;
int i;
for (i = conf->poolinfo->raid_disks; i--; ) {
bio = r1bio->bios[i];
rps = bio->bi_private;
bio_reset(bio);
bio->bi_private = rps;
}
r1bio->master_bio = NULL;
return r1bio;
}
/*
* perform a "sync" on one "block"
*
@ -2649,7 +2666,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
r1_bio = raid1_alloc_init_r1buf(conf);
raise_barrier(conf, sector_nr);

View file

@ -2798,6 +2798,35 @@ static int init_resync(struct r10conf *conf)
return 0;
}
static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
{
struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
struct rsync_pages *rp;
struct bio *bio;
int nalloc;
int i;
if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
nalloc = conf->copies; /* resync */
else
nalloc = 2; /* recovery */
for (i = 0; i < nalloc; i++) {
bio = r10bio->devs[i].bio;
rp = bio->bi_private;
bio_reset(bio);
bio->bi_private = rp;
bio = r10bio->devs[i].repl_bio;
if (bio) {
rp = bio->bi_private;
bio_reset(bio);
bio->bi_private = rp;
}
}
return r10bio;
}
/*
* perform a "sync" on one "block"
*
@ -3027,7 +3056,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
atomic_inc(&mreplace->nr_pending);
rcu_read_unlock();
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
r10_bio = raid10_alloc_init_r10buf(conf);
r10_bio->state = 0;
raise_barrier(conf, rb2 != NULL);
atomic_set(&r10_bio->remaining, 0);
@ -3236,7 +3265,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
if (sync_blocks < max_sync)
max_sync = sync_blocks;
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
r10_bio = raid10_alloc_init_r10buf(conf);
r10_bio->state = 0;
r10_bio->mddev = mddev;
@ -4360,7 +4389,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
read_more:
/* Now schedule reads for blocks from sector_nr to last */
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
r10_bio = raid10_alloc_init_r10buf(conf);
r10_bio->state = 0;
raise_barrier(conf, sectors_done != 0);
atomic_set(&r10_bio->remaining, 0);

View file

@ -6235,6 +6235,10 @@ static void raid5_do_work(struct work_struct *work)
spin_unlock_irq(&conf->device_lock);
flush_deferred_bios(conf);
r5l_flush_stripe_to_raid(conf->log);
async_tx_issue_pending_all();
blk_finish_plug(&plug);

View file

@ -3687,7 +3687,7 @@ static noinline void gfar_update_link_state(struct gfar_private *priv)
u32 tempval1 = gfar_read(&regs->maccfg1);
u32 tempval = gfar_read(&regs->maccfg2);
u32 ecntrl = gfar_read(&regs->ecntrl);
u32 tx_flow_oldval = (tempval & MACCFG1_TX_FLOW);
u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW);
if (phydev->duplex != priv->oldduplex) {
if (!(phydev->duplex))

View file

@ -1429,6 +1429,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
}
btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL);
if (!btt_sb)
return -ENOMEM;
/*
* If this returns < 0, that is ok as it just means there wasn't

View file

@ -905,19 +905,20 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
int read_only, unsigned int ioctl_cmd, unsigned long arg)
{
struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
size_t buf_len = 0, in_len = 0, out_len = 0;
static char out_env[ND_CMD_MAX_ENVELOPE];
static char in_env[ND_CMD_MAX_ENVELOPE];
const struct nd_cmd_desc *desc = NULL;
unsigned int cmd = _IOC_NR(ioctl_cmd);
unsigned int func = cmd;
void __user *p = (void __user *) arg;
struct device *dev = &nvdimm_bus->dev;
struct nd_cmd_pkg pkg;
void __user *p = (void __user *) arg;
const char *cmd_name, *dimm_name;
u32 in_len = 0, out_len = 0;
unsigned int func = cmd;
unsigned long cmd_mask;
void *buf;
struct nd_cmd_pkg pkg;
int rc, i, cmd_rc;
u64 buf_len = 0;
void *buf;
if (nvdimm) {
desc = nd_cmd_dimm_desc(cmd);
@ -977,7 +978,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
if (cmd == ND_CMD_CALL) {
func = pkg.nd_command;
dev_dbg(dev, "%s:%s, idx: %llu, in: %zu, out: %zu, len %zu\n",
dev_dbg(dev, "%s:%s, idx: %llu, in: %u, out: %u, len %llu\n",
__func__, dimm_name, pkg.nd_command,
in_len, out_len, buf_len);
@ -1007,9 +1008,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
out_len += out_size;
}
buf_len = out_len + in_len;
buf_len = (u64) out_len + (u64) in_len;
if (buf_len > ND_IOCTL_MAX_BUFLEN) {
dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__,
dev_dbg(dev, "%s:%s cmd: %s buf_len: %llu > %d\n", __func__,
dimm_name, cmd_name, buf_len,
ND_IOCTL_MAX_BUFLEN);
return -EINVAL;

View file

@ -807,11 +807,11 @@ static ssize_t key_store(struct device *dev, struct device_attribute *attr,
struct tb_switch *sw = tb_to_switch(dev);
u8 key[TB_SWITCH_KEY_SIZE];
ssize_t ret = count;
bool clear = false;
if (count < 64)
return -EINVAL;
if (hex2bin(key, buf, sizeof(key)))
if (!strcmp(buf, "\n"))
clear = true;
else if (hex2bin(key, buf, sizeof(key)))
return -EINVAL;
if (mutex_lock_interruptible(&switch_lock))
@ -821,15 +821,19 @@ static ssize_t key_store(struct device *dev, struct device_attribute *attr,
ret = -EBUSY;
} else {
kfree(sw->key);
sw->key = kmemdup(key, sizeof(key), GFP_KERNEL);
if (!sw->key)
ret = -ENOMEM;
if (clear) {
sw->key = NULL;
} else {
sw->key = kmemdup(key, sizeof(key), GFP_KERNEL);
if (!sw->key)
ret = -ENOMEM;
}
}
mutex_unlock(&switch_lock);
return ret;
}
static DEVICE_ATTR_RW(key);
static DEVICE_ATTR(key, 0600, key_show, key_store);
static ssize_t nvm_authenticate_show(struct device *dev,
struct device_attribute *attr, char *buf)

View file

@ -634,8 +634,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
preempt_enable();
if (vhost_enable_notify(&net->dev, vq))
if (!vhost_vq_avail_empty(&net->dev, vq))
vhost_poll_queue(&vq->poll);
else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
vhost_disable_notify(&net->dev, vq);
vhost_poll_queue(&vq->poll);
}
mutex_unlock(&vq->mutex);
len = peek_head_len(rvq, sk);

View file

@ -291,7 +291,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
return 0;
/* Get the previous summary */
for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
if (curseg->segno == segno) {
sum = curseg->sum_blk->entries[blkoff];
@ -599,8 +599,6 @@ out:
}
clear_sbi_flag(sbi, SBI_POR_DOING);
if (err)
set_ckpt_flags(sbi, CP_ERROR_FLAG);
mutex_unlock(&sbi->cp_mutex);
/* let's drop all the directory inodes for clean checkpoint */

View file

@ -1222,9 +1222,6 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
struct fuse_in *in;
unsigned reqsize;
if (task_active_pid_ns(current) != fc->pid_ns)
return -EIO;
restart:
spin_lock(&fiq->waitq.lock);
err = -EAGAIN;
@ -1262,6 +1259,13 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
in = &req->in;
reqsize = in->h.len;
if (task_active_pid_ns(current) != fc->pid_ns) {
rcu_read_lock();
in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
rcu_read_unlock();
}
/* If request is too large, reply with an error and restart the read */
if (nbytes < reqsize) {
req->out.h.error = -EIO;
@ -1823,9 +1827,6 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
struct fuse_req *req;
struct fuse_out_header oh;
if (task_active_pid_ns(current) != fc->pid_ns)
return -EIO;
if (nbytes < sizeof(struct fuse_out_header))
return -EINVAL;

View file

@ -2181,9 +2181,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
return 0;
if (pid && pid_nr == 0)
return -EOVERFLOW;
fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
err = fuse_simple_request(fc, &args);

View file

@ -637,6 +637,7 @@ again:
dispose_list(&dispose);
}
EXPORT_SYMBOL_GPL(evict_inodes);
/**
* invalidate_inodes - attempt to free all inodes on a superblock

View file

@ -132,7 +132,6 @@ static inline bool atime_needs_update_rcu(const struct path *path,
extern void inode_io_list_del(struct inode *inode);
extern long get_nr_dirty_inodes(void);
extern void evict_inodes(struct super_block *);
extern int invalidate_inodes(struct super_block *, bool);
/*

View file

@ -576,10 +576,13 @@ static int ovl_inode_set(struct inode *inode, void *data)
static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
struct dentry *upperdentry)
{
struct inode *lowerinode = lowerdentry ? d_inode(lowerdentry) : NULL;
/* Lower (origin) inode must match, even if NULL */
if (ovl_inode_lower(inode) != lowerinode)
/*
* Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
* This happens when finding a copied up overlay inode for a renamed
* or hardlinked overlay dentry and lower dentry cannot be followed
* by origin because lower fs does not support file handles.
*/
if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
return false;
/*

View file

@ -579,7 +579,7 @@ xfs_bmap_validate_ret(
#else
#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0)
#endif /* DEBUG */
/*

View file

@ -858,6 +858,7 @@ xfs_bmbt_change_owner(
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
if (!cur)
return -ENOMEM;
cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);

View file

@ -1791,6 +1791,7 @@ xfs_btree_lookup_get_block(
/* Check the inode owner since the verifiers don't. */
if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
!(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
(cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
cur->bc_private.b.ip->i_ino)
@ -4451,10 +4452,15 @@ xfs_btree_block_change_owner(
/* modify the owner */
block = xfs_btree_get_block(cur, level, &bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
return 0;
block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
else
} else {
if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
return 0;
block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
}
/*
* If the block is a root block hosted in an inode, we might not have a
@ -4463,16 +4469,19 @@ xfs_btree_block_change_owner(
* block is formatted into the on-disk inode fork. We still change it,
* though, so everything is consistent in memory.
*/
if (bp) {
if (cur->bc_tp) {
xfs_trans_ordered_buf(cur->bc_tp, bp);
xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
} else {
xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
}
} else {
if (!bp) {
ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
ASSERT(level == cur->bc_nlevels - 1);
return 0;
}
if (cur->bc_tp) {
if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
return -EAGAIN;
}
} else {
xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
}
return 0;

View file

@ -233,7 +233,8 @@ typedef struct xfs_btree_cur
short forksize; /* fork's inode space */
char whichfork; /* data or attr fork */
char flags; /* flags */
#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */
#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */
} b;
} bc_private; /* per-btree type data */
} xfs_btree_cur_t;

View file

@ -378,8 +378,6 @@ xfs_ialloc_inode_init(
* transaction and pin the log appropriately.
*/
xfs_trans_ordered_buf(tp, fbuf);
xfs_trans_log_buf(tp, fbuf, 0,
BBTOB(fbuf->b_length) - 1);
}
} else {
fbuf->b_flags |= XBF_DONE;
@ -1133,6 +1131,7 @@ xfs_dialloc_ag_inobt(
int error;
int offset;
int i, j;
int searchdistance = 10;
pag = xfs_perag_get(mp, agno);
@ -1159,7 +1158,6 @@ xfs_dialloc_ag_inobt(
if (pagno == agno) {
int doneleft; /* done, to the left */
int doneright; /* done, to the right */
int searchdistance = 10;
error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
if (error)
@ -1220,21 +1218,9 @@ xfs_dialloc_ag_inobt(
/*
* Loop until we find an inode chunk with a free inode.
*/
while (!doneleft || !doneright) {
while (--searchdistance > 0 && (!doneleft || !doneright)) {
int useleft; /* using left inode chunk this time */
if (!--searchdistance) {
/*
* Not in range - save last search
* location and allocate a new inode
*/
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
pag->pagl_leftrec = trec.ir_startino;
pag->pagl_rightrec = rec.ir_startino;
pag->pagl_pagino = pagino;
goto newino;
}
/* figure out the closer block if both are valid. */
if (!doneleft && !doneright) {
useleft = pagino -
@ -1278,26 +1264,37 @@ xfs_dialloc_ag_inobt(
goto error1;
}
/*
* We've reached the end of the btree. because
* we are only searching a small chunk of the
* btree each search, there is obviously free
* inodes closer to the parent inode than we
* are now. restart the search again.
*/
pag->pagl_pagino = NULLAGINO;
pag->pagl_leftrec = NULLAGINO;
pag->pagl_rightrec = NULLAGINO;
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
goto restart_pagno;
if (searchdistance <= 0) {
/*
* Not in range - save last search
* location and allocate a new inode
*/
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
pag->pagl_leftrec = trec.ir_startino;
pag->pagl_rightrec = rec.ir_startino;
pag->pagl_pagino = pagino;
} else {
/*
* We've reached the end of the btree. because
* we are only searching a small chunk of the
* btree each search, there is obviously free
* inodes closer to the parent inode than we
* are now. restart the search again.
*/
pag->pagl_pagino = NULLAGINO;
pag->pagl_leftrec = NULLAGINO;
pag->pagl_rightrec = NULLAGINO;
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
goto restart_pagno;
}
}
/*
* In a different AG from the parent.
* See if the most recently allocated block has any free.
*/
newino:
if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
XFS_LOOKUP_EQ, &i);

View file

@ -1499,14 +1499,11 @@ xfs_iext_realloc_indirect(
xfs_ifork_t *ifp, /* inode fork pointer */
int new_size) /* new indirection array size */
{
int nlists; /* number of irec's (ex lists) */
int size; /* current indirection array size */
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
size = nlists * sizeof(xfs_ext_irec_t);
ASSERT(ifp->if_real_bytes);
ASSERT((new_size >= 0) && (new_size != size));
ASSERT((new_size >= 0) &&
(new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
sizeof(xfs_ext_irec_t))));
if (new_size == 0) {
xfs_iext_destroy(ifp);
} else {

View file

@ -85,11 +85,11 @@ xfs_find_bdev_for_inode(
* associated buffer_heads, paying attention to the start and end offsets that
* we need to process on the page.
*
* Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
* buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
* the page at all, as we may be racing with memory reclaim and it can free both
* the bufferhead chain and the page as it will see the page as clean and
* unused.
* Note that we open code the action in end_buffer_async_write here so that we
* only have to iterate over the buffers attached to the page once. This is not
* only more efficient, but also ensures that we only calls end_page_writeback
* at the end of the iteration, and thus avoids the pitfall of having the page
* and buffers potentially freed after every call to end_buffer_async_write.
*/
static void
xfs_finish_page_writeback(
@ -97,29 +97,44 @@ xfs_finish_page_writeback(
struct bio_vec *bvec,
int error)
{
unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
struct buffer_head *head, *bh, *next;
struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head;
bool busy = false;
unsigned int off = 0;
unsigned int bsize;
unsigned long flags;
ASSERT(bvec->bv_offset < PAGE_SIZE);
ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
ASSERT(end < PAGE_SIZE);
ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
bh = head = page_buffers(bvec->bv_page);
bsize = bh->b_size;
local_irq_save(flags);
bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
do {
if (off > end)
break;
next = bh->b_this_page;
if (off < bvec->bv_offset)
goto next_bh;
bh->b_end_io(bh, !error);
next_bh:
off += bsize;
} while ((bh = next) != head);
if (off >= bvec->bv_offset &&
off < bvec->bv_offset + bvec->bv_len) {
ASSERT(buffer_async_write(bh));
ASSERT(bh->b_end_io == NULL);
if (error) {
mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
SetPageError(bvec->bv_page);
} else {
set_buffer_uptodate(bh);
}
clear_buffer_async_write(bh);
unlock_buffer(bh);
} else if (buffer_async_write(bh)) {
ASSERT(buffer_locked(bh));
busy = true;
}
off += bh->b_size;
} while ((bh = bh->b_this_page) != head);
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
local_irq_restore(flags);
if (!busy)
end_page_writeback(bvec->bv_page);
}
/*
@ -133,8 +148,10 @@ xfs_destroy_ioend(
int error)
{
struct inode *inode = ioend->io_inode;
struct bio *last = ioend->io_bio;
struct bio *bio, *next;
struct bio *bio = &ioend->io_inline_bio;
struct bio *last = ioend->io_bio, *next;
u64 start = bio->bi_iter.bi_sector;
bool quiet = bio_flagged(bio, BIO_QUIET);
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
@ -155,6 +172,11 @@ xfs_destroy_ioend(
bio_put(bio);
}
if (unlikely(error && !quiet)) {
xfs_err_ratelimited(XFS_I(inode)->i_mount,
"writeback error on sector %llu", start);
}
}
/*
@ -423,7 +445,8 @@ xfs_start_buffer_writeback(
ASSERT(!buffer_delay(bh));
ASSERT(!buffer_unwritten(bh));
mark_buffer_async_write(bh);
bh->b_end_io = NULL;
set_buffer_async_write(bh);
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
}

View file

@ -1840,29 +1840,18 @@ xfs_swap_extent_forks(
}
/*
* Before we've swapped the forks, lets set the owners of the forks
* appropriately. We have to do this as we are demand paging the btree
* buffers, and so the validation done on read will expect the owner
* field to be correctly set. Once we change the owners, we can swap the
* inode forks.
* Btree format (v3) inodes have the inode number stamped in the bmbt
* block headers. We can't start changing the bmbt blocks until the
* inode owner change is logged so recovery does the right thing in the
* event of a crash. Set the owner change log flags now and leave the
* bmbt scan as the last step.
*/
if (ip->i_d.di_version == 3 &&
ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
(*target_log_flags) |= XFS_ILOG_DOWNER;
error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
tip->i_ino, NULL);
if (error)
return error;
}
if (tip->i_d.di_version == 3 &&
tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
(*src_log_flags) |= XFS_ILOG_DOWNER;
error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
ip->i_ino, NULL);
if (error)
return error;
}
/*
* Swap the data forks of the inodes
@ -1940,6 +1929,48 @@ xfs_swap_extent_forks(
return 0;
}
/*
* Fix up the owners of the bmbt blocks to refer to the current inode. The
* change owner scan attempts to order all modified buffers in the current
* transaction. In the event of ordered buffer failure, the offending buffer is
* physically logged as a fallback and the scan returns -EAGAIN. We must roll
* the transaction in this case to replenish the fallback log reservation and
* restart the scan. This process repeats until the scan completes.
*/
static int
xfs_swap_change_owner(
struct xfs_trans **tpp,
struct xfs_inode *ip,
struct xfs_inode *tmpip)
{
int error;
struct xfs_trans *tp = *tpp;
do {
error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
NULL);
/* success or fatal error */
if (error != -EAGAIN)
break;
error = xfs_trans_roll(tpp, NULL);
if (error)
break;
tp = *tpp;
/*
* Redirty both inodes so they can relog and keep the log tail
* moving forward.
*/
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_ijoin(tp, tmpip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
} while (true);
return error;
}
int
xfs_swap_extents(
struct xfs_inode *ip, /* target inode */
@ -1954,7 +1985,7 @@ xfs_swap_extents(
int lock_flags;
struct xfs_ifork *cowfp;
uint64_t f;
int resblks;
int resblks = 0;
/*
* Lock the inodes against other IO, page faults and truncate to
@ -2002,11 +2033,8 @@ xfs_swap_extents(
XFS_SWAP_RMAP_SPACE_RES(mp,
XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
0, 0, &tp);
} else
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
0, 0, &tp);
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
if (error)
goto out_unlock;
@ -2091,6 +2119,23 @@ xfs_swap_extents(
xfs_trans_log_inode(tp, ip, src_log_flags);
xfs_trans_log_inode(tp, tip, target_log_flags);
/*
* The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
* have inode number owner values in the bmbt blocks that still refer to
* the old inode. Scan each bmbt to fix up the owner values with the
* inode number of the current inode.
*/
if (src_log_flags & XFS_ILOG_DOWNER) {
error = xfs_swap_change_owner(&tp, ip, tip);
if (error)
goto out_trans_cancel;
}
if (target_log_flags & XFS_ILOG_DOWNER) {
error = xfs_swap_change_owner(&tp, tip, ip);
if (error)
goto out_trans_cancel;
}
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.

View file

@ -29,6 +29,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_inode.h"
kmem_zone_t *xfs_buf_item_zone;
@ -322,6 +323,8 @@ xfs_buf_item_format(
ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
(xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
&& xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
(bip->bli_flags & XFS_BLI_STALE));
/*
@ -346,16 +349,6 @@ xfs_buf_item_format(
bip->bli_flags &= ~XFS_BLI_INODE_BUF;
}
if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
XFS_BLI_ORDERED) {
/*
* The buffer has been logged just to order it. It is not being
* included in the transaction commit, so don't format it.
*/
trace_xfs_buf_item_format_ordered(bip);
return;
}
for (i = 0; i < bip->bli_format_count; i++) {
xfs_buf_item_format_segment(bip, lv, &vecp, offset,
&bip->bli_formats[i]);
@ -574,26 +567,20 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
bool clean;
bool aborted;
int flags;
bool aborted = !!(lip->li_flags & XFS_LI_ABORTED);
bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
#if defined(DEBUG) || defined(XFS_WARN)
bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
#endif
/* Clear the buffer's association with this transaction. */
bp->b_transp = NULL;
/*
* If this is a transaction abort, don't return early. Instead, allow
* the brelse to happen. Normally it would be done for stale
* (cancelled) buffers at unpin time, but we'll never go through the
* pin/unpin cycle if we abort inside commit.
* The per-transaction state has been copied above so clear it from the
* bli.
*/
aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
/*
* Before possibly freeing the buf item, copy the per-transaction state
* so we can reference it safely later after clearing it from the
* buffer log item.
*/
flags = bip->bli_flags;
bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
/*
@ -601,7 +588,7 @@ xfs_buf_item_unlock(
* unlock the buffer and free the buf item when the buffer is unpinned
* for the last time.
*/
if (flags & XFS_BLI_STALE) {
if (bip->bli_flags & XFS_BLI_STALE) {
trace_xfs_buf_item_unlock_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
if (!aborted) {
@ -619,20 +606,11 @@ xfs_buf_item_unlock(
* regardless of whether it is dirty or not. A dirty abort implies a
* shutdown, anyway.
*
* Ordered buffers are dirty but may have no recorded changes, so ensure
* we only release clean items here.
* The bli dirty state should match whether the blf has logged segments
* except for ordered buffers, where only the bli should be dirty.
*/
clean = (flags & XFS_BLI_DIRTY) ? false : true;
if (clean) {
int i;
for (i = 0; i < bip->bli_format_count; i++) {
if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
bip->bli_formats[i].blf_map_size)) {
clean = false;
break;
}
}
}
ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
(ordered && dirty && !xfs_buf_item_dirty_format(bip)));
/*
* Clean buffers, by definition, cannot be in the AIL. However, aborted
@ -651,11 +629,11 @@ xfs_buf_item_unlock(
ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
} else if (clean)
} else if (!dirty)
xfs_buf_item_relse(bp);
}
if (!(flags & XFS_BLI_HOLD))
if (!hold)
xfs_buf_relse(bp);
}
@ -945,14 +923,22 @@ xfs_buf_item_log(
/*
* Return 1 if the buffer has been logged or ordered in a transaction (at any
* point, not just the current transaction) and 0 if not.
* Return true if the buffer has any ranges logged/dirtied by a transaction,
* false otherwise.
*/
uint
xfs_buf_item_dirty(
xfs_buf_log_item_t *bip)
bool
xfs_buf_item_dirty_format(
struct xfs_buf_log_item *bip)
{
return (bip->bli_flags & XFS_BLI_DIRTY);
int i;
for (i = 0; i < bip->bli_format_count; i++) {
if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
bip->bli_formats[i].blf_map_size))
return true;
}
return false;
}
STATIC void
@ -1054,6 +1040,31 @@ xfs_buf_do_callbacks(
}
}
/*
* Invoke the error state callback for each log item affected by the failed I/O.
*
* If a metadata buffer write fails with a non-permanent error, the buffer is
* eventually resubmitted and so the completion callbacks are not run. The error
* state may need to be propagated to the log items attached to the buffer,
* however, so the next AIL push of the item knows hot to handle it correctly.
*/
STATIC void
xfs_buf_do_callbacks_fail(
struct xfs_buf *bp)
{
struct xfs_log_item *next;
struct xfs_log_item *lip = bp->b_fspriv;
struct xfs_ail *ailp = lip->li_ailp;
spin_lock(&ailp->xa_lock);
for (; lip; lip = next) {
next = lip->li_bio_list;
if (lip->li_ops->iop_error)
lip->li_ops->iop_error(lip, bp);
}
spin_unlock(&ailp->xa_lock);
}
static bool
xfs_buf_iodone_callback_error(
struct xfs_buf *bp)
@ -1123,7 +1134,11 @@ xfs_buf_iodone_callback_error(
if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
goto permanent_error;
/* still a transient error, higher layers will retry */
/*
* Still a transient error, run IO completion failure callbacks and let
* the higher layers retry the buffer.
*/
xfs_buf_do_callbacks_fail(bp);
xfs_buf_ioerror(bp, 0);
xfs_buf_relse(bp);
return true;
@ -1204,3 +1219,31 @@ xfs_buf_iodone(
xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
xfs_buf_item_free(BUF_ITEM(lip));
}
/*
* Requeue a failed buffer for writeback
*
* Return true if the buffer has been re-queued properly, false otherwise
*/
bool
xfs_buf_resubmit_failed_buffers(
struct xfs_buf *bp,
struct xfs_log_item *lip,
struct list_head *buffer_list)
{
struct xfs_log_item *next;
/*
* Clear XFS_LI_FAILED flag from all items before resubmit
*
* XFS_LI_FAILED set/clear is protected by xa_lock, caller this
* function already have it acquired
*/
for (; lip; lip = next) {
next = lip->li_bio_list;
xfs_clear_li_failed(lip);
}
/* Add this buffer back to the delayed write list */
return xfs_buf_delwri_queue(bp, buffer_list);
}

View file

@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item {
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *);
void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_attach_iodone(struct xfs_buf *,
void(*)(struct xfs_buf *, xfs_log_item_t *),
xfs_log_item_t *);
void xfs_buf_iodone_callbacks(struct xfs_buf *);
void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
struct xfs_log_item *,
struct list_head *);
extern kmem_zone_t *xfs_buf_item_zone;

View file

@ -1124,11 +1124,11 @@ reclaim:
* Because we use RCU freeing we need to ensure the inode always appears
* to be reclaimed with an invalid inode number when in the free state.
* We do this as early as possible under the ILOCK so that
* xfs_iflush_cluster() can be guaranteed to detect races with us here.
* By doing this, we guarantee that once xfs_iflush_cluster has locked
* XFS_ILOCK that it will see either a valid, flushable inode that will
* serialise correctly, or it will see a clean (and invalid) inode that
* it can skip.
* xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
* detect races with us here. By doing this, we guarantee that once
* xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
* it will see either a valid inode that will serialise correctly, or it
* will see an invalid inode that it can skip.
*/
spin_lock(&ip->i_flags_lock);
ip->i_flags = XFS_IRECLAIM;

View file

@ -2359,11 +2359,24 @@ retry:
* already marked stale. If we can't lock it, back off
* and retry.
*/
if (ip != free_ip &&
!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
rcu_read_unlock();
delay(1);
goto retry;
if (ip != free_ip) {
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
rcu_read_unlock();
delay(1);
goto retry;
}
/*
* Check the inode number again in case we're
* racing with freeing in xfs_reclaim_inode().
* See the comments in that function for more
* information as to why the initial check is
* not sufficient.
*/
if (ip->i_ino != inum + i) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
continue;
}
}
rcu_read_unlock();

View file

@ -27,6 +27,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
@ -475,6 +476,23 @@ xfs_inode_item_unpin(
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
/*
* Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
* have been failed during writeback
*
* This informs the AIL that the inode is already flush locked on the next push,
* and acquires a hold on the buffer to ensure that it isn't reclaimed before
* dirty data makes it to disk.
*/
STATIC void
xfs_inode_item_error(
struct xfs_log_item *lip,
struct xfs_buf *bp)
{
ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
xfs_set_li_failed(lip, bp);
}
STATIC uint
xfs_inode_item_push(
struct xfs_log_item *lip,
@ -484,13 +502,28 @@ xfs_inode_item_push(
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
struct xfs_buf *bp = NULL;
struct xfs_buf *bp = lip->li_buf;
uint rval = XFS_ITEM_SUCCESS;
int error;
if (xfs_ipincount(ip) > 0)
return XFS_ITEM_PINNED;
/*
* The buffer containing this item failed to be written back
* previously. Resubmit the buffer for IO.
*/
if (lip->li_flags & XFS_LI_FAILED) {
if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;
if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_unlock(bp);
return rval;
}
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
return XFS_ITEM_LOCKED;
@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_unlock = xfs_inode_item_unlock,
.iop_committed = xfs_inode_item_committed,
.iop_push = xfs_inode_item_push,
.iop_committing = xfs_inode_item_committing
.iop_committing = xfs_inode_item_committing,
.iop_error = xfs_inode_item_error
};
@ -710,7 +744,8 @@ xfs_iflush_done(
* the AIL lock.
*/
iip = INODE_ITEM(blip);
if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
lip->li_flags & XFS_LI_FAILED)
need_ail++;
blip = next;
@ -718,7 +753,8 @@ xfs_iflush_done(
/* make sure we capture the state of the initial inode. */
iip = INODE_ITEM(lip);
if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
lip->li_flags & XFS_LI_FAILED)
need_ail++;
/*
@ -739,6 +775,9 @@ xfs_iflush_done(
if (INODE_ITEM(blip)->ili_logged &&
blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
mlip_changed |= xfs_ail_delete_one(ailp, blip);
else {
xfs_clear_li_failed(blip);
}
}
if (mlip_changed) {

View file

@ -931,16 +931,15 @@ xfs_ioc_fsgetxattr(
return 0;
}
STATIC void
xfs_set_diflags(
STATIC uint16_t
xfs_flags2diflags(
struct xfs_inode *ip,
unsigned int xflags)
{
unsigned int di_flags;
uint64_t di_flags2;
/* can't set PREALLOC this way, just preserve it */
di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
uint16_t di_flags =
(ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
if (xflags & FS_XFLAG_IMMUTABLE)
di_flags |= XFS_DIFLAG_IMMUTABLE;
if (xflags & FS_XFLAG_APPEND)
@ -970,19 +969,24 @@ xfs_set_diflags(
if (xflags & FS_XFLAG_EXTSIZE)
di_flags |= XFS_DIFLAG_EXTSIZE;
}
ip->i_d.di_flags = di_flags;
/* diflags2 only valid for v3 inodes. */
if (ip->i_d.di_version < 3)
return;
return di_flags;
}
STATIC uint64_t
xfs_flags2diflags2(
struct xfs_inode *ip,
unsigned int xflags)
{
uint64_t di_flags2 =
(ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
if (xflags & FS_XFLAG_COWEXTSIZE)
di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
ip->i_d.di_flags2 = di_flags2;
return di_flags2;
}
STATIC void
@ -1008,11 +1012,12 @@ xfs_diflags_to_linux(
inode->i_flags |= S_NOATIME;
else
inode->i_flags &= ~S_NOATIME;
#if 0 /* disabled until the flag switching races are sorted out */
if (xflags & FS_XFLAG_DAX)
inode->i_flags |= S_DAX;
else
inode->i_flags &= ~S_DAX;
#endif
}
static int
@ -1022,6 +1027,7 @@ xfs_ioctl_setattr_xflags(
struct fsxattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
uint64_t di_flags2;
/* Can't change realtime flag if any extents are allocated. */
if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
@ -1052,7 +1058,14 @@ xfs_ioctl_setattr_xflags(
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
xfs_set_diflags(ip, fa->fsx_xflags);
/* diflags2 only valid for v3 inodes. */
di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
if (di_flags2 && ip->i_d.di_version < 3)
return -EINVAL;
ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
ip->i_d.di_flags2 = di_flags2;
xfs_diflags_to_linux(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

View file

@ -817,7 +817,7 @@ xfs_vn_setattr_nonsize(
* Caution: The caller of this function is responsible for calling
* setattr_prepare() or otherwise verifying the change is fine.
*/
int
STATIC int
xfs_setattr_size(
struct xfs_inode *ip,
struct iattr *iattr)

View file

@ -743,10 +743,14 @@ xfs_log_mount_finish(
struct xfs_mount *mp)
{
int error = 0;
bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
return 0;
} else if (readonly) {
/* Allow unlinked processing to proceed */
mp->m_flags &= ~XFS_MOUNT_RDONLY;
}
/*
@ -757,12 +761,27 @@ xfs_log_mount_finish(
* inodes. Turn it off immediately after recovery finishes
* so that we don't leak the quota inodes if subsequent mount
* activities fail.
*
* We let all inodes involved in redo item processing end up on
* the LRU instead of being evicted immediately so that if we do
* something to an unlinked inode, the irele won't cause
* premature truncation and freeing of the inode, which results
* in log recovery failure. We have to evict the unreferenced
* lru inodes after clearing MS_ACTIVE because we don't
* otherwise clean up the lru if there's a subsequent failure in
* xfs_mountfs, which leads to us leaking the inodes if nothing
* else (e.g. quotacheck) references the inodes before the
* mount failure occurs.
*/
mp->m_super->s_flags |= MS_ACTIVE;
error = xlog_recover_finish(mp->m_log);
if (!error)
xfs_log_work_queue(mp);
mp->m_super->s_flags &= ~MS_ACTIVE;
evict_inodes(mp->m_super);
if (readonly)
mp->m_flags |= XFS_MOUNT_RDONLY;
return error;
}
@ -812,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
int error;
/*
* Don't write out unmount record on read-only mounts.
* Don't write out unmount record on norecovery mounts or ro devices.
* Or, if we are doing a forced umount (typically because of IO errors).
*/
if (mp->m_flags & XFS_MOUNT_RDONLY)
if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
return 0;
}
error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
@ -3353,8 +3375,6 @@ maybe_sleep:
*/
if (iclog->ic_state & XLOG_STATE_IOERROR)
return -EIO;
if (log_flushed)
*log_flushed = 1;
} else {
no_sleep:
@ -3458,8 +3478,6 @@ try_again:
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
if (log_flushed)
*log_flushed = 1;
already_slept = 1;
goto try_again;
}
@ -3493,9 +3511,6 @@ try_again:
*/
if (iclog->ic_state & XLOG_STATE_IOERROR)
return -EIO;
if (log_flushed)
*log_flushed = 1;
} else { /* just return */
spin_unlock(&log->l_icloglock);
}

View file

@ -1029,61 +1029,106 @@ out_error:
}
/*
* Check the log tail for torn writes. This is required when torn writes are
* detected at the head and the head had to be walked back to a previous record.
* The tail of the previous record must now be verified to ensure the torn
* writes didn't corrupt the previous tail.
* Calculate distance from head to tail (i.e., unused space in the log).
*/
static inline int
xlog_tail_distance(
struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
if (head_blk < tail_blk)
return tail_blk - head_blk;
return tail_blk + (log->l_logBBsize - head_blk);
}
/*
* Verify the log tail. This is particularly important when torn or incomplete
* writes have been detected near the front of the log and the head has been
* walked back accordingly.
*
* Return an error if CRC verification fails as recovery cannot proceed.
* We also have to handle the case where the tail was pinned and the head
* blocked behind the tail right before a crash. If the tail had been pushed
* immediately prior to the crash and the subsequent checkpoint was only
* partially written, it's possible it overwrote the last referenced tail in the
* log with garbage. This is not a coherency problem because the tail must have
* been pushed before it can be overwritten, but appears as log corruption to
* recovery because we have no way to know the tail was updated if the
* subsequent checkpoint didn't write successfully.
*
* Therefore, CRC check the log from tail to head. If a failure occurs and the
* offending record is within max iclog bufs from the head, walk the tail
* forward and retry until a valid tail is found or corruption is detected out
* of the range of a possible overwrite.
*/
STATIC int
xlog_verify_tail(
struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
xfs_daddr_t *tail_blk,
int hsize)
{
struct xlog_rec_header *thead;
struct xfs_buf *bp;
xfs_daddr_t first_bad;
int count;
int error = 0;
bool wrapped;
xfs_daddr_t tmp_head;
xfs_daddr_t tmp_tail;
xfs_daddr_t orig_tail = *tail_blk;
bp = xlog_get_bp(log, 1);
if (!bp)
return -ENOMEM;
/*
* Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
* a temporary head block that points after the last possible
* concurrently written record of the tail.
* Make sure the tail points to a record (returns positive count on
* success).
*/
count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
&wrapped);
if (count < 0) {
error = count;
error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
&tmp_tail, &thead, &wrapped);
if (error < 0)
goto out;
if (*tail_blk != tmp_tail)
*tail_blk = tmp_tail;
/*
* Run a CRC check from the tail to the head. We can't just check
* MAX_ICLOGS records past the tail because the tail may point to stale
* blocks cleared during the search for the head/tail. These blocks are
* overwritten with zero-length records and thus record count is not a
* reliable indicator of the iclog state before a crash.
*/
first_bad = 0;
error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
int tail_distance;
/*
* Is corruption within range of the head? If so, retry from
* the next record. Otherwise return an error.
*/
tail_distance = xlog_tail_distance(log, head_blk, first_bad);
if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
break;
/* skip to the next record; returns positive count on success */
error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
&tmp_tail, &thead, &wrapped);
if (error < 0)
goto out;
*tail_blk = tmp_tail;
first_bad = 0;
error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
}
/*
* If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
* into the actual log head. tmp_head points to the start of the record
* so update it to the actual head block.
*/
if (count < XLOG_MAX_ICLOGS + 1)
tmp_head = head_blk;
/*
* We now have a tail and temporary head block that covers at least
* XLOG_MAX_ICLOGS records from the tail. We need to verify that these
* records were completely written. Run a CRC verification pass from
* tail to head and return the result.
*/
error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
if (!error && *tail_blk != orig_tail)
xfs_warn(log->l_mp,
"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
orig_tail, *tail_blk);
out:
xlog_put_bp(bp);
return error;
@ -1143,7 +1188,7 @@ xlog_verify_head(
*/
error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
if (error == -EFSBADCRC) {
if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
/*
* We've hit a potential torn write. Reset the error and warn
* about it.
@ -1183,31 +1228,12 @@ xlog_verify_head(
ASSERT(0);
return 0;
}
/*
* Now verify the tail based on the updated head. This is
* required because the torn writes trimmed from the head could
* have been written over the tail of a previous record. Return
* any errors since recovery cannot proceed if the tail is
* corrupt.
*
* XXX: This leaves a gap in truly robust protection from torn
* writes in the log. If the head is behind the tail, the tail
* pushes forward to create some space and then a crash occurs
* causing the writes into the previous record's tail region to
* tear, log recovery isn't able to recover.
*
* How likely is this to occur? If possible, can we do something
* more intelligent here? Is it safe to push the tail forward if
* we can determine that the tail is within the range of the
* torn write (e.g., the kernel can only overwrite the tail if
* it has actually been pushed forward)? Alternatively, could we
* somehow prevent this condition at runtime?
*/
error = xlog_verify_tail(log, *head_blk, *tail_blk);
}
if (error)
return error;
return error;
return xlog_verify_tail(log, *head_blk, tail_blk,
be32_to_cpu((*rhead)->h_size));
}
/*
@ -4801,12 +4827,16 @@ xlog_recover_process_intents(
int error = 0;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp;
#if defined(DEBUG) || defined(XFS_WARN)
xfs_lsn_t last_lsn;
#endif
ailp = log->l_ailp;
spin_lock(&ailp->xa_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
#endif
while (lip != NULL) {
/*
* We're done when we see something other than an intent.
@ -5218,7 +5248,7 @@ xlog_do_recovery_pass(
xfs_daddr_t *first_bad) /* out: first bad log rec */
{
xlog_rec_header_t *rhead;
xfs_daddr_t blk_no;
xfs_daddr_t blk_no, rblk_no;
xfs_daddr_t rhead_blk;
char *offset;
xfs_buf_t *hbp, *dbp;
@ -5231,7 +5261,7 @@ xlog_do_recovery_pass(
LIST_HEAD (buffer_list);
ASSERT(head_blk != tail_blk);
rhead_blk = 0;
blk_no = rhead_blk = tail_blk;
for (i = 0; i < XLOG_RHASH_SIZE; i++)
INIT_HLIST_HEAD(&rhash[i]);
@ -5309,7 +5339,6 @@ xlog_do_recovery_pass(
}
memset(rhash, 0, sizeof(rhash));
blk_no = rhead_blk = tail_blk;
if (tail_blk > head_blk) {
/*
* Perform recovery around the end of the physical log.
@ -5371,9 +5400,19 @@ xlog_do_recovery_pass(
bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
blk_no += hblks;
/* Read in data for log record */
if (blk_no + bblks <= log->l_logBBsize) {
error = xlog_bread(log, blk_no, bblks, dbp,
/*
* Read the log record data in multiple reads if it
* wraps around the end of the log. Note that if the
* header already wrapped, blk_no could point past the
* end of the log. The record data is contiguous in
* that case.
*/
if (blk_no + bblks <= log->l_logBBsize ||
blk_no >= log->l_logBBsize) {
/* mod blk_no in case the header wrapped and
* pushed it beyond the end of the log */
rblk_no = do_mod(blk_no, log->l_logBBsize);
error = xlog_bread(log, rblk_no, bblks, dbp,
&offset);
if (error)
goto bread_err2;

View file

@ -1220,7 +1220,7 @@ xfs_test_remount_options(
tmp_mp->m_super = sb;
error = xfs_parseargs(tmp_mp, options);
xfs_free_fsname(tmp_mp);
kfree(tmp_mp);
kmem_free(tmp_mp);
return error;
}

View file

@ -517,7 +517,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);

View file

@ -49,6 +49,7 @@ typedef struct xfs_log_item {
struct xfs_ail *li_ailp; /* ptr to AIL */
uint li_type; /* item type */
uint li_flags; /* misc flags */
struct xfs_buf *li_buf; /* real buffer pointer */
struct xfs_log_item *li_bio_list; /* buffer item list */
void (*li_cb)(struct xfs_buf *,
struct xfs_log_item *);
@ -64,11 +65,13 @@ typedef struct xfs_log_item {
} xfs_log_item_t;
#define XFS_LI_IN_AIL 0x1
#define XFS_LI_ABORTED 0x2
#define XFS_LI_ABORTED 0x2
#define XFS_LI_FAILED 0x4
#define XFS_LI_FLAGS \
{ XFS_LI_IN_AIL, "IN_AIL" }, \
{ XFS_LI_ABORTED, "ABORTED" }
{ XFS_LI_ABORTED, "ABORTED" }, \
{ XFS_LI_FAILED, "FAILED" }
struct xfs_item_ops {
void (*iop_size)(xfs_log_item_t *, int *, int *);
@ -79,6 +82,7 @@ struct xfs_item_ops {
void (*iop_unlock)(xfs_log_item_t *);
xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
};
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
@ -208,12 +212,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
uint);
void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
void xfs_extent_free_init_defer_op(void);

View file

@ -687,12 +687,13 @@ xfs_trans_ail_update_bulk(
bool
xfs_ail_delete_one(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
struct xfs_log_item *lip)
{
struct xfs_log_item *mlip = xfs_ail_min(ailp);
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
xfs_clear_li_failed(lip);
lip->li_flags &= ~XFS_LI_IN_AIL;
lip->li_lsn = 0;

View file

@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
} else if (!xfs_buf_item_dirty(bip)) {
} else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
/***
ASSERT(bp->b_pincount == 0);
***/
@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
}
/*
* This is called to mark bytes first through last inclusive of the given
* buffer as needing to be logged when the transaction is committed.
* The buffer must already be associated with the given transaction.
*
* First and last are numbers relative to the beginning of this buffer,
* so the first byte in the buffer is numbered 0 regardless of the
* value of b_blkno.
* Mark a buffer dirty in the transaction.
*/
void
xfs_trans_log_buf(xfs_trans_t *tp,
xfs_buf_t *bp,
uint first,
uint last)
xfs_trans_dirty_buf(
struct xfs_trans *tp,
struct xfs_buf *bp)
{
xfs_buf_log_item_t *bip = bp->b_fspriv;
struct xfs_buf_log_item *bip = bp->b_fspriv;
ASSERT(bp->b_transp == tp);
ASSERT(bip != NULL);
ASSERT(first <= last && last < BBTOB(bp->b_length));
ASSERT(bp->b_iodone == NULL ||
bp->b_iodone == xfs_buf_iodone_callbacks);
@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bp->b_iodone = xfs_buf_iodone_callbacks;
bip->bli_item.li_cb = xfs_buf_iodone;
trace_xfs_trans_log_buf(bip);
/*
* If we invalidated the buffer within this transaction, then
* cancel the invalidation now that we're dirtying the buffer
@ -545,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bp->b_flags &= ~XBF_STALE;
bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
}
bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
tp->t_flags |= XFS_TRANS_DIRTY;
bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
}
/*
* If we have an ordered buffer we are not logging any dirty range but
* it still needs to be marked dirty and that it has been logged.
*/
bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
if (!(bip->bli_flags & XFS_BLI_ORDERED))
xfs_buf_item_log(bip, first, last);
/*
* This is called to mark bytes first through last inclusive of the given
* buffer as needing to be logged when the transaction is committed.
* The buffer must already be associated with the given transaction.
*
* First and last are numbers relative to the beginning of this buffer,
* so the first byte in the buffer is numbered 0 regardless of the
* value of b_blkno.
*/
void
xfs_trans_log_buf(
struct xfs_trans *tp,
struct xfs_buf *bp,
uint first,
uint last)
{
struct xfs_buf_log_item *bip = bp->b_fspriv;
ASSERT(first <= last && last < BBTOB(bp->b_length));
ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
xfs_trans_dirty_buf(tp, bp);
trace_xfs_trans_log_buf(bip);
xfs_buf_item_log(bip, first, last);
}
@ -708,14 +718,13 @@ xfs_trans_inode_alloc_buf(
}
/*
* Mark the buffer as ordered for this transaction. This means
* that the contents of the buffer are not recorded in the transaction
* but it is tracked in the AIL as though it was. This allows us
* to record logical changes in transactions rather than the physical
* changes we make to the buffer without changing writeback ordering
* constraints of metadata buffers.
* Mark the buffer as ordered for this transaction. This means that the contents
* of the buffer are not recorded in the transaction but it is tracked in the
* AIL as though it was. This allows us to record logical changes in
* transactions rather than the physical changes we make to the buffer without
* changing writeback ordering constraints of metadata buffers.
*/
void
bool
xfs_trans_ordered_buf(
struct xfs_trans *tp,
struct xfs_buf *bp)
@ -726,8 +735,18 @@ xfs_trans_ordered_buf(
ASSERT(bip != NULL);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
if (xfs_buf_item_dirty_format(bip))
return false;
bip->bli_flags |= XFS_BLI_ORDERED;
trace_xfs_buf_item_ordered(bip);
/*
* We don't log a dirty range of an ordered buffer but it still needs
* to be marked dirty and that it has been logged.
*/
xfs_trans_dirty_buf(tp, bp);
return true;
}
/*

View file

@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn(
*dst = *src;
}
#endif
static inline void
xfs_clear_li_failed(
struct xfs_log_item *lip)
{
struct xfs_buf *bp = lip->li_buf;
ASSERT(lip->li_flags & XFS_LI_IN_AIL);
lockdep_assert_held(&lip->li_ailp->xa_lock);
if (lip->li_flags & XFS_LI_FAILED) {
lip->li_flags &= ~XFS_LI_FAILED;
lip->li_buf = NULL;
xfs_buf_rele(bp);
}
}
static inline void
xfs_set_li_failed(
struct xfs_log_item *lip,
struct xfs_buf *bp)
{
lockdep_assert_held(&lip->li_ailp->xa_lock);
if (!(lip->li_flags & XFS_LI_FAILED)) {
xfs_buf_hold(bp);
lip->li_flags |= XFS_LI_FAILED;
lip->li_buf = bp;
}
}
#endif /* __XFS_TRANS_PRIV_H__ */

View file

@ -2831,6 +2831,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
#endif
extern void unlock_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb);
extern void __iget(struct inode * inode);
extern void iget_failed(struct inode *);

View file

@ -126,4 +126,10 @@ static __always_inline enum lru_list page_lru(struct page *page)
#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
#ifdef arch_unmap_kpfn
extern void arch_unmap_kpfn(unsigned long pfn);
#else
static __always_inline void arch_unmap_kpfn(unsigned long pfn) { }
#endif
#endif

View file

@ -885,7 +885,7 @@ void kfree_skb(struct sk_buff *skb);
void kfree_skb_list(struct sk_buff *segs);
void skb_tx_error(struct sk_buff *skb);
void consume_skb(struct sk_buff *skb);
void consume_stateless_skb(struct sk_buff *skb);
void __consume_stateless_skb(struct sk_buff *skb);
void __kfree_skb(struct sk_buff *skb);
extern struct kmem_cache *skbuff_head_cache;

View file

@ -1,14 +1,9 @@
#ifndef __NET_FRAG_H__
#define __NET_FRAG_H__
#include <linux/percpu_counter.h>
struct netns_frags {
/* The percpu_counter "mem" need to be cacheline aligned.
* mem.count must not share cacheline with other writers
*/
struct percpu_counter mem ____cacheline_aligned_in_smp;
/* Keep atomic mem on separate cachelines in structs that include it */
atomic_t mem ____cacheline_aligned_in_smp;
/* sysctls */
int timeout;
int high_thresh;
@ -108,15 +103,10 @@ struct inet_frags {
int inet_frags_init(struct inet_frags *);
void inet_frags_fini(struct inet_frags *);
static inline int inet_frags_init_net(struct netns_frags *nf)
static inline void inet_frags_init_net(struct netns_frags *nf)
{
return percpu_counter_init(&nf->mem, 0, GFP_KERNEL);
atomic_set(&nf->mem, 0);
}
static inline void inet_frags_uninit_net(struct netns_frags *nf)
{
percpu_counter_destroy(&nf->mem);
}
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
@ -140,31 +130,24 @@ static inline bool inet_frag_evicting(struct inet_frag_queue *q)
/* Memory Tracking Functions. */
/* The default percpu_counter batch size is not big enough to scale to
* fragmentation mem acct sizes.
* The mem size of a 64K fragment is approx:
* (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
*/
static unsigned int frag_percpu_counter_batch = 130000;
static inline int frag_mem_limit(struct netns_frags *nf)
{
return percpu_counter_read(&nf->mem);
return atomic_read(&nf->mem);
}
static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
{
percpu_counter_add_batch(&nf->mem, -i, frag_percpu_counter_batch);
atomic_sub(i, &nf->mem);
}
static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
{
percpu_counter_add_batch(&nf->mem, i, frag_percpu_counter_batch);
atomic_add(i, &nf->mem);
}
static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
static inline int sum_frag_mem_limit(struct netns_frags *nf)
{
return percpu_counter_sum_positive(&nf->mem);
return atomic_read(&nf->mem);
}
/* RFC 3168 support :

View file

@ -154,7 +154,7 @@ void *idr_replace(struct idr *idr, void *ptr, int id)
void __rcu **slot = NULL;
void *entry;
if (WARN_ON_ONCE(id < 0))
if (id < 0)
return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
return ERR_PTR(-EINVAL);

View file

@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
}
arch_unmap_kpfn(pfn);
orig_head = hpage = compound_head(p);
num_poisoned_pages_inc();

View file

@ -753,14 +753,11 @@ EXPORT_SYMBOL(consume_skb);
* consume_stateless_skb - free an skbuff, assuming it is stateless
* @skb: buffer to free
*
* Works like consume_skb(), but this variant assumes that all the head
* states have been already dropped.
* Alike consume_skb(), but this variant assumes that this is the last
* skb reference and all the head states have been already dropped
*/
void consume_stateless_skb(struct sk_buff *skb)
void __consume_stateless_skb(struct sk_buff *skb)
{
if (!skb_unref(skb))
return;
trace_consume_skb(skb);
if (likely(skb->head))
skb_release_data(skb);

View file

@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
{
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
int res;
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
res = inet_frags_init_net(&ieee802154_lowpan->frags);
if (res)
return res;
res = lowpan_frags_ns_sysctl_register(net);
if (res)
inet_frags_uninit_net(&ieee802154_lowpan->frags);
return res;
inet_frags_init_net(&ieee802154_lowpan->frags);
return lowpan_frags_ns_sysctl_register(net);
}
static void __net_exit lowpan_frags_exit_net(struct net *net)

View file

@ -234,10 +234,8 @@ evict_again:
cond_resched();
if (read_seqretry(&f->rnd_seqlock, seq) ||
percpu_counter_sum(&nf->mem))
sum_frag_mem_limit(nf))
goto evict_again;
percpu_counter_destroy(&nf->mem);
}
EXPORT_SYMBOL(inet_frags_exit_net);

View file

@ -844,8 +844,6 @@ static void __init ip4_frags_ctl_register(void)
static int __net_init ipv4_frags_init_net(struct net *net)
{
int res;
/* Fragment cache limits.
*
* The fragment memory accounting code, (tries to) account for
@ -871,13 +869,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
net->ipv4.frags.max_dist = 64;
res = inet_frags_init_net(&net->ipv4.frags);
if (res)
return res;
res = ip4_frags_ns_ctl_register(net);
if (res)
inet_frags_uninit_net(&net->ipv4.frags);
return res;
inet_frags_init_net(&net->ipv4.frags);
return ip4_frags_ns_ctl_register(net);
}
static void __net_exit ipv4_frags_exit_net(struct net *net)

View file

@ -618,8 +618,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
ip_rt_put(rt);
goto tx_dropped;
}
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
df, !net_eq(tunnel->net, dev_net(dev)));
return;
tx_error:
dev->stats.tx_errors++;

View file

@ -1722,9 +1722,9 @@ process:
*/
sock_hold(sk);
refcounted = true;
if (tcp_filter(sk, skb))
goto discard_and_relse;
nsk = tcp_check_req(sk, skb, req, false);
nsk = NULL;
if (!tcp_filter(sk, skb))
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
goto discard_and_relse;

View file

@ -1386,12 +1386,15 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
unlock_sock_fast(sk, slow);
}
if (!skb_unref(skb))
return;
/* In the more common cases we cleared the head states previously,
* see __udp_queue_rcv_skb().
*/
if (unlikely(udp_skb_has_head_state(skb)))
skb_release_head_state(skb);
consume_stateless_skb(skb);
__consume_stateless_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_consume_udp);

View file

@ -198,6 +198,12 @@ static void rt6_release(struct rt6_info *rt)
}
}
static void fib6_free_table(struct fib6_table *table)
{
inetpeer_invalidate_tree(&table->tb6_peers);
kfree(table);
}
static void fib6_link_table(struct net *net, struct fib6_table *tb)
{
unsigned int h;
@ -1915,15 +1921,22 @@ out_timer:
static void fib6_net_exit(struct net *net)
{
unsigned int i;
rt6_ifdown(net, NULL);
del_timer_sync(&net->ipv6.ip6_fib_timer);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
kfree(net->ipv6.fib6_local_tbl);
#endif
inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
kfree(net->ipv6.fib6_main_tbl);
for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
struct hlist_head *head = &net->ipv6.fib_table_hash[i];
struct hlist_node *tmp;
struct fib6_table *tb;
hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
hlist_del(&tb->tb6_hlist);
fib6_free_table(tb);
}
}
kfree(net->ipv6.fib_table_hash);
kfree(net->ipv6.rt6_stats);
}

View file

@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
break;
case ICMPV6_PKT_TOOBIG:
mtu = be32_to_cpu(info) - offset;
mtu = be32_to_cpu(info) - offset - t->tun_hlen;
if (t->dev->type == ARPHRD_ETHER)
mtu -= ETH_HLEN;
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
t->dev->mtu = mtu;

View file

@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
static int nf_ct_net_init(struct net *net)
{
int res;
net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
res = inet_frags_init_net(&net->nf_frag.frags);
if (res)
return res;
res = nf_ct_frag6_sysctl_register(net);
if (res)
inet_frags_uninit_net(&net->nf_frag.frags);
return res;
inet_frags_init_net(&net->nf_frag.frags);
return nf_ct_frag6_sysctl_register(net);
}
static void nf_ct_net_exit(struct net *net)

View file

@ -714,19 +714,13 @@ static void ip6_frags_sysctl_unregister(void)
static int __net_init ipv6_frags_init_net(struct net *net)
{
int res;
net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
res = inet_frags_init_net(&net->ipv6.frags);
if (res)
return res;
res = ip6_frags_ns_sysctl_register(net);
if (res)
inet_frags_uninit_net(&net->ipv6.frags);
return res;
inet_frags_init_net(&net->ipv6.frags);
return ip6_frags_ns_sysctl_register(net);
}
static void __net_exit ipv6_frags_exit_net(struct net *net)

View file

@ -1456,9 +1456,9 @@ process:
}
sock_hold(sk);
refcounted = true;
if (tcp_filter(sk, skb))
goto discard_and_relse;
nsk = tcp_check_req(sk, skb, req, false);
nsk = NULL;
if (!tcp_filter(sk, skb))
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
goto discard_and_relse;

View file

@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
sctp_ulpq_clear_pd(ulpq);
if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
sp->data_ready_signalled = 1;
if (!sock_owned_by_user(sk))
sp->data_ready_signalled = 1;
sk->sk_data_ready(sk);
}
return 1;