Merge 5.7-rc1 into android-mainline

Linux 5.7-rc1

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I40037d3be5c3181d58f7aa1122d1fb06353d98b9
This commit is contained in:
Greg Kroah-Hartman
2020-04-13 09:13:20 +02:00
34 changed files with 3830 additions and 3159 deletions

View File

@@ -65,6 +65,12 @@ max_pid_namespaces
The maximum number of pid namespaces that any user in the current
user namespace may create.
max_time_namespaces
===================
The maximum number of time namespaces that any user in the current
user namespace may create.
max_user_namespaces
===================

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
VERSION = 5
PATCHLEVEL = 6
PATCHLEVEL = 7
SUBLEVEL = 0
EXTRAVERSION =
EXTRAVERSION = -rc1
NAME = Kleptomaniac Octopus
# *DOCUMENTATION*

View File

@@ -1476,6 +1476,12 @@ static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = {
.mmio_init = tgl_l_uncore_mmio_init,
};
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
.mmio_init = icx_uncore_mmio_init,
};
static const struct intel_uncore_init_fun snr_uncore_init __initconst = {
.cpu_init = snr_uncore_cpu_init,
.pci_init = snr_uncore_pci_init,
@@ -1511,6 +1517,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),

View File

@@ -550,6 +550,9 @@ void skx_uncore_cpu_init(void);
int snr_uncore_pci_init(void);
void snr_uncore_cpu_init(void);
void snr_uncore_mmio_init(void);
int icx_uncore_pci_init(void);
void icx_uncore_cpu_init(void);
void icx_uncore_mmio_init(void);
/* uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);

View File

@@ -382,6 +382,42 @@
#define SNR_IMC_MMIO_MEM0_OFFSET 0xd8
#define SNR_IMC_MMIO_MEM0_MASK 0x7FF
/* ICX CHA */
#define ICX_C34_MSR_PMON_CTR0 0xb68
#define ICX_C34_MSR_PMON_CTL0 0xb61
#define ICX_C34_MSR_PMON_BOX_CTL 0xb60
#define ICX_C34_MSR_PMON_BOX_FILTER0 0xb65
/* ICX IIO */
#define ICX_IIO_MSR_PMON_CTL0 0xa58
#define ICX_IIO_MSR_PMON_CTR0 0xa51
#define ICX_IIO_MSR_PMON_BOX_CTL 0xa50
/* ICX IRP */
#define ICX_IRP0_MSR_PMON_CTL0 0xa4d
#define ICX_IRP0_MSR_PMON_CTR0 0xa4b
#define ICX_IRP0_MSR_PMON_BOX_CTL 0xa4a
/* ICX M2PCIE */
#define ICX_M2PCIE_MSR_PMON_CTL0 0xa46
#define ICX_M2PCIE_MSR_PMON_CTR0 0xa41
#define ICX_M2PCIE_MSR_PMON_BOX_CTL 0xa40
/* ICX UPI */
#define ICX_UPI_PCI_PMON_CTL0 0x350
#define ICX_UPI_PCI_PMON_CTR0 0x320
#define ICX_UPI_PCI_PMON_BOX_CTL 0x318
#define ICX_UPI_CTL_UMASK_EXT 0xffffff
/* ICX M3UPI*/
#define ICX_M3UPI_PCI_PMON_CTL0 0xd8
#define ICX_M3UPI_PCI_PMON_CTR0 0xa8
#define ICX_M3UPI_PCI_PMON_BOX_CTL 0xa0
/* ICX IMC */
#define ICX_NUMBER_IMC_CHN 2
#define ICX_IMC_MEM_STRIDE 0x4
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
@@ -390,6 +426,7 @@ DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55");
DEFINE_UNCORE_FORMAT_ATTR(umask_ext2, umask, "config:8-15,32-57");
DEFINE_UNCORE_FORMAT_ATTR(umask_ext3, umask, "config:8-15,32-39");
DEFINE_UNCORE_FORMAT_ATTR(umask_ext4, umask, "config:8-15,32-55");
DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
@@ -4551,3 +4588,477 @@ void snr_uncore_mmio_init(void)
}
/* end of SNR uncore support */
/* ICX uncore support */
static unsigned icx_cha_msr_offsets[] = {
0x2a0, 0x2ae, 0x2bc, 0x2ca, 0x2d8, 0x2e6, 0x2f4, 0x302, 0x310,
0x31e, 0x32c, 0x33a, 0x348, 0x356, 0x364, 0x372, 0x380, 0x38e,
0x3aa, 0x3b8, 0x3c6, 0x3d4, 0x3e2, 0x3f0, 0x3fe, 0x40c, 0x41a,
0x428, 0x436, 0x444, 0x452, 0x460, 0x46e, 0x47c, 0x0, 0xe,
0x1c, 0x2a, 0x38, 0x46,
};
static int icx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
{
struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
bool tie_en = !!(event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN);
if (tie_en) {
reg1->reg = ICX_C34_MSR_PMON_BOX_FILTER0 +
icx_cha_msr_offsets[box->pmu->pmu_idx];
reg1->config = event->attr.config1 & SKX_CHA_MSR_PMON_BOX_FILTER_TID;
reg1->idx = 0;
}
return 0;
}
static struct intel_uncore_ops icx_uncore_chabox_ops = {
.init_box = ivbep_uncore_msr_init_box,
.disable_box = snbep_uncore_msr_disable_box,
.enable_box = snbep_uncore_msr_enable_box,
.disable_event = snbep_uncore_msr_disable_event,
.enable_event = snr_cha_enable_event,
.read_counter = uncore_msr_read_counter,
.hw_config = icx_cha_hw_config,
};
static struct intel_uncore_type icx_uncore_chabox = {
.name = "cha",
.num_counters = 4,
.perf_ctr_bits = 48,
.event_ctl = ICX_C34_MSR_PMON_CTL0,
.perf_ctr = ICX_C34_MSR_PMON_CTR0,
.box_ctl = ICX_C34_MSR_PMON_BOX_CTL,
.msr_offsets = icx_cha_msr_offsets,
.event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
.event_mask_ext = SNR_CHA_RAW_EVENT_MASK_EXT,
.constraints = skx_uncore_chabox_constraints,
.ops = &icx_uncore_chabox_ops,
.format_group = &snr_uncore_chabox_format_group,
};
static unsigned icx_msr_offsets[] = {
0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0,
};
static struct event_constraint icx_uncore_iio_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
UNCORE_EVENT_CONSTRAINT(0x03, 0x3),
UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
EVENT_CONSTRAINT_END
};
static struct intel_uncore_type icx_uncore_iio = {
.name = "iio",
.num_counters = 4,
.num_boxes = 6,
.perf_ctr_bits = 48,
.event_ctl = ICX_IIO_MSR_PMON_CTL0,
.perf_ctr = ICX_IIO_MSR_PMON_CTR0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
.box_ctl = ICX_IIO_MSR_PMON_BOX_CTL,
.msr_offsets = icx_msr_offsets,
.constraints = icx_uncore_iio_constraints,
.ops = &skx_uncore_iio_ops,
.format_group = &snr_uncore_iio_format_group,
};
static struct intel_uncore_type icx_uncore_irp = {
.name = "irp",
.num_counters = 2,
.num_boxes = 6,
.perf_ctr_bits = 48,
.event_ctl = ICX_IRP0_MSR_PMON_CTL0,
.perf_ctr = ICX_IRP0_MSR_PMON_CTR0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.box_ctl = ICX_IRP0_MSR_PMON_BOX_CTL,
.msr_offsets = icx_msr_offsets,
.ops = &ivbep_uncore_msr_ops,
.format_group = &ivbep_uncore_format_group,
};
static struct event_constraint icx_uncore_m2pcie_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
EVENT_CONSTRAINT_END
};
static struct intel_uncore_type icx_uncore_m2pcie = {
.name = "m2pcie",
.num_counters = 4,
.num_boxes = 6,
.perf_ctr_bits = 48,
.event_ctl = ICX_M2PCIE_MSR_PMON_CTL0,
.perf_ctr = ICX_M2PCIE_MSR_PMON_CTR0,
.box_ctl = ICX_M2PCIE_MSR_PMON_BOX_CTL,
.msr_offsets = icx_msr_offsets,
.constraints = icx_uncore_m2pcie_constraints,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.ops = &ivbep_uncore_msr_ops,
.format_group = &ivbep_uncore_format_group,
};
enum perf_uncore_icx_iio_freerunning_type_id {
ICX_IIO_MSR_IOCLK,
ICX_IIO_MSR_BW_IN,
ICX_IIO_FREERUNNING_TYPE_MAX,
};
static unsigned icx_iio_clk_freerunning_box_offsets[] = {
0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0,
};
static unsigned icx_iio_bw_freerunning_box_offsets[] = {
0x0, 0x10, 0x20, 0x90, 0xa0, 0xb0,
};
static struct freerunning_counters icx_iio_freerunning[] = {
[ICX_IIO_MSR_IOCLK] = { 0xa55, 0x1, 0x20, 1, 48, icx_iio_clk_freerunning_box_offsets },
[ICX_IIO_MSR_BW_IN] = { 0xaa0, 0x1, 0x10, 8, 48, icx_iio_bw_freerunning_box_offsets },
};
static struct uncore_event_desc icx_uncore_iio_freerunning_events[] = {
/* Free-Running IIO CLOCKS Counter */
INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"),
/* Free-Running IIO BANDWIDTH IN Counters */
INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"),
INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"),
INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"),
INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"),
INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"),
INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"),
INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"),
INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"),
INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"),
{ /* end: all zeroes */ },
};
static struct intel_uncore_type icx_uncore_iio_free_running = {
.name = "iio_free_running",
.num_counters = 9,
.num_boxes = 6,
.num_freerunning_types = ICX_IIO_FREERUNNING_TYPE_MAX,
.freerunning = icx_iio_freerunning,
.ops = &skx_uncore_iio_freerunning_ops,
.event_descs = icx_uncore_iio_freerunning_events,
.format_group = &skx_uncore_iio_freerunning_format_group,
};
static struct intel_uncore_type *icx_msr_uncores[] = {
&skx_uncore_ubox,
&icx_uncore_chabox,
&icx_uncore_iio,
&icx_uncore_irp,
&icx_uncore_m2pcie,
&skx_uncore_pcu,
&icx_uncore_iio_free_running,
NULL,
};
/*
* To determine the number of CHAs, it should read CAPID6(Low) and CAPID7 (High)
* registers which located at Device 30, Function 3
*/
#define ICX_CAPID6 0x9c
#define ICX_CAPID7 0xa0
static u64 icx_count_chabox(void)
{
struct pci_dev *dev = NULL;
u64 caps = 0;
dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x345b, dev);
if (!dev)
goto out;
pci_read_config_dword(dev, ICX_CAPID6, (u32 *)&caps);
pci_read_config_dword(dev, ICX_CAPID7, (u32 *)&caps + 1);
out:
pci_dev_put(dev);
return hweight64(caps);
}
void icx_uncore_cpu_init(void)
{
u64 num_boxes = icx_count_chabox();
if (WARN_ON(num_boxes > ARRAY_SIZE(icx_cha_msr_offsets)))
return;
icx_uncore_chabox.num_boxes = num_boxes;
uncore_msr_uncores = icx_msr_uncores;
}
static struct intel_uncore_type icx_uncore_m2m = {
.name = "m2m",
.num_counters = 4,
.num_boxes = 4,
.perf_ctr_bits = 48,
.perf_ctr = SNR_M2M_PCI_PMON_CTR0,
.event_ctl = SNR_M2M_PCI_PMON_CTL0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.box_ctl = SNR_M2M_PCI_PMON_BOX_CTL,
.ops = &snr_m2m_uncore_pci_ops,
.format_group = &skx_uncore_format_group,
};
static struct attribute *icx_upi_uncore_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask_ext4.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
&format_attr_thresh8.attr,
NULL,
};
static const struct attribute_group icx_upi_uncore_format_group = {
.name = "format",
.attrs = icx_upi_uncore_formats_attr,
};
static struct intel_uncore_type icx_uncore_upi = {
.name = "upi",
.num_counters = 4,
.num_boxes = 3,
.perf_ctr_bits = 48,
.perf_ctr = ICX_UPI_PCI_PMON_CTR0,
.event_ctl = ICX_UPI_PCI_PMON_CTL0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.event_mask_ext = ICX_UPI_CTL_UMASK_EXT,
.box_ctl = ICX_UPI_PCI_PMON_BOX_CTL,
.ops = &skx_upi_uncore_pci_ops,
.format_group = &icx_upi_uncore_format_group,
};
static struct event_constraint icx_uncore_m3upi_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x1c, 0x1),
UNCORE_EVENT_CONSTRAINT(0x1d, 0x1),
UNCORE_EVENT_CONSTRAINT(0x1e, 0x1),
UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
UNCORE_EVENT_CONSTRAINT(0x40, 0x7),
UNCORE_EVENT_CONSTRAINT(0x4e, 0x7),
UNCORE_EVENT_CONSTRAINT(0x4f, 0x7),
UNCORE_EVENT_CONSTRAINT(0x50, 0x7),
EVENT_CONSTRAINT_END
};
static struct intel_uncore_type icx_uncore_m3upi = {
.name = "m3upi",
.num_counters = 4,
.num_boxes = 3,
.perf_ctr_bits = 48,
.perf_ctr = ICX_M3UPI_PCI_PMON_CTR0,
.event_ctl = ICX_M3UPI_PCI_PMON_CTL0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.box_ctl = ICX_M3UPI_PCI_PMON_BOX_CTL,
.constraints = icx_uncore_m3upi_constraints,
.ops = &ivbep_uncore_pci_ops,
.format_group = &skx_uncore_format_group,
};
enum {
ICX_PCI_UNCORE_M2M,
ICX_PCI_UNCORE_UPI,
ICX_PCI_UNCORE_M3UPI,
};
static struct intel_uncore_type *icx_pci_uncores[] = {
[ICX_PCI_UNCORE_M2M] = &icx_uncore_m2m,
[ICX_PCI_UNCORE_UPI] = &icx_uncore_upi,
[ICX_PCI_UNCORE_M3UPI] = &icx_uncore_m3upi,
NULL,
};
static const struct pci_device_id icx_uncore_pci_ids[] = {
{ /* M2M 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
.driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 0, ICX_PCI_UNCORE_M2M, 0),
},
{ /* M2M 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
.driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 0, ICX_PCI_UNCORE_M2M, 1),
},
{ /* M2M 2 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
.driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, ICX_PCI_UNCORE_M2M, 2),
},
{ /* M2M 3 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
.driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, ICX_PCI_UNCORE_M2M, 3),
},
{ /* UPI Link 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441),
.driver_data = UNCORE_PCI_DEV_FULL_DATA(2, 1, ICX_PCI_UNCORE_UPI, 0),
},
{ /* UPI Link 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441),
.driver_data = UNCORE_PCI_DEV_FULL_DATA(3, 1, ICX_PCI_UNCORE_UPI, 1),
},
{ /* UPI Link 2 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441),
.driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 1, ICX_PCI_UNCORE_UPI, 2),
},
{ /* M3UPI Link 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446),
.driver_data = UNCORE_PCI_DEV_FULL_DATA(5, 1, ICX_PCI_UNCORE_M3UPI, 0),
},
{ /* M3UPI Link 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446),
.driver_data = UNCORE_PCI_DEV_FULL_DATA(6, 1, ICX_PCI_UNCORE_M3UPI, 1),
},
{ /* M3UPI Link 2 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446),
.driver_data = UNCORE_PCI_DEV_FULL_DATA(7, 1, ICX_PCI_UNCORE_M3UPI, 2),
},
{ /* end: all zeroes */ }
};
static struct pci_driver icx_uncore_pci_driver = {
.name = "icx_uncore",
.id_table = icx_uncore_pci_ids,
};
int icx_uncore_pci_init(void)
{
/* ICX UBOX DID */
int ret = snbep_pci2phy_map_init(0x3450, SKX_CPUNODEID,
SKX_GIDNIDMAP, true);
if (ret)
return ret;
uncore_pci_uncores = icx_pci_uncores;
uncore_pci_driver = &icx_uncore_pci_driver;
return 0;
}
static void icx_uncore_imc_init_box(struct intel_uncore_box *box)
{
unsigned int box_ctl = box->pmu->type->box_ctl +
box->pmu->type->mmio_offset * (box->pmu->pmu_idx % ICX_NUMBER_IMC_CHN);
int mem_offset = (box->pmu->pmu_idx / ICX_NUMBER_IMC_CHN) * ICX_IMC_MEM_STRIDE +
SNR_IMC_MMIO_MEM0_OFFSET;
__snr_uncore_mmio_init_box(box, box_ctl, mem_offset);
}
static struct intel_uncore_ops icx_uncore_mmio_ops = {
.init_box = icx_uncore_imc_init_box,
.exit_box = uncore_mmio_exit_box,
.disable_box = snr_uncore_mmio_disable_box,
.enable_box = snr_uncore_mmio_enable_box,
.disable_event = snr_uncore_mmio_disable_event,
.enable_event = snr_uncore_mmio_enable_event,
.read_counter = uncore_mmio_read_counter,
};
static struct intel_uncore_type icx_uncore_imc = {
.name = "imc",
.num_counters = 4,
.num_boxes = 8,
.perf_ctr_bits = 48,
.fixed_ctr_bits = 48,
.fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
.fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
.event_descs = hswep_uncore_imc_events,
.perf_ctr = SNR_IMC_MMIO_PMON_CTR0,
.event_ctl = SNR_IMC_MMIO_PMON_CTL0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL,
.mmio_offset = SNR_IMC_MMIO_OFFSET,
.ops = &icx_uncore_mmio_ops,
.format_group = &skx_uncore_format_group,
};
enum perf_uncore_icx_imc_freerunning_type_id {
ICX_IMC_DCLK,
ICX_IMC_DDR,
ICX_IMC_DDRT,
ICX_IMC_FREERUNNING_TYPE_MAX,
};
static struct freerunning_counters icx_imc_freerunning[] = {
[ICX_IMC_DCLK] = { 0x22b0, 0x0, 0, 1, 48 },
[ICX_IMC_DDR] = { 0x2290, 0x8, 0, 2, 48 },
[ICX_IMC_DDRT] = { 0x22a0, 0x8, 0, 2, 48 },
};
static struct uncore_event_desc icx_uncore_imc_freerunning_events[] = {
INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"),
INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"),
INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"),
INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(ddrt_read, "event=0xff,umask=0x30"),
INTEL_UNCORE_EVENT_DESC(ddrt_read.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(ddrt_read.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(ddrt_write, "event=0xff,umask=0x31"),
INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "3.814697266e-6"),
INTEL_UNCORE_EVENT_DESC(ddrt_write.unit, "MiB"),
{ /* end: all zeroes */ },
};
static void icx_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
{
int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE +
SNR_IMC_MMIO_MEM0_OFFSET;
__snr_uncore_mmio_init_box(box, uncore_mmio_box_ctl(box), mem_offset);
}
static struct intel_uncore_ops icx_uncore_imc_freerunning_ops = {
.init_box = icx_uncore_imc_freerunning_init_box,
.exit_box = uncore_mmio_exit_box,
.read_counter = uncore_mmio_read_counter,
.hw_config = uncore_freerunning_hw_config,
};
static struct intel_uncore_type icx_uncore_imc_free_running = {
.name = "imc_free_running",
.num_counters = 5,
.num_boxes = 4,
.num_freerunning_types = ICX_IMC_FREERUNNING_TYPE_MAX,
.freerunning = icx_imc_freerunning,
.ops = &icx_uncore_imc_freerunning_ops,
.event_descs = icx_uncore_imc_freerunning_events,
.format_group = &skx_uncore_iio_freerunning_format_group,
};
static struct intel_uncore_type *icx_mmio_uncores[] = {
&icx_uncore_imc,
&icx_uncore_imc_free_running,
NULL,
};
void icx_uncore_mmio_init(void)
{
uncore_mmio_uncores = icx_mmio_uncores;
}
/* end of ICX uncore support */

View File

@@ -44,6 +44,7 @@ unsigned int x86_stepping(unsigned int sig);
extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c);
extern void switch_to_sld(unsigned long tifn);
extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
extern bool handle_guest_split_lock(unsigned long ip);
#else
static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {}
static inline void switch_to_sld(unsigned long tifn) {}
@@ -51,5 +52,10 @@ static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
{
return false;
}
static inline bool handle_guest_split_lock(unsigned long ip)
{
return false;
}
#endif
#endif /* _ASM_X86_CPU_H */

View File

@@ -21,6 +21,7 @@
#include <asm/elf.h>
#include <asm/cpu_device_id.h>
#include <asm/cmdline.h>
#include <asm/traps.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -1066,13 +1067,10 @@ static void split_lock_init(void)
split_lock_verify_msr(sld_state != sld_off);
}
bool handle_user_split_lock(struct pt_regs *regs, long error_code)
static void split_lock_warn(unsigned long ip)
{
if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
return false;
pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
current->comm, current->pid, regs->ip);
current->comm, current->pid, ip);
/*
* Disable the split lock detection for this task so it can make
@@ -1081,6 +1079,31 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code)
*/
sld_update_msr(false);
set_tsk_thread_flag(current, TIF_SLD);
}
bool handle_guest_split_lock(unsigned long ip)
{
if (sld_state == sld_warn) {
split_lock_warn(ip);
return true;
}
pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
current->comm, current->pid,
sld_state == sld_fatal ? "fatal" : "bogus", ip);
current->thread.error_code = 0;
current->thread.trap_nr = X86_TRAP_AC;
force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
return false;
}
EXPORT_SYMBOL_GPL(handle_guest_split_lock);
bool handle_user_split_lock(struct pt_regs *regs, long error_code)
{
if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
return false;
split_lock_warn(regs->ip);
return true;
}

View File

@@ -4588,6 +4588,26 @@ static int handle_machine_check(struct kvm_vcpu *vcpu)
return 1;
}
/*
* If the host has split lock detection disabled, then #AC is
* unconditionally injected into the guest, which is the pre split lock
* detection behaviour.
*
* If the host has split lock detection enabled then #AC is
* only injected into the guest when:
* - Guest CPL == 3 (user mode)
* - Guest has #AC detection enabled in CR0
* - Guest EFLAGS has AC bit set
*/
static inline bool guest_inject_ac(struct kvm_vcpu *vcpu)
{
if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
return true;
return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
(kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
}
static int handle_exception_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4653,9 +4673,6 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
return handle_rmode_exception(vcpu, ex_no, error_code);
switch (ex_no) {
case AC_VECTOR:
kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
return 1;
case DB_VECTOR:
dr6 = vmcs_readl(EXIT_QUALIFICATION);
if (!(vcpu->guest_debug &
@@ -4684,6 +4701,20 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
kvm_run->debug.arch.exception = ex_no;
break;
case AC_VECTOR:
if (guest_inject_ac(vcpu)) {
kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
return 1;
}
/*
* Handle split lock. Depending on detection mode this will
* either warn and disable split lock detection for this
* task or force SIGBUS on it.
*/
if (handle_guest_split_lock(kvm_rip_read(vcpu)))
return 1;
fallthrough;
default:
kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
kvm_run->ex.exception = ex_no;

View File

@@ -5839,6 +5839,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
{
struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u64 page_line_mask;
gpa_t gpa;
char *kaddr;
bool exchanged;
@@ -5853,7 +5854,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto emul_write;
if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
/*
* Emulate the atomic as a straight write to avoid #AC if SLD is
* enabled in the host and the access splits a cache line.
*/
if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
page_line_mask = ~(cache_line_size() - 1);
else
page_line_mask = PAGE_MASK;
if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
goto emul_write;
if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))

View File

@@ -202,7 +202,7 @@ config CIFS_SMB_DIRECT
help
Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1.
SMB Direct allows transferring SMB packets over RDMA. If unsure,
say N.
say Y.
config CIFS_FSCACHE
bool "Provide CIFS client caching support"

View File

@@ -323,10 +323,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
atomic_read(&server->smbd_conn->send_credits),
atomic_read(&server->smbd_conn->receive_credits),
server->smbd_conn->receive_credit_target);
seq_printf(m, "\nPending send_pending: %x "
"send_payload_pending: %x",
atomic_read(&server->smbd_conn->send_pending),
atomic_read(&server->smbd_conn->send_payload_pending));
seq_printf(m, "\nPending send_pending: %x ",
atomic_read(&server->smbd_conn->send_pending));
seq_printf(m, "\nReceive buffers count_receive_queue: %x "
"count_empty_packet_queue: %x",
server->smbd_conn->count_receive_queue,

View File

@@ -1208,6 +1208,10 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
{
unsigned int xid = get_xid();
ssize_t rc;
struct cifsFileInfo *cfile = dst_file->private_data;
if (cfile->swapfile)
return -EOPNOTSUPP;
rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
len, flags);

View File

@@ -426,7 +426,8 @@ struct smb_version_operations {
/* generate new lease key */
void (*new_lease_key)(struct cifs_fid *);
int (*generate_signingkey)(struct cifs_ses *);
int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *,
bool allocate_crypto);
int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon,
struct cifsFileInfo *src_file);
int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon,
@@ -1312,6 +1313,7 @@ struct cifsFileInfo {
struct tcon_link *tlink;
unsigned int f_flags;
bool invalidHandle:1; /* file closed via session abend */
bool swapfile:1;
bool oplock_break_cancelled:1;
unsigned int oplock_epoch; /* epoch from the lease break */
__u32 oplock_level; /* oplock/lease level from the lease break */

View File

@@ -4808,6 +4808,60 @@ cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter)
return -EINVAL;
}
static int cifs_swap_activate(struct swap_info_struct *sis,
struct file *swap_file, sector_t *span)
{
struct cifsFileInfo *cfile = swap_file->private_data;
struct inode *inode = swap_file->f_mapping->host;
unsigned long blocks;
long long isize;
cifs_dbg(FYI, "swap activate\n");
spin_lock(&inode->i_lock);
blocks = inode->i_blocks;
isize = inode->i_size;
spin_unlock(&inode->i_lock);
if (blocks*512 < isize) {
pr_warn("swap activate: swapfile has holes\n");
return -EINVAL;
}
*span = sis->pages;
printk_once(KERN_WARNING "Swap support over SMB3 is experimental\n");
/*
* TODO: consider adding ACL (or documenting how) to prevent other
* users (on this or other systems) from reading it
*/
/* TODO: add sk_set_memalloc(inet) or similar */
if (cfile)
cfile->swapfile = true;
/*
* TODO: Since file already open, we can't open with DENY_ALL here
* but we could add call to grab a byte range lock to prevent others
* from reading or writing the file
*/
return 0;
}
static void cifs_swap_deactivate(struct file *file)
{
struct cifsFileInfo *cfile = file->private_data;
cifs_dbg(FYI, "swap deactivate\n");
/* TODO: undo sk_set_memalloc(inet) will eventually be needed */
if (cfile)
cfile->swapfile = false;
/* do we need to unpin (or unlock) the file */
}
const struct address_space_operations cifs_addr_ops = {
.readpage = cifs_readpage,
@@ -4821,6 +4875,13 @@ const struct address_space_operations cifs_addr_ops = {
.direct_IO = cifs_direct_io,
.invalidatepage = cifs_invalidate_page,
.launder_page = cifs_launder_page,
/*
* TODO: investigate and if useful we could add an cifs_migratePage
* helper (under an CONFIG_MIGRATION) in the future, and also
* investigate and add an is_dirty_writeback helper if needed
*/
.swap_activate = cifs_swap_activate,
.swap_deactivate = cifs_swap_deactivate,
};
/*

View File

@@ -2026,6 +2026,10 @@ cifs_revalidate_mapping(struct inode *inode)
int rc;
unsigned long *flags = &CIFS_I(inode)->flags;
/* swapfiles are not supposed to be shared */
if (IS_SWAPFILE(inode))
return 0;
rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
TASK_KILLABLE);
if (rc)

View File

@@ -246,7 +246,7 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info,
*/
fattr->cf_mode = le32_to_cpu(info->Mode) & ~S_IFMT;
cifs_dbg(VFS, "XXX dev %d, reparse %d, mode %o",
cifs_dbg(FYI, "posix fattr: dev %d, reparse %d, mode %o",
le32_to_cpu(info->DeviceId),
le32_to_cpu(info->ReparseTag),
le32_to_cpu(info->Mode));

View File

@@ -766,6 +766,20 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
spin_lock(&cifs_tcp_ses_lock);
if (tcon->tc_count <= 0) {
struct TCP_Server_Info *server = NULL;
WARN_ONCE(tcon->tc_count < 0, "tcon refcount is negative");
spin_unlock(&cifs_tcp_ses_lock);
if (tcon->ses)
server = tcon->ses->server;
cifs_server_dbg(FYI, "tid=%u: tcon is closing, skipping async close retry of fid %llu %llu\n",
tcon->tid, persistent_fid, volatile_fid);
return 0;
}
tcon->tc_count++;
spin_unlock(&cifs_tcp_ses_lock);

View File

@@ -55,9 +55,11 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server,
__u64 ses_id, __u32 tid);
extern int smb2_calc_signature(struct smb_rqst *rqst,
struct TCP_Server_Info *server);
struct TCP_Server_Info *server,
bool allocate_crypto);
extern int smb3_calc_signature(struct smb_rqst *rqst,
struct TCP_Server_Info *server);
struct TCP_Server_Info *server,
bool allocate_crypto);
extern void smb2_echo_request(struct work_struct *work);
extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
extern bool smb2_is_valid_oplock_break(char *buffer,

View File

@@ -40,14 +40,6 @@
#include "smb2status.h"
#include "smb2glob.h"
static int
smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
{
return cifs_alloc_hash("hmac(sha256)",
&server->secmech.hmacsha256,
&server->secmech.sdeschmacsha256);
}
static int
smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
{
@@ -219,7 +211,8 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid)
}
int
smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
bool allocate_crypto)
{
int rc;
unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
@@ -228,6 +221,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
struct cifs_ses *ses;
struct shash_desc *shash;
struct crypto_shash *hash;
struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
ses = smb2_find_smb_ses(server, shdr->SessionId);
@@ -239,24 +234,32 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
rc = smb2_crypto_shash_allocate(server);
if (rc) {
cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__);
return rc;
if (allocate_crypto) {
rc = cifs_alloc_hash("hmac(sha256)", &hash, &sdesc);
if (rc) {
cifs_server_dbg(VFS,
"%s: sha256 alloc failed\n", __func__);
return rc;
}
shash = &sdesc->shash;
} else {
hash = server->secmech.hmacsha256;
shash = &server->secmech.sdeschmacsha256->shash;
}
rc = crypto_shash_setkey(server->secmech.hmacsha256,
ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
rc = crypto_shash_setkey(hash, ses->auth_key.response,
SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with response\n", __func__);
return rc;
cifs_server_dbg(VFS,
"%s: Could not update with response\n",
__func__);
goto out;
}
shash = &server->secmech.sdeschmacsha256->shash;
rc = crypto_shash_init(shash);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not init sha256", __func__);
return rc;
goto out;
}
/*
@@ -271,9 +274,10 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_update(shash, iov[0].iov_base,
iov[0].iov_len);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with payload\n",
__func__);
return rc;
cifs_server_dbg(VFS,
"%s: Could not update with payload\n",
__func__);
goto out;
}
drqst.rq_iov++;
drqst.rq_nvec--;
@@ -283,6 +287,9 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!rc)
memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
out:
if (allocate_crypto)
cifs_free_hash(&hash, &sdesc);
return rc;
}
@@ -504,14 +511,17 @@ generate_smb311signingkey(struct cifs_ses *ses)
}
int
smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
bool allocate_crypto)
{
int rc;
unsigned char smb3_signature[SMB2_CMACAES_SIZE];
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
struct shash_desc *shash = &server->secmech.sdesccmacaes->shash;
struct shash_desc *shash;
struct crypto_shash *hash;
struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
u8 key[SMB3_SIGN_KEY_SIZE];
@@ -519,14 +529,24 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (rc)
return 0;
if (allocate_crypto) {
rc = cifs_alloc_hash("cmac(aes)", &hash, &sdesc);
if (rc)
return rc;
shash = &sdesc->shash;
} else {
hash = server->secmech.cmacaes;
shash = &server->secmech.sdesccmacaes->shash;
}
memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
rc = crypto_shash_setkey(server->secmech.cmacaes,
key, SMB2_CMACAES_SIZE);
rc = crypto_shash_setkey(hash, key, SMB2_CMACAES_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
return rc;
goto out;
}
/*
@@ -537,7 +557,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_init(shash);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
return rc;
goto out;
}
/*
@@ -554,7 +574,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with payload\n",
__func__);
return rc;
goto out;
}
drqst.rq_iov++;
drqst.rq_nvec--;
@@ -564,6 +584,9 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!rc)
memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
out:
if (allocate_crypto)
cifs_free_hash(&hash, &sdesc);
return rc;
}
@@ -593,7 +616,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return 0;
}
rc = server->ops->calc_signature(rqst, server);
rc = server->ops->calc_signature(rqst, server, false);
return rc;
}
@@ -631,9 +654,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(shdr->Signature, 0, SMB2_SIGNATURE_SIZE);
mutex_lock(&server->srv_mutex);
rc = server->ops->calc_signature(rqst, server);
mutex_unlock(&server->srv_mutex);
rc = server->ops->calc_signature(rqst, server, true);
if (rc)
return rc;

View File

@@ -284,13 +284,10 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
request->sge[i].length,
DMA_TO_DEVICE);
if (request->has_payload) {
if (atomic_dec_and_test(&request->info->send_payload_pending))
wake_up(&request->info->wait_send_payload_pending);
} else {
if (atomic_dec_and_test(&request->info->send_pending))
wake_up(&request->info->wait_send_pending);
}
if (atomic_dec_and_test(&request->info->send_pending))
wake_up(&request->info->wait_send_pending);
wake_up(&request->info->wait_post_send);
mempool_free(request, request->info->request_mempool);
}
@@ -383,27 +380,6 @@ static bool process_negotiation_response(
return true;
}
/*
* Check and schedule to send an immediate packet
* This is used to extend credtis to remote peer to keep the transport busy
*/
static void check_and_send_immediate(struct smbd_connection *info)
{
if (info->transport_status != SMBD_CONNECTED)
return;
info->send_immediate = true;
/*
* Promptly send a packet if our peer is running low on receive
* credits
*/
if (atomic_read(&info->receive_credits) <
info->receive_credit_target - 1)
queue_delayed_work(
info->workqueue, &info->send_immediate_work, 0);
}
static void smbd_post_send_credits(struct work_struct *work)
{
int ret = 0;
@@ -453,10 +429,16 @@ static void smbd_post_send_credits(struct work_struct *work)
info->new_credits_offered += ret;
spin_unlock(&info->lock_new_credits_offered);
atomic_add(ret, &info->receive_credits);
/* Check if we can post new receive and grant credits to peer */
check_and_send_immediate(info);
/* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
info->send_immediate = true;
if (atomic_read(&info->receive_credits) <
info->receive_credit_target - 1) {
if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
info->send_immediate) {
log_keep_alive(INFO, "send an empty message\n");
smbd_post_send_empty(info);
}
}
}
/* Called from softirq, when recv is done */
@@ -551,12 +533,6 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
info->keep_alive_requested = KEEP_ALIVE_PENDING;
}
/*
* Check if we need to send something to remote peer to
* grant more credits or respond to KEEP_ALIVE packet
*/
check_and_send_immediate(info);
return;
default:
@@ -749,7 +725,6 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
request->sge[0].addr,
request->sge[0].length, request->sge[0].lkey);
request->has_payload = false;
atomic_inc(&info->send_pending);
rc = ib_post_send(info->id->qp, &send_wr, NULL);
if (!rc)
@@ -806,120 +781,9 @@ static int manage_keep_alive_before_sending(struct smbd_connection *info)
return 0;
}
/*
* Build and prepare the SMBD packet header
* This function waits for avaialbe send credits and build a SMBD packet
* header. The caller then optional append payload to the packet after
* the header
* intput values
* size: the size of the payload
* remaining_data_length: remaining data to send if this is part of a
* fragmented packet
* output values
* request_out: the request allocated from this function
* return values: 0 on success, otherwise actual error code returned
*/
static int smbd_create_header(struct smbd_connection *info,
int size, int remaining_data_length,
struct smbd_request **request_out)
{
struct smbd_request *request;
struct smbd_data_transfer *packet;
int header_length;
int rc;
/* Wait for send credits. A SMBD packet needs one credit */
rc = wait_event_interruptible(info->wait_send_queue,
atomic_read(&info->send_credits) > 0 ||
info->transport_status != SMBD_CONNECTED);
if (rc)
return rc;
if (info->transport_status != SMBD_CONNECTED) {
log_outgoing(ERR, "disconnected not sending\n");
return -EAGAIN;
}
atomic_dec(&info->send_credits);
request = mempool_alloc(info->request_mempool, GFP_KERNEL);
if (!request) {
rc = -ENOMEM;
goto err;
}
request->info = info;
/* Fill in the packet header */
packet = smbd_request_payload(request);
packet->credits_requested = cpu_to_le16(info->send_credit_target);
packet->credits_granted =
cpu_to_le16(manage_credits_prior_sending(info));
info->send_immediate = false;
packet->flags = 0;
if (manage_keep_alive_before_sending(info))
packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
packet->reserved = 0;
if (!size)
packet->data_offset = 0;
else
packet->data_offset = cpu_to_le32(24);
packet->data_length = cpu_to_le32(size);
packet->remaining_data_length = cpu_to_le32(remaining_data_length);
packet->padding = 0;
log_outgoing(INFO, "credits_requested=%d credits_granted=%d "
"data_offset=%d data_length=%d remaining_data_length=%d\n",
le16_to_cpu(packet->credits_requested),
le16_to_cpu(packet->credits_granted),
le32_to_cpu(packet->data_offset),
le32_to_cpu(packet->data_length),
le32_to_cpu(packet->remaining_data_length));
/* Map the packet to DMA */
header_length = sizeof(struct smbd_data_transfer);
/* If this is a packet without payload, don't send padding */
if (!size)
header_length = offsetof(struct smbd_data_transfer, padding);
request->num_sge = 1;
request->sge[0].addr = ib_dma_map_single(info->id->device,
(void *)packet,
header_length,
DMA_TO_DEVICE);
if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
mempool_free(request, info->request_mempool);
rc = -EIO;
goto err;
}
request->sge[0].length = header_length;
request->sge[0].lkey = info->pd->local_dma_lkey;
*request_out = request;
return 0;
err:
atomic_inc(&info->send_credits);
return rc;
}
static void smbd_destroy_header(struct smbd_connection *info,
struct smbd_request *request)
{
ib_dma_unmap_single(info->id->device,
request->sge[0].addr,
request->sge[0].length,
DMA_TO_DEVICE);
mempool_free(request, info->request_mempool);
atomic_inc(&info->send_credits);
}
/* Post the send request */
static int smbd_post_send(struct smbd_connection *info,
struct smbd_request *request, bool has_payload)
struct smbd_request *request)
{
struct ib_send_wr send_wr;
int rc, i;
@@ -944,24 +808,9 @@ static int smbd_post_send(struct smbd_connection *info,
send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
if (has_payload) {
request->has_payload = true;
atomic_inc(&info->send_payload_pending);
} else {
request->has_payload = false;
atomic_inc(&info->send_pending);
}
rc = ib_post_send(info->id->qp, &send_wr, NULL);
if (rc) {
log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
if (has_payload) {
if (atomic_dec_and_test(&info->send_payload_pending))
wake_up(&info->wait_send_payload_pending);
} else {
if (atomic_dec_and_test(&info->send_pending))
wake_up(&info->wait_send_pending);
}
smbd_disconnect_rdma_connection(info);
rc = -EAGAIN;
} else
@@ -977,14 +826,107 @@ static int smbd_post_send_sgl(struct smbd_connection *info,
{
int num_sgs;
int i, rc;
int header_length;
struct smbd_request *request;
struct smbd_data_transfer *packet;
int new_credits;
struct scatterlist *sg;
rc = smbd_create_header(
info, data_length, remaining_data_length, &request);
wait_credit:
/* Wait for send credits. A SMBD packet needs one credit */
rc = wait_event_interruptible(info->wait_send_queue,
atomic_read(&info->send_credits) > 0 ||
info->transport_status != SMBD_CONNECTED);
if (rc)
return rc;
goto err_wait_credit;
if (info->transport_status != SMBD_CONNECTED) {
log_outgoing(ERR, "disconnected not sending on wait_credit\n");
rc = -EAGAIN;
goto err_wait_credit;
}
if (unlikely(atomic_dec_return(&info->send_credits) < 0)) {
atomic_inc(&info->send_credits);
goto wait_credit;
}
wait_send_queue:
wait_event(info->wait_post_send,
atomic_read(&info->send_pending) < info->send_credit_target ||
info->transport_status != SMBD_CONNECTED);
if (info->transport_status != SMBD_CONNECTED) {
log_outgoing(ERR, "disconnected not sending on wait_send_queue\n");
rc = -EAGAIN;
goto err_wait_send_queue;
}
if (unlikely(atomic_inc_return(&info->send_pending) >
info->send_credit_target)) {
atomic_dec(&info->send_pending);
goto wait_send_queue;
}
request = mempool_alloc(info->request_mempool, GFP_KERNEL);
if (!request) {
rc = -ENOMEM;
goto err_alloc;
}
request->info = info;
/* Fill in the packet header */
packet = smbd_request_payload(request);
packet->credits_requested = cpu_to_le16(info->send_credit_target);
new_credits = manage_credits_prior_sending(info);
atomic_add(new_credits, &info->receive_credits);
packet->credits_granted = cpu_to_le16(new_credits);
info->send_immediate = false;
packet->flags = 0;
if (manage_keep_alive_before_sending(info))
packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
packet->reserved = 0;
if (!data_length)
packet->data_offset = 0;
else
packet->data_offset = cpu_to_le32(24);
packet->data_length = cpu_to_le32(data_length);
packet->remaining_data_length = cpu_to_le32(remaining_data_length);
packet->padding = 0;
log_outgoing(INFO, "credits_requested=%d credits_granted=%d "
"data_offset=%d data_length=%d remaining_data_length=%d\n",
le16_to_cpu(packet->credits_requested),
le16_to_cpu(packet->credits_granted),
le32_to_cpu(packet->data_offset),
le32_to_cpu(packet->data_length),
le32_to_cpu(packet->remaining_data_length));
/* Map the packet to DMA */
header_length = sizeof(struct smbd_data_transfer);
/* If this is a packet without payload, don't send padding */
if (!data_length)
header_length = offsetof(struct smbd_data_transfer, padding);
request->num_sge = 1;
request->sge[0].addr = ib_dma_map_single(info->id->device,
(void *)packet,
header_length,
DMA_TO_DEVICE);
if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
rc = -EIO;
request->sge[0].addr = 0;
goto err_dma;
}
request->sge[0].length = header_length;
request->sge[0].lkey = info->pd->local_dma_lkey;
/* Fill in the packet data payload */
num_sgs = sgl ? sg_nents(sgl) : 0;
for_each_sg(sgl, sg, num_sgs, i) {
request->sge[i+1].addr =
@@ -994,25 +936,41 @@ static int smbd_post_send_sgl(struct smbd_connection *info,
info->id->device, request->sge[i+1].addr)) {
rc = -EIO;
request->sge[i+1].addr = 0;
goto dma_mapping_failure;
goto err_dma;
}
request->sge[i+1].length = sg->length;
request->sge[i+1].lkey = info->pd->local_dma_lkey;
request->num_sge++;
}
rc = smbd_post_send(info, request, data_length);
rc = smbd_post_send(info, request);
if (!rc)
return 0;
dma_mapping_failure:
for (i = 1; i < request->num_sge; i++)
err_dma:
for (i = 0; i < request->num_sge; i++)
if (request->sge[i].addr)
ib_dma_unmap_single(info->id->device,
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
smbd_destroy_header(info, request);
mempool_free(request, info->request_mempool);
/* roll back receive credits and credits to be offered */
spin_lock(&info->lock_new_credits_offered);
info->new_credits_offered += new_credits;
spin_unlock(&info->lock_new_credits_offered);
atomic_sub(new_credits, &info->receive_credits);
err_alloc:
if (atomic_dec_and_test(&info->send_pending))
wake_up(&info->wait_send_pending);
err_wait_send_queue:
/* roll back send credits and pending */
atomic_inc(&info->send_credits);
err_wait_credit:
return rc;
}
@@ -1334,25 +1292,6 @@ static void destroy_receive_buffers(struct smbd_connection *info)
mempool_free(response, info->response_mempool);
}
/*
* Check and send an immediate or keep alive packet
* The condition to send those packets are defined in [MS-SMBD] 3.1.1.1
* Connection.KeepaliveRequested and Connection.SendImmediate
* The idea is to extend credits to server as soon as it becomes available
*/
static void send_immediate_work(struct work_struct *work)
{
struct smbd_connection *info = container_of(
work, struct smbd_connection,
send_immediate_work.work);
if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
info->send_immediate) {
log_keep_alive(INFO, "send an empty message\n");
smbd_post_send_empty(info);
}
}
/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
static void idle_connection_timer(struct work_struct *work)
{
@@ -1407,14 +1346,10 @@ void smbd_destroy(struct TCP_Server_Info *server)
log_rdma_event(INFO, "cancelling idle timer\n");
cancel_delayed_work_sync(&info->idle_timer_work);
log_rdma_event(INFO, "cancelling send immediate work\n");
cancel_delayed_work_sync(&info->send_immediate_work);
log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
wait_event(info->wait_send_pending,
atomic_read(&info->send_pending) == 0);
wait_event(info->wait_send_payload_pending,
atomic_read(&info->send_payload_pending) == 0);
/* It's not posssible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
@@ -1744,15 +1679,13 @@ static struct smbd_connection *_smbd_get_connection(
init_waitqueue_head(&info->wait_send_queue);
INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work);
queue_delayed_work(info->workqueue, &info->idle_timer_work,
info->keep_alive_interval*HZ);
init_waitqueue_head(&info->wait_send_pending);
atomic_set(&info->send_pending, 0);
init_waitqueue_head(&info->wait_send_payload_pending);
atomic_set(&info->send_payload_pending, 0);
init_waitqueue_head(&info->wait_post_send);
INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
@@ -2226,8 +2159,8 @@ done:
* that means all the I/Os have been out and we are good to return
*/
wait_event(info->wait_send_payload_pending,
atomic_read(&info->send_payload_pending) == 0);
wait_event(info->wait_send_pending,
atomic_read(&info->send_pending) == 0);
return rc;
}

View File

@@ -114,8 +114,7 @@ struct smbd_connection {
/* Activity accoutning */
atomic_t send_pending;
wait_queue_head_t wait_send_pending;
atomic_t send_payload_pending;
wait_queue_head_t wait_send_payload_pending;
wait_queue_head_t wait_post_send;
/* Receive queue */
struct list_head receive_queue;
@@ -154,7 +153,6 @@ struct smbd_connection {
struct workqueue_struct *workqueue;
struct delayed_work idle_timer_work;
struct delayed_work send_immediate_work;
/* Memory pool for preallocating buffers */
/* request pool for RDMA send */
@@ -234,9 +232,6 @@ struct smbd_request {
struct smbd_connection *info;
struct ib_cqe cqe;
/* true if this request carries upper layer payload */
bool has_payload;
/* the SGE entries for this packet */
struct ib_sge sge[SMBDIRECT_MAX_SGE];
int num_sge;

View File

@@ -501,6 +501,7 @@ pnfs_alloc_ds_commits_list(struct list_head *list,
rcu_read_lock();
pnfs_put_commit_array(array, cinfo->inode);
}
rcu_read_unlock();
return ret;
}

View File

@@ -38,11 +38,24 @@
* atomic operations, then the count will continue to edge closer to 0. If it
* reaches a value of 1 before /any/ of the threads reset it to the saturated
* value, then a concurrent refcount_dec_and_test() may erroneously free the
* underlying object. Given the precise timing details involved with the
* round-robin scheduling of each thread manipulating the refcount and the need
* to hit the race multiple times in succession, there doesn't appear to be a
* practical avenue of attack even if using refcount_add() operations with
* larger increments.
* underlying object.
* Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
* 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
* With the current PID limit, if no batched refcounting operations are used and
* the attacker can't repeatedly trigger kernel oopses in the middle of refcount
* operations, this makes it impossible for a saturated refcount to leave the
* saturation range, even if it is possible for multiple uses of the same
* refcount to nest in the context of a single task:
*
* (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
* 0x40000000 / 0x400000 = 0x100 = 256
*
* If hundreds of references are added/removed with a single refcounting
* operation, it may potentially be possible to leave the saturation range; but
* given the precise timing details involved with the round-robin scheduling of
* each thread manipulating the refcount and the need to hit the race multiple
* times in succession, there doesn't appear to be a practical avenue of attack
* even if using refcount_add() operations with larger increments.
*
* Memory ordering
* ===============

View File

@@ -983,16 +983,10 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
event->shadow_ctx_time = now - t->timestamp;
}
/*
* Update cpuctx->cgrp so that it is set when first cgroup event is added and
* cleared when last cgroup event is removed.
*/
static inline void
list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
struct list_head *cpuctx_entry;
if (!is_cgroup_event(event))
return;
@@ -1009,28 +1003,41 @@ list_update_cgroup_event(struct perf_event *event,
* because if the first would mismatch, the second would not try again
* and we would leave cpuctx->cgrp unset.
*/
if (add && !cpuctx->cgrp) {
if (ctx->is_active && !cpuctx->cgrp) {
struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
cpuctx->cgrp = cgrp;
}
if (add && ctx->nr_cgroups++)
return;
else if (!add && --ctx->nr_cgroups)
if (ctx->nr_cgroups++)
return;
/* no cgroup running */
if (!add)
list_add(&cpuctx->cgrp_cpuctx_entry,
per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
}
static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
if (!is_cgroup_event(event))
return;
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
*/
cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
if (--ctx->nr_cgroups)
return;
if (ctx->is_active && cpuctx->cgrp)
cpuctx->cgrp = NULL;
cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
if (add)
list_add(cpuctx_entry,
per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
else
list_del(cpuctx_entry);
list_del(&cpuctx->cgrp_cpuctx_entry);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1096,11 +1103,14 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
}
static inline void
list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}
static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
#endif
/*
@@ -1791,13 +1801,14 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
add_event_to_groups(event, ctx);
}
list_update_cgroup_event(event, ctx, true);
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_enable(event, ctx);
ctx->generation++;
}
@@ -1976,8 +1987,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
list_update_cgroup_event(event, ctx, false);
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -1994,8 +2003,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
* of error state is by explicit re-enabling
* of the event
*/
if (event->state > PERF_EVENT_STATE_OFF)
if (event->state > PERF_EVENT_STATE_OFF) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
}
ctx->generation++;
}
@@ -2226,6 +2237,7 @@ event_sched_out(struct perf_event *event,
if (READ_ONCE(event->pending_disable) >= 0) {
WRITE_ONCE(event->pending_disable, -1);
perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
@@ -2363,6 +2375,7 @@ static void __perf_event_disable(struct perf_event *event,
event_sched_out(event, cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
perf_cgroup_event_disable(event, ctx);
}
/*
@@ -2746,7 +2759,7 @@ static int __perf_install_in_context(void *info)
}
#ifdef CONFIG_CGROUP_PERF
if (is_cgroup_event(event)) {
if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
/*
* If the current cgroup doesn't match the event's
* cgroup, we should not try to schedule it.
@@ -2906,6 +2919,7 @@ static void __perf_event_enable(struct perf_event *event,
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
if (!ctx->is_active)
return;
@@ -3508,7 +3522,8 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
static bool perf_less_group_idx(const void *l, const void *r)
{
const struct perf_event *le = l, *re = r;
const struct perf_event *le = *(const struct perf_event **)l;
const struct perf_event *re = *(const struct perf_event **)r;
return le->group_index < re->group_index;
}
@@ -3616,8 +3631,10 @@ static int merge_sched_in(struct perf_event *event, void *data)
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
if (event->attr.pinned)
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
*can_add_hw = 0;
ctx->rotate_necessary = 1;
@@ -6917,9 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe __get_user_pages_fast first.
* If failed, leave phys_addr as 0.
*/
if ((current->mm != NULL) &&
(__get_user_pages_fast(virt, 1, 0, &p) == 1))
phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
if (current->mm != NULL) {
pagefault_disable();
if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
pagefault_enable();
}
if (p)
put_page(p);

View File

@@ -3952,10 +3952,36 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return ret;
}
static inline short task_wait_context(struct task_struct *curr)
{
/*
* Set appropriate wait type for the context; for IRQs we have to take
* into account force_irqthread as that is implied by PREEMPT_RT.
*/
if (curr->hardirq_context) {
/*
* Check if force_irqthreads will run us threaded.
*/
if (curr->hardirq_threaded || curr->irq_config)
return LD_WAIT_CONFIG;
return LD_WAIT_SPIN;
} else if (curr->softirq_context) {
/*
* Softirqs are always threaded.
*/
return LD_WAIT_CONFIG;
}
return LD_WAIT_MAX;
}
static int
print_lock_invalid_wait_context(struct task_struct *curr,
struct held_lock *hlock)
{
short curr_inner;
if (!debug_locks_off())
return 0;
if (debug_locks_silent)
@@ -3971,6 +3997,10 @@ print_lock_invalid_wait_context(struct task_struct *curr,
print_lock(hlock);
pr_warn("other info that might help us debug this:\n");
curr_inner = task_wait_context(curr);
pr_warn("context-{%d:%d}\n", curr_inner, curr_inner);
lockdep_print_held_locks(curr);
pr_warn("stack backtrace:\n");
@@ -4017,26 +4047,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
}
depth++;
/*
* Set appropriate wait type for the context; for IRQs we have to take
* into account force_irqthread as that is implied by PREEMPT_RT.
*/
if (curr->hardirq_context) {
/*
* Check if force_irqthreads will run us threaded.
*/
if (curr->hardirq_threaded || curr->irq_config)
curr_inner = LD_WAIT_CONFIG;
else
curr_inner = LD_WAIT_SPIN;
} else if (curr->softirq_context) {
/*
* Softirqs are always threaded.
*/
curr_inner = LD_WAIT_CONFIG;
} else {
curr_inner = LD_WAIT_MAX;
}
curr_inner = task_wait_context(curr);
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;

View File

@@ -118,14 +118,15 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
unsigned int mode, int wake_flags,
void *key)
{
struct task_struct *p = get_task_struct(wq_entry->private);
bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
struct percpu_rw_semaphore *sem = key;
struct task_struct *p;
/* concurrent against percpu_down_write(), can get stolen */
if (!__percpu_rwsem_trylock(sem, reader))
return 1;
p = get_task_struct(wq_entry->private);
list_del_init(&wq_entry->entry);
smp_store_release(&wq_entry->private, NULL);

View File

@@ -2120,12 +2120,6 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
return cpu;
}
static void update_avg(u64 *avg, u64 sample)
{
s64 diff = sample - *avg;
*avg += diff >> 3;
}
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -4127,7 +4121,8 @@ static inline void sched_submit_work(struct task_struct *tsk)
* it wants to wake up a task to maintain concurrency.
* As this function is called inside the schedule() context,
* we disable preemption to avoid it calling schedule() again
* in the possible wakeup of a kworker.
* in the possible wakeup of a kworker and because wq_worker_sleeping()
* requires it.
*/
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
preempt_disable();
@@ -6687,7 +6682,6 @@ void __init sched_init(void)
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_load_update_tick = jiffies;
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
#endif

View File

@@ -816,10 +816,12 @@ static int __init init_sched_debug_procfs(void)
__initcall(init_sched_debug_procfs);
#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
#define __P(F) __PS(#F, F)
#define P(F) __PS(#F, p->F)
#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
#define __PN(F) __PSN(#F, F)
#define PN(F) __PSN(#F, p->F)
#ifdef CONFIG_NUMA_BALANCING
@@ -868,18 +870,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
SEQ_printf(m,
"---------------------------------------------------------"
"----------\n");
#define __P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define P_SCHEDSTAT(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
#define __PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
#define PN_SCHEDSTAT(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F))
#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F))
PN(se.exec_start);
PN(se.vruntime);
@@ -939,10 +932,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
}
__P(nr_switches);
SEQ_printf(m, "%-45s:%21Ld\n",
"nr_voluntary_switches", (long long)p->nvcsw);
SEQ_printf(m, "%-45s:%21Ld\n",
"nr_involuntary_switches", (long long)p->nivcsw);
__PS("nr_voluntary_switches", p->nvcsw);
__PS("nr_involuntary_switches", p->nivcsw);
P(se.load.weight);
#ifdef CONFIG_SMP
@@ -955,6 +946,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.last_update_time);
P(se.avg.util_est.ewma);
P(se.avg.util_est.enqueued);
#endif
#ifdef CONFIG_UCLAMP_TASK
__PS("uclamp.min", p->uclamp[UCLAMP_MIN].value);
__PS("uclamp.max", p->uclamp[UCLAMP_MAX].value);
__PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
__PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
#endif
P(policy);
P(prio);
@@ -963,11 +960,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(dl.deadline);
}
#undef PN_SCHEDSTAT
#undef PN
#undef __PN
#undef P_SCHEDSTAT
#undef P
#undef __P
{
unsigned int this_cpu = raw_smp_processor_id();
@@ -975,8 +968,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
t0 = cpu_clock(this_cpu);
t1 = cpu_clock(this_cpu);
SEQ_printf(m, "%-45s:%21Ld\n",
"clock-delta", (long long)(t1-t0));
__PS("clock-delta", t1-t0);
}
sched_show_numa(p, m);

View File

@@ -4836,11 +4836,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
resched_curr(rq);
}
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
{
struct cfs_rq *cfs_rq;
u64 runtime;
u64 starting_runtime = remaining;
u64 runtime, remaining = 1;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -4855,10 +4854,13 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
/* By the above check, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > remaining)
runtime = remaining;
remaining -= runtime;
if (runtime > cfs_b->runtime)
runtime = cfs_b->runtime;
cfs_b->runtime -= runtime;
remaining = cfs_b->runtime;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += runtime;
@@ -4873,8 +4875,6 @@ next:
break;
}
rcu_read_unlock();
return starting_runtime - remaining;
}
/*
@@ -4885,7 +4885,6 @@ next:
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
u64 runtime;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -4914,24 +4913,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
cfs_b->nr_throttled += overrun;
/*
* This check is repeated as we are holding onto the new bandwidth while
* we unthrottle. This can potentially race with an unthrottled group
* trying to acquire new bandwidth from the global pool. This can result
* in us over-using our runtime if it is all used during this loop, but
* only by limited amounts in that extreme case.
* This check is repeated as we release cfs_b->lock while we unthrottle.
*/
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
cfs_b->distribute_running = 1;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime);
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
lsub_positive(&cfs_b->runtime, runtime);
}
/*
@@ -5065,10 +5057,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (!runtime)
return;
runtime = distribute_cfs_runtime(cfs_b, runtime);
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
@@ -6080,8 +6071,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
u64 time;
int this = smp_processor_id();
int cpu, nr = INT_MAX;
@@ -6119,9 +6109,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
}
time = cpu_clock(this) - time;
cost = this_sd->avg_scan_cost;
delta = (s64)(time - cost) / 8;
this_sd->avg_scan_cost += delta;
update_avg(&this_sd->avg_scan_cost, time);
return cpu;
}
@@ -9091,6 +9079,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
sds->total_capacity;
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
*/
if (local->avg_load >= busiest->avg_load) {
env->imbalance = 0;
return;
}
}
/*

View File

@@ -195,6 +195,12 @@ static inline int task_has_dl_policy(struct task_struct *p)
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
static inline void update_avg(u64 *avg, u64 sample)
{
s64 diff = sample - *avg;
*avg += diff / 8;
}
/*
* !! For sched_setattr_nocheck() (kernel) only !!
*
@@ -884,7 +890,6 @@ struct rq {
#endif
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
#endif /* CONFIG_SMP */

View File

@@ -447,6 +447,7 @@ const struct proc_ns_operations timens_operations = {
const struct proc_ns_operations timens_for_children_operations = {
.name = "time_for_children",
.real_ns_name = "time",
.type = CLONE_NEWTIME,
.get = timens_for_children_get,
.put = timens_put,

View File

@@ -69,6 +69,7 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_net_namespaces"),
UCOUNT_ENTRY("max_mnt_namespaces"),
UCOUNT_ENTRY("max_cgroup_namespaces"),
UCOUNT_ENTRY("max_time_namespaces"),
#ifdef CONFIG_INOTIFY_USER
UCOUNT_ENTRY("max_inotify_instances"),
UCOUNT_ENTRY("max_inotify_watches"),
@@ -81,6 +82,8 @@ bool setup_userns_sysctls(struct user_namespace *ns)
{
#ifdef CONFIG_SYSCTL
struct ctl_table *tbl;
BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1);
setup_sysctl_set(&ns->set, &set_root, set_is_seen);
tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
if (tbl) {

View File

@@ -858,7 +858,8 @@ void wq_worker_running(struct task_struct *task)
* @task: task going to sleep
*
* This function is called from schedule() when a busy worker is
* going to sleep.
* going to sleep. Preemption needs to be disabled to protect ->sleeping
* assignment.
*/
void wq_worker_sleeping(struct task_struct *task)
{
@@ -875,7 +876,8 @@ void wq_worker_sleeping(struct task_struct *task)
pool = worker->pool;
if (WARN_ON_ONCE(worker->sleeping))
/* Return if preempted before wq_worker_running() was reached */
if (worker->sleeping)
return;
worker->sleeping = 1;