From e000acc145928693833f09152244242a678d3cd5 Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Wed, 15 Apr 2020 14:04:43 -0700 Subject: [PATCH 001/502] objtool: Do not assume order of parent/child functions If a .cold function is examined prior to it's parent, the link to the parent/child function can be overwritten when the parent is examined. Only update pfunc and cfunc if they were previously nil to prevent this from happening. This fixes an issue seen when compiling with -ffunction-sections. Signed-off-by: Kristen Carlson Accardi Signed-off-by: Josh Poimboeuf --- tools/objtool/elf.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 84225679f96d..f953d3a15612 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -434,7 +434,13 @@ static int read_symbols(struct elf *elf) size_t pnamelen; if (sym->type != STT_FUNC) continue; - sym->pfunc = sym->cfunc = sym; + + if (sym->pfunc == NULL) + sym->pfunc = sym; + + if (sym->cfunc == NULL) + sym->cfunc = sym; + coldstr = strstr(sym->name, ".cold"); if (!coldstr) continue; From 1e968bf5caf65eff3f080102879aaa5440c261b6 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Tue, 21 Apr 2020 11:25:01 -0700 Subject: [PATCH 002/502] objtool: Use sh_info to find the base for .rela sections ELF doesn't require .rela section names to match the base section. Use the section index in sh_info to find the section instead of looking it up by name. LLD, for example, generates a .rela section that doesn't match the base section name when we merge sections in a linker script for a binary compiled with -ffunction-sections. Signed-off-by: Sami Tolvanen Signed-off-by: Josh Poimboeuf Reviewed-by: Kees Cook --- tools/objtool/elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index f953d3a15612..5bc259c9d892 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -508,7 +508,7 @@ static int read_relas(struct elf *elf) if (sec->sh.sh_type != SHT_RELA) continue; - sec->base = find_section_by_name(elf, sec->name + 5); + sec->base = find_section_by_index(elf, sec->sh.sh_info); if (!sec->base) { WARN("can't find base section for rela section %s", sec->name); From f1974222634010486c1692e843af0ab11304dd2c Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Fri, 29 May 2020 14:01:13 -0700 Subject: [PATCH 003/502] objtool: Rename rela to reloc Before supporting additional relocation types rename the relevant types and functions from "rela" to "reloc". This work be done with the following regex: sed -e 's/struct rela/struct reloc/g' \ -e 's/\([_\*]\)rela\(s\{0,1\}\)/\1reloc\2/g' \ -e 's/tmprela\(s\{0,1\}\)/tmpreloc\1/g' \ -e 's/relasec/relocsec/g' \ -e 's/rela_list/reloc_list/g' \ -e 's/rela_hash/reloc_hash/g' \ -e 's/add_rela/add_reloc/g' \ -e 's/rela->/reloc->/g' \ -e '/rela[,\.]/{ s/\([^\.>]\)rela\([\.,]\)/\1reloc\2/g ; }' \ -e 's/rela =/reloc =/g' \ -e 's/relas =/relocs =/g' \ -e 's/relas\[/relocs[/g' \ -e 's/relaname =/relocname =/g' \ -e 's/= rela\;/= reloc\;/g' \ -e 's/= relas\;/= relocs\;/g' \ -e 's/= relaname\;/= relocname\;/g' \ -e 's/, rela)/, reloc)/g' \ -e 's/\([ @]\)rela\([ "]\)/\1reloc\2/g' \ -e 's/ rela$/ reloc/g' \ -e 's/, relaname/, relocname/g' \ -e 's/sec->rela/sec->reloc/g' \ -e 's/(\(!\{0,1\}\)rela/(\1reloc/g' \ -i \ arch.h \ arch/x86/decode.c \ check.c \ check.h \ elf.c \ elf.h \ orc_gen.c \ special.c Notable exceptions which complicate the regex include gelf_* library calls and standard/expected section names which still use "rela" because they encode the type of relocation expected. Also, keep "rela" in the struct because it encodes a specific type of relocation we currently expect. It will eventually turn into a member of an anonymous union when a susequent patch adds implicit addend, or "rel", relocation support. Signed-off-by: Matt Helsley Signed-off-by: Josh Poimboeuf --- tools/objtool/arch.h | 2 +- tools/objtool/arch/x86/decode.c | 2 +- tools/objtool/check.c | 196 ++++++++++++++++---------------- tools/objtool/check.h | 2 +- tools/objtool/elf.c | 138 +++++++++++----------- tools/objtool/elf.h | 22 ++-- tools/objtool/orc_gen.c | 46 ++++---- tools/objtool/special.c | 28 ++--- 8 files changed, 218 insertions(+), 218 deletions(-) diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h index eda15a5a285e..d0969a9328c2 100644 --- a/tools/objtool/arch.h +++ b/tools/objtool/arch.h @@ -82,6 +82,6 @@ bool arch_callee_saved_reg(unsigned char reg); unsigned long arch_jump_destination(struct instruction *insn); -unsigned long arch_dest_rela_offset(int addend); +unsigned long arch_dest_reloc_offset(int addend); #endif /* _ARCH_H */ diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 4b504fc90bbb..fe83d4c92825 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -67,7 +67,7 @@ bool arch_callee_saved_reg(unsigned char reg) } } -unsigned long arch_dest_rela_offset(int addend) +unsigned long arch_dest_reloc_offset(int addend) { return addend + 4; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 63d65a702900..28ce311ea90c 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -352,7 +352,7 @@ static struct instruction *find_last_insn(struct objtool_file *file, static int add_dead_ends(struct objtool_file *file) { struct section *sec; - struct rela *rela; + struct reloc *reloc; struct instruction *insn; /* @@ -370,24 +370,24 @@ static int add_dead_ends(struct objtool_file *file) if (!sec) goto reachable; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (insn) insn = list_prev_entry(insn, list); - else if (rela->addend == rela->sym->sec->len) { - insn = find_last_insn(file, rela->sym->sec); + else if (reloc->addend == reloc->sym->sec->len) { + insn = find_last_insn(file, reloc->sym->sec); if (!insn) { WARN("can't find unreachable insn at %s+0x%x", - rela->sym->sec->name, rela->addend); + reloc->sym->sec->name, reloc->addend); return -1; } } else { WARN("can't find unreachable insn at %s+0x%x", - rela->sym->sec->name, rela->addend); + reloc->sym->sec->name, reloc->addend); return -1; } @@ -405,24 +405,24 @@ reachable: if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (insn) insn = list_prev_entry(insn, list); - else if (rela->addend == rela->sym->sec->len) { - insn = find_last_insn(file, rela->sym->sec); + else if (reloc->addend == reloc->sym->sec->len) { + insn = find_last_insn(file, reloc->sym->sec); if (!insn) { WARN("can't find reachable insn at %s+0x%x", - rela->sym->sec->name, rela->addend); + reloc->sym->sec->name, reloc->addend); return -1; } } else { WARN("can't find reachable insn at %s+0x%x", - rela->sym->sec->name, rela->addend); + reloc->sym->sec->name, reloc->addend); return -1; } @@ -440,26 +440,26 @@ static void add_ignores(struct objtool_file *file) struct instruction *insn; struct section *sec; struct symbol *func; - struct rela *rela; + struct reloc *reloc; sec = find_section_by_name(file->elf, ".rela.discard.func_stack_frame_non_standard"); if (!sec) return; - list_for_each_entry(rela, &sec->rela_list, list) { - switch (rela->sym->type) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + switch (reloc->sym->type) { case STT_FUNC: - func = rela->sym; + func = reloc->sym; break; case STT_SECTION: - func = find_func_by_offset(rela->sym->sec, rela->addend); + func = find_func_by_offset(reloc->sym->sec, reloc->addend); if (!func) continue; break; default: - WARN("unexpected relocation symbol type in %s: %d", sec->name, rela->sym->type); + WARN("unexpected relocation symbol type in %s: %d", sec->name, reloc->sym->type); continue; } @@ -557,20 +557,20 @@ static void add_uaccess_safe(struct objtool_file *file) static int add_ignore_alternatives(struct objtool_file *file) { struct section *sec; - struct rela *rela; + struct reloc *reloc; struct instruction *insn; sec = find_section_by_name(file->elf, ".rela.discard.ignore_alts"); if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("bad .discard.ignore_alts entry"); return -1; @@ -588,7 +588,7 @@ static int add_ignore_alternatives(struct objtool_file *file) static int add_jump_destinations(struct objtool_file *file) { struct instruction *insn; - struct rela *rela; + struct reloc *reloc; struct section *dest_sec; unsigned long dest_off; @@ -599,19 +599,19 @@ static int add_jump_destinations(struct objtool_file *file) if (insn->ignore || insn->offset == FAKE_JUMP_OFFSET) continue; - rela = find_rela_by_dest_range(file->elf, insn->sec, + reloc = find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len); - if (!rela) { + if (!reloc) { dest_sec = insn->sec; dest_off = arch_jump_destination(insn); - } else if (rela->sym->type == STT_SECTION) { - dest_sec = rela->sym->sec; - dest_off = arch_dest_rela_offset(rela->addend); - } else if (rela->sym->sec->idx) { - dest_sec = rela->sym->sec; - dest_off = rela->sym->sym.st_value + - arch_dest_rela_offset(rela->addend); - } else if (strstr(rela->sym->name, "_indirect_thunk_")) { + } else if (reloc->sym->type == STT_SECTION) { + dest_sec = reloc->sym->sec; + dest_off = arch_dest_reloc_offset(reloc->addend); + } else if (reloc->sym->sec->idx) { + dest_sec = reloc->sym->sec; + dest_off = reloc->sym->sym.st_value + + arch_dest_reloc_offset(reloc->addend); + } else if (strstr(reloc->sym->name, "_indirect_thunk_")) { /* * Retpoline jumps are really dynamic jumps in * disguise, so convert them accordingly. @@ -625,7 +625,7 @@ static int add_jump_destinations(struct objtool_file *file) continue; } else { /* external sibling call */ - insn->call_dest = rela->sym; + insn->call_dest = reloc->sym; continue; } @@ -701,15 +701,15 @@ static int add_call_destinations(struct objtool_file *file) { struct instruction *insn; unsigned long dest_off; - struct rela *rela; + struct reloc *reloc; for_each_insn(file, insn) { if (insn->type != INSN_CALL) continue; - rela = find_rela_by_dest_range(file->elf, insn->sec, + reloc = find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len); - if (!rela) { + if (!reloc) { dest_off = arch_jump_destination(insn); insn->call_dest = find_func_by_offset(insn->sec, dest_off); if (!insn->call_dest) @@ -729,19 +729,19 @@ static int add_call_destinations(struct objtool_file *file) return -1; } - } else if (rela->sym->type == STT_SECTION) { - dest_off = arch_dest_rela_offset(rela->addend); - insn->call_dest = find_func_by_offset(rela->sym->sec, + } else if (reloc->sym->type == STT_SECTION) { + dest_off = arch_dest_reloc_offset(reloc->addend); + insn->call_dest = find_func_by_offset(reloc->sym->sec, dest_off); if (!insn->call_dest) { WARN_FUNC("can't find call dest symbol at %s+0x%lx", insn->sec, insn->offset, - rela->sym->sec->name, + reloc->sym->sec->name, dest_off); return -1; } } else - insn->call_dest = rela->sym; + insn->call_dest = reloc->sym; /* * Whatever stack impact regular CALLs have, should be undone @@ -849,7 +849,7 @@ static int handle_group_alt(struct objtool_file *file, */ if ((insn->offset != special_alt->new_off || (insn->type != INSN_CALL && !is_static_jump(insn))) && - find_rela_by_dest_range(file->elf, insn->sec, insn->offset, insn->len)) { + find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len)) { WARN_FUNC("unsupported relocation in alternatives section", insn->sec, insn->offset); @@ -995,34 +995,34 @@ out: } static int add_jump_table(struct objtool_file *file, struct instruction *insn, - struct rela *table) + struct reloc *table) { - struct rela *rela = table; + struct reloc *reloc = table; struct instruction *dest_insn; struct alternative *alt; struct symbol *pfunc = insn->func->pfunc; unsigned int prev_offset = 0; /* - * Each @rela is a switch table relocation which points to the target + * Each @reloc is a switch table relocation which points to the target * instruction. */ - list_for_each_entry_from(rela, &table->sec->rela_list, list) { + list_for_each_entry_from(reloc, &table->sec->reloc_list, list) { /* Check for the end of the table: */ - if (rela != table && rela->jump_table_start) + if (reloc != table && reloc->jump_table_start) break; /* Make sure the table entries are consecutive: */ - if (prev_offset && rela->offset != prev_offset + 8) + if (prev_offset && reloc->offset != prev_offset + 8) break; /* Detect function pointers from contiguous objects: */ - if (rela->sym->sec == pfunc->sec && - rela->addend == pfunc->offset) + if (reloc->sym->sec == pfunc->sec && + reloc->addend == pfunc->offset) break; - dest_insn = find_insn(file, rela->sym->sec, rela->addend); + dest_insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!dest_insn) break; @@ -1038,7 +1038,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, alt->insn = dest_insn; list_add_tail(&alt->list, &insn->alts); - prev_offset = rela->offset; + prev_offset = reloc->offset; } if (!prev_offset) { @@ -1093,11 +1093,11 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, * * NOTE: RETPOLINE made it harder still to decode dynamic jumps. */ -static struct rela *find_jump_table(struct objtool_file *file, +static struct reloc *find_jump_table(struct objtool_file *file, struct symbol *func, struct instruction *insn) { - struct rela *text_rela, *table_rela; + struct reloc *text_reloc, *table_reloc; struct instruction *dest_insn, *orig_insn = insn; struct section *table_sec; unsigned long table_offset; @@ -1122,16 +1122,16 @@ static struct rela *find_jump_table(struct objtool_file *file, break; /* look for a relocation which references .rodata */ - text_rela = find_rela_by_dest_range(file->elf, insn->sec, + text_reloc = find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len); - if (!text_rela || text_rela->sym->type != STT_SECTION || - !text_rela->sym->sec->rodata) + if (!text_reloc || text_reloc->sym->type != STT_SECTION || + !text_reloc->sym->sec->rodata) continue; - table_offset = text_rela->addend; - table_sec = text_rela->sym->sec; + table_offset = text_reloc->addend; + table_sec = text_reloc->sym->sec; - if (text_rela->type == R_X86_64_PC32) + if (text_reloc->type == R_X86_64_PC32) table_offset += 4; /* @@ -1148,14 +1148,14 @@ static struct rela *find_jump_table(struct objtool_file *file, continue; /* - * Each table entry has a rela associated with it. The rela + * Each table entry has a reloc associated with it. The reloc * should reference text in the same function as the original * instruction. */ - table_rela = find_rela_by_dest(file->elf, table_sec, table_offset); - if (!table_rela) + table_reloc = find_reloc_by_dest(file->elf, table_sec, table_offset); + if (!table_reloc) continue; - dest_insn = find_insn(file, table_rela->sym->sec, table_rela->addend); + dest_insn = find_insn(file, table_reloc->sym->sec, table_reloc->addend); if (!dest_insn || !dest_insn->func || dest_insn->func->pfunc != func) continue; @@ -1164,10 +1164,10 @@ static struct rela *find_jump_table(struct objtool_file *file, * indicates a rare GCC quirk/bug which can leave dead code * behind. */ - if (text_rela->type == R_X86_64_PC32) + if (text_reloc->type == R_X86_64_PC32) file->ignore_unreachables = true; - return table_rela; + return table_reloc; } return NULL; @@ -1181,7 +1181,7 @@ static void mark_func_jump_tables(struct objtool_file *file, struct symbol *func) { struct instruction *insn, *last = NULL; - struct rela *rela; + struct reloc *reloc; func_for_each_insn(file, func, insn) { if (!last) @@ -1204,10 +1204,10 @@ static void mark_func_jump_tables(struct objtool_file *file, if (insn->type != INSN_JUMP_DYNAMIC) continue; - rela = find_jump_table(file, func, insn); - if (rela) { - rela->jump_table_start = true; - insn->jump_table = rela; + reloc = find_jump_table(file, func, insn); + if (reloc) { + reloc->jump_table_start = true; + insn->jump_table = reloc; } } } @@ -1261,8 +1261,8 @@ static int add_jump_table_alts(struct objtool_file *file) static int read_unwind_hints(struct objtool_file *file) { - struct section *sec, *relasec; - struct rela *rela; + struct section *sec, *relocsec; + struct reloc *reloc; struct unwind_hint *hint; struct instruction *insn; struct cfi_reg *cfa; @@ -1272,8 +1272,8 @@ static int read_unwind_hints(struct objtool_file *file) if (!sec) return 0; - relasec = sec->rela; - if (!relasec) { + relocsec = sec->reloc; + if (!relocsec) { WARN("missing .rela.discard.unwind_hints section"); return -1; } @@ -1288,13 +1288,13 @@ static int read_unwind_hints(struct objtool_file *file) for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) { hint = (struct unwind_hint *)sec->data->d_buf + i; - rela = find_rela_by_dest(file->elf, sec, i * sizeof(*hint)); - if (!rela) { - WARN("can't find rela for unwind_hints[%d]", i); + reloc = find_reloc_by_dest(file->elf, sec, i * sizeof(*hint)); + if (!reloc) { + WARN("can't find reloc for unwind_hints[%d]", i); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("can't find insn for unwind_hints[%d]", i); return -1; @@ -1352,19 +1352,19 @@ static int read_retpoline_hints(struct objtool_file *file) { struct section *sec; struct instruction *insn; - struct rela *rela; + struct reloc *reloc; sec = find_section_by_name(file->elf, ".rela.discard.retpoline_safe"); if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("bad .discard.retpoline_safe entry"); return -1; @@ -1387,19 +1387,19 @@ static int read_instr_hints(struct objtool_file *file) { struct section *sec; struct instruction *insn; - struct rela *rela; + struct reloc *reloc; sec = find_section_by_name(file->elf, ".rela.discard.instr_end"); if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("bad .discard.instr_end entry"); return -1; @@ -1412,13 +1412,13 @@ static int read_instr_hints(struct objtool_file *file) if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("bad .discard.instr_begin entry"); return -1; @@ -1434,22 +1434,22 @@ static int read_intra_function_calls(struct objtool_file *file) { struct instruction *insn; struct section *sec; - struct rela *rela; + struct reloc *reloc; sec = find_section_by_name(file->elf, ".rela.discard.intra_function_calls"); if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { + list_for_each_entry(reloc, &sec->reloc_list, list) { unsigned long dest_off; - if (rela->sym->type != STT_SECTION) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("bad .discard.intra_function_call entry"); return -1; diff --git a/tools/objtool/check.h b/tools/objtool/check.h index 906b5210f7ca..061aa96e15d3 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -37,7 +37,7 @@ struct instruction { struct symbol *call_dest; struct instruction *jump_dest; struct instruction *first_jump_src; - struct rela *jump_table; + struct reloc *jump_table; struct list_head alts; struct symbol *func; struct list_head stack_ops; diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 5bc259c9d892..3160931e858c 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -228,26 +228,26 @@ struct symbol *find_symbol_by_name(const struct elf *elf, const char *name) return NULL; } -struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec, +struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len) { - struct rela *rela, *r = NULL; + struct reloc *reloc, *r = NULL; unsigned long o; - if (!sec->rela) + if (!sec->reloc) return NULL; - sec = sec->rela; + sec = sec->reloc; for_offset_range(o, offset, offset + len) { - elf_hash_for_each_possible(elf->rela_hash, rela, hash, + elf_hash_for_each_possible(elf->reloc_hash, reloc, hash, sec_offset_hash(sec, o)) { - if (rela->sec != sec) + if (reloc->sec != sec) continue; - if (rela->offset >= offset && rela->offset < offset + len) { - if (!r || rela->offset < r->offset) - r = rela; + if (reloc->offset >= offset && reloc->offset < offset + len) { + if (!r || reloc->offset < r->offset) + r = reloc; } } if (r) @@ -257,9 +257,9 @@ struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec, return NULL; } -struct rela *find_rela_by_dest(const struct elf *elf, struct section *sec, unsigned long offset) +struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset) { - return find_rela_by_dest_range(elf, sec, offset, 1); + return find_reloc_by_dest_range(elf, sec, offset, 1); } static int read_sections(struct elf *elf) @@ -288,7 +288,7 @@ static int read_sections(struct elf *elf) memset(sec, 0, sizeof(*sec)); INIT_LIST_HEAD(&sec->symbol_list); - INIT_LIST_HEAD(&sec->rela_list); + INIT_LIST_HEAD(&sec->reloc_list); s = elf_getscn(elf->elf, i); if (!s) { @@ -488,21 +488,21 @@ err: return -1; } -void elf_add_rela(struct elf *elf, struct rela *rela) +void elf_add_reloc(struct elf *elf, struct reloc *reloc) { - struct section *sec = rela->sec; + struct section *sec = reloc->sec; - list_add_tail(&rela->list, &sec->rela_list); - elf_hash_add(elf->rela_hash, &rela->hash, rela_hash(rela)); + list_add_tail(&reloc->list, &sec->reloc_list); + elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc)); } -static int read_relas(struct elf *elf) +static int read_relocs(struct elf *elf) { struct section *sec; - struct rela *rela; + struct reloc *reloc; int i; unsigned int symndx; - unsigned long nr_rela, max_rela = 0, tot_rela = 0; + unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0; list_for_each_entry(sec, &elf->sections, list) { if (sec->sh.sh_type != SHT_RELA) @@ -510,49 +510,49 @@ static int read_relas(struct elf *elf) sec->base = find_section_by_index(elf, sec->sh.sh_info); if (!sec->base) { - WARN("can't find base section for rela section %s", + WARN("can't find base section for reloc section %s", sec->name); return -1; } - sec->base->rela = sec; + sec->base->reloc = sec; - nr_rela = 0; + nr_reloc = 0; for (i = 0; i < sec->sh.sh_size / sec->sh.sh_entsize; i++) { - rela = malloc(sizeof(*rela)); - if (!rela) { + reloc = malloc(sizeof(*reloc)); + if (!reloc) { perror("malloc"); return -1; } - memset(rela, 0, sizeof(*rela)); + memset(reloc, 0, sizeof(*reloc)); - if (!gelf_getrela(sec->data, i, &rela->rela)) { + if (!gelf_getrela(sec->data, i, &reloc->rela)) { WARN_ELF("gelf_getrela"); return -1; } - rela->type = GELF_R_TYPE(rela->rela.r_info); - rela->addend = rela->rela.r_addend; - rela->offset = rela->rela.r_offset; - symndx = GELF_R_SYM(rela->rela.r_info); - rela->sym = find_symbol_by_index(elf, symndx); - rela->sec = sec; - if (!rela->sym) { - WARN("can't find rela entry symbol %d for %s", + reloc->type = GELF_R_TYPE(reloc->rela.r_info); + reloc->addend = reloc->rela.r_addend; + reloc->offset = reloc->rela.r_offset; + symndx = GELF_R_SYM(reloc->rela.r_info); + reloc->sym = find_symbol_by_index(elf, symndx); + reloc->sec = sec; + if (!reloc->sym) { + WARN("can't find reloc entry symbol %d for %s", symndx, sec->name); return -1; } - elf_add_rela(elf, rela); - nr_rela++; + elf_add_reloc(elf, reloc); + nr_reloc++; } - max_rela = max(max_rela, nr_rela); - tot_rela += nr_rela; + max_reloc = max(max_reloc, nr_reloc); + tot_reloc += nr_reloc; } if (stats) { - printf("max_rela: %lu\n", max_rela); - printf("tot_rela: %lu\n", tot_rela); + printf("max_reloc: %lu\n", max_reloc); + printf("tot_reloc: %lu\n", tot_reloc); } return 0; @@ -578,7 +578,7 @@ struct elf *elf_open_read(const char *name, int flags) elf_hash_init(elf->symbol_name_hash); elf_hash_init(elf->section_hash); elf_hash_init(elf->section_name_hash); - elf_hash_init(elf->rela_hash); + elf_hash_init(elf->reloc_hash); elf->fd = open(name, flags); if (elf->fd == -1) { @@ -611,7 +611,7 @@ struct elf *elf_open_read(const char *name, int flags) if (read_symbols(elf)) goto err; - if (read_relas(elf)) + if (read_relocs(elf)) goto err; return elf; @@ -637,7 +637,7 @@ struct section *elf_create_section(struct elf *elf, const char *name, memset(sec, 0, sizeof(*sec)); INIT_LIST_HEAD(&sec->symbol_list); - INIT_LIST_HEAD(&sec->rela_list); + INIT_LIST_HEAD(&sec->reloc_list); s = elf_newscn(elf->elf); if (!s) { @@ -722,25 +722,25 @@ struct section *elf_create_section(struct elf *elf, const char *name, return sec; } -struct section *elf_create_rela_section(struct elf *elf, struct section *base) +struct section *elf_create_reloc_section(struct elf *elf, struct section *base) { - char *relaname; + char *relocname; struct section *sec; - relaname = malloc(strlen(base->name) + strlen(".rela") + 1); - if (!relaname) { + relocname = malloc(strlen(base->name) + strlen(".rela") + 1); + if (!relocname) { perror("malloc"); return NULL; } - strcpy(relaname, ".rela"); - strcat(relaname, base->name); + strcpy(relocname, ".rela"); + strcat(relocname, base->name); - sec = elf_create_section(elf, relaname, sizeof(GElf_Rela), 0); - free(relaname); + sec = elf_create_section(elf, relocname, sizeof(GElf_Rela), 0); + free(relocname); if (!sec) return NULL; - base->rela = sec; + base->reloc = sec; sec->base = base; sec->sh.sh_type = SHT_RELA; @@ -752,33 +752,33 @@ struct section *elf_create_rela_section(struct elf *elf, struct section *base) return sec; } -int elf_rebuild_rela_section(struct section *sec) +int elf_rebuild_reloc_section(struct section *sec) { - struct rela *rela; + struct reloc *reloc; int nr, idx = 0, size; - GElf_Rela *relas; + GElf_Rela *relocs; nr = 0; - list_for_each_entry(rela, &sec->rela_list, list) + list_for_each_entry(reloc, &sec->reloc_list, list) nr++; - size = nr * sizeof(*relas); - relas = malloc(size); - if (!relas) { + size = nr * sizeof(*relocs); + relocs = malloc(size); + if (!relocs) { perror("malloc"); return -1; } - sec->data->d_buf = relas; + sec->data->d_buf = relocs; sec->data->d_size = size; sec->sh.sh_size = size; idx = 0; - list_for_each_entry(rela, &sec->rela_list, list) { - relas[idx].r_offset = rela->offset; - relas[idx].r_addend = rela->addend; - relas[idx].r_info = GELF_R_INFO(rela->sym->idx, rela->type); + list_for_each_entry(reloc, &sec->reloc_list, list) { + relocs[idx].r_offset = reloc->offset; + relocs[idx].r_addend = reloc->addend; + relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); idx++; } @@ -821,7 +821,7 @@ void elf_close(struct elf *elf) { struct section *sec, *tmpsec; struct symbol *sym, *tmpsym; - struct rela *rela, *tmprela; + struct reloc *reloc, *tmpreloc; if (elf->elf) elf_end(elf->elf); @@ -835,10 +835,10 @@ void elf_close(struct elf *elf) hash_del(&sym->hash); free(sym); } - list_for_each_entry_safe(rela, tmprela, &sec->rela_list, list) { - list_del(&rela->list); - hash_del(&rela->hash); - free(rela); + list_for_each_entry_safe(reloc, tmpreloc, &sec->reloc_list, list) { + list_del(&reloc->list); + hash_del(&reloc->hash); + free(reloc); } list_del(&sec->list); free(sec); diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h index f4fe1d6ea392..6ad759fd778e 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/elf.h @@ -32,8 +32,8 @@ struct section { GElf_Shdr sh; struct rb_root symbol_tree; struct list_head symbol_list; - struct list_head rela_list; - struct section *base, *rela; + struct list_head reloc_list; + struct section *base, *reloc; struct symbol *sym; Elf_Data *data; char *name; @@ -58,7 +58,7 @@ struct symbol { bool uaccess_safe; }; -struct rela { +struct reloc { struct list_head list; struct hlist_node hash; GElf_Rela rela; @@ -82,7 +82,7 @@ struct elf { DECLARE_HASHTABLE(symbol_name_hash, ELF_HASH_BITS); DECLARE_HASHTABLE(section_hash, ELF_HASH_BITS); DECLARE_HASHTABLE(section_name_hash, ELF_HASH_BITS); - DECLARE_HASHTABLE(rela_hash, ELF_HASH_BITS); + DECLARE_HASHTABLE(reloc_hash, ELF_HASH_BITS); }; #define OFFSET_STRIDE_BITS 4 @@ -109,15 +109,15 @@ static inline u32 sec_offset_hash(struct section *sec, unsigned long offset) return ol; } -static inline u32 rela_hash(struct rela *rela) +static inline u32 reloc_hash(struct reloc *reloc) { - return sec_offset_hash(rela->sec, rela->offset); + return sec_offset_hash(reloc->sec, reloc->offset); } struct elf *elf_open_read(const char *name, int flags); struct section *elf_create_section(struct elf *elf, const char *name, size_t entsize, int nr); -struct section *elf_create_rela_section(struct elf *elf, struct section *base); -void elf_add_rela(struct elf *elf, struct rela *rela); +struct section *elf_create_reloc_section(struct elf *elf, struct section *base); +void elf_add_reloc(struct elf *elf, struct reloc *reloc); int elf_write(const struct elf *elf); void elf_close(struct elf *elf); @@ -126,11 +126,11 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_name(const struct elf *elf, const char *name); struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset); -struct rela *find_rela_by_dest(const struct elf *elf, struct section *sec, unsigned long offset); -struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec, +struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset); +struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len); struct symbol *find_func_containing(struct section *sec, unsigned long offset); -int elf_rebuild_rela_section(struct section *sec); +int elf_rebuild_reloc_section(struct section *sec); #define for_each_sec(file, sec) \ list_for_each_entry(sec, &file->elf->sections, list) diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index c9549988121a..93c720baea66 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -80,56 +80,56 @@ int create_orc(struct objtool_file *file) return 0; } -static int create_orc_entry(struct elf *elf, struct section *u_sec, struct section *ip_relasec, +static int create_orc_entry(struct elf *elf, struct section *u_sec, struct section *ip_relocsec, unsigned int idx, struct section *insn_sec, unsigned long insn_off, struct orc_entry *o) { struct orc_entry *orc; - struct rela *rela; + struct reloc *reloc; /* populate ORC data */ orc = (struct orc_entry *)u_sec->data->d_buf + idx; memcpy(orc, o, sizeof(*orc)); - /* populate rela for ip */ - rela = malloc(sizeof(*rela)); - if (!rela) { + /* populate reloc for ip */ + reloc = malloc(sizeof(*reloc)); + if (!reloc) { perror("malloc"); return -1; } - memset(rela, 0, sizeof(*rela)); + memset(reloc, 0, sizeof(*reloc)); if (insn_sec->sym) { - rela->sym = insn_sec->sym; - rela->addend = insn_off; + reloc->sym = insn_sec->sym; + reloc->addend = insn_off; } else { /* * The Clang assembler doesn't produce section symbols, so we * have to reference the function symbol instead: */ - rela->sym = find_symbol_containing(insn_sec, insn_off); - if (!rela->sym) { + reloc->sym = find_symbol_containing(insn_sec, insn_off); + if (!reloc->sym) { /* * Hack alert. This happens when we need to reference * the NOP pad insn immediately after the function. */ - rela->sym = find_symbol_containing(insn_sec, + reloc->sym = find_symbol_containing(insn_sec, insn_off - 1); } - if (!rela->sym) { + if (!reloc->sym) { WARN("missing symbol for insn at offset 0x%lx\n", insn_off); return -1; } - rela->addend = insn_off - rela->sym->offset; + reloc->addend = insn_off - reloc->sym->offset; } - rela->type = R_X86_64_PC32; - rela->offset = idx * sizeof(int); - rela->sec = ip_relasec; + reloc->type = R_X86_64_PC32; + reloc->offset = idx * sizeof(int); + reloc->sec = ip_relocsec; - elf_add_rela(elf, rela); + elf_add_reloc(elf, reloc); return 0; } @@ -137,7 +137,7 @@ static int create_orc_entry(struct elf *elf, struct section *u_sec, struct secti int create_orc_sections(struct objtool_file *file) { struct instruction *insn, *prev_insn; - struct section *sec, *u_sec, *ip_relasec; + struct section *sec, *u_sec, *ip_relocsec; unsigned int idx; struct orc_entry empty = { @@ -181,8 +181,8 @@ int create_orc_sections(struct objtool_file *file) if (!sec) return -1; - ip_relasec = elf_create_rela_section(file->elf, sec); - if (!ip_relasec) + ip_relocsec = elf_create_reloc_section(file->elf, sec); + if (!ip_relocsec) return -1; /* create .orc_unwind section */ @@ -200,7 +200,7 @@ int create_orc_sections(struct objtool_file *file) if (!prev_insn || memcmp(&insn->orc, &prev_insn->orc, sizeof(struct orc_entry))) { - if (create_orc_entry(file->elf, u_sec, ip_relasec, idx, + if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx, insn->sec, insn->offset, &insn->orc)) return -1; @@ -212,7 +212,7 @@ int create_orc_sections(struct objtool_file *file) /* section terminator */ if (prev_insn) { - if (create_orc_entry(file->elf, u_sec, ip_relasec, idx, + if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx, prev_insn->sec, prev_insn->offset + prev_insn->len, &empty)) @@ -222,7 +222,7 @@ int create_orc_sections(struct objtool_file *file) } } - if (elf_rebuild_rela_section(ip_relasec)) + if (elf_rebuild_reloc_section(ip_relocsec)) return -1; return 0; diff --git a/tools/objtool/special.c b/tools/objtool/special.c index e74e0189de22..e893f1e48e44 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -72,7 +72,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, struct section *sec, int idx, struct special_alt *alt) { - struct rela *orig_rela, *new_rela; + struct reloc *orig_reloc, *new_reloc; unsigned long offset; offset = idx * entry->size; @@ -118,30 +118,30 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, } } - orig_rela = find_rela_by_dest(elf, sec, offset + entry->orig); - if (!orig_rela) { - WARN_FUNC("can't find orig rela", sec, offset + entry->orig); + orig_reloc = find_reloc_by_dest(elf, sec, offset + entry->orig); + if (!orig_reloc) { + WARN_FUNC("can't find orig reloc", sec, offset + entry->orig); return -1; } - if (orig_rela->sym->type != STT_SECTION) { - WARN_FUNC("don't know how to handle non-section rela symbol %s", - sec, offset + entry->orig, orig_rela->sym->name); + if (orig_reloc->sym->type != STT_SECTION) { + WARN_FUNC("don't know how to handle non-section reloc symbol %s", + sec, offset + entry->orig, orig_reloc->sym->name); return -1; } - alt->orig_sec = orig_rela->sym->sec; - alt->orig_off = orig_rela->addend; + alt->orig_sec = orig_reloc->sym->sec; + alt->orig_off = orig_reloc->addend; if (!entry->group || alt->new_len) { - new_rela = find_rela_by_dest(elf, sec, offset + entry->new); - if (!new_rela) { - WARN_FUNC("can't find new rela", + new_reloc = find_reloc_by_dest(elf, sec, offset + entry->new); + if (!new_reloc) { + WARN_FUNC("can't find new reloc", sec, offset + entry->new); return -1; } - alt->new_sec = new_rela->sym->sec; - alt->new_off = (unsigned int)new_rela->addend; + alt->new_sec = new_reloc->sym->sec; + alt->new_off = (unsigned int)new_reloc->addend; /* _ASM_EXTABLE_EX hack */ if (alt->new_off >= 0x7ffffff0) From fb414783b65c880606fbc1463e6849f017e60d46 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Fri, 29 May 2020 14:01:14 -0700 Subject: [PATCH 004/502] objtool: Add support for relocations without addends Currently objtool only collects information about relocations with addends. In recordmcount, which we are about to merge into objtool, some supported architectures do not use rela relocations. Signed-off-by: Matt Helsley Reviewed-by: Julien Thierry Reviewed-by: Kamalesh Babulal Signed-off-by: Josh Poimboeuf --- tools/objtool/elf.c | 145 +++++++++++++++++++++++++++++++++++----- tools/objtool/elf.h | 7 +- tools/objtool/orc_gen.c | 2 +- 3 files changed, 134 insertions(+), 20 deletions(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 3160931e858c..95d86bcb9512 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -496,6 +496,32 @@ void elf_add_reloc(struct elf *elf, struct reloc *reloc) elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc)); } +static int read_rel_reloc(struct section *sec, int i, struct reloc *reloc, unsigned int *symndx) +{ + if (!gelf_getrel(sec->data, i, &reloc->rel)) { + WARN_ELF("gelf_getrel"); + return -1; + } + reloc->type = GELF_R_TYPE(reloc->rel.r_info); + reloc->addend = 0; + reloc->offset = reloc->rel.r_offset; + *symndx = GELF_R_SYM(reloc->rel.r_info); + return 0; +} + +static int read_rela_reloc(struct section *sec, int i, struct reloc *reloc, unsigned int *symndx) +{ + if (!gelf_getrela(sec->data, i, &reloc->rela)) { + WARN_ELF("gelf_getrela"); + return -1; + } + reloc->type = GELF_R_TYPE(reloc->rela.r_info); + reloc->addend = reloc->rela.r_addend; + reloc->offset = reloc->rela.r_offset; + *symndx = GELF_R_SYM(reloc->rela.r_info); + return 0; +} + static int read_relocs(struct elf *elf) { struct section *sec; @@ -505,7 +531,8 @@ static int read_relocs(struct elf *elf) unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0; list_for_each_entry(sec, &elf->sections, list) { - if (sec->sh.sh_type != SHT_RELA) + if ((sec->sh.sh_type != SHT_RELA) && + (sec->sh.sh_type != SHT_REL)) continue; sec->base = find_section_by_index(elf, sec->sh.sh_info); @@ -525,16 +552,17 @@ static int read_relocs(struct elf *elf) return -1; } memset(reloc, 0, sizeof(*reloc)); - - if (!gelf_getrela(sec->data, i, &reloc->rela)) { - WARN_ELF("gelf_getrela"); - return -1; + switch (sec->sh.sh_type) { + case SHT_REL: + if (read_rel_reloc(sec, i, reloc, &symndx)) + return -1; + break; + case SHT_RELA: + if (read_rela_reloc(sec, i, reloc, &symndx)) + return -1; + break; + default: return -1; } - - reloc->type = GELF_R_TYPE(reloc->rela.r_info); - reloc->addend = reloc->rela.r_addend; - reloc->offset = reloc->rela.r_offset; - symndx = GELF_R_SYM(reloc->rela.r_info); reloc->sym = find_symbol_by_index(elf, symndx); reloc->sec = sec; if (!reloc->sym) { @@ -722,7 +750,37 @@ struct section *elf_create_section(struct elf *elf, const char *name, return sec; } -struct section *elf_create_reloc_section(struct elf *elf, struct section *base) +static struct section *elf_create_rel_reloc_section(struct elf *elf, struct section *base) +{ + char *relocname; + struct section *sec; + + relocname = malloc(strlen(base->name) + strlen(".rel") + 1); + if (!relocname) { + perror("malloc"); + return NULL; + } + strcpy(relocname, ".rel"); + strcat(relocname, base->name); + + sec = elf_create_section(elf, relocname, sizeof(GElf_Rel), 0); + free(relocname); + if (!sec) + return NULL; + + base->reloc = sec; + sec->base = base; + + sec->sh.sh_type = SHT_REL; + sec->sh.sh_addralign = 8; + sec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx; + sec->sh.sh_info = base->idx; + sec->sh.sh_flags = SHF_INFO_LINK; + + return sec; +} + +static struct section *elf_create_rela_reloc_section(struct elf *elf, struct section *base) { char *relocname; struct section *sec; @@ -752,16 +810,53 @@ struct section *elf_create_reloc_section(struct elf *elf, struct section *base) return sec; } -int elf_rebuild_reloc_section(struct section *sec) +struct section *elf_create_reloc_section(struct elf *elf, + struct section *base, + int reltype) +{ + switch (reltype) { + case SHT_REL: return elf_create_rel_reloc_section(elf, base); + case SHT_RELA: return elf_create_rela_reloc_section(elf, base); + default: return NULL; + } +} + +static int elf_rebuild_rel_reloc_section(struct section *sec, int nr) { struct reloc *reloc; - int nr, idx = 0, size; + int idx = 0, size; + GElf_Rel *relocs; + + /* Allocate a buffer for relocations */ + size = nr * sizeof(*relocs); + relocs = malloc(size); + if (!relocs) { + perror("malloc"); + return -1; + } + + sec->data->d_buf = relocs; + sec->data->d_size = size; + + sec->sh.sh_size = size; + + idx = 0; + list_for_each_entry(reloc, &sec->reloc_list, list) { + relocs[idx].r_offset = reloc->offset; + relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + idx++; + } + + return 0; +} + +static int elf_rebuild_rela_reloc_section(struct section *sec, int nr) +{ + struct reloc *reloc; + int idx = 0, size; GElf_Rela *relocs; - nr = 0; - list_for_each_entry(reloc, &sec->reloc_list, list) - nr++; - + /* Allocate a buffer for relocations with addends */ size = nr * sizeof(*relocs); relocs = malloc(size); if (!relocs) { @@ -785,6 +880,22 @@ int elf_rebuild_reloc_section(struct section *sec) return 0; } +int elf_rebuild_reloc_section(struct section *sec) +{ + struct reloc *reloc; + int nr; + + nr = 0; + list_for_each_entry(reloc, &sec->reloc_list, list) + nr++; + + switch (sec->sh.sh_type) { + case SHT_REL: return elf_rebuild_rel_reloc_section(sec, nr); + case SHT_RELA: return elf_rebuild_rela_reloc_section(sec, nr); + default: return -1; + } +} + int elf_write(const struct elf *elf) { struct section *sec; diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h index 6ad759fd778e..78a2db23b8b6 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/elf.h @@ -61,7 +61,10 @@ struct symbol { struct reloc { struct list_head list; struct hlist_node hash; - GElf_Rela rela; + union { + GElf_Rela rela; + GElf_Rel rel; + }; struct section *sec; struct symbol *sym; unsigned int type; @@ -116,7 +119,7 @@ static inline u32 reloc_hash(struct reloc *reloc) struct elf *elf_open_read(const char *name, int flags); struct section *elf_create_section(struct elf *elf, const char *name, size_t entsize, int nr); -struct section *elf_create_reloc_section(struct elf *elf, struct section *base); +struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype); void elf_add_reloc(struct elf *elf, struct reloc *reloc); int elf_write(const struct elf *elf); void elf_close(struct elf *elf); diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index 93c720baea66..75e08cf0709b 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -181,7 +181,7 @@ int create_orc_sections(struct objtool_file *file) if (!sec) return -1; - ip_relocsec = elf_create_reloc_section(file->elf, sec); + ip_relocsec = elf_create_reloc_section(file->elf, sec, SHT_RELA); if (!ip_relocsec) return -1; From bb85429a9bf2e7d370b8e1afd72f933a88f0629f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 19 May 2020 12:18:25 -0700 Subject: [PATCH 005/502] perf/x86/intel/uncore: Add Comet Lake support The uncore subsystem on Comet Lake is similar to Sky Lake. The only difference is the new PCI IDs for IMC. Share the perf code with Sky Lake. Add new PCI IDs in the table. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1589915905-55870-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 2 + arch/x86/events/intel/uncore_snb.c | 66 ++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index cf76d6631afa..b9c28765bf33 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1514,6 +1514,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init), diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 3de1065eefc4..5c4036710b7a 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -42,6 +42,17 @@ #define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC 0x3ed0 #define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC 0x3e34 #define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35 +#define PCI_DEVICE_ID_INTEL_CML_H1_IMC 0x9b44 +#define PCI_DEVICE_ID_INTEL_CML_H2_IMC 0x9b54 +#define PCI_DEVICE_ID_INTEL_CML_H3_IMC 0x9b64 +#define PCI_DEVICE_ID_INTEL_CML_U1_IMC 0x9b51 +#define PCI_DEVICE_ID_INTEL_CML_U2_IMC 0x9b61 +#define PCI_DEVICE_ID_INTEL_CML_U3_IMC 0x9b71 +#define PCI_DEVICE_ID_INTEL_CML_S1_IMC 0x9b33 +#define PCI_DEVICE_ID_INTEL_CML_S2_IMC 0x9b43 +#define PCI_DEVICE_ID_INTEL_CML_S3_IMC 0x9b53 +#define PCI_DEVICE_ID_INTEL_CML_S4_IMC 0x9b63 +#define PCI_DEVICE_ID_INTEL_CML_S5_IMC 0x9b73 #define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 #define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 #define PCI_DEVICE_ID_INTEL_TGL_U1_IMC 0x9a02 @@ -771,6 +782,50 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -863,6 +918,17 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Dual Core */ + IMC_DEV(CML_H1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_H2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_H3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S4_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver), IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ { /* end marker */ } From e17d43b93e544f5016c0251d2074c15568d5d963 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:08 +0300 Subject: [PATCH 006/502] perf: Add perf text poke event Record (single instruction) changes to the kernel text (i.e. self-modifying code) in order to support tracers like Intel PT and ARM CoreSight. A copy of the running kernel code is needed as a reference point (e.g. from /proc/kcore). The text poke event records the old bytes and the new bytes so that the event can be processed forwards or backwards. The basic problem is recording the modified instruction in an unambiguous manner given SMP instruction cache (in)coherence. That is, when modifying an instruction concurrently any solution with one or multiple timestamps is not sufficient: CPU0 CPU1 0 1 write insn A 2 execute insn A 3 sync-I$ 4 Due to I$, CPU1 might execute either the old or new A. No matter where we record tracepoints on CPU0, one simply cannot tell what CPU1 will have observed, except that at 0 it must be the old one and at 4 it must be the new one. To solve this, take inspiration from x86 text poking, which has to solve this exact problem due to variable length instruction encoding and I-fetch windows. 1) overwrite the instruction with a breakpoint and sync I$ This guarantees that that code flow will never hit the target instruction anymore, on any CPU (or rather, it will cause an exception). 2) issue the TEXT_POKE event 3) overwrite the breakpoint with the new instruction and sync I$ Now we know that any execution after the TEXT_POKE event will either observe the breakpoint (and hit the exception) or the new instruction. So by guarding the TEXT_POKE event with an exception on either side; we can now tell, without doubt, which instruction another CPU will have observed. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-2-adrian.hunter@intel.com --- include/linux/perf_event.h | 8 +++ include/uapi/linux/perf_event.h | 21 +++++++- kernel/events/core.c | 90 ++++++++++++++++++++++++++++++++- 3 files changed, 117 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b4bb32082342..46fe5cfb5163 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1232,6 +1232,9 @@ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); +extern void perf_event_text_poke(const void *addr, + const void *old_bytes, size_t old_len, + const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); @@ -1479,6 +1482,11 @@ static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } +static inline void perf_event_text_poke(const void *addr, + const void *old_bytes, + size_t old_len, + const void *new_bytes, + size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7b2d6fc9e6ed..e5bee6c17b86 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -383,7 +383,8 @@ struct perf_event_attr { bpf_event : 1, /* include bpf events */ aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ - __reserved_1 : 31; + text_poke : 1, /* include text poke events */ + __reserved_1 : 30; union { __u32 wakeup_events; /* wakeup every n events */ @@ -1024,6 +1025,24 @@ enum perf_event_type { */ PERF_RECORD_CGROUP = 19, + /* + * Records changes to kernel text i.e. self-modified code. 'old_len' is + * the number of old bytes, 'new_len' is the number of new bytes. Either + * 'old_len' or 'new_len' may be zero to indicate, for example, the + * addition or removal of a trampoline. 'bytes' contains the old bytes + * followed immediately by the new bytes. + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u16 old_len; + * u16 new_len; + * u8 bytes[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_TEXT_POKE = 20, + PERF_RECORD_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 856d98c36f56..9b8f92500833 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; +static atomic_t nr_text_poke_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4575,7 +4576,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || - attr->context_switch || + attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; @@ -4651,6 +4652,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); + if (event->attr.text_poke) + atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -8628,6 +8631,89 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_text_poke_event { + const void *old_bytes; + const void *new_bytes; + size_t pad; + u16 old_len; + u16 new_len; + + struct { + struct perf_event_header header; + + u64 addr; + } event_id; +}; + +static int perf_event_text_poke_match(struct perf_event *event) +{ + return event->attr.text_poke; +} + +static void perf_event_text_poke_output(struct perf_event *event, void *data) +{ + struct perf_text_poke_event *text_poke_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u64 padding = 0; + int ret; + + if (!perf_event_text_poke_match(event)) + return; + + perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, text_poke_event->event_id); + perf_output_put(&handle, text_poke_event->old_len); + perf_output_put(&handle, text_poke_event->new_len); + + __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); + __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); + + if (text_poke_event->pad) + __output_copy(&handle, &padding, text_poke_event->pad); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_text_poke(const void *addr, const void *old_bytes, + size_t old_len, const void *new_bytes, size_t new_len) +{ + struct perf_text_poke_event text_poke_event; + size_t tot, pad; + + if (!atomic_read(&nr_text_poke_events)) + return; + + tot = sizeof(text_poke_event.old_len) + old_len; + tot += sizeof(text_poke_event.new_len) + new_len; + pad = ALIGN(tot, sizeof(u64)) - tot; + + text_poke_event = (struct perf_text_poke_event){ + .old_bytes = old_bytes, + .new_bytes = new_bytes, + .pad = pad, + .old_len = old_len, + .new_len = new_len, + .event_id = { + .header = { + .type = PERF_RECORD_TEXT_POKE, + .misc = PERF_RECORD_MISC_KERNEL, + .size = sizeof(text_poke_event.event_id) + tot + pad, + }, + .addr = (unsigned long)addr, + }, + }; + + perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -10945,6 +11031,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); + if (event->attr.text_poke) + atomic_inc(&nr_text_poke_events); if (inc) { /* From d769811ca93303deb1d8729d20cceaca7051a6f1 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:09 +0300 Subject: [PATCH 007/502] perf/x86: Add support for perf text poke event for text_poke_bp_batch() callers Add support for perf text poke event for text_poke_bp_batch() callers. That includes jump labels. See comments for more details. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-3-adrian.hunter@intel.com --- arch/x86/kernel/alternative.c | 37 ++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8fd39ff74a49..f94c9f371411 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -1001,6 +1002,7 @@ struct text_poke_loc { s32 rel32; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; + u8 old; }; struct bp_patching_desc { @@ -1168,8 +1170,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries /* * First step: add a int3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) + for (i = 0; i < nr_entries; i++) { + tp[i].old = *(u8 *)text_poke_addr(&tp[i]); text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + } text_poke_sync(); @@ -1177,14 +1181,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { + u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; int len = text_opcode_size(tp[i].opcode); if (len - INT3_INSN_SIZE > 0) { + memcpy(old + INT3_INSN_SIZE, + text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, (const char *)tp[i].text + INT3_INSN_SIZE, len - INT3_INSN_SIZE); do_sync++; } + + /* + * Emit a perf event to record the text poke, primarily to + * support Intel PT decoding which must walk the executable code + * to reconstruct the trace. The flow up to here is: + * - write INT3 byte + * - IPI-SYNC + * - write instruction tail + * At this point the actual control flow will be through the + * INT3 and handler and not hit the old or new instruction. + * Intel PT outputs FUP/TIP packets for the INT3, so the flow + * can still be decoded. Subsequently: + * - emit RECORD_TEXT_POKE with the new instruction + * - IPI-SYNC + * - write first byte + * - IPI-SYNC + * So before the text poke event timestamp, the decoder will see + * either the old instruction flow or FUP/TIP of INT3. After the + * text poke event timestamp, the decoder will see either the + * new instruction flow or FUP/TIP of INT3. Thus decoders can + * use the timestamp as the point at which to modify the + * executable code. + * The old instruction is recorded so that the event can be + * processed forwards or backwards. + */ + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, + tp[i].text, len); } if (do_sync) { From d002b8bc6dbc20e9043e279196cff8795dba05fe Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 28 May 2020 11:00:58 +0300 Subject: [PATCH 008/502] kprobes: Add symbols for kprobe insn pages Symbols are needed for tools to describe instruction addresses. Pages allocated for kprobe's purposes need symbols to be created for them. Add such symbols to be visible via /proc/kallsyms. Note: kprobe insn pages are not used if ftrace is configured. To see the effect of this patch, the kernel must be configured with: # CONFIG_FUNCTION_TRACER is not set CONFIG_KPROBES=y and for optimised kprobes: CONFIG_OPTPROBES=y Example on x86: # perf probe __schedule Added new event: probe:__schedule (on __schedule) # cat /proc/kallsyms | grep '\[__builtin__kprobes\]' ffffffffc00d4000 t kprobe_insn_page [__builtin__kprobes] ffffffffc00d6000 t kprobe_optinsn_page [__builtin__kprobes] Note: This patch adds "__builtin__kprobes" as a module name in /proc/kallsyms for symbols for pages allocated for kprobes' purposes, even though "__builtin__kprobes" is not a module. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200528080058.20230-1-adrian.hunter@intel.com --- include/linux/kprobes.h | 15 ++++++++++++++ kernel/kallsyms.c | 37 +++++++++++++++++++++++++++++---- kernel/kprobes.c | 45 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 4 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 594265bfd390..13fc58a74c04 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -242,6 +242,7 @@ struct kprobe_insn_cache { struct mutex mutex; void *(*alloc)(void); /* allocate insn page */ void (*free)(void *); /* free insn page */ + const char *sym; /* symbol for insn pages */ struct list_head pages; /* list of kprobe_insn_page */ size_t insn_size; /* size of instruction slot */ int nr_garbage; @@ -272,6 +273,10 @@ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ { \ return __is_insn_slot_addr(&kprobe_##__name##_slots, addr); \ } +#define KPROBE_INSN_PAGE_SYM "kprobe_insn_page" +#define KPROBE_OPTINSN_PAGE_SYM "kprobe_optinsn_page" +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, + unsigned long *value, char *type, char *sym); #else /* __ARCH_WANT_KPROBES_INSN_SLOT */ #define DEFINE_INSN_CACHE_OPS(__name) \ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ @@ -373,6 +378,11 @@ void dump_kprobe(struct kprobe *kp); void *alloc_insn_page(void); void free_insn_page(void *page); +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym); + +int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, + char *type, char *sym); #else /* !CONFIG_KPROBES: */ static inline int kprobes_built_in(void) @@ -435,6 +445,11 @@ static inline bool within_kprobe_blacklist(unsigned long addr) { return true; } +static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} #endif /* CONFIG_KPROBES */ static inline int disable_kretprobe(struct kretprobe *rp) { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..c6cc293c0e67 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* @@ -437,6 +438,7 @@ struct kallsym_iter { loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; + loff_t pos_bpf_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -496,11 +498,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) static int get_ksymbol_bpf(struct kallsym_iter *iter) { + int ret; + strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; - return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, - &iter->value, &iter->type, - iter->name) < 0 ? 0 : 1; + ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, + &iter->value, &iter->type, + iter->name); + if (ret < 0) { + iter->pos_bpf_end = iter->pos; + return 0; + } + + return 1; +} + +/* + * This uses "__builtin__kprobes" as a module name for symbols for pages + * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a + * module. + */ +static int get_ksymbol_kprobe(struct kallsym_iter *iter) +{ + strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); + iter->exported = 0; + return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, + &iter->value, &iter->type, + iter->name) < 0 ? 0 : 1; } /* Returns space to next name. */ @@ -527,6 +551,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; + iter->pos_bpf_end = 0; } } @@ -551,7 +576,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) get_ksymbol_ftrace_mod(iter)) return 1; - return get_ksymbol_bpf(iter); + if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) && + get_ksymbol_bpf(iter)) + return 1; + + return get_ksymbol_kprobe(iter); } /* Returns false if pos at or past end of file. */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 50cd84f53df0..058c0be3464b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -118,6 +118,7 @@ struct kprobe_insn_cache kprobe_insn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_INSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -290,12 +291,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr) return ret; } +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, + unsigned long *value, char *type, char *sym) +{ + struct kprobe_insn_page *kip; + int ret = -ERANGE; + + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if ((*symnum)--) + continue; + strlcpy(sym, c->sym, KSYM_NAME_LEN); + *type = 't'; + *value = (unsigned long)kip->insns; + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_OPTINSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, @@ -2197,6 +2220,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry) kprobe_remove_area_blacklist(entry, entry + 1); } +int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} + +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ +#ifdef __ARCH_WANT_KPROBES_INSN_SLOT + if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym)) + return 0; +#ifdef CONFIG_OPTPROBES + if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym)) + return 0; +#endif +#endif + if (!arch_kprobe_get_kallsym(&symnum, value, type, sym)) + return 0; + return -ERANGE; +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; From 69e49088692899d25dedfa22f00dfb9761e86ed7 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:11 +0300 Subject: [PATCH 009/502] kprobes: Add perf ksymbol events for kprobe insn pages Symbols are needed for tools to describe instruction addresses. Pages allocated for kprobe's purposes need symbols to be created for them. Add such symbols to be visible via perf ksymbol events. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200512121922.8997-5-adrian.hunter@intel.com --- include/uapi/linux/perf_event.h | 5 +++++ kernel/kprobes.c | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e5bee6c17b86..e1a4179144a1 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1049,6 +1049,11 @@ enum perf_event_type { enum perf_record_ksymbol_type { PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + /* + * Out of line code such as kprobe-replaced instructions or optimized + * kprobes. + */ + PERF_RECORD_KSYMBOL_TYPE_OOL = 2, PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ }; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 058c0be3464b..2b58740ca0f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -184,6 +185,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->cache = c; list_add_rcu(&kip->list, &c->pages); slot = kip->insns; + + /* Record the perf ksymbol register event after adding the page */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, + PAGE_SIZE, false, c->sym); out: mutex_unlock(&c->mutex); return slot; @@ -202,6 +207,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { + /* + * Record perf ksymbol unregister event before removing + * the page. + */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + (unsigned long)kip->insns, PAGE_SIZE, true, + kip->cache->sym); list_del_rcu(&kip->list); synchronize_rcu(); kip->cache->free(kip->insns); From 3e46bb40af8c12947c093efb8af56e0e921cd39b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:12 +0300 Subject: [PATCH 010/502] perf/x86: Add perf text poke events for kprobes Add perf text poke events for kprobes. That includes: - the replaced instruction(s) which are executed out-of-line i.e. arch_copy_kprobe() and arch_remove_kprobe() - the INT3 that activates the kprobe i.e. arch_arm_kprobe() and arch_disarm_kprobe() - optimised kprobe function i.e. arch_prepare_optimized_kprobe() and __arch_remove_optimized_kprobe() - optimised kprobe i.e. arch_optimize_kprobes() and arch_unoptimize_kprobe() Resulting in 8 possible text_poke events: 0: NULL -> probe.ainsn.insn (if ainsn.boostable && !kp.post_handler) arch_copy_kprobe() 1: old0 -> INT3 arch_arm_kprobe() // boosted kprobe active 2: NULL -> optprobe_trampoline arch_prepare_optimized_kprobe() 3: INT3,old1,old2,old3,old4 -> JMP32 arch_optimize_kprobes() // optprobe active 4: JMP32 -> INT3,old1,old2,old3,old4 // optprobe disabled and kprobe active (this sometimes goes back to 3) arch_unoptimize_kprobe() 5: optprobe_trampoline -> NULL arch_remove_optimized_kprobe() // boosted kprobe active 6: INT3 -> old0 arch_disarm_kprobe() 7: probe.ainsn.insn -> NULL (if ainsn.boostable && !kp.post_handler) arch_remove_kprobe() Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200512121922.8997-6-adrian.hunter@intel.com --- arch/x86/include/asm/kprobes.h | 2 ++ arch/x86/kernel/kprobes/core.c | 15 +++++++++++++- arch/x86/kernel/kprobes/opt.c | 38 +++++++++++++++++++++++++++++----- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 073eb7ad2f56..143bc9abe99c 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -66,6 +66,8 @@ struct arch_specific_insn { */ bool boostable; bool if_modifier; + /* Number of bytes of text poked */ + int tp_len; }; struct arch_optimized_insn { diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 3bafe1bd4dc7..bcc53c0d17c1 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -471,6 +472,9 @@ static int arch_copy_kprobe(struct kprobe *p) /* Also, displacement change doesn't affect the first byte */ p->opcode = buf[0]; + p->ainsn.tp_len = len; + perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len); + /* OK, write back the instruction(s) into ROX insn buffer */ text_poke(p->ainsn.insn, buf, len); @@ -502,12 +506,18 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_arm_kprobe(struct kprobe *p) { - text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1); + u8 int3 = INT3_INSN_OPCODE; + + text_poke(p->addr, &int3, 1); text_poke_sync(); + perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } void arch_disarm_kprobe(struct kprobe *p) { + u8 int3 = INT3_INSN_OPCODE; + + perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); text_poke_sync(); } @@ -515,6 +525,9 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { + /* Record the perf event before freeing the slot */ + perf_event_text_poke(p->ainsn.insn, p->ainsn.insn, + p->ainsn.tp_len, NULL, 0); free_insn_slot(p->ainsn.insn, p->ainsn.boostable); p->ainsn.insn = NULL; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 321c19950285..3239b6a80bce 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -6,6 +6,7 @@ * Copyright (C) Hitachi Ltd., 2012 */ #include +#include #include #include #include @@ -352,8 +353,15 @@ int arch_within_optimized_kprobe(struct optimized_kprobe *op, static void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) { - if (op->optinsn.insn) { - free_optinsn_slot(op->optinsn.insn, dirty); + u8 *slot = op->optinsn.insn; + if (slot) { + int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; + + /* Record the perf event before freeing the slot */ + if (dirty) + perf_event_text_poke(slot, slot, len, NULL, 0); + + free_optinsn_slot(slot, dirty); op->optinsn.insn = NULL; op->optinsn.size = 0; } @@ -424,8 +432,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, (u8 *)op->kp.addr + op->optinsn.size); len += JMP32_INSN_SIZE; + /* + * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also + * used in __arch_remove_optimized_kprobe(). + */ + /* We have to use text_poke() for instruction buffer because it is RO */ + perf_event_text_poke(slot, NULL, 0, buf, len); text_poke(slot, buf, len); + ret = 0; out: kfree(buf); @@ -477,10 +492,23 @@ void arch_optimize_kprobes(struct list_head *oplist) */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - arch_arm_kprobe(&op->kp); - text_poke(op->kp.addr + INT3_INSN_SIZE, - op->optinsn.copied_insn, DISP32_SIZE); + u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; + u8 old[JMP32_INSN_SIZE]; + u8 *addr = op->kp.addr; + + memcpy(old, op->kp.addr, JMP32_INSN_SIZE); + memcpy(new + INT3_INSN_SIZE, + op->optinsn.copied_insn, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + + text_poke(addr, new, INT3_INSN_SIZE); text_poke_sync(); + text_poke(addr + INT3_INSN_SIZE, + new + INT3_INSN_SIZE, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + text_poke_sync(); + + perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } /* From fc0ea795f53c8d7040fa42471f74fe51d78d0834 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:13 +0300 Subject: [PATCH 011/502] ftrace: Add symbols for ftrace trampolines Symbols are needed for tools to describe instruction addresses. Pages allocated for ftrace's purposes need symbols to be created for them. Add such symbols to be visible via /proc/kallsyms. Example on x86 with CONFIG_DYNAMIC_FTRACE=y # echo function > /sys/kernel/debug/tracing/current_tracer # cat /proc/kallsyms | grep '\[__builtin__ftrace\]' ffffffffc0238000 t ftrace_trampoline [__builtin__ftrace] Note: This patch adds "__builtin__ftrace" as a module name in /proc/kallsyms for symbols for pages allocated for ftrace's purposes, even though "__builtin__ftrace" is not a module. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-7-adrian.hunter@intel.com --- include/linux/ftrace.h | 12 ++++--- kernel/kallsyms.c | 5 +++ kernel/trace/ftrace.c | 77 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 88 insertions(+), 6 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e339dac91ee6..ce2c06f72e86 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -58,9 +58,6 @@ struct ftrace_direct_func; const char * ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym); -int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *name, - char *module_name, int *exported); #else static inline const char * ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, @@ -68,6 +65,13 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, { return NULL; } +#endif + +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, + char *module_name, int *exported); +#else static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported) @@ -76,7 +80,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val } #endif - #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; @@ -207,6 +210,7 @@ struct ftrace_ops { struct ftrace_ops_hash old_hash; unsigned long trampoline; unsigned long trampoline_size; + struct list_head list; #endif }; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index c6cc293c0e67..834bfdc43235 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -482,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) return 1; } +/* + * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace + * purposes. In that case "__builtin__ftrace" is used as a module name, even + * though "__builtin__ftrace" is not a module. + */ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c163c3531faf..31675b209db2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2764,6 +2764,38 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } +/* List of trace_ops that have allocated trampolines */ +static LIST_HEAD(ftrace_ops_trampoline_list); + +static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_add_rcu(&ops->list, &ftrace_ops_trampoline_list); +} + +static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_del_rcu(&ops->list); +} + +/* + * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols + * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is + * not a module. + */ +#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace" +#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline" + +static void ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && + ops->trampoline) + ftrace_remove_trampoline_from_kallsyms(ops); + + arch_ftrace_trampoline_free(ops); +} + static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -2934,7 +2966,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) synchronize_rcu_tasks(); free_ops: - arch_ftrace_trampoline_free(ops); + ftrace_trampoline_free(ops); } return 0; @@ -6178,6 +6210,27 @@ struct ftrace_mod_map { unsigned int num_funcs; }; +static int ftrace_get_trampoline_kallsym(unsigned int symnum, + unsigned long *value, char *type, + char *name, char *module_name, + int *exported) +{ + struct ftrace_ops *op; + + list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) { + if (!op->trampoline || symnum--) + continue; + *value = op->trampoline; + *type = 't'; + strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + *exported = 0; + return 0; + } + + return -ERANGE; +} + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6514,6 +6567,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, { struct ftrace_mod_map *mod_map; struct ftrace_mod_func *mod_func; + int ret; preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { @@ -6540,8 +6594,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, WARN_ON(1); break; } + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); preempt_enable(); - return -ERANGE; + return ret; } #else @@ -6553,6 +6609,18 @@ allocate_ftrace_mod_map(struct module *mod, { return NULL; } +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, char *module_name, + int *exported) +{ + int ret; + + preempt_disable(); + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); + preempt_enable(); + return ret; +} #endif /* CONFIG_MODULES */ struct ftrace_init_func { @@ -6733,7 +6801,12 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) static void ftrace_update_trampoline(struct ftrace_ops *ops) { + unsigned long trampoline = ops->trampoline; + arch_ftrace_update_trampoline(ops); + if (ops->trampoline && ops->trampoline != trampoline && + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + ftrace_add_trampoline_to_kallsyms(ops); } void ftrace_init_trace_array(struct trace_array *tr) From dd9ddf466ad7a5d2e247925d81ebb0b878bf3b76 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:14 +0300 Subject: [PATCH 012/502] ftrace: Add perf ksymbol events for ftrace trampolines Symbols are needed for tools to describe instruction addresses. Pages allocated for ftrace's purposes need symbols to be created for them. Add such symbols to be visible via perf ksymbol events. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-8-adrian.hunter@intel.com --- include/uapi/linux/perf_event.h | 2 +- kernel/trace/ftrace.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e1a4179144a1..52ca2093831c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1051,7 +1051,7 @@ enum perf_record_ksymbol_type { PERF_RECORD_KSYMBOL_TYPE_BPF = 1, /* * Out of line code such as kprobe-replaced instructions or optimized - * kprobes. + * kprobes or ftrace trampolines. */ PERF_RECORD_KSYMBOL_TYPE_OOL = 2, PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 31675b209db2..2baaf7716537 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2790,8 +2790,13 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) static void ftrace_trampoline_free(struct ftrace_ops *ops) { if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && - ops->trampoline) + ops->trampoline) { + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, + true, FTRACE_TRAMPOLINE_SYM); + /* Remove from kallsyms after the perf events */ ftrace_remove_trampoline_from_kallsyms(ops); + } arch_ftrace_trampoline_free(ops); } @@ -6805,8 +6810,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) arch_ftrace_update_trampoline(ops); if (ops->trampoline && ops->trampoline != trampoline && - (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) { + /* Add to kallsyms before the perf events */ ftrace_add_trampoline_to_kallsyms(ops); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, false, + FTRACE_TRAMPOLINE_SYM); + } } void ftrace_init_trace_array(struct trace_array *tr) From 548e1f6c76e1eb80ba29edd4286b9b9f2c37f5bf Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:15 +0300 Subject: [PATCH 013/502] ftrace: Add perf text poke events for ftrace trampolines Add perf text poke events for ftrace trampolines when created and when freed. There can be 3 text_poke events for ftrace trampolines: 1. NULL -> trampoline By ftrace_update_trampoline() when !ops->trampoline Trampoline created 2. [e.g. on x86] CALL rel32 -> CALL rel32 By arch_ftrace_update_trampoline() when ops->trampoline and ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP [e.g. on x86] via text_poke_bp() which generates text poke events Trampoline-called function target updated 3. trampoline -> NULL By ftrace_trampoline_free() when ops->trampoline and ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP Trampoline freed Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-9-adrian.hunter@intel.com --- kernel/trace/ftrace.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2baaf7716537..d6bba734ab72 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2791,6 +2791,13 @@ static void ftrace_trampoline_free(struct ftrace_ops *ops) { if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && ops->trampoline) { + /* + * Record the text poke event before the ksymbol unregister + * event. + */ + perf_event_text_poke((void *)ops->trampoline, + (void *)ops->trampoline, + ops->trampoline_size, NULL, 0); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, ops->trampoline, ops->trampoline_size, true, FTRACE_TRAMPOLINE_SYM); @@ -6816,6 +6823,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, ops->trampoline, ops->trampoline_size, false, FTRACE_TRAMPOLINE_SYM); + /* + * Record the perf text poke event after the ksymbol register + * event. + */ + perf_event_text_poke((void *)ops->trampoline, NULL, 0, + (void *)ops->trampoline, + ops->trampoline_size); } } From 2af834f1faab3f1e218fcbcab70a399121620d62 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 28 May 2020 08:19:27 -0700 Subject: [PATCH 014/502] perf/x86/intel/uncore: Fix oops when counting IMC uncore events on some TGL When counting IMC uncore events on some TGL machines, an oops will be triggered. [ 393.101262] BUG: unable to handle page fault for address: ffffb45200e15858 [ 393.101269] #PF: supervisor read access in kernel mode [ 393.101271] #PF: error_code(0x0000) - not-present page Current perf uncore driver still use the IMC MAP SIZE inherited from SNB, which is 0x6000. However, the offset of IMC uncore counters is larger than 0x6000, e.g. 0xd8a0. Enlarge the IMC MAP SIZE for TGL to 0xe000. Fixes: fdb64822443e ("perf/x86: Add Intel Tiger Lake uncore support") Reported-by: Ammy Yi Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ammy Yi Tested-by: Chao Qin Link: https://lkml.kernel.org/r/1590679169-61823-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 5c4036710b7a..d5ae3a822193 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -1151,6 +1151,7 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void) } #define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000 +#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000 static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) { @@ -1178,7 +1179,7 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) addr |= ((resource_size_t)mch_bar << 32); #endif - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->io_addr = ioremap(addr, TGL_UNCORE_PCI_IMC_MAP_SIZE); } static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = { From 1b94d31de422399421422af0e63c9685e7485901 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 28 May 2020 08:19:28 -0700 Subject: [PATCH 015/502] perf/x86/intel/uncore: Record the size of mapped area Perf cannot validate an address before the actual access to MMIO space of some uncore units, e.g. IMC on TGL. Accessing an invalid address, which exceeds mapped area, can trigger oops. Perf never records the size of mapped area. Generic functions, e.g. uncore_mmio_read_counter(), cannot get the correct size for address validation. Add mmio_map_size in intel_uncore_type to record the size of mapped area. Print warning message if ioremap fails. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1590679169-61823-2-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.h | 1 + arch/x86/events/intel/uncore_snb.c | 13 +++++++++++-- arch/x86/events/intel/uncore_snbep.c | 11 +++++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index b469ddd45515..79ff626b7ea6 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -61,6 +61,7 @@ struct intel_uncore_type { unsigned msr_offset; unsigned mmio_offset; }; + unsigned mmio_map_size; unsigned num_shared_regs:8; unsigned single_fixed:1; unsigned pair_ctr_ctl:1; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index d5ae3a822193..cb94ba86efd2 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -426,6 +426,7 @@ static const struct attribute_group snb_uncore_imc_format_group = { static void snb_uncore_imc_init_box(struct intel_uncore_box *box) { + struct intel_uncore_type *type = box->pmu->type; struct pci_dev *pdev = box->pci_dev; int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; resource_size_t addr; @@ -441,7 +442,10 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box) addr &= ~(PAGE_SIZE - 1); - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); + box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; } @@ -597,6 +601,7 @@ static struct intel_uncore_type snb_uncore_imc = { .num_counters = 2, .num_boxes = 1, .num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNB_UNCORE_PCI_IMC_MAP_SIZE, .freerunning = snb_uncore_imc_freerunning, .event_descs = snb_uncore_imc_events, .format_group = &snb_uncore_imc_format_group, @@ -1157,6 +1162,7 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) { struct pci_dev *pdev = tgl_uncore_get_mc_dev(); struct intel_uncore_pmu *pmu = box->pmu; + struct intel_uncore_type *type = pmu->type; resource_size_t addr; u32 mch_bar; @@ -1179,7 +1185,9 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) addr |= ((resource_size_t)mch_bar << 32); #endif - box->io_addr = ioremap(addr, TGL_UNCORE_PCI_IMC_MAP_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); } static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = { @@ -1205,6 +1213,7 @@ static struct intel_uncore_type tgl_uncore_imc_free_running = { .num_counters = 3, .num_boxes = 2, .num_freerunning_types = TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = TGL_UNCORE_PCI_IMC_MAP_SIZE, .freerunning = tgl_uncore_imc_freerunning, .ops = &tgl_uncore_imc_freerunning_ops, .event_descs = tgl_uncore_imc_events, diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 07652fa20ebb..bffb7554f4fb 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4421,6 +4421,7 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, unsigned int box_ctl, int mem_offset) { struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid); + struct intel_uncore_type *type = box->pmu->type; resource_size_t addr; u32 pci_dword; @@ -4435,9 +4436,11 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, addr += box_ctl; - box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE); - if (!box->io_addr) + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) { + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); return; + } writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr); } @@ -4530,6 +4533,7 @@ static struct intel_uncore_type snr_uncore_imc = { .event_mask = SNBEP_PMON_RAW_EVENT_MASK, .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, .mmio_offset = SNR_IMC_MMIO_OFFSET, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .ops = &snr_uncore_mmio_ops, .format_group = &skx_uncore_format_group, }; @@ -4570,6 +4574,7 @@ static struct intel_uncore_type snr_uncore_imc_free_running = { .num_counters = 3, .num_boxes = 1, .num_freerunning_types = SNR_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .freerunning = snr_imc_freerunning, .ops = &snr_uncore_imc_freerunning_ops, .event_descs = snr_uncore_imc_freerunning_events, @@ -4987,6 +4992,7 @@ static struct intel_uncore_type icx_uncore_imc = { .event_mask = SNBEP_PMON_RAW_EVENT_MASK, .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, .mmio_offset = SNR_IMC_MMIO_OFFSET, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .ops = &icx_uncore_mmio_ops, .format_group = &skx_uncore_format_group, }; @@ -5044,6 +5050,7 @@ static struct intel_uncore_type icx_uncore_imc_free_running = { .num_counters = 5, .num_boxes = 4, .num_freerunning_types = ICX_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .freerunning = icx_imc_freerunning, .ops = &icx_uncore_imc_freerunning_ops, .event_descs = icx_uncore_imc_freerunning_events, From f01719730bbe04b90ae60c7e9d2b6d3533308502 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 28 May 2020 08:19:29 -0700 Subject: [PATCH 016/502] perf/x86/intel/uncore: Validate MMIO address before accessing An oops will be triggered, if perf tries to access an invalid address which exceeds the mapped area. Check the address before the actual access to MMIO sapce of an uncore unit. Suggested-by: David Laight Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1590679169-61823-3-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 3 +++ arch/x86/events/intel/uncore.h | 12 ++++++++++++ arch/x86/events/intel/uncore_snbep.c | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index b9c28765bf33..cbe32d592aad 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -132,6 +132,9 @@ u64 uncore_mmio_read_counter(struct intel_uncore_box *box, if (!box->io_addr) return 0; + if (!uncore_mmio_is_valid_offset(box, event->hw.event_base)) + return 0; + return readq(box->io_addr + event->hw.event_base); } diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 79ff626b7ea6..7859ac01f7a5 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -197,6 +197,18 @@ static inline bool uncore_pmc_freerunning(int idx) return idx == UNCORE_PMC_IDX_FREERUNNING; } +static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box, + unsigned long offset) +{ + if (offset < box->pmu->type->mmio_map_size) + return true; + + pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n", + offset, box->pmu->type->name); + + return false; +} + static inline unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box) { diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index bffb7554f4fb..045c2d2231d2 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4483,6 +4483,9 @@ static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box, if (!box->io_addr) return; + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) + return; + writel(hwc->config | SNBEP_PMON_CTL_EN, box->io_addr + hwc->config_base); } @@ -4495,6 +4498,9 @@ static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box, if (!box->io_addr) return; + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) + return; + writel(hwc->config, box->io_addr + hwc->config_base); } From 19a39819818dee57e363bd44bd096e2e940a456b Mon Sep 17 00:00:00 2001 From: Roman Sudarikov Date: Mon, 1 Jun 2020 11:35:41 +0300 Subject: [PATCH 017/502] perf/x86/intel/uncore: Expose an Uncore unit to PMON mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each Uncore unit type, by its nature, can be mapped to its own context - which platform component each PMON block of that type is supposed to monitor. Intel® Xeon® Scalable processor family (code name Skylake-SP) makes significant changes in the integrated I/O (IIO) architecture. The new solution introduces IIO stacks which are responsible for managing traffic between the PCIe domain and the Mesh domain. Each IIO stack has its own PMON block and can handle either DMI port, x16 PCIe root port, MCP-Link or various built-in accelerators. IIO PMON blocks allow concurrent monitoring of I/O flows up to 4 x4 bifurcation within each IIO stack. Software is supposed to program required perf counters within each IIO stack and gather performance data. The tricky thing here is that IIO PMON reports data per IIO stack but users have no idea what IIO stacks are - they only know devices which are connected to the platform. Understanding IIO stack concept to find which IIO stack that particular IO device is connected to, or to identify an IIO PMON block to program for monitoring specific IIO stack assumes a lot of implicit knowledge about given Intel server platform architecture. Usage example: ls /sys/devices/uncore__/die* Signed-off-by: Alexander Antonov Signed-off-by: Roman Sudarikov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Reviewed-by: Alexander Shishkin Link: https://lkml.kernel.org/r/20200601083543.30011-2-alexander.antonov@linux.intel.com --- arch/x86/events/intel/uncore.c | 8 ++++++++ arch/x86/events/intel/uncore.h | 12 ++++++++++++ 2 files changed, 20 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index cbe32d592aad..49255e656e85 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -846,10 +846,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .read = uncore_pmu_event_read, .module = THIS_MODULE, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .attr_update = pmu->type->attr_update, }; } else { pmu->pmu = *pmu->type->pmu; pmu->pmu.attr_groups = pmu->type->attr_groups; + pmu->pmu.attr_update = pmu->type->attr_update; } if (pmu->type->num_boxes == 1) { @@ -890,6 +892,9 @@ static void uncore_type_exit(struct intel_uncore_type *type) struct intel_uncore_pmu *pmu = type->pmus; int i; + if (type->cleanup_mapping) + type->cleanup_mapping(type); + if (pmu) { for (i = 0; i < type->num_boxes; i++, pmu++) { uncore_pmu_unregister(pmu); @@ -957,6 +962,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) type->pmu_group = &uncore_pmu_attr_group; + if (type->set_mapping) + type->set_mapping(type); + return 0; err: diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 7859ac01f7a5..7caba06c7df5 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -73,7 +73,19 @@ struct intel_uncore_type { struct uncore_event_desc *event_descs; struct freerunning_counters *freerunning; const struct attribute_group *attr_groups[4]; + const struct attribute_group **attr_update; struct pmu *pmu; /* for custom pmu ops */ + /* + * Uncore PMU would store relevant platform topology configuration here + * to identify which platform component each PMON block of that type is + * supposed to monitor. + */ + u64 *topology; + /* + * Optional callbacks for managing mapping of Uncore units to PMONs + */ + int (*set_mapping)(struct intel_uncore_type *type); + void (*cleanup_mapping)(struct intel_uncore_type *type); }; #define pmu_group attr_groups[0] From 36b533bc5e3ed1039406f3b27e746b4d18f2cac1 Mon Sep 17 00:00:00 2001 From: Roman Sudarikov Date: Mon, 1 Jun 2020 11:35:42 +0300 Subject: [PATCH 018/502] perf/x86/intel/uncore: Wrap the max dies calculation into an accessor The accessor to return number of dies on the platform. Signed-off-by: Alexander Antonov Signed-off-by: Roman Sudarikov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Reviewed-by: Alexander Shishkin Link: https://lkml.kernel.org/r/20200601083543.30011-3-alexander.antonov@linux.intel.com --- arch/x86/events/intel/uncore.c | 13 +++++++------ arch/x86/events/intel/uncore.h | 3 +++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 49255e656e85..d5c6d3b340c5 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -16,7 +16,7 @@ struct pci_driver *uncore_pci_driver; DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_extra_dev *uncore_extra_pci_dev; -static int max_dies; +int __uncore_max_dies; /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -108,7 +108,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ - return dieid < max_dies ? pmu->boxes[dieid] : NULL; + return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -882,7 +882,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { int die; - for (die = 0; die < max_dies; die++) + for (die = 0; die < uncore_max_dies(); die++) kfree(pmu->boxes[die]); kfree(pmu->boxes); } @@ -923,7 +923,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) if (!pmus) return -ENOMEM; - size = max_dies * sizeof(struct intel_uncore_box *); + size = uncore_max_dies() * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { pmus[i].func_id = setid ? i : -1; @@ -1123,7 +1123,7 @@ static int __init uncore_pci_init(void) size_t size; int ret; - size = max_dies * sizeof(struct pci_extra_dev); + size = uncore_max_dies() * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { ret = -ENOMEM; @@ -1552,7 +1552,8 @@ static int __init intel_uncore_init(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; - max_dies = topology_max_packages() * topology_max_die_per_package(); + __uncore_max_dies = + topology_max_packages() * topology_max_die_per_package(); uncore_init = (struct intel_uncore_init_fun *)id->driver_data; if (uncore_init->pci_init) { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 7caba06c7df5..594a2fe20de9 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -182,6 +182,9 @@ int uncore_pcibus_to_physid(struct pci_bus *bus); ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); +extern int __uncore_max_dies; +#define uncore_max_dies() (__uncore_max_dies) + #define INTEL_UNCORE_EVENT_DESC(_name, _config) \ { \ .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ From bb42b3d39781d7fcd3be7f9f9bf11b6661b5fdf1 Mon Sep 17 00:00:00 2001 From: Roman Sudarikov Date: Mon, 1 Jun 2020 11:35:43 +0300 Subject: [PATCH 019/502] perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current version supports a server line starting Intel® Xeon® Processor Scalable Family and introduces mapping for IIO Uncore units only. Other units can be added on demand. IIO stack to PMON mapping is exposed through: /sys/devices/uncore_iio_/dieX where dieX is file which holds "Segment:Root Bus" for PCIe root port, which can be monitored by that IIO PMON block. Details are explained in Documentation/ABI/testing/sysfs-devices-mapping Reported-by: kbuild test robot Signed-off-by: Alexander Antonov Signed-off-by: Roman Sudarikov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Reviewed-by: Alexander Shishkin Link: https://lkml.kernel.org/r/20200601083543.30011-4-alexander.antonov@linux.intel.com --- .../ABI/testing/sysfs-devices-mapping | 33 +++ arch/x86/events/intel/uncore.h | 9 + arch/x86/events/intel/uncore_snbep.c | 191 ++++++++++++++++++ 3 files changed, 233 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-devices-mapping diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping new file mode 100644 index 000000000000..490ccfd67f12 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-mapping @@ -0,0 +1,33 @@ +What: /sys/devices/uncore_iio_x/dieX +Date: February 2020 +Contact: Roman Sudarikov +Description: + Each IIO stack (PCIe root port) has its own IIO PMON block, so + each dieX file (where X is die number) holds "Segment:Root Bus" + for PCIe root port, which can be monitored by that IIO PMON + block. + For example, on 4-die Xeon platform with up to 6 IIO stacks per + die and, therefore, 6 IIO PMON blocks per die, the mapping of + IIO PMON block 0 exposes as the following: + + $ ls /sys/devices/uncore_iio_0/die* + -r--r--r-- /sys/devices/uncore_iio_0/die0 + -r--r--r-- /sys/devices/uncore_iio_0/die1 + -r--r--r-- /sys/devices/uncore_iio_0/die2 + -r--r--r-- /sys/devices/uncore_iio_0/die3 + + $ tail /sys/devices/uncore_iio_0/die* + ==> /sys/devices/uncore_iio_0/die0 <== + 0000:00 + ==> /sys/devices/uncore_iio_0/die1 <== + 0000:40 + ==> /sys/devices/uncore_iio_0/die2 <== + 0000:80 + ==> /sys/devices/uncore_iio_0/die3 <== + 0000:c0 + + Which means: + IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x0000 + IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000 + IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000 + IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000 diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 594a2fe20de9..105fdc69825e 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -182,6 +182,15 @@ int uncore_pcibus_to_physid(struct pci_bus *bus); ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); +static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev) +{ + return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu); +} + +#define to_device_attribute(n) container_of(n, struct device_attribute, attr) +#define to_dev_ext_attribute(n) container_of(n, struct dev_ext_attribute, attr) +#define attr_to_ext_attr(n) to_dev_ext_attribute(to_device_attribute(n)) + extern int __uncore_max_dies; #define uncore_max_dies() (__uncore_max_dies) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 045c2d2231d2..62e88ad919ff 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -273,6 +273,30 @@ #define SKX_CPUNODEID 0xc0 #define SKX_GIDNIDMAP 0xd4 +/* + * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR + * that BIOS programmed. MSR has package scope. + * | Bit | Default | Description + * | [63] | 00h | VALID - When set, indicates the CPU bus + * numbers have been initialized. (RO) + * |[62:48]| --- | Reserved + * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned + * CPUBUSNO(5). (RO) + * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned + * CPUBUSNO(4). (RO) + * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned + * CPUBUSNO(3). (RO) + * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned + * CPUBUSNO(2). (RO) + * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned + * CPUBUSNO(1). (RO) + * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned + * CPUBUSNO(0). (RO) + */ +#define SKX_MSR_CPU_BUS_NUMBER 0x300 +#define SKX_MSR_CPU_BUS_VALID_BIT (1ULL << 63) +#define BUS_NUM_STRIDE 8 + /* SKX CHA */ #define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0) #define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9) @@ -3612,6 +3636,170 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { .read_counter = uncore_msr_read_counter, }; +static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) +{ + return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE); +} + +static umode_t +skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj)); + + /* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */ + return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode; +} + +static ssize_t skx_iio_mapping_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pci_bus *bus = pci_find_next_bus(NULL); + struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev); + struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); + long die = (long)ea->var; + + /* + * Current implementation is for single segment configuration hence it's + * safe to take the segment value from the first available root bus. + */ + return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus), + skx_iio_stack(uncore_pmu, die)); +} + +static int skx_msr_cpu_bus_read(int cpu, u64 *topology) +{ + u64 msr_value; + + if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) || + !(msr_value & SKX_MSR_CPU_BUS_VALID_BIT)) + return -ENXIO; + + *topology = msr_value; + + return 0; +} + +static int die_to_cpu(int die) +{ + int res = 0, cpu, current_die; + /* + * Using cpus_read_lock() to ensure cpu is not going down between + * looking at cpu_online_mask. + */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + current_die = topology_logical_die_id(cpu); + if (current_die == die) { + res = cpu; + break; + } + } + cpus_read_unlock(); + return res; +} + +static int skx_iio_get_topology(struct intel_uncore_type *type) +{ + int i, ret; + struct pci_bus *bus = NULL; + + /* + * Verified single-segment environments only; disabled for multiple + * segment topologies for now except VMD domains. + * VMD domains start at 0x10000 to not clash with ACPI _SEG domains. + */ + while ((bus = pci_find_next_bus(bus)) + && (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff)) + ; + if (bus) + return -EPERM; + + type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL); + if (!type->topology) + return -ENOMEM; + + for (i = 0; i < uncore_max_dies(); i++) { + ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]); + if (ret) { + kfree(type->topology); + type->topology = NULL; + return ret; + } + } + + return 0; +} + +static struct attribute_group skx_iio_mapping_group = { + .is_visible = skx_iio_mapping_visible, +}; + +static const struct attribute_group *skx_iio_attr_update[] = { + &skx_iio_mapping_group, + NULL, +}; + +static int skx_iio_set_mapping(struct intel_uncore_type *type) +{ + char buf[64]; + int ret; + long die = -1; + struct attribute **attrs = NULL; + struct dev_ext_attribute *eas = NULL; + + ret = skx_iio_get_topology(type); + if (ret) + return ret; + + /* One more for NULL. */ + attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); + if (!attrs) + goto err; + + eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL); + if (!eas) + goto err; + + for (die = 0; die < uncore_max_dies(); die++) { + sprintf(buf, "die%ld", die); + sysfs_attr_init(&eas[die].attr.attr); + eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL); + if (!eas[die].attr.attr.name) + goto err; + eas[die].attr.attr.mode = 0444; + eas[die].attr.show = skx_iio_mapping_show; + eas[die].attr.store = NULL; + eas[die].var = (void *)die; + attrs[die] = &eas[die].attr.attr; + } + skx_iio_mapping_group.attrs = attrs; + + return 0; +err: + for (; die >= 0; die--) + kfree(eas[die].attr.attr.name); + kfree(eas); + kfree(attrs); + kfree(type->topology); + type->attr_update = NULL; + return -ENOMEM; +} + +static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) +{ + struct attribute **attr = skx_iio_mapping_group.attrs; + + if (!attr) + return; + + for (; *attr; attr++) + kfree((*attr)->name); + kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs)); + kfree(skx_iio_mapping_group.attrs); + skx_iio_mapping_group.attrs = NULL; + kfree(type->topology); +} + static struct intel_uncore_type skx_uncore_iio = { .name = "iio", .num_counters = 4, @@ -3626,6 +3814,9 @@ static struct intel_uncore_type skx_uncore_iio = { .constraints = skx_uncore_iio_constraints, .ops = &skx_uncore_iio_ops, .format_group = &skx_uncore_iio_format_group, + .attr_update = skx_iio_attr_update, + .set_mapping = skx_iio_set_mapping, + .cleanup_mapping = skx_iio_cleanup_mapping, }; enum perf_uncore_iio_freerunning_type_id { From c935cd62d3fe985d7f0ebea185d2759e8992e96f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 17 Jun 2020 17:17:19 +1000 Subject: [PATCH 020/502] lockdep: Split header file into lockdep and lockdep_types There is a header file inclusion loop between asm-generic/bug.h and linux/kernel.h. This causes potential compile failurs depending on the which file is included first. One way of breaking this loop is to stop spinlock_types.h from including lockdep.h. This patch splits lockdep.h into two files for this purpose. Signed-off-by: Herbert Xu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sergey Senozhatsky Reviewed-by: Andy Shevchenko Acked-by: Petr Mladek Acked-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/E1jlSJz-0003hE-8g@fornost.hmeau.com --- include/linux/lockdep.h | 178 +----------------------------- include/linux/lockdep_types.h | 196 +++++++++++++++++++++++++++++++++ include/linux/spinlock.h | 1 + include/linux/spinlock_types.h | 2 +- 4 files changed, 200 insertions(+), 177 deletions(-) create mode 100644 include/linux/lockdep_types.h diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 8fce5c98a4b0..3b73cf84f77d 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -10,181 +10,20 @@ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H +#include + struct task_struct; -struct lockdep_map; /* for sysctl */ extern int prove_locking; extern int lock_stat; -#define MAX_LOCKDEP_SUBCLASSES 8UL - -#include - -enum lockdep_wait_type { - LD_WAIT_INV = 0, /* not checked, catch all */ - - LD_WAIT_FREE, /* wait free, rcu etc.. */ - LD_WAIT_SPIN, /* spin loops, raw_spinlock_t etc.. */ - -#ifdef CONFIG_PROVE_RAW_LOCK_NESTING - LD_WAIT_CONFIG, /* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */ -#else - LD_WAIT_CONFIG = LD_WAIT_SPIN, -#endif - LD_WAIT_SLEEP, /* sleeping locks, mutex_t etc.. */ - - LD_WAIT_MAX, /* must be last */ -}; - #ifdef CONFIG_LOCKDEP #include -#include #include #include -/* - * We'd rather not expose kernel/lockdep_states.h this wide, but we do need - * the total number of states... :-( - */ -#define XXX_LOCK_USAGE_STATES (1+2*4) - -/* - * NR_LOCKDEP_CACHING_CLASSES ... Number of classes - * cached in the instance of lockdep_map - * - * Currently main class (subclass == 0) and signle depth subclass - * are cached in lockdep_map. This optimization is mainly targeting - * on rq->lock. double_rq_lock() acquires this highly competitive with - * single depth. - */ -#define NR_LOCKDEP_CACHING_CLASSES 2 - -/* - * A lockdep key is associated with each lock object. For static locks we use - * the lock address itself as the key. Dynamically allocated lock objects can - * have a statically or dynamically allocated key. Dynamically allocated lock - * keys must be registered before being used and must be unregistered before - * the key memory is freed. - */ -struct lockdep_subclass_key { - char __one_byte; -} __attribute__ ((__packed__)); - -/* hash_entry is used to keep track of dynamically allocated keys. */ -struct lock_class_key { - union { - struct hlist_node hash_entry; - struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; - }; -}; - -extern struct lock_class_key __lockdep_no_validate__; - -struct lock_trace; - -#define LOCKSTAT_POINTS 4 - -/* - * The lock-class itself. The order of the structure members matters. - * reinit_class() zeroes the key member and all subsequent members. - */ -struct lock_class { - /* - * class-hash: - */ - struct hlist_node hash_entry; - - /* - * Entry in all_lock_classes when in use. Entry in free_lock_classes - * when not in use. Instances that are being freed are on one of the - * zapped_classes lists. - */ - struct list_head lock_entry; - - /* - * These fields represent a directed graph of lock dependencies, - * to every node we attach a list of "forward" and a list of - * "backward" graph nodes. - */ - struct list_head locks_after, locks_before; - - const struct lockdep_subclass_key *key; - unsigned int subclass; - unsigned int dep_gen_id; - - /* - * IRQ/softirq usage tracking bits: - */ - unsigned long usage_mask; - const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES]; - - /* - * Generation counter, when doing certain classes of graph walking, - * to ensure that we check one node only once: - */ - int name_version; - const char *name; - - short wait_type_inner; - short wait_type_outer; - -#ifdef CONFIG_LOCK_STAT - unsigned long contention_point[LOCKSTAT_POINTS]; - unsigned long contending_point[LOCKSTAT_POINTS]; -#endif -} __no_randomize_layout; - -#ifdef CONFIG_LOCK_STAT -struct lock_time { - s64 min; - s64 max; - s64 total; - unsigned long nr; -}; - -enum bounce_type { - bounce_acquired_write, - bounce_acquired_read, - bounce_contended_write, - bounce_contended_read, - nr_bounce_types, - - bounce_acquired = bounce_acquired_write, - bounce_contended = bounce_contended_write, -}; - -struct lock_class_stats { - unsigned long contention_point[LOCKSTAT_POINTS]; - unsigned long contending_point[LOCKSTAT_POINTS]; - struct lock_time read_waittime; - struct lock_time write_waittime; - struct lock_time read_holdtime; - struct lock_time write_holdtime; - unsigned long bounces[nr_bounce_types]; -}; - -struct lock_class_stats lock_stats(struct lock_class *class); -void clear_lock_stats(struct lock_class *class); -#endif - -/* - * Map the lock object (the lock instance) to the lock-class object. - * This is embedded into specific lock instances: - */ -struct lockdep_map { - struct lock_class_key *key; - struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; - const char *name; - short wait_type_outer; /* can be taken in this context */ - short wait_type_inner; /* presents this context */ -#ifdef CONFIG_LOCK_STAT - int cpu; - unsigned long ip; -#endif -}; - static inline void lockdep_copy_map(struct lockdep_map *to, struct lockdep_map *from) { @@ -440,8 +279,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock, extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); -struct pin_cookie { unsigned int val; }; - #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); @@ -520,10 +357,6 @@ static inline void lockdep_set_selftest_task(struct task_struct *task) # define lockdep_reset() do { debug_locks = 1; } while (0) # define lockdep_free_key_range(start, size) do { } while (0) # define lockdep_sys_exit() do { } while (0) -/* - * The class key takes no space if lockdep is disabled: - */ -struct lock_class_key { }; static inline void lockdep_register_key(struct lock_class_key *key) { @@ -533,11 +366,6 @@ static inline void lockdep_unregister_key(struct lock_class_key *key) { } -/* - * The lockdep_map takes no space if lockdep is disabled: - */ -struct lockdep_map { }; - #define lockdep_depth(tsk) (0) #define lockdep_is_held_type(l, r) (1) @@ -549,8 +377,6 @@ struct lockdep_map { }; #define lockdep_recursing(tsk) (0) -struct pin_cookie { }; - #define NIL_COOKIE (struct pin_cookie){ } #define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; }) diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h new file mode 100644 index 000000000000..7b9350624577 --- /dev/null +++ b/include/linux/lockdep_types.h @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Runtime locking correctness validator + * + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * see Documentation/locking/lockdep-design.rst for more details. + */ +#ifndef __LINUX_LOCKDEP_TYPES_H +#define __LINUX_LOCKDEP_TYPES_H + +#include + +#define MAX_LOCKDEP_SUBCLASSES 8UL + +enum lockdep_wait_type { + LD_WAIT_INV = 0, /* not checked, catch all */ + + LD_WAIT_FREE, /* wait free, rcu etc.. */ + LD_WAIT_SPIN, /* spin loops, raw_spinlock_t etc.. */ + +#ifdef CONFIG_PROVE_RAW_LOCK_NESTING + LD_WAIT_CONFIG, /* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */ +#else + LD_WAIT_CONFIG = LD_WAIT_SPIN, +#endif + LD_WAIT_SLEEP, /* sleeping locks, mutex_t etc.. */ + + LD_WAIT_MAX, /* must be last */ +}; + +#ifdef CONFIG_LOCKDEP + +#include + +/* + * We'd rather not expose kernel/lockdep_states.h this wide, but we do need + * the total number of states... :-( + */ +#define XXX_LOCK_USAGE_STATES (1+2*4) + +/* + * NR_LOCKDEP_CACHING_CLASSES ... Number of classes + * cached in the instance of lockdep_map + * + * Currently main class (subclass == 0) and signle depth subclass + * are cached in lockdep_map. This optimization is mainly targeting + * on rq->lock. double_rq_lock() acquires this highly competitive with + * single depth. + */ +#define NR_LOCKDEP_CACHING_CLASSES 2 + +/* + * A lockdep key is associated with each lock object. For static locks we use + * the lock address itself as the key. Dynamically allocated lock objects can + * have a statically or dynamically allocated key. Dynamically allocated lock + * keys must be registered before being used and must be unregistered before + * the key memory is freed. + */ +struct lockdep_subclass_key { + char __one_byte; +} __attribute__ ((__packed__)); + +/* hash_entry is used to keep track of dynamically allocated keys. */ +struct lock_class_key { + union { + struct hlist_node hash_entry; + struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; + }; +}; + +extern struct lock_class_key __lockdep_no_validate__; + +struct lock_trace; + +#define LOCKSTAT_POINTS 4 + +/* + * The lock-class itself. The order of the structure members matters. + * reinit_class() zeroes the key member and all subsequent members. + */ +struct lock_class { + /* + * class-hash: + */ + struct hlist_node hash_entry; + + /* + * Entry in all_lock_classes when in use. Entry in free_lock_classes + * when not in use. Instances that are being freed are on one of the + * zapped_classes lists. + */ + struct list_head lock_entry; + + /* + * These fields represent a directed graph of lock dependencies, + * to every node we attach a list of "forward" and a list of + * "backward" graph nodes. + */ + struct list_head locks_after, locks_before; + + const struct lockdep_subclass_key *key; + unsigned int subclass; + unsigned int dep_gen_id; + + /* + * IRQ/softirq usage tracking bits: + */ + unsigned long usage_mask; + const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES]; + + /* + * Generation counter, when doing certain classes of graph walking, + * to ensure that we check one node only once: + */ + int name_version; + const char *name; + + short wait_type_inner; + short wait_type_outer; + +#ifdef CONFIG_LOCK_STAT + unsigned long contention_point[LOCKSTAT_POINTS]; + unsigned long contending_point[LOCKSTAT_POINTS]; +#endif +} __no_randomize_layout; + +#ifdef CONFIG_LOCK_STAT +struct lock_time { + s64 min; + s64 max; + s64 total; + unsigned long nr; +}; + +enum bounce_type { + bounce_acquired_write, + bounce_acquired_read, + bounce_contended_write, + bounce_contended_read, + nr_bounce_types, + + bounce_acquired = bounce_acquired_write, + bounce_contended = bounce_contended_write, +}; + +struct lock_class_stats { + unsigned long contention_point[LOCKSTAT_POINTS]; + unsigned long contending_point[LOCKSTAT_POINTS]; + struct lock_time read_waittime; + struct lock_time write_waittime; + struct lock_time read_holdtime; + struct lock_time write_holdtime; + unsigned long bounces[nr_bounce_types]; +}; + +struct lock_class_stats lock_stats(struct lock_class *class); +void clear_lock_stats(struct lock_class *class); +#endif + +/* + * Map the lock object (the lock instance) to the lock-class object. + * This is embedded into specific lock instances: + */ +struct lockdep_map { + struct lock_class_key *key; + struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; + const char *name; + short wait_type_outer; /* can be taken in this context */ + short wait_type_inner; /* presents this context */ +#ifdef CONFIG_LOCK_STAT + int cpu; + unsigned long ip; +#endif +}; + +struct pin_cookie { unsigned int val; }; + +#else /* !CONFIG_LOCKDEP */ + +/* + * The class key takes no space if lockdep is disabled: + */ +struct lock_class_key { }; + +/* + * The lockdep_map takes no space if lockdep is disabled: + */ +struct lockdep_map { }; + +struct pin_cookie { }; + +#endif /* !LOCKDEP */ + +#endif /* __LINUX_LOCKDEP_TYPES_H */ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index d3770b3f9d9a..f2f12d746dbd 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -56,6 +56,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 6102e6bff3ae..b981caafe8bf 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -15,7 +15,7 @@ # include #endif -#include +#include typedef struct raw_spinlock { arch_spinlock_t raw_lock; From 5769a351b89cd4d82016f18fa5f6c4077403564d Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Wed, 17 Jun 2020 17:53:55 +0800 Subject: [PATCH 021/502] io_uring: change the poll type to be 32-bits poll events should be 32-bits to cover EPOLLEXCLUSIVE. Explicit word-swap the poll32_events for big endian to make sure the ABI is not changed. We call this feature IORING_FEAT_POLL_32BITS, applications who want to use EPOLLEXCLUSIVE should check the feature bit first. Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- include/uapi/linux/io_uring.h | 4 +++- tools/io_uring/liburing.h | 6 +++++- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a78201b96179..0eb063daa9b5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4589,7 +4589,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; - u16 events; + u32 events; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4598,7 +4598,10 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (!poll->file) return -EBADF; - events = READ_ONCE(sqe->poll_events); + events = READ_ONCE(sqe->poll32_events); +#ifdef __BIG_ENDIAN + events = swahw32(events); +#endif poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; io_get_req_task(req); @@ -7928,7 +7931,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | + IORING_FEAT_POLL_32BITS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -8217,7 +8221,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); - BUILD_BUG_SQE_ELEM(28, __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 92c22699a5a7..8d033961cb78 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -31,7 +31,8 @@ struct io_uring_sqe { union { __kernel_rwf_t rw_flags; __u32 fsync_flags; - __u16 poll_events; + __u16 poll_events; /* compatibility */ + __u32 poll32_events; /* word-reversed for BE */ __u32 sync_range_flags; __u32 msg_flags; __u32 timeout_flags; @@ -248,6 +249,7 @@ struct io_uring_params { #define IORING_FEAT_RW_CUR_POS (1U << 3) #define IORING_FEAT_CUR_PERSONALITY (1U << 4) #define IORING_FEAT_FAST_POLL (1U << 5) +#define IORING_FEAT_POLL_32BITS (1U << 6) /* * io_uring_register(2) opcodes and arguments diff --git a/tools/io_uring/liburing.h b/tools/io_uring/liburing.h index 5f305c86b892..28a837b6069d 100644 --- a/tools/io_uring/liburing.h +++ b/tools/io_uring/liburing.h @@ -10,6 +10,7 @@ extern "C" { #include #include "../../include/uapi/linux/io_uring.h" #include +#include #include "barrier.h" /* @@ -145,11 +146,14 @@ static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, } static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd, - short poll_mask) + unsigned poll_mask) { memset(sqe, 0, sizeof(*sqe)); sqe->opcode = IORING_OP_POLL_ADD; sqe->fd = fd; +#if __BYTE_ORDER == __BIG_ENDIAN + poll_mask = __swahw32(poll_mask); +#endif sqe->poll_events = poll_mask; } From a31eb4a2f1650fa578082ad9e9845487ecd90abe Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Wed, 17 Jun 2020 17:53:56 +0800 Subject: [PATCH 022/502] io_uring: use EPOLLEXCLUSIVE flag to aoid thundering herd type behavior Applications can pass this flag in to avoid accept thundering herd. Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0eb063daa9b5..311e8038ae58 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4245,7 +4245,11 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = 0; poll->head = head; - add_wait_queue(head, &poll->wait); + + if (poll->events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(head, &poll->wait); + else + add_wait_queue(head, &poll->wait); } static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, @@ -4602,7 +4606,8 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe #ifdef __BIG_ENDIAN events = swahw32(events); #endif - poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP | + (events & EPOLLEXCLUSIVE); io_get_req_task(req); return 0; From a087e2b519929152fdde8299457e32d5a8994a7c Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:07 -0700 Subject: [PATCH 023/502] io_uring: add wrappers for memory accounting Facilitate separation of locked memory usage reporting vs. limiting for upcoming patches. No functional changes. Signed-off-by: Bijan Mottahedeh [axboe: kill unnecessary () around return in io_account_mem()] Signed-off-by: Jens Axboe --- fs/io_uring.c | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 311e8038ae58..9db9f09499d1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6968,12 +6968,14 @@ err: return ret; } -static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) +static inline void __io_unaccount_mem(struct user_struct *user, + unsigned long nr_pages) { atomic_long_sub(nr_pages, &user->locked_vm); } -static int io_account_mem(struct user_struct *user, unsigned long nr_pages) +static inline int __io_account_mem(struct user_struct *user, + unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -6991,6 +6993,20 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) return 0; } +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +{ + if (ctx->account_mem) + __io_unaccount_mem(ctx->user, nr_pages); +} + +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +{ + if (ctx->account_mem) + return __io_account_mem(ctx->user, nr_pages); + + return 0; +} + static void io_mem_free(void *ptr) { struct page *page; @@ -7065,8 +7081,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, imu->nr_bvecs); + io_unaccount_mem(ctx, imu->nr_bvecs); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -7149,11 +7164,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - if (ctx->account_mem) { - ret = io_account_mem(ctx->user, nr_pages); - if (ret) - goto err; - } + ret = io_account_mem(ctx, nr_pages); + if (ret) + goto err; ret = 0; if (!pages || nr_pages > got_pages) { @@ -7166,8 +7179,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages); goto err; } got_pages = nr_pages; @@ -7177,8 +7189,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages); goto err; } @@ -7209,8 +7220,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages); kvfree(imu->bvec); goto err; } @@ -7315,9 +7325,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_mem_free(ctx->sq_sqes); percpu_ref_exit(&ctx->refs); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, - ring_pages(ctx->sq_entries, ctx->cq_entries)); + io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->cancel_hash); @@ -7887,7 +7895,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, account_mem = !capable(CAP_IPC_LOCK); if (account_mem) { - ret = io_account_mem(user, + ret = __io_account_mem(user, ring_pages(p->sq_entries, p->cq_entries)); if (ret) { free_uid(user); @@ -7898,7 +7906,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx = io_ring_ctx_alloc(p); if (!ctx) { if (account_mem) - io_unaccount_mem(user, ring_pages(p->sq_entries, + __io_unaccount_mem(user, ring_pages(p->sq_entries, p->cq_entries)); free_uid(user); return -ENOMEM; From aad5d8da1b301fe399d65f2dcb84df2ec60caaa3 Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:08 -0700 Subject: [PATCH 024/502] io_uring: rename ctx->account_mem field Rename account_mem to limit_name to clarify its purpose. Signed-off-by: Bijan Mottahedeh Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9db9f09499d1..fcaf9eee3420 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -226,7 +226,7 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int compat: 1; - unsigned int account_mem: 1; + unsigned int limit_mem: 1; unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; @@ -6995,13 +6995,13 @@ static inline int __io_account_mem(struct user_struct *user, static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->account_mem) + if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); } static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->account_mem) + if (ctx->limit_mem) return __io_account_mem(ctx->user, nr_pages); return 0; @@ -7853,7 +7853,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, { struct user_struct *user = NULL; struct io_ring_ctx *ctx; - bool account_mem; + bool limit_mem; int ret; if (!entries) @@ -7892,9 +7892,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, } user = get_uid(current_user()); - account_mem = !capable(CAP_IPC_LOCK); + limit_mem = !capable(CAP_IPC_LOCK); - if (account_mem) { + if (limit_mem) { ret = __io_account_mem(user, ring_pages(p->sq_entries, p->cq_entries)); if (ret) { @@ -7905,14 +7905,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx = io_ring_ctx_alloc(p); if (!ctx) { - if (account_mem) + if (limit_mem) __io_unaccount_mem(user, ring_pages(p->sq_entries, p->cq_entries)); free_uid(user); return -ENOMEM; } ctx->compat = in_compat_syscall(); - ctx->account_mem = account_mem; + ctx->limit_mem = limit_mem; ctx->user = user; ctx->creds = get_current_cred(); From 309758254ea62e07471abcaeca5b5c2173f4ebc2 Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:09 -0700 Subject: [PATCH 025/502] io_uring: report pinned memory usage Report pinned memory usage always, regardless of whether locked memory limit is enforced. Signed-off-by: Bijan Mottahedeh Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fcaf9eee3420..5ea55de3edef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6997,12 +6997,23 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); + + if (ctx->sqo_mm) + atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); } static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->limit_mem) - return __io_account_mem(ctx->user, nr_pages); + int ret; + + if (ctx->limit_mem) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + } + + if (ctx->sqo_mm) + atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); return 0; } @@ -7304,8 +7315,10 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); - if (ctx->sqo_mm) + if (ctx->sqo_mm) { mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + } io_iopoll_reap_events(ctx); io_sqe_buffer_unregister(ctx); @@ -7912,7 +7925,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, return -ENOMEM; } ctx->compat = in_compat_syscall(); - ctx->limit_mem = limit_mem; ctx->user = user; ctx->creds = get_current_cred(); @@ -7960,6 +7972,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries)); + ctx->limit_mem = limit_mem; return ret; err: io_ring_ctx_wait_and_kill(ctx); From 2e0464d48f32a9e78e2aa85cbbedc77ecbb6ed60 Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:10 -0700 Subject: [PATCH 026/502] io_uring: separate reporting of ring pages from registered pages Ring pages are not pinned so it is more appropriate to report them as locked. Signed-off-by: Bijan Mottahedeh Signed-off-by: Jens Axboe --- fs/io_uring.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5ea55de3edef..10b293780703 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -880,6 +880,11 @@ static const struct io_op_def io_op_defs[] = { }, }; +enum io_mem_account { + ACCT_LOCKED, + ACCT_PINNED, +}; + static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); @@ -6993,16 +6998,22 @@ static inline int __io_account_mem(struct user_struct *user, return 0; } -static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) { if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); - if (ctx->sqo_mm) - atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm -= nr_pages; + else if (acct == ACCT_PINNED) + atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + } } -static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) { int ret; @@ -7012,8 +7023,12 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return ret; } - if (ctx->sqo_mm) - atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm += nr_pages; + else if (acct == ACCT_PINNED) + atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + } return 0; } @@ -7092,7 +7107,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - io_unaccount_mem(ctx, imu->nr_bvecs); + io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -7175,7 +7190,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - ret = io_account_mem(ctx, nr_pages); + ret = io_account_mem(ctx, nr_pages, ACCT_PINNED); if (ret) goto err; @@ -7190,7 +7205,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - io_unaccount_mem(ctx, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } got_pages = nr_pages; @@ -7200,7 +7215,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { - io_unaccount_mem(ctx, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } @@ -7231,7 +7246,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - io_unaccount_mem(ctx, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); kvfree(imu->bvec); goto err; } @@ -7338,7 +7353,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_mem_free(ctx->sq_sqes); percpu_ref_exit(&ctx->refs); - io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries)); + io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries), + ACCT_LOCKED); free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->cancel_hash); @@ -7972,7 +7988,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); - io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries)); + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), + ACCT_LOCKED); ctx->limit_mem = limit_mem; return ret; err: From 5a473e8311b582a40c10409a0f4bb39f42aa8123 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 4 Jun 2020 11:23:39 -0600 Subject: [PATCH 027/502] block: provide plug based way of signaling forced no-wait semantics Provide a way for the caller to specify that IO should be marked with REQ_NOWAIT to avoid blocking on allocation. Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++++++ include/linux/blkdev.h | 1 + 2 files changed, 7 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 03252af8c82c..62a4904db921 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -958,6 +958,7 @@ generic_make_request_checks(struct bio *bio) struct request_queue *q; int nr_sectors = bio_sectors(bio); blk_status_t status = BLK_STS_IOERR; + struct blk_plug *plug; char b[BDEVNAME_SIZE]; might_sleep(); @@ -971,6 +972,10 @@ generic_make_request_checks(struct bio *bio) goto end_io; } + plug = blk_mq_plug(q, bio); + if (plug && plug->nowait) + bio->bi_opf |= REQ_NOWAIT; + /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. @@ -1800,6 +1805,7 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->cb_list); plug->rq_count = 0; plug->multiple_queues = false; + plug->nowait = false; /* * Store ordering should not be needed here, since a potential diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8fd900998b4e..6e067dca94cf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1189,6 +1189,7 @@ struct blk_plug { struct list_head cb_list; /* md requires an unplug callback */ unsigned short rq_count; bool multiple_queues; + bool nowait; }; #define BLK_MAX_REQUEST_COUNT 16 #define BLK_PLUG_FLUSH_SIZE (128 * 1024) From ac8691c415e0ce0b8734cb6d9df2df18608eebed Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Jun 2020 08:30:41 -0600 Subject: [PATCH 028/502] io_uring: always plug for any number of IOs Currently we only plug if we're doing more than two request. We're going to be relying on always having the plug there to pass down information, so plug unconditionally. Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 10b293780703..de894455f6bd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -676,7 +676,6 @@ struct io_kiocb { }; }; -#define IO_PLUG_THRESHOLD 2 #define IO_IOPOLL_BATCH 8 struct io_submit_state { @@ -5914,7 +5913,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, struct file *ring_file, int ring_fd) { - struct io_submit_state state, *statep = NULL; + struct io_submit_state state; struct io_kiocb *link = NULL; int i, submitted = 0; @@ -5931,10 +5930,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, nr); - statep = &state; - } + io_submit_state_start(&state, nr); ctx->ring_fd = ring_fd; ctx->ring_file = ring_file; @@ -5949,14 +5945,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, io_consume_sqe(ctx); break; } - req = io_alloc_req(ctx, statep); + req = io_alloc_req(ctx, &state); if (unlikely(!req)) { if (!submitted) submitted = -EAGAIN; break; } - err = io_init_req(ctx, req, sqe, statep); + err = io_init_req(ctx, req, sqe, &state); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; @@ -5982,8 +5978,7 @@ fail_req: } if (link) io_queue_link_head(link); - if (statep) - io_submit_state_end(&state); + io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); From 4503b7676a2e0abe69c2f2c0d8b03aec53f2f048 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Jun 2020 10:00:27 -0600 Subject: [PATCH 029/502] io_uring: catch -EIO from buffered issue request failure -EIO bubbles up like -EAGAIN if we fail to allocate a request at the lower level. Play it safe and treat it like -EAGAIN in terms of sync retry, to avoid passing back an errant -EIO. Catch some of these early for block based file, as non-mq devices generally do not support NOWAIT. That saves us some overhead by not first trying, then retrying from async context. We can go straight to async punt instead. Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index de894455f6bd..c5ee6d1a92d3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2088,6 +2088,15 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) return state->file; } +static bool io_bdev_nowait(struct block_device *bdev) +{ +#ifdef CONFIG_BLOCK + return !bdev || queue_is_mq(bdev_get_queue(bdev)); +#else + return true; +#endif +} + /* * If we tracked the file through the SCM inflight mechanism, we could support * any file. For now, just ensure that anything potentially problematic is done @@ -2097,10 +2106,19 @@ static bool io_file_supports_async(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) - return true; - if (S_ISREG(mode) && file->f_op != &io_uring_fops) + if (S_ISBLK(mode)) { + if (io_bdev_nowait(file->f_inode->i_bdev)) + return true; + return false; + } + if (S_ISCHR(mode) || S_ISSOCK(mode)) return true; + if (S_ISREG(mode)) { + if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) && + file->f_op != &io_uring_fops) + return true; + return false; + } /* any ->read/write should understand O_NONBLOCK */ if (file->f_flags & O_NONBLOCK) @@ -2650,7 +2668,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { - ssize_t ret2; + ssize_t ret2 = 0; if (req->file->f_op->read_iter) ret2 = call_read_iter(req->file, kiocb, &iter); @@ -2658,7 +2676,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) { + if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { kiocb_done(kiocb, ret2); } else { copy_iov: From b63534c41e20b474483b4ddf47efc858c17352e0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 4 Jun 2020 11:28:00 -0600 Subject: [PATCH 030/502] io_uring: re-issue block requests that failed because of resources Mark the plug with nowait == true, which will cause requests to avoid blocking on request allocation. If they do, we catch them and reissue them from a task_work based handler. Normally we can catch -EAGAIN directly, but the hard case is for split requests. As an example, the application issues a 512KB request. The block core will split this into 128KB if that's the max size for the device. The first request issues just fine, but we run into -EAGAIN for some latter splits for the same request. As the bio is split, we don't get to see the -EAGAIN until one of the actual reads complete, and hence we cannot handle it inline as part of submission. This does potentially cause re-reads of parts of the range, as the whole request is reissued. There's currently no better way to handle this. Signed-off-by: Jens Axboe --- fs/io_uring.c | 148 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 124 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c5ee6d1a92d3..f3dbf83fabf3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -900,6 +900,13 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe); +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock); +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -1978,12 +1985,115 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) __io_cqring_add_event(req, res, cflags); } +static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + +#ifdef CONFIG_BLOCK +static bool io_resubmit_prep(struct io_kiocb *req, int error) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + ssize_t ret = -ECANCELED; + struct iov_iter iter; + int rw; + + if (error) { + ret = error; + goto end_req; + } + + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + rw = READ; + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + rw = WRITE; + break; + default: + printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n", + req->opcode); + goto end_req; + } + + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); + if (!ret) + return true; + kfree(iovec); +end_req: + io_cqring_add_event(req, ret); + req_set_fail_links(req); + io_put_req(req); + return false; +} + +static void io_rw_resubmit(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; + int err; + + __set_current_state(TASK_RUNNING); + + err = io_sq_thread_acquire_mm(ctx, req); + + if (io_resubmit_prep(req, err)) { + refcount_inc(&req->refs); + io_queue_async_work(req); + } +} +#endif + +static bool io_rw_reissue(struct io_kiocb *req, long res) +{ +#ifdef CONFIG_BLOCK + struct task_struct *tsk; + int ret; + + if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) + return false; + + tsk = req->task; + init_task_work(&req->task_work, io_rw_resubmit); + ret = task_work_add(tsk, &req->task_work, true); + if (!ret) + return true; +#endif + return false; +} + static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - io_complete_rw_common(kiocb, res); - io_put_req(req); + if (!io_rw_reissue(req, res)) { + io_complete_rw_common(kiocb, res); + io_put_req(req); + } } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -2169,6 +2279,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; + if (kiocb->ki_flags & IOCB_DIRECT) + io_get_req_task(req); + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; @@ -2668,6 +2781,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { + unsigned long nr_segs = iter.nr_segs; ssize_t ret2 = 0; if (req->file->f_op->read_iter) @@ -2679,6 +2793,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { kiocb_done(kiocb, ret2); } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); @@ -2765,6 +2881,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) iov_count = iov_iter_count(&iter); ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (!ret) { + unsigned long nr_segs = iter.nr_segs; ssize_t ret2; /* @@ -2802,6 +2919,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2); } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); @@ -4282,28 +4401,6 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&pt->req->apoll->poll, pt, head); } -static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (io_op_defs[req->opcode].needs_mm && !current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } - - return 0; -} - static void io_async_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); @@ -5814,6 +5911,9 @@ static void io_submit_state_start(struct io_submit_state *state, unsigned int max_ios) { blk_start_plug(&state->plug); +#ifdef CONFIG_BLOCK + state->plug.nowait = true; +#endif state->free_reqs = 0; state->file = NULL; state->ios_left = max_ios; From 2e85abf053b99a6488f1b529d7aa3b8d7478adae Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 08:59:42 -0600 Subject: [PATCH 031/502] mm: allow read-ahead with IOCB_NOWAIT set The read-ahead shouldn't block, so allow it to be done even if IOCB_NOWAIT is set in the kiocb. Acked-by: Johannes Weiner Signed-off-by: Jens Axboe --- mm/filemap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index f0ae9a6308cb..3378d4fca883 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2028,8 +2028,6 @@ find_page: page = find_get_page(mapping, index); if (!page) { - if (iocb->ki_flags & IOCB_NOWAIT) - goto would_block; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); From c7510ab2cf5ccd997fe7f194edfe09cc511abf99 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 23 May 2020 08:22:14 -0600 Subject: [PATCH 032/502] mm: abstract out wake_page_match() from wake_page_function() No functional changes in this patch, just in preparation for allowing more callers. Acked-by: Johannes Weiner Signed-off-by: Jens Axboe --- include/linux/pagemap.h | 37 +++++++++++++++++++++++++++++++++++++ mm/filemap.c | 35 ++++------------------------------- 2 files changed, 41 insertions(+), 31 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cf2468da68e9..2f18221bb5c8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -496,6 +496,43 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, return pgoff; } +/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ +struct wait_page_key { + struct page *page; + int bit_nr; + int page_match; +}; + +struct wait_page_queue { + struct page *page; + int bit_nr; + wait_queue_entry_t wait; +}; + +static inline int wake_page_match(struct wait_page_queue *wait_page, + struct wait_page_key *key) +{ + if (wait_page->page != key->page) + return 0; + key->page_match = 1; + + if (wait_page->bit_nr != key->bit_nr) + return 0; + + /* + * Stop walking if it's locked. + * Is this safe if put_and_wait_on_page_locked() is in use? + * Yes: the waker must hold a reference to this page, and if PG_locked + * has now already been set by another task, that task must also hold + * a reference to the *same usage* of this page; so there is no need + * to walk on to wake even the put_and_wait_on_page_locked() callers. + */ + if (test_bit(key->bit_nr, &key->page->flags)) + return -1; + + return 1; +} + extern void __lock_page(struct page *page); extern int __lock_page_killable(struct page *page); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, diff --git a/mm/filemap.c b/mm/filemap.c index 3378d4fca883..c3175dbd8fba 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -987,43 +987,16 @@ void __init pagecache_init(void) page_writeback_init(); } -/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ -struct wait_page_key { - struct page *page; - int bit_nr; - int page_match; -}; - -struct wait_page_queue { - struct page *page; - int bit_nr; - wait_queue_entry_t wait; -}; - static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); + int ret; - if (wait_page->page != key->page) - return 0; - key->page_match = 1; - - if (wait_page->bit_nr != key->bit_nr) - return 0; - - /* - * Stop walking if it's locked. - * Is this safe if put_and_wait_on_page_locked() is in use? - * Yes: the waker must hold a reference to this page, and if PG_locked - * has now already been set by another task, that task must also hold - * a reference to the *same usage* of this page; so there is no need - * to walk on to wake even the put_and_wait_on_page_locked() callers. - */ - if (test_bit(key->bit_nr, &key->page->flags)) - return -1; - + ret = wake_page_match(wait_page, key); + if (ret != 1) + return ret; return autoremove_wake_function(wait, mode, sync, key); } From dd3e6d5039de1cbff4e20e2b34390ff44cdb182f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:12:09 -0600 Subject: [PATCH 033/502] mm: add support for async page locking Normally waiting for a page to become unlocked, or locking the page, requires waiting for IO to complete. Add support for lock_page_async() and wait_on_page_locked_async(), which are callback based instead. This allows a caller to get notified when a page becomes unlocked, rather than wait for it. We add a new iocb field, ki_waitq, to pass in the necessary data for this to happen. We can unionize this with ki_cookie, since that is only used for polled IO. Polled IO can never co-exist with async callbacks, as it is (by definition) polled completions. struct wait_page_key is made public, and we define struct wait_page_async as the interface between the caller and the core. Acked-by: Johannes Weiner Signed-off-by: Jens Axboe --- include/linux/fs.h | 7 ++++++- include/linux/pagemap.h | 17 ++++++++++++++++ mm/filemap.c | 45 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f881a892ea7..2a5cf6080e68 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -315,6 +315,8 @@ enum rw_hint { #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) #define IOCB_NOWAIT (1 << 7) +/* iocb->ki_waitq is valid */ +#define IOCB_WAITQ (1 << 8) struct kiocb { struct file *ki_filp; @@ -328,7 +330,10 @@ struct kiocb { int ki_flags; u16 ki_hint; u16 ki_ioprio; /* See linux/ioprio.h */ - unsigned int ki_cookie; /* for ->iopoll */ + union { + unsigned int ki_cookie; /* for ->iopoll */ + struct wait_page_queue *ki_waitq; /* for async buffered IO */ + }; randomized_struct_fields_end }; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2f18221bb5c8..e053e1d9a4d7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -535,6 +535,7 @@ static inline int wake_page_match(struct wait_page_queue *wait_page, extern void __lock_page(struct page *page); extern int __lock_page_killable(struct page *page); +extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); extern void unlock_page(struct page *page); @@ -571,6 +572,22 @@ static inline int lock_page_killable(struct page *page) return 0; } +/* + * lock_page_async - Lock the page, unless this would block. If the page + * is already locked, then queue a callback when the page becomes unlocked. + * This callback can then retry the operation. + * + * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page + * was already locked and the callback defined in 'wait' was queued. + */ +static inline int lock_page_async(struct page *page, + struct wait_page_queue *wait) +{ + if (!trylock_page(page)) + return __lock_page_async(page, wait); + return 0; +} + /* * lock_page_or_retry - Lock the page, unless this would block and the * caller indicated that it can handle a retry. diff --git a/mm/filemap.c b/mm/filemap.c index c3175dbd8fba..e8aaf43bee9f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1180,6 +1180,36 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) } EXPORT_SYMBOL(wait_on_page_bit_killable); +static int __wait_on_page_locked_async(struct page *page, + struct wait_page_queue *wait, bool set) +{ + struct wait_queue_head *q = page_waitqueue(page); + int ret = 0; + + wait->page = page; + wait->bit_nr = PG_locked; + + spin_lock_irq(&q->lock); + __add_wait_queue_entry_tail(q, &wait->wait); + SetPageWaiters(page); + if (set) + ret = !trylock_page(page); + else + ret = PageLocked(page); + /* + * If we were succesful now, we know we're still on the + * waitqueue as we're still under the lock. This means it's + * safe to remove and return success, we know the callback + * isn't going to trigger. + */ + if (!ret) + __remove_wait_queue(q, &wait->wait); + else + ret = -EIOCBQUEUED; + spin_unlock_irq(&q->lock); + return ret; +} + /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. @@ -1342,6 +1372,11 @@ int __lock_page_killable(struct page *__page) } EXPORT_SYMBOL_GPL(__lock_page_killable); +int __lock_page_async(struct page *page, struct wait_page_queue *wait) +{ + return __wait_on_page_locked_async(page, wait, true); +} + /* * Return values: * 1 - page is locked; mmap_lock is still held. @@ -2131,6 +2166,11 @@ page_not_up_to_date_locked: } readpage: + if (iocb->ki_flags & IOCB_NOWAIT) { + unlock_page(page); + put_page(page); + goto would_block; + } /* * A previous I/O error may have been due to temporary * failures, eg. multipath errors. @@ -2150,7 +2190,10 @@ readpage: } if (!PageUptodate(page)) { - error = lock_page_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) + error = lock_page_async(page, iocb->ki_waitq); + else + error = lock_page_killable(page); if (unlikely(error)) goto readpage_error; if (!PageUptodate(page)) { From 1a0a7853b901c35a742b3bf176cf4701a5c5817c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:18:38 -0600 Subject: [PATCH 034/502] mm: support async buffered reads in generic_file_buffered_read() Use the async page locking infrastructure, if IOCB_WAITQ is set in the passed in iocb. The caller must expect an -EIOCBQUEUED return value, which means that IO is started but not done yet. This is similar to how O_DIRECT signals the same operation. Once the callback is received by the caller for IO completion, the caller must retry the operation. Acked-by: Johannes Weiner Signed-off-by: Jens Axboe --- mm/filemap.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index e8aaf43bee9f..a5b1fa8f7ce4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1210,6 +1210,14 @@ static int __wait_on_page_locked_async(struct page *page, return ret; } +static int wait_on_page_locked_async(struct page *page, + struct wait_page_queue *wait) +{ + if (!PageLocked(page)) + return 0; + return __wait_on_page_locked_async(compound_head(page), wait, false); +} + /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. @@ -2049,17 +2057,25 @@ find_page: index, last_index - index); } if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_NOWAIT) { - put_page(page); - goto would_block; - } - /* * See comment in do_read_cache_page on why * wait_on_page_locked is used to avoid unnecessarily * serialisations and why it's safe. */ - error = wait_on_page_locked_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) { + if (written) { + put_page(page); + goto out; + } + error = wait_on_page_locked_async(page, + iocb->ki_waitq); + } else { + if (iocb->ki_flags & IOCB_NOWAIT) { + put_page(page); + goto would_block; + } + error = wait_on_page_locked_killable(page); + } if (unlikely(error)) goto readpage_error; if (PageUptodate(page)) @@ -2147,7 +2163,10 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - error = lock_page_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) + error = lock_page_async(page, iocb->ki_waitq); + else + error = lock_page_killable(page); if (unlikely(error)) goto readpage_error; @@ -2190,10 +2209,7 @@ readpage: } if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_WAITQ) - error = lock_page_async(page, iocb->ki_waitq); - else - error = lock_page_killable(page); + error = lock_page_killable(page); if (unlikely(error)) goto readpage_error; if (!PageUptodate(page)) { From c2a25ec0f1005dde004cd671484f578a9c8ca7de Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:12:51 -0600 Subject: [PATCH 035/502] fs: add FMODE_BUF_RASYNC If set, this indicates that the file system supports IOCB_WAITQ for buffered reads. Signed-off-by: Jens Axboe --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 2a5cf6080e68..4090320360f4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) +/* File supports async buffered reads */ +#define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000) + /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * that indicates that they should check the contents of the iovec are From a304f0744824fd37d6e1aab4f9715f907724ad11 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:14:08 -0600 Subject: [PATCH 036/502] block: flag block devices as supporting IOCB_WAITQ Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 0ae656e022fd..679d9346b871 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1851,7 +1851,7 @@ static int blkdev_open(struct inode * inode, struct file * filp) */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; if (filp->f_flags & O_NDELAY) filp->f_mode |= FMODE_NDELAY; From f89fb730aa02f451fba1f8d5964dfec244d2e2d1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:27:33 -0600 Subject: [PATCH 037/502] xfs: flag files as supporting buffered async reads XFS uses generic_file_read_iter(), which already supports this. Acked-by: Darrick J. Wong Signed-off-by: Jens Axboe --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 00db81eac80d..fdbff4860d61 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1080,7 +1080,7 @@ xfs_file_open( return -EFBIG; if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; return 0; } From 8730f12b7962b21ea9ad2756abce1e205d22db84 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 10:19:22 -0600 Subject: [PATCH 038/502] btrfs: flag files as supporting buffered async reads btrfs uses generic_file_read_iter(), which already supports this. Acked-by: Chris Mason Signed-off-by: Jens Axboe --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2c14312b05e8..234a418eb6da 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3472,7 +3472,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) static int btrfs_file_open(struct inode *inode, struct file *filp) { - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; return generic_file_open(inode, filp); } From d1932dc3dc268f8dd5201c64971324d06ba977cc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 10:18:23 -0600 Subject: [PATCH 039/502] mm: add kiocb_wait_page_queue_init() helper Checks if the file supports it, and initializes the values that we need. Caller passes in 'data' pointer, if any, and the callback function to be used. Acked-by: Johannes Weiner Signed-off-by: Jens Axboe --- include/linux/pagemap.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e053e1d9a4d7..7386bc67cc5a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -533,6 +533,27 @@ static inline int wake_page_match(struct wait_page_queue *wait_page, return 1; } +static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, + struct wait_page_queue *wait, + wait_queue_func_t func, + void *data) +{ + /* Can't support async wakeup with polled IO */ + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; + if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) { + wait->wait.func = func; + wait->wait.private = data; + wait->wait.flags = 0; + INIT_LIST_HEAD(&wait->wait.entry); + kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_waitq = wait; + return 0; + } + + return -EOPNOTSUPP; +} + extern void __lock_page(struct page *page); extern int __lock_page_killable(struct page *page); extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); From bcf5a06304d69a3bb194a494d87b532d5e90b01c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:24:42 -0600 Subject: [PATCH 040/502] io_uring: support true async buffered reads, if file provides it If the file is flagged with FMODE_BUF_RASYNC, then we don't have to punt the buffered read to an io-wq worker. Instead we can rely on page unlocking callbacks to support retry based async IO. This is a lot more efficient than doing async thread offload. The retry is done similarly to how we handle poll based retry. From the unlock callback, we simply queue the retry to a task_work based handler. Signed-off-by: Jens Axboe --- fs/io_uring.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 135 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f3dbf83fabf3..5d1685e206c1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,6 +78,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -503,6 +504,8 @@ struct io_async_rw { struct iovec *iov; ssize_t nr_segs; ssize_t size; + struct wait_page_queue wpq; + struct callback_head task_work; }; struct io_async_ctx { @@ -2750,6 +2753,126 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static void __io_async_buf_error(struct io_kiocb *req, int error) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + io_cqring_fill_event(req, error); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + req_set_fail_links(req); + io_double_put_req(req); +} + +static void io_async_buf_cancel(struct callback_head *cb) +{ + struct io_async_rw *rw; + struct io_kiocb *req; + + rw = container_of(cb, struct io_async_rw, task_work); + req = rw->wpq.wait.private; + __io_async_buf_error(req, -ECANCELED); +} + +static void io_async_buf_retry(struct callback_head *cb) +{ + struct io_async_rw *rw; + struct io_ring_ctx *ctx; + struct io_kiocb *req; + + rw = container_of(cb, struct io_async_rw, task_work); + req = rw->wpq.wait.private; + ctx = req->ctx; + + __set_current_state(TASK_RUNNING); + if (!io_sq_thread_acquire_mm(ctx, req)) { + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL); + mutex_unlock(&ctx->uring_lock); + } else { + __io_async_buf_error(req, -EFAULT); + } +} + +static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, + int sync, void *arg) +{ + struct wait_page_queue *wpq; + struct io_kiocb *req = wait->private; + struct io_async_rw *rw = &req->io->rw; + struct wait_page_key *key = arg; + struct task_struct *tsk; + int ret; + + wpq = container_of(wait, struct wait_page_queue, wait); + + ret = wake_page_match(wpq, key); + if (ret != 1) + return ret; + + list_del_init(&wait->entry); + + init_task_work(&rw->task_work, io_async_buf_retry); + /* submit ref gets dropped, acquire a new one */ + refcount_inc(&req->refs); + tsk = req->task; + ret = task_work_add(tsk, &rw->task_work, true); + if (unlikely(ret)) { + /* queue just for cancelation */ + init_task_work(&rw->task_work, io_async_buf_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &rw->task_work, true); + } + wake_up_process(tsk); + return 1; +} + +static bool io_rw_should_retry(struct io_kiocb *req) +{ + struct kiocb *kiocb = &req->rw.kiocb; + int ret; + + /* never retry for NOWAIT, we just complete with -EAGAIN */ + if (req->flags & REQ_F_NOWAIT) + return false; + + /* already tried, or we're doing O_DIRECT */ + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ)) + return false; + /* + * just use poll if we can, and don't attempt if the fs doesn't + * support callback based unlocks + */ + if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + return false; + + /* + * If request type doesn't require req->io to defer in general, + * we need to allocate it here + */ + if (!req->io && __io_alloc_async_ctx(req)) + return false; + + ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, + io_async_buf_func, req); + if (!ret) { + io_get_req_task(req); + return true; + } + + return false; +} + +static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) +{ + if (req->file->f_op->read_iter) + return call_read_iter(req->file, &req->rw.kiocb, iter); + return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); +} + static int io_read(struct io_kiocb *req, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; @@ -2784,10 +2907,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) unsigned long nr_segs = iter.nr_segs; ssize_t ret2 = 0; - if (req->file->f_op->read_iter) - ret2 = call_read_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); + ret2 = io_iter_do_read(req, &iter); /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { @@ -2804,6 +2924,17 @@ copy_iov: if (!(req->flags & REQ_F_NOWAIT) && !file_can_poll(req->file)) req->flags |= REQ_F_MUST_PUNT; + /* if we can retry, do so with the callbacks armed */ + if (io_rw_should_retry(req)) { + ret2 = io_iter_do_read(req, &iter); + if (ret2 == -EIOCBQUEUED) { + goto out_free; + } else if (ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2); + goto out_free; + } + } + kiocb->ki_flags &= ~IOCB_WAITQ; return -EAGAIN; } } From 62ef73165091476d31f31e33d9d0d48b088c129d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:50 +0300 Subject: [PATCH 041/502] io_uring: remove setting REQ_F_MUST_PUNT in rw io_{read,write}() { ... copy_iov: // prep async if (!(flags & REQ_F_NOWAIT) && !file_can_poll(file)) flags |= REQ_F_MUST_PUNT; } REQ_F_MUST_PUNT there is pointless, because if it happens then REQ_F_NOWAIT is known to be _not_ set, and the request will go async path in __io_queue_sqe() anyway. file_can_poll() check is also repeated in arm_poll*(), so don't need it. Remove the mentioned assignment REQ_F_MUST_PUNT in preparation for killing the flag. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5d1685e206c1..13f72d2a3fec 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2920,10 +2920,6 @@ copy_iov: inline_vecs, &iter); if (ret) goto out_free; - /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT) && - !file_can_poll(req->file)) - req->flags |= REQ_F_MUST_PUNT; /* if we can retry, do so with the callbacks armed */ if (io_rw_should_retry(req)) { ret2 = io_iter_do_read(req, &iter); @@ -3057,10 +3053,6 @@ copy_iov: inline_vecs, &iter); if (ret) goto out_free; - /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT) && - !file_can_poll(req->file)) - req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } From 24c74678634b3cbdb325b3b7706366c83811b311 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:51 +0300 Subject: [PATCH 042/502] io_uring: remove REQ_F_MUST_PUNT REQ_F_MUST_PUNT may seem looking good and clear, but it's the same as not having REQ_F_NOWAIT set. That rather creates more confusion. Moreover, it doesn't even affect any behaviour (e.g. see the patch removing it from io_{read,write}). Kill theg flag and update already outdated comments. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 13f72d2a3fec..93af915a98e6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -534,7 +534,6 @@ enum { REQ_F_LINK_TIMEOUT_BIT, REQ_F_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_MUST_PUNT_BIT, REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, @@ -582,8 +581,6 @@ enum { REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* must be punted even for NONBLOCK */ - REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), /* no timeout sequence */ REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), /* completion under lock */ @@ -2894,10 +2891,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) if (req->flags & REQ_F_LINK_HEAD) req->result = io_size; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ + /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; @@ -2993,10 +2987,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) if (req->flags & REQ_F_LINK_HEAD) req->result = io_size; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ + /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) goto copy_iov; @@ -3717,8 +3708,10 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) /* if the file has a flush method, be safe and punt to async */ if (close->put_file->f_op->flush && force_nonblock) { + /* was never set, but play safe */ + req->flags &= ~REQ_F_NOWAIT; /* avoid grabbing files - we don't need the files */ - req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT; + req->flags |= REQ_F_NO_FILE_TABLE; return -EAGAIN; } @@ -4645,7 +4638,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (!req->file || !file_can_poll(req->file)) return false; - if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED)) + if (req->flags & REQ_F_POLLED) return false; if (!def->pollin && !def->pollout) return false; @@ -5852,8 +5845,7 @@ again: * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || - (req->flags & REQ_F_MUST_PUNT))) { + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { if (io_arm_poll_handler(req)) { if (linked_timeout) io_queue_linked_timeout(linked_timeout); From b90cd197f9315f968d5ee4e6ee9f4e3067f2c883 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:52 +0300 Subject: [PATCH 043/502] io_uring: set @poll->file after @poll init It's a good practice to modify fields of a struct after but not before it was initialised. Even though io_init_poll_iocb() doesn't touch poll->file, call it first. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 93af915a98e6..cc1f2f3b7bfa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4596,8 +4596,8 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, struct io_ring_ctx *ctx = req->ctx; bool cancel = false; - poll->file = req->file; io_init_poll_iocb(poll, mask, wake_func); + poll->file = req->file; poll->wait.private = req; ipt->pt._key = mask; From f6b6c7d6a9600bdbf5826f57137630e1670e2d87 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:53 +0300 Subject: [PATCH 044/502] io_uring: kill NULL checks for submit state After recent changes, io_submit_sqes() always passes valid submit state, so kill leftovers checking it for NULL. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cc1f2f3b7bfa..c686061c3762 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1376,11 +1376,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; - if (!state) { - req = kmem_cache_alloc(req_cachep, gfp); - if (unlikely(!req)) - goto fallback; - } else if (!state->free_reqs) { + if (!state->free_reqs) { size_t sz; int ret; From d3cac64c498c4fb2df46b97ee6f4c7d6d75f5e3d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Jun 2020 12:38:13 +0300 Subject: [PATCH 045/502] io_uring: fix NULL-mm for linked reqs __io_queue_sqe() tries to handle all request of a link, so it's not enough to grab mm in io_sq_thread_acquire_mm() based just on the head. Don't check req->needs_mm and do it always. Signed-off-by: Pavel Begunkov --- fs/io_uring.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c686061c3762..72739188b2ff 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1991,10 +1991,9 @@ static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) } } -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { - if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (!current->mm) { if (unlikely(!mmget_not_zero(ctx->sqo_mm))) return -EFAULT; kthread_use_mm(ctx->sqo_mm); @@ -2003,6 +2002,14 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, return 0; } +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].needs_mm) + return 0; + return __io_sq_thread_acquire_mm(ctx); +} + #ifdef CONFIG_BLOCK static bool io_resubmit_prep(struct io_kiocb *req, int error) { @@ -2781,7 +2788,7 @@ static void io_async_buf_retry(struct callback_head *cb) ctx = req->ctx; __set_current_state(TASK_RUNNING); - if (!io_sq_thread_acquire_mm(ctx, req)) { + if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); From e1e16097e265daac918ce355bf1a0d1677adf0c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 09:17:17 -0600 Subject: [PATCH 046/502] io_uring: provide generic io_req_complete() helper We have lots of callers of: io_cqring_add_event(req, result); io_put_req(req); Provide a helper that does this for us. It helps clean up the code, and also provides a more convenient location for us to change the completion handling. Signed-off-by: Jens Axboe --- fs/io_uring.c | 106 ++++++++++++++++++++------------------------------ 1 file changed, 43 insertions(+), 63 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 72739188b2ff..17d7bafaf8cf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1335,7 +1335,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) __io_cqring_fill_event(req, res, 0); } -static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) +static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; @@ -1348,9 +1348,15 @@ static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) io_cqring_ev_posted(ctx); } -static void io_cqring_add_event(struct io_kiocb *req, long res) +static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags) { - __io_cqring_add_event(req, res, 0); + io_cqring_add_event(req, res, cflags); + io_put_req(req); +} + +static void io_req_complete(struct io_kiocb *req, long res) +{ + __io_req_complete(req, res, 0); } static inline bool io_is_fallback_req(struct io_kiocb *req) @@ -1978,7 +1984,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_kbuf(req); - __io_cqring_add_event(req, res, cflags); + io_cqring_add_event(req, res, cflags); } static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) @@ -2048,9 +2054,8 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) return true; kfree(iovec); end_req: - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return false; } @@ -3117,10 +3122,9 @@ static int io_tee(struct io_kiocb *req, bool force_nonblock) io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret != sp->len) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3154,10 +3158,9 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret != sp->len) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3171,8 +3174,7 @@ static int io_nop(struct io_kiocb *req) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_cqring_add_event(req, 0); - io_put_req(req); + io_req_complete(req, 0); return 0; } @@ -3211,8 +3213,7 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock) req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3245,8 +3246,7 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock) current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3342,8 +3342,7 @@ err: req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3416,8 +3415,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) io_ring_submit_lock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3504,8 +3502,7 @@ out: io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3548,8 +3545,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; #else return -EOPNOTSUPP; @@ -3585,8 +3581,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock) ret = do_madvise(ma->addr, ma->len, ma->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; #else return -EOPNOTSUPP; @@ -3625,8 +3620,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock) ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3665,8 +3659,7 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3722,10 +3715,9 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) ret = filp_close(close->put_file, req->work.files); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); fput(close->put_file); close->put_file = NULL; - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3759,8 +3751,7 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) req->sync.flags); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3859,10 +3850,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3902,10 +3892,9 @@ static int io_send(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; } - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -4102,10 +4091,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, cflags); return 0; } @@ -4159,10 +4147,9 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock) kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; - __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, cflags); return 0; } @@ -4201,8 +4188,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; req_set_fail_links(req); } - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -4262,8 +4248,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock) out: if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } #else /* !CONFIG_NET */ @@ -4555,7 +4540,7 @@ static void io_async_task_func(struct callback_head *cb) if (!canceled) { __set_current_state(TASK_RUNNING); if (io_sq_thread_acquire_mm(ctx, req)) { - io_cqring_add_event(req, -EFAULT); + io_cqring_add_event(req, -EFAULT, 0); goto end_req; } mutex_lock(&ctx->uring_lock); @@ -4804,10 +4789,9 @@ static int io_poll_remove(struct io_kiocb *req) ret = io_poll_cancel(ctx, addr); spin_unlock_irq(&ctx->completion_lock); - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -5163,8 +5147,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -5657,8 +5640,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) if (ret) { req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); } io_steal_work(req, workptr); @@ -5775,8 +5757,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req(prev); } else { - io_cqring_add_event(req, -ETIME); - io_put_req(req); + io_req_complete(req, -ETIME); } return HRTIMER_NORESTART; } @@ -5885,9 +5866,8 @@ err: /* and drop final reference, if we failed */ if (ret) { - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); } if (nxt) { req = nxt; @@ -5909,9 +5889,9 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ret) { if (ret != -EIOCBQUEUED) { fail_req: - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { if (!req->io) { @@ -5937,8 +5917,8 @@ fail_req: static inline void io_queue_link_head(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_FAIL_LINK)) { - io_cqring_add_event(req, -ECANCELED); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, -ECANCELED); } else io_queue_sqe(req, NULL); } @@ -6195,8 +6175,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (unlikely(err)) { fail_req: - io_cqring_add_event(req, err); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, err); break; } From 013538bd65fd3cdbf3ca8b0c99b962c70473c803 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 09:29:15 -0600 Subject: [PATCH 047/502] io_uring: add 'io_comp_state' to struct io_submit_state No functional changes in this patch, just in preparation for passing back pending completions to the caller and completing them in a batched fashion. Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 17d7bafaf8cf..002ab5eae20f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -678,6 +678,12 @@ struct io_kiocb { #define IO_IOPOLL_BATCH 8 +struct io_comp_state { + unsigned int nr; + struct list_head list; + struct io_ring_ctx *ctx; +}; + struct io_submit_state { struct blk_plug plug; @@ -687,6 +693,11 @@ struct io_submit_state { void *reqs[IO_IOPOLL_BATCH]; unsigned int free_reqs; + /* + * Batch completion logic + */ + struct io_comp_state comp; + /* * File reference cache */ @@ -6006,12 +6017,15 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - unsigned int max_ios) + struct io_ring_ctx *ctx, unsigned int max_ios) { blk_start_plug(&state->plug); #ifdef CONFIG_BLOCK state->plug.nowait = true; #endif + state->comp.nr = 0; + INIT_LIST_HEAD(&state->comp.list); + state->comp.ctx = ctx; state->free_reqs = 0; state->file = NULL; state->ios_left = max_ios; @@ -6146,7 +6160,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - io_submit_state_start(&state, nr); + io_submit_state_start(&state, ctx, nr); ctx->ring_fd = ring_fd; ctx->ring_file = ring_file; From f13fad7ba41cef806358885fbb3f9004f3214b2d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 09:34:30 -0600 Subject: [PATCH 048/502] io_uring: pass down completion state on the issue side No functional changes in this patch, just in preparation for having the completion state be available on the issue side. Later on, this will allow requests that complete inline to be completed in batches. Signed-off-by: Jens Axboe --- fs/io_uring.c | 67 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 17 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 002ab5eae20f..46241c1ad1b8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -909,7 +909,8 @@ static void io_cleanup_req(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); static void __io_queue_sqe(struct io_kiocb *req, - const struct io_uring_sqe *sqe); + const struct io_uring_sqe *sqe, + struct io_comp_state *cs); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, @@ -2806,7 +2807,7 @@ static void io_async_buf_retry(struct callback_head *cb) __set_current_state(TASK_RUNNING); if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL); + __io_queue_sqe(req, NULL, NULL); mutex_unlock(&ctx->uring_lock); } else { __io_async_buf_error(req, -EFAULT); @@ -4430,7 +4431,7 @@ static void io_poll_task_func(struct callback_head *cb) struct io_ring_ctx *ctx = nxt->ctx; mutex_lock(&ctx->uring_lock); - __io_queue_sqe(nxt, NULL); + __io_queue_sqe(nxt, NULL, NULL); mutex_unlock(&ctx->uring_lock); } } @@ -4555,7 +4556,7 @@ static void io_async_task_func(struct callback_head *cb) goto end_req; } mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL); + __io_queue_sqe(req, NULL, NULL); mutex_unlock(&ctx->uring_lock); } else { io_cqring_ev_posted(ctx); @@ -5352,7 +5353,7 @@ static void io_cleanup_req(struct io_kiocb *req) } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) + bool force_nonblock, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5637,7 +5638,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) if (!ret) { do { - ret = io_issue_sqe(req, NULL, false); + ret = io_issue_sqe(req, NULL, false, NULL); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -5814,7 +5815,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt; @@ -5834,7 +5836,7 @@ again: old_creds = override_creds(req->work.creds); } - ret = io_issue_sqe(req, sqe, true); + ret = io_issue_sqe(req, sqe, true, cs); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -5892,7 +5894,8 @@ exit: revert_creds(old_creds); } -static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { int ret; @@ -5921,21 +5924,22 @@ fail_req: req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { - __io_queue_sqe(req, sqe); + __io_queue_sqe(req, sqe, cs); } } -static inline void io_queue_link_head(struct io_kiocb *req) +static inline void io_queue_link_head(struct io_kiocb *req, + struct io_comp_state *cs) { if (unlikely(req->flags & REQ_F_FAIL_LINK)) { io_put_req(req); io_req_complete(req, -ECANCELED); } else - io_queue_sqe(req, NULL); + io_queue_sqe(req, NULL, cs); } static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **link) + struct io_kiocb **link, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5975,7 +5979,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - io_queue_link_head(head); + io_queue_link_head(head, cs); *link = NULL; } } else { @@ -5995,18 +5999,47 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_FAIL_LINK; *link = req; } else { - io_queue_sqe(req, sqe); + io_queue_sqe(req, sqe, cs); } } return 0; } +static void io_submit_flush_completions(struct io_comp_state *cs) +{ + struct io_ring_ctx *ctx = cs->ctx; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&cs->list)) { + struct io_kiocb *req; + + req = list_first_entry(&cs->list, struct io_kiocb, list); + list_del(&req->list); + io_cqring_fill_event(req, req->result); + if (!(req->flags & REQ_F_LINK_HEAD)) { + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } else { + spin_unlock_irq(&ctx->completion_lock); + io_put_req(req); + spin_lock_irq(&ctx->completion_lock); + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + cs->nr = 0; +} + /* * Batched submission is done, ensure local IO is flushed out. */ static void io_submit_state_end(struct io_submit_state *state) { + if (!list_empty(&state->comp.list)) + io_submit_flush_completions(&state->comp); blk_finish_plug(&state->plug); io_state_file_put(state); if (state->free_reqs) @@ -6196,7 +6229,7 @@ fail_req: trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true, io_async_submit(ctx)); - err = io_submit_sqe(req, sqe, &link); + err = io_submit_sqe(req, sqe, &link, &state.comp); if (err) goto fail_req; } @@ -6207,7 +6240,7 @@ fail_req: percpu_ref_put_many(&ctx->refs, nr - ref_used); } if (link) - io_queue_link_head(link); + io_queue_link_head(link, &state.comp); io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ From 229a7b63507a3e84afb17c3bbb67505a81d28a1d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 10:13:11 -0600 Subject: [PATCH 049/502] io_uring: pass in completion state to appropriate issue side handlers Provide the completion state to the handlers that we know can complete inline, so they can utilize this for batching completions. Cap the max batch count at 32. This should be enough to provide a good amortization of the cost of the lock+commit dance for completions, while still being low enough not to cause any real latency issues for SQPOLL applications. Xuan Zhuo reports that this changes his profile from: 17.97% [kernel] [k] copy_user_generic_unrolled 13.92% [kernel] [k] io_commit_cqring 11.04% [kernel] [k] __io_cqring_fill_event 10.33% [kernel] [k] udp_recvmsg 5.94% [kernel] [k] skb_release_data 4.31% [kernel] [k] udp_rmem_release 2.68% [kernel] [k] __check_object_size 2.24% [kernel] [k] __slab_free 2.22% [kernel] [k] _raw_spin_lock_bh 2.21% [kernel] [k] kmem_cache_free 2.13% [kernel] [k] free_pcppages_bulk 1.83% [kernel] [k] io_submit_sqes 1.38% [kernel] [k] page_frag_free 1.31% [kernel] [k] inet_recvmsg to 19.99% [kernel] [k] copy_user_generic_unrolled 11.63% [kernel] [k] skb_release_data 9.36% [kernel] [k] udp_rmem_release 8.64% [kernel] [k] udp_recvmsg 6.21% [kernel] [k] __slab_free 4.39% [kernel] [k] __check_object_size 3.64% [kernel] [k] free_pcppages_bulk 2.41% [kernel] [k] kmem_cache_free 2.00% [kernel] [k] io_submit_sqes 1.95% [kernel] [k] page_frag_free 1.54% [kernel] [k] io_put_req [...] 0.07% [kernel] [k] io_commit_cqring 0.44% [kernel] [k] __io_cqring_fill_event Signed-off-by: Jens Axboe --- fs/io_uring.c | 153 ++++++++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 67 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 46241c1ad1b8..6c9ca4fcbc31 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1360,15 +1360,50 @@ static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags) io_cqring_ev_posted(ctx); } -static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags) +static void io_submit_flush_completions(struct io_comp_state *cs) { - io_cqring_add_event(req, res, cflags); - io_put_req(req); + struct io_ring_ctx *ctx = cs->ctx; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&cs->list)) { + struct io_kiocb *req; + + req = list_first_entry(&cs->list, struct io_kiocb, list); + list_del(&req->list); + io_cqring_fill_event(req, req->result); + if (!(req->flags & REQ_F_LINK_HEAD)) { + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } else { + spin_unlock_irq(&ctx->completion_lock); + io_put_req(req); + spin_lock_irq(&ctx->completion_lock); + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + cs->nr = 0; +} + +static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, + struct io_comp_state *cs) +{ + if (!cs) { + io_cqring_add_event(req, res, cflags); + io_put_req(req); + } else { + req->result = res; + list_add_tail(&req->list, &cs->list); + if (++cs->nr >= 32) + io_submit_flush_completions(cs); + } } static void io_req_complete(struct io_kiocb *req, long res) { - __io_req_complete(req, res, 0); + __io_req_complete(req, res, 0, NULL); } static inline bool io_is_fallback_req(struct io_kiocb *req) @@ -3179,14 +3214,14 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) /* * IORING_OP_NOP just posts a completion event, nothing else. */ -static int io_nop(struct io_kiocb *req) +static int io_nop(struct io_kiocb *req, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_req_complete(req, 0); + __io_req_complete(req, 0, 0, cs); return 0; } @@ -3408,7 +3443,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, return i; } -static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) +static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; @@ -3427,7 +3463,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) io_ring_submit_lock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3485,7 +3521,8 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) return i ? i : -ENOMEM; } -static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) +static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; @@ -3514,7 +3551,7 @@ out: io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3545,7 +3582,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #endif } -static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) +static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { #if defined(CONFIG_EPOLL) struct io_epoll *ie = &req->epoll; @@ -3557,7 +3595,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; #else return -EOPNOTSUPP; @@ -3702,7 +3740,8 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_close(struct io_kiocb *req, bool force_nonblock) +static int io_close(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_close *close = &req->close; int ret; @@ -3729,7 +3768,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) req_set_fail_links(req); fput(close->put_file); close->put_file = NULL; - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3815,7 +3854,8 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_async_msghdr *kmsg = NULL; struct socket *sock; @@ -3864,11 +3904,12 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } -static int io_send(struct io_kiocb *req, bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct socket *sock; int ret; @@ -3906,7 +3947,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -4049,7 +4090,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, return ret; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_async_msghdr *kmsg = NULL; struct socket *sock; @@ -4105,11 +4147,12 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, cflags); + __io_req_complete(req, ret, cflags, cs); return 0; } -static int io_recv(struct io_kiocb *req, bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_buffer *kbuf = NULL; struct socket *sock; @@ -4161,7 +4204,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, cflags); + __io_req_complete(req, ret, cflags, cs); return 0; } @@ -4181,7 +4224,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_accept *accept = &req->accept; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -4200,7 +4244,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; req_set_fail_links(req); } - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -4224,7 +4268,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) &io->connect.address); } -static int io_connect(struct io_kiocb *req, bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_async_ctx __io, *io; unsigned file_flags; @@ -4260,7 +4305,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock) out: if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } #else /* !CONFIG_NET */ @@ -5141,7 +5186,8 @@ static int io_files_update_prep(struct io_kiocb *req, return 0; } -static int io_files_update(struct io_kiocb *req, bool force_nonblock) +static int io_files_update(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_files_update up; @@ -5159,7 +5205,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -5360,7 +5406,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, switch (req->opcode) { case IORING_OP_NOP: - ret = io_nop(req); + ret = io_nop(req, cs); break; case IORING_OP_READV: case IORING_OP_READ_FIXED: @@ -5422,9 +5468,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_SENDMSG) - ret = io_sendmsg(req, force_nonblock); + ret = io_sendmsg(req, force_nonblock, cs); else - ret = io_send(req, force_nonblock); + ret = io_send(req, force_nonblock, cs); break; case IORING_OP_RECVMSG: case IORING_OP_RECV: @@ -5434,9 +5480,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_RECVMSG) - ret = io_recvmsg(req, force_nonblock); + ret = io_recvmsg(req, force_nonblock, cs); else - ret = io_recv(req, force_nonblock); + ret = io_recv(req, force_nonblock, cs); break; case IORING_OP_TIMEOUT: if (sqe) { @@ -5460,7 +5506,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_accept(req, force_nonblock); + ret = io_accept(req, force_nonblock, cs); break; case IORING_OP_CONNECT: if (sqe) { @@ -5468,7 +5514,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_connect(req, force_nonblock); + ret = io_connect(req, force_nonblock, cs); break; case IORING_OP_ASYNC_CANCEL: if (sqe) { @@ -5500,7 +5546,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_close(req, force_nonblock); + ret = io_close(req, force_nonblock, cs); break; case IORING_OP_FILES_UPDATE: if (sqe) { @@ -5508,7 +5554,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_files_update(req, force_nonblock); + ret = io_files_update(req, force_nonblock, cs); break; case IORING_OP_STATX: if (sqe) { @@ -5548,7 +5594,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_epoll_ctl(req, force_nonblock); + ret = io_epoll_ctl(req, force_nonblock, cs); break; case IORING_OP_SPLICE: if (sqe) { @@ -5564,7 +5610,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_provide_buffers(req, force_nonblock); + ret = io_provide_buffers(req, force_nonblock, cs); break; case IORING_OP_REMOVE_BUFFERS: if (sqe) { @@ -5572,7 +5618,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_remove_buffers(req, force_nonblock); + ret = io_remove_buffers(req, force_nonblock, cs); break; case IORING_OP_TEE: if (sqe) { @@ -6006,33 +6052,6 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_submit_flush_completions(struct io_comp_state *cs) -{ - struct io_ring_ctx *ctx = cs->ctx; - - spin_lock_irq(&ctx->completion_lock); - while (!list_empty(&cs->list)) { - struct io_kiocb *req; - - req = list_first_entry(&cs->list, struct io_kiocb, list); - list_del(&req->list); - io_cqring_fill_event(req, req->result); - if (!(req->flags & REQ_F_LINK_HEAD)) { - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - } else { - spin_unlock_irq(&ctx->completion_lock); - io_put_req(req); - spin_lock_irq(&ctx->completion_lock); - } - } - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); - cs->nr = 0; -} - /* * Batched submission is done, ensure local IO is flushed out. */ From a1d7c393c4711a9ce6c239c3ab053a50dc96505a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 11:09:46 -0600 Subject: [PATCH 050/502] io_uring: enable READ/WRITE to use deferred completions A bit more surgery required here, as completions are generally done through the kiocb->ki_complete() callback, even if they complete inline. This enables the regular read/write path to use the io_comp_state logic to batch inline completions. Signed-off-by: Jens Axboe --- fs/io_uring.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6c9ca4fcbc31..0bba12e4e559 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2019,7 +2019,8 @@ static inline void req_set_fail_links(struct io_kiocb *req) req->flags |= REQ_F_FAIL_LINK; } -static void io_complete_rw_common(struct kiocb *kiocb, long res) +static void io_complete_rw_common(struct kiocb *kiocb, long res, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); int cflags = 0; @@ -2031,7 +2032,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_kbuf(req); - io_cqring_add_event(req, res, cflags); + __io_req_complete(req, res, cflags, cs); } static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) @@ -2141,14 +2142,18 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) return false; } +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs) +{ + if (!io_rw_reissue(req, res)) + io_complete_rw_common(&req->rw.kiocb, res, cs); +} + static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - if (!io_rw_reissue(req, res)) { - io_complete_rw_common(kiocb, res); - io_put_req(req); - } + __io_complete_rw(req, res, res2, NULL); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -2382,14 +2387,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret) +static void kiocb_done(struct kiocb *kiocb, ssize_t ret, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; if (ret >= 0 && kiocb->ki_complete == io_complete_rw) - io_complete_rw(kiocb, ret, 0); + __io_complete_rw(req, ret, 0, cs); else io_rw_done(kiocb, ret); } @@ -2925,7 +2931,8 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); } -static int io_read(struct io_kiocb *req, bool force_nonblock) +static int io_read(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -2960,7 +2967,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { - kiocb_done(kiocb, ret2); + kiocb_done(kiocb, ret2, cs); } else { iter.count = iov_count; iter.nr_segs = nr_segs; @@ -2975,7 +2982,7 @@ copy_iov: if (ret2 == -EIOCBQUEUED) { goto out_free; } else if (ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2); + kiocb_done(kiocb, ret2, cs); goto out_free; } } @@ -3021,7 +3028,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static int io_write(struct io_kiocb *req, bool force_nonblock) +static int io_write(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -3090,7 +3098,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2); + kiocb_done(kiocb, ret2, cs); } else { iter.count = iov_count; iter.nr_segs = nr_segs; @@ -5416,7 +5424,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_read(req, force_nonblock); + ret = io_read(req, force_nonblock, cs); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: @@ -5426,7 +5434,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_write(req, force_nonblock); + ret = io_write(req, force_nonblock, cs); break; case IORING_OP_FSYNC: if (sqe) { From c40f63790ec957e9449056fb78d8c2523eff96b5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Jun 2020 15:39:59 -0600 Subject: [PATCH 051/502] io_uring: use task_work for links if possible Currently links are always done in an async fashion, unless we catch them inline after we successfully complete a request without having to resort to blocking. This isn't necessarily the most efficient approach, it'd be more ideal if we could just use the task_work handling for this. Outside of saving an async jump, we can also do less prep work for these kinds of requests. Running dependent links from the task_work handler yields some nice performance benefits. As an example, examples/link-cp from the liburing repository uses read+write links to implement a copy operation. Without this patch, the a cache fold 4G file read from a VM runs in about 3 seconds: $ time examples/link-cp /data/file /dev/null real 0m2.986s user 0m0.051s sys 0m2.843s and a subsequent cache hot run looks like this: $ time examples/link-cp /data/file /dev/null real 0m0.898s user 0m0.069s sys 0m0.797s With this patch in place, the cold case takes about 2.4 seconds: $ time examples/link-cp /data/file /dev/null real 0m2.400s user 0m0.020s sys 0m2.366s and the cache hot case looks like this: $ time examples/link-cp /data/file /dev/null real 0m0.676s user 0m0.010s sys 0m0.665s As expected, the (mostly) cache hot case yields the biggest improvement, running about 25% faster with this change, while the cache cold case yields about a 20% increase in performance. Outside of the performance increase, we're using less CPU as well, as we're not using the async offload threads at all for this anymore. Signed-off-by: Jens Axboe --- fs/io_uring.c | 191 +++++++++++++++++++++++++++++++------------------- 1 file changed, 117 insertions(+), 74 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0bba12e4e559..b628e4429b75 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -898,6 +898,7 @@ enum io_mem_account { static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); +static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); @@ -951,6 +952,41 @@ static void __io_put_req_task(struct io_kiocb *req) put_task_struct(req->task); } +static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) +{ + if (!current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].needs_mm) + return 0; + return __io_sq_thread_acquire_mm(ctx); +} + +static inline void req_set_fail_links(struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; +} + static void io_file_put_work(struct work_struct *work); /* @@ -1664,6 +1700,64 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) } } +static void __io_req_task_cancel(struct io_kiocb *req, int error) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + io_cqring_fill_event(req, error); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + req_set_fail_links(req); + io_double_put_req(req); +} + +static void io_req_task_cancel(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + __io_req_task_cancel(req, -ECANCELED); +} + +static void __io_req_task_submit(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + __set_current_state(TASK_RUNNING); + if (!__io_sq_thread_acquire_mm(ctx)) { + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL, NULL); + mutex_unlock(&ctx->uring_lock); + } else { + __io_req_task_cancel(req, -EFAULT); + } +} + +static void io_req_task_submit(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + __io_req_task_submit(req); +} + +static void io_req_task_queue(struct io_kiocb *req) +{ + struct task_struct *tsk = req->task; + int ret; + + init_task_work(&req->task_work, io_req_task_submit); + + ret = task_work_add(tsk, &req->task_work, true); + if (unlikely(ret)) { + init_task_work(&req->task_work, io_req_task_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, true); + } + wake_up_process(tsk); +} + static void io_free_req(struct io_kiocb *req) { struct io_kiocb *nxt = NULL; @@ -1671,8 +1765,12 @@ static void io_free_req(struct io_kiocb *req) io_req_find_next(req, &nxt); __io_free_req(req); - if (nxt) - io_queue_async_work(nxt); + if (nxt) { + if (nxt->flags & REQ_F_WORK_INITIALIZED) + io_queue_async_work(nxt); + else + io_req_task_queue(nxt); + } } static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) @@ -2013,12 +2111,6 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } -static inline void req_set_fail_links(struct io_kiocb *req) -{ - if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; -} - static void io_complete_rw_common(struct kiocb *kiocb, long res, struct io_comp_state *cs) { @@ -2035,35 +2127,6 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res, __io_req_complete(req, res, cflags, cs); } -static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - -static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) -{ - if (!current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } - - return 0; -} - -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (!io_op_defs[req->opcode].needs_mm) - return 0; - return __io_sq_thread_acquire_mm(ctx); -} - #ifdef CONFIG_BLOCK static bool io_resubmit_prep(struct io_kiocb *req, int error) { @@ -2811,20 +2874,6 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void __io_async_buf_error(struct io_kiocb *req, int error) -{ - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->completion_lock); - io_cqring_fill_event(req, error); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); - req_set_fail_links(req); - io_double_put_req(req); -} - static void io_async_buf_cancel(struct callback_head *cb) { struct io_async_rw *rw; @@ -2832,27 +2881,18 @@ static void io_async_buf_cancel(struct callback_head *cb) rw = container_of(cb, struct io_async_rw, task_work); req = rw->wpq.wait.private; - __io_async_buf_error(req, -ECANCELED); + __io_req_task_cancel(req, -ECANCELED); } static void io_async_buf_retry(struct callback_head *cb) { struct io_async_rw *rw; - struct io_ring_ctx *ctx; struct io_kiocb *req; rw = container_of(cb, struct io_async_rw, task_work); req = rw->wpq.wait.private; - ctx = req->ctx; - __set_current_state(TASK_RUNNING); - if (!__io_sq_thread_acquire_mm(ctx)) { - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL, NULL); - mutex_unlock(&ctx->uring_lock); - } else { - __io_async_buf_error(req, -EFAULT); - } + __io_req_task_submit(req); } static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, @@ -5218,23 +5258,25 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock, } static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, bool for_async) { ssize_t ret = 0; if (!sqe) return 0; - io_req_init_async(req); + if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { + io_req_init_async(req); - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; + if (io_op_defs[req->opcode].file_table) { + ret = io_grab_files(req); + if (unlikely(ret)) + return ret; + } + + io_req_work_grab_env(req, &io_op_defs[req->opcode]); } - io_req_work_grab_env(req, &io_op_defs[req->opcode]); - switch (req->opcode) { case IORING_OP_NOP: break; @@ -5347,7 +5389,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req->io) { if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, true); if (ret < 0) return ret; } @@ -5966,7 +6008,7 @@ fail_req: ret = -EAGAIN; if (io_alloc_async_ctx(req)) goto fail_req; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, true); if (unlikely(ret < 0)) goto fail_req; } @@ -6022,13 +6064,14 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, false); if (ret) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; return ret; } trace_io_uring_link(ctx, req, head); + io_get_req_task(req); list_add_tail(&req->link_list, &head->link_list); /* last request of a link, enqueue the link */ @@ -6048,7 +6091,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, true); if (ret) req->flags |= REQ_F_FAIL_LINK; *link = req; From e883a79d8ced8e123f8c4042a29a7524c39935ab Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Jun 2020 18:20:53 +0300 Subject: [PATCH 052/502] io-wq: compact io-wq flags numbers Renumerate IO_WQ flags, so they take adjacent bits Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io-wq.h b/fs/io-wq.h index 071f1a997800..04239dfb12b0 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -5,10 +5,10 @@ struct io_wq; enum { IO_WQ_WORK_CANCEL = 1, - IO_WQ_WORK_HASHED = 4, - IO_WQ_WORK_UNBOUND = 32, - IO_WQ_WORK_NO_CANCEL = 256, - IO_WQ_WORK_CONCURRENT = 512, + IO_WQ_WORK_HASHED = 2, + IO_WQ_WORK_UNBOUND = 4, + IO_WQ_WORK_NO_CANCEL = 8, + IO_WQ_WORK_CONCURRENT = 16, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; From f4db7182e0de981a3f1b356e0cf43c6815423055 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Jun 2020 18:20:54 +0300 Subject: [PATCH 053/502] io-wq: return next work from ->do_work() directly It's easier to return next work from ->do_work() than having an in-out argument. Looks nicer and easier to compile. Also, merge io_wq_assign_next() into its only user. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 8 +++----- fs/io-wq.h | 2 +- fs/io_uring.c | 57 +++++++++++++++++++++------------------------------ 3 files changed, 27 insertions(+), 40 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 47c5f3aeb460..72f759e1d6eb 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -523,9 +523,8 @@ get_next: work->flags |= IO_WQ_WORK_CANCEL; hash = io_get_work_hash(work); - linked = old_work = work; - wq->do_work(&linked); - linked = (old_work == linked) ? NULL : linked; + old_work = work; + linked = wq->do_work(work); work = next_hashed; if (!work && linked && !io_wq_is_hashed(linked)) { @@ -781,8 +780,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) struct io_wq_work *old_work = work; work->flags |= IO_WQ_WORK_CANCEL; - wq->do_work(&work); - work = (work == old_work) ? NULL : work; + work = wq->do_work(work); wq->free_work(old_work); } while (work); } diff --git a/fs/io-wq.h b/fs/io-wq.h index 04239dfb12b0..114f12ec2d65 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -101,7 +101,7 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) } typedef void (free_work_fn)(struct io_wq_work *); -typedef void (io_wq_work_fn)(struct io_wq_work **); +typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *); struct io_wq_data { struct user_struct *user; diff --git a/fs/io_uring.c b/fs/io_uring.c index b628e4429b75..2e44b3788265 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -895,7 +895,6 @@ enum io_mem_account { ACCT_PINNED, }; -static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); @@ -1773,20 +1772,6 @@ static void io_free_req(struct io_kiocb *req) } } -static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) -{ - struct io_kiocb *link; - const struct io_op_def *def = &io_op_defs[nxt->opcode]; - - if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file) - io_wq_hash_work(&nxt->work, file_inode(nxt->file)); - - *workptr = &nxt->work; - link = io_prep_linked_timeout(nxt); - if (link) - nxt->flags |= REQ_F_QUEUE_TIMEOUT; -} - /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1806,24 +1791,29 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_steal_work(struct io_kiocb *req, - struct io_wq_work **workptr) +static struct io_wq_work *io_steal_work(struct io_kiocb *req) { - /* - * It's in an io-wq worker, so there always should be at least - * one reference, which will be dropped in io_put_work() just - * after the current handler returns. - * - * It also means, that if the counter dropped to 1, then there is - * no asynchronous users left, so it's safe to steal the next work. - */ - if (refcount_read(&req->refs) == 1) { - struct io_kiocb *nxt = NULL; + struct io_kiocb *link, *nxt = NULL; - io_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); - } + /* + * A ref is owned by io-wq in which context we're. So, if that's the + * last one, it's safe to steal next work. False negatives are Ok, + * it just will be re-punted async in io_put_work() + */ + if (refcount_read(&req->refs) != 1) + return NULL; + + io_req_find_next(req, &nxt); + if (!nxt) + return NULL; + + if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file) + io_wq_hash_work(&nxt->work, file_inode(nxt->file)); + + link = io_prep_linked_timeout(nxt); + if (link) + nxt->flags |= REQ_F_QUEUE_TIMEOUT; + return &nxt->work; } /* @@ -5718,9 +5708,8 @@ static void io_arm_async_linked_timeout(struct io_kiocb *req) io_queue_linked_timeout(link); } -static void io_wq_submit_work(struct io_wq_work **workptr) +static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) { - struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); int ret = 0; @@ -5751,7 +5740,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_req_complete(req, ret); } - io_steal_work(req, workptr); + return io_steal_work(req); } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, From 1e16c2f917a59d27fb6b540c44d66978c8ad29ef Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 26 Jun 2020 16:32:50 -0700 Subject: [PATCH 054/502] io_uring: fix function args for !CONFIG_NET MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build errors when CONFIG_NET is not set/enabled: ../fs/io_uring.c:5472:10: error: too many arguments to function ‘io_sendmsg’ ../fs/io_uring.c:5474:10: error: too many arguments to function ‘io_send’ ../fs/io_uring.c:5484:10: error: too many arguments to function ‘io_recvmsg’ ../fs/io_uring.c:5486:10: error: too many arguments to function ‘io_recv’ ../fs/io_uring.c:5510:9: error: too many arguments to function ‘io_accept’ ../fs/io_uring.c:5518:9: error: too many arguments to function ‘io_connect’ Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: io-uring@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index af4d7a5c49f4..43ddda2a3d49 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4360,12 +4360,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } -static int io_send(struct io_kiocb *req, bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4376,12 +4378,14 @@ static int io_recvmsg_prep(struct io_kiocb *req, return -EOPNOTSUPP; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } -static int io_recv(struct io_kiocb *req, bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4391,7 +4395,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4401,7 +4406,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_connect(struct io_kiocb *req, bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } From 8ef77766ba8694968ed4ba24311b4bacee14f235 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:59 +0300 Subject: [PATCH 055/502] io_uring: fix req->work corruption req->work and req->task_work are in a union, so io_req_task_queue() screws everything that was in work. De-union them for now. [ 704.367253] BUG: unable to handle page fault for address: ffffffffaf7330d0 [ 704.367256] #PF: supervisor write access in kernel mode [ 704.367256] #PF: error_code(0x0003) - permissions violation [ 704.367261] CPU: 6 PID: 1654 Comm: io_wqe_worker-0 Tainted: G I 5.8.0-rc2-00038-ge28d0bdc4863-dirty #498 [ 704.367265] RIP: 0010:_raw_spin_lock+0x1e/0x36 ... [ 704.367276] __alloc_fd+0x35/0x150 [ 704.367279] __get_unused_fd_flags+0x25/0x30 [ 704.367280] io_openat2+0xcb/0x1b0 [ 704.367283] io_issue_sqe+0x36a/0x1320 [ 704.367294] io_wq_submit_work+0x58/0x160 [ 704.367295] io_worker_handle_work+0x2a3/0x430 [ 704.367296] io_wqe_worker+0x2a0/0x350 [ 704.367301] kthread+0x136/0x180 [ 704.367304] ret_from_fork+0x22/0x30 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 43ddda2a3d49..dcf3ffb5ecf3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -668,12 +668,12 @@ struct io_kiocb { * restore the work, if needed. */ struct { - struct callback_head task_work; struct hlist_node hash_node; struct async_poll *apoll; }; struct io_wq_work work; }; + struct callback_head task_work; }; #define IO_IOPOLL_BATCH 8 From 906a8c3fdbc367325d4200e39212a2a7715b7b0e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:55 +0300 Subject: [PATCH 056/502] io_uring: fix punting req w/o grabbed env It's not enough to check for REQ_F_WORK_INITIALIZED and punt async assuming that io_req_work_grab_env() was called, it may not have been. E.g. io_close_prep() and personality path set the flag without further async init. As a quick fix, always pass next work through io_req_task_queue(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index dcf3ffb5ecf3..483457f6a7df 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1766,12 +1766,8 @@ static void io_free_req(struct io_kiocb *req) io_req_find_next(req, &nxt); __io_free_req(req); - if (nxt) { - if (nxt->flags & REQ_F_WORK_INITIALIZED) - io_queue_async_work(nxt); - else - io_req_task_queue(nxt); - } + if (nxt) + io_req_task_queue(nxt); } /* From 1bcb8c5d65a845e0ecb9e82237c399b29b8d15ea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:56 +0300 Subject: [PATCH 057/502] io_uring: fix feeding io-wq with uninit reqs io_steal_work() can't be sure that @nxt has req->work properly set, so we can't pass it to io-wq as is. A dirty quick fix -- drag it through io_req_task_queue(), and always return NULL from io_steal_work(). e.g. [ 50.770161] BUG: kernel NULL pointer dereference, address: 00000000 [ 50.770164] #PF: supervisor write access in kernel mode [ 50.770164] #PF: error_code(0x0002) - not-present page [ 50.770168] CPU: 1 PID: 1448 Comm: io_wqe_worker-0 Tainted: G I 5.8.0-rc2-00035-g2237d76530eb-dirty #494 [ 50.770172] RIP: 0010:override_creds+0x19/0x30 ... [ 50.770183] io_worker_handle_work+0x25c/0x430 [ 50.770185] io_wqe_worker+0x2a0/0x350 [ 50.770190] kthread+0x136/0x180 [ 50.770194] ret_from_fork+0x22/0x30 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 483457f6a7df..658949bed77f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1791,7 +1791,7 @@ static void io_put_req(struct io_kiocb *req) static struct io_wq_work *io_steal_work(struct io_kiocb *req) { - struct io_kiocb *link, *nxt = NULL; + struct io_kiocb *nxt = NULL; /* * A ref is owned by io-wq in which context we're. So, if that's the @@ -1808,10 +1808,15 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file) io_wq_hash_work(&nxt->work, file_inode(nxt->file)); - link = io_prep_linked_timeout(nxt); - if (link) - nxt->flags |= REQ_F_QUEUE_TIMEOUT; - return &nxt->work; + io_req_task_queue(nxt); + /* + * If we're going to return actual work, here should be timeout prep: + * + * link = io_prep_linked_timeout(nxt); + * if (link) + * nxt->flags |= REQ_F_QUEUE_TIMEOUT; + */ + return NULL; } /* From a6d45dd0d43e6d1275e002704540688b6768bc22 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:57 +0300 Subject: [PATCH 058/502] io_uring: don't mark link's head for_async No reason to mark a head of a link as for-async in io_req_defer_prep(). grab_env(), etc. That will be done further during submission if neccessary. Mark for_async=false saving extra grab_env() in many cases. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 658949bed77f..545b137c7b4a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6092,7 +6092,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, true); + ret = io_req_defer_prep(req, sqe, false); if (ret) req->flags |= REQ_F_FAIL_LINK; *link = req; From 710c2bfb66474a186b0196e3342d43db0e6c04e1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:58 +0300 Subject: [PATCH 059/502] io_uring: fix missing io_grab_files() We won't have valid ring_fd, ring_file in task work. Grab files early. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 545b137c7b4a..4a9929c0b4ad 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5270,15 +5270,15 @@ static int io_req_defer_prep(struct io_kiocb *req, if (!sqe) return 0; + if (io_op_defs[req->opcode].file_table) { + io_req_init_async(req); + ret = io_grab_files(req); + if (unlikely(ret)) + return ret; + } + if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { io_req_init_async(req); - - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; - } - io_req_work_grab_env(req, &io_op_defs[req->opcode]); } From 8c9cb6cd9a46ae6fb7cb6c39cf6a48a53440feef Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:29 +0300 Subject: [PATCH 060/502] io_uring: fix refs underflow in io_iopoll_queue() Now io_complete_rw_common() puts a ref, extra io_req_put() in io_iopoll_queue() causes undeflow. Remove it. [ 455.998620] refcount_t: underflow; use-after-free. [ 455.998743] WARNING: CPU: 6 PID: 285394 at lib/refcount.c:28 refcount_warn_saturate+0xae/0xf0 [ 455.998772] CPU: 6 PID: 285394 Comm: read-write2 Tainted: G I E 5.8.0-rc2-00048-g1b1aa738f167-dirty #509 [ 455.998772] RIP: 0010:refcount_warn_saturate+0xae/0xf0 ... [ 455.998778] Call Trace: [ 455.998778] io_put_req+0x44/0x50 [ 455.998778] io_iopoll_complete+0x245/0x370 [ 455.998779] io_iopoll_getevents+0x12f/0x1a0 [ 455.998779] io_iopoll_reap_events.part.0+0x5e/0xa0 [ 455.998780] io_ring_ctx_wait_and_kill+0x132/0x1c0 [ 455.998780] io_uring_release+0x20/0x30 [ 455.998780] __fput+0xcd/0x230 [ 455.998781] ____fput+0xe/0x10 [ 455.998781] task_work_run+0x67/0xa0 [ 455.998781] do_exit+0x35d/0xb70 [ 455.998782] do_group_exit+0x43/0xa0 [ 455.998783] get_signal+0x140/0x900 [ 455.998783] do_signal+0x37/0x780 [ 455.998784] __prepare_exit_to_usermode+0x126/0x1c0 [ 455.998785] __syscall_return_slowpath+0x3b/0x1c0 [ 455.998785] do_syscall_64+0x5f/0xa0 [ 455.998785] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: a1d7c393c47 ("io_uring: enable READ/WRITE to use deferred completions") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4a9929c0b4ad..ab9f2f3a9b56 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1904,7 +1904,6 @@ static void io_iopoll_queue(struct list_head *again) /* shouldn't happen unless io_uring is dying, cancel reqs */ if (unlikely(!current->mm)) { io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); - io_put_req(req); continue; } From e6543a816edca00b6b4c48625d142059d7211059 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:30 +0300 Subject: [PATCH 061/502] io_uring: remove inflight batching in free_many() io_free_req_many() is used only for iopoll requests, i.e. reads/writes. Hence no need to batch inflight unhooking. For safety, it'll be done by io_dismantle_req(), which replaces __io_req_aux_free(), and looks more solid and cleaner. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 41 ++++++++--------------------------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ab9f2f3a9b56..9863cec8020f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1504,7 +1504,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, fput(file); } -static void __io_req_aux_free(struct io_kiocb *req) +static void io_dismantle_req(struct io_kiocb *req) { if (req->flags & REQ_F_NEED_CLEANUP) io_cleanup_req(req); @@ -1514,11 +1514,6 @@ static void __io_req_aux_free(struct io_kiocb *req) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); __io_put_req_task(req); io_req_work_drop_env(req); -} - -static void __io_free_req(struct io_kiocb *req) -{ - __io_req_aux_free(req); if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -1530,7 +1525,11 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } +} +static void __io_free_req(struct io_kiocb *req) +{ + io_dismantle_req(req); percpu_ref_put(&req->ctx->refs); if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -1549,35 +1548,11 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) if (!rb->to_free) return; if (rb->need_iter) { - int i, inflight = 0; - unsigned long flags; + int i; - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; - - if (req->flags & REQ_F_INFLIGHT) - inflight++; - __io_req_aux_free(req); - } - if (!inflight) - goto do_free; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; - - if (req->flags & REQ_F_INFLIGHT) { - list_del(&req->inflight_entry); - if (!--inflight) - break; - } - } - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); + for (i = 0; i < rb->to_free; i++) + io_dismantle_req(rb->reqs[i]); } -do_free: kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); percpu_ref_put_many(&ctx->refs, rb->to_free); rb->to_free = rb->need_iter = 0; From 2757a23e7f6441eabf605ca59eeb88c34071757d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:31 +0300 Subject: [PATCH 062/502] io_uring: dismantle req early and remove need_iter Every request in io_req_multi_free() is has ->file set. Instead of pointlessly defering and counting reqs with file, dismantle it on place and save for batch dealloc. It also saves us from potentially skipping io_cleanup_req(), put_task(), etc. Never happens though, becacuse ->file is always there. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9863cec8020f..8cb5252269d7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1540,22 +1540,16 @@ static void __io_free_req(struct io_kiocb *req) struct req_batch { void *reqs[IO_IOPOLL_BATCH]; int to_free; - int need_iter; }; static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) { if (!rb->to_free) return; - if (rb->need_iter) { - int i; - for (i = 0; i < rb->to_free; i++) - io_dismantle_req(rb->reqs[i]); - } kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); percpu_ref_put_many(&ctx->refs, rb->to_free); - rb->to_free = rb->need_iter = 0; + rb->to_free = 0; } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -1846,9 +1840,7 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) return false; - if (req->file || req->io) - rb->need_iter++; - + io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) io_free_req_many(req->ctx, rb); @@ -1900,7 +1892,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, /* order with ->result store in io_complete_rw_iopoll() */ smp_rmb(); - rb.to_free = rb.need_iter = 0; + rb.to_free = 0; while (!list_empty(done)) { int cflags = 0; From c3524383333e4ff2f720ab0c02b3a329f72de78b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:32 +0300 Subject: [PATCH 063/502] io_uring: batch-free linked requests as well There is no reason to not batch deallocation of linked requests. Take away its next req first and handle it as everything else in io_req_multi_free(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8cb5252269d7..af8d1d64f858 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1728,17 +1728,21 @@ static void io_req_task_queue(struct io_kiocb *req) wake_up_process(tsk); } -static void io_free_req(struct io_kiocb *req) +static void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = NULL; io_req_find_next(req, &nxt); - __io_free_req(req); - if (nxt) io_req_task_queue(nxt); } +static void io_free_req(struct io_kiocb *req) +{ + io_queue_next(req); + __io_free_req(req); +} + /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1835,16 +1839,19 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) +static inline void io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) { - if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) - return false; + if (unlikely(io_is_fallback_req(req))) { + io_free_req(req); + return; + } + if (req->flags & REQ_F_LINK_HEAD) + io_queue_next(req); io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) io_free_req_many(req->ctx, rb); - return true; } static int io_put_kbuf(struct io_kiocb *req) @@ -1910,9 +1917,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; - if (refcount_dec_and_test(&req->refs) && - !io_req_multi_free(&rb, req)) - io_free_req(req); + if (refcount_dec_and_test(&req->refs)) + io_req_multi_free(&rb, req); } io_commit_cqring(ctx); From 2d6500d44c1374808040d120e625a22b013c9f0d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:33 +0300 Subject: [PATCH 064/502] io_uring: cosmetic changes for batch free Move all batch free bits close to each other and rename in a consistent way. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 69 +++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index af8d1d64f858..18a452ac81cc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1537,21 +1537,6 @@ static void __io_free_req(struct io_kiocb *req) clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req); } -struct req_batch { - void *reqs[IO_IOPOLL_BATCH]; - int to_free; -}; - -static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) -{ - if (!rb->to_free) - return; - - kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); - percpu_ref_put_many(&ctx->refs, rb->to_free); - rb->to_free = 0; -} - static bool io_link_cancel_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1743,6 +1728,41 @@ static void io_free_req(struct io_kiocb *req) __io_free_req(req); } +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; +}; + +static void __io_req_free_batch_flush(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + rb->to_free = 0; +} + +static void io_req_free_batch_finish(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + if (rb->to_free) + __io_req_free_batch_flush(ctx, rb); +} + +static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) +{ + if (unlikely(io_is_fallback_req(req))) { + io_free_req(req); + return; + } + if (req->flags & REQ_F_LINK_HEAD) + io_queue_next(req); + + io_dismantle_req(req); + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + __io_req_free_batch_flush(req->ctx, rb); +} + /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1839,21 +1859,6 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static inline void io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) -{ - if (unlikely(io_is_fallback_req(req))) { - io_free_req(req); - return; - } - if (req->flags & REQ_F_LINK_HEAD) - io_queue_next(req); - - io_dismantle_req(req); - rb->reqs[rb->to_free++] = req; - if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) - io_free_req_many(req->ctx, rb); -} - static int io_put_kbuf(struct io_kiocb *req) { struct io_buffer *kbuf; @@ -1918,13 +1923,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, (*nr_events)++; if (refcount_dec_and_test(&req->refs)) - io_req_multi_free(&rb, req); + io_req_free_batch(&rb, req); } io_commit_cqring(ctx); if (ctx->flags & IORING_SETUP_SQPOLL) io_cqring_ev_posted(ctx); - io_free_req_many(ctx, &rb); + io_req_free_batch_finish(ctx, &rb); if (!list_empty(&again)) io_iopoll_queue(&again); From 9b0d911acce00b67f7e7336f838b732de7d917d6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:34 +0300 Subject: [PATCH 065/502] io_uring: kill REQ_F_LINK_NEXT After pulling nxt from a request, it's no more a links head, so clear REQ_F_LINK_HEAD. Absence of this flag also indicates that there are no linked requests, so replacing REQ_F_LINK_NEXT, which can be killed. Linked timeouts also behave leaving the flag intact when necessary. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 18a452ac81cc..14c5655c0434 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -526,7 +526,6 @@ enum { REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, REQ_F_LINK_HEAD_BIT, - REQ_F_LINK_NEXT_BIT, REQ_F_FAIL_LINK_BIT, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, @@ -565,8 +564,6 @@ enum { /* head of a link */ REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), - /* already grabbed next link */ - REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), /* fail rest of links */ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), /* on inflight list */ @@ -1559,10 +1556,6 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) struct io_ring_ctx *ctx = req->ctx; bool wake_ev = false; - /* Already got next link */ - if (req->flags & REQ_F_LINK_NEXT) - return; - /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the @@ -1587,7 +1580,6 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) break; } - req->flags |= REQ_F_LINK_NEXT; if (wake_ev) io_cqring_ev_posted(ctx); } @@ -1628,6 +1620,7 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) { if (likely(!(req->flags & REQ_F_LINK_HEAD))) return; + req->flags &= ~REQ_F_LINK_HEAD; /* * If LINK is set, we have dependent requests in this chain. If we From 6795c5aba247653f99d1f336ff496dd74659b322 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:35 +0300 Subject: [PATCH 066/502] io_uring: clean up req->result setting by rw Assign req->result to io_size early in io_{read,write}(), it's enough and makes it more straightforward. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 14c5655c0434..f283d111666b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2384,7 +2384,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; - req->result = 0; req->iopoll_completed = 0; } else { if (kiocb->ki_flags & IOCB_HIPRI) @@ -2957,10 +2956,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (!force_nonblock) kiocb->ki_flags &= ~IOCB_NOWAIT; - req->result = 0; io_size = ret; - if (req->flags & REQ_F_LINK_HEAD) - req->result = io_size; + req->result = io_size; /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, READ)) @@ -3054,10 +3051,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (!force_nonblock) req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - req->result = 0; io_size = ret; - if (req->flags & REQ_F_LINK_HEAD) - req->result = io_size; + req->result = io_size; /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) From 3adfecaa647ff8afa4b6f5907193cf751a0f8351 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:37 +0300 Subject: [PATCH 067/502] io_uring: do task_work_run() during iopoll There are a lot of new users of task_work, and some of task_work_add() may happen while we do io polling, thus make iopoll from time to time to do task_work_run(), so it doesn't poll for sitting there reqs. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index f283d111666b..c514a5209703 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2052,6 +2052,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); + if (current->task_works) + task_work_run(); mutex_lock(&ctx->uring_lock); } From f3a6fa2267480d7f19fbde8316372be46055e548 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:38 +0300 Subject: [PATCH 068/502] io_uring: fix iopoll -EAGAIN handling req->iopoll() is not necessarily called by a task that submitted a request. Because of that, it's dangerous to grab_env() and punt async on -EGAIN, potentially grabbing another task's mm and corrupting its memory. Do resubmit from the submitter task context. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c514a5209703..9d3d8d3866cc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -892,6 +892,7 @@ enum io_mem_account { ACCT_PINNED, }; +static bool io_rw_reissue(struct io_kiocb *req, long res); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); @@ -1873,14 +1874,9 @@ static void io_iopoll_queue(struct list_head *again) req = list_first_entry(again, struct io_kiocb, list); list_del(&req->list); - /* shouldn't happen unless io_uring is dying, cancel reqs */ - if (unlikely(!current->mm)) { + /* should have ->mm unless io_uring is dying, kill reqs then */ + if (unlikely(!current->mm) || !io_rw_reissue(req, -EAGAIN)) io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); - continue; - } - - refcount_inc(&req->refs); - io_queue_async_work(req); } while (!list_empty(again)); } @@ -2387,6 +2383,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; + io_get_req_task(req); } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; From fb49278624f75e15d36c3c43d322ca8961fb40e9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 12:59:48 +0300 Subject: [PATCH 069/502] io_uring: fix missing wake_up io_rw_reissue() Don't forget to wake up a process to which io_rw_reissue() added task_work. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9d3d8d3866cc..92c7e2a96912 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2168,8 +2168,10 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) tsk = req->task; init_task_work(&req->task_work, io_rw_resubmit); ret = task_work_add(tsk, &req->task_work, true); - if (!ret) + if (!ret) { + wake_up_process(tsk); return true; + } #endif return false; } From 0188d08a46ffe4a39c6b463451a41d8b503d04d6 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 12 Jun 2020 13:06:07 +0200 Subject: [PATCH 070/502] s390: convert to msecs_to_jiffies() Instead of using the old 'jiffies + HZ {/,*} something' calculation use msecs_to_jiffies() as that makes the code more readable. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/kernel/lgr.c | 2 +- arch/s390/kernel/time.c | 2 +- arch/s390/kernel/topology.c | 4 ++-- arch/s390/mm/cmm.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index 452502f9a0d9..3b895971c3d0 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -167,7 +167,7 @@ static struct timer_list lgr_timer; */ static void lgr_timer_set(void) { - mod_timer(&lgr_timer, jiffies + LGR_TIMER_INTERVAL_SECS * HZ); + mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC)); } /* diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index b1113b519432..6bc20861fff9 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -669,7 +669,7 @@ static void stp_work_fn(struct work_struct *work) * There is a usable clock but the synchonization failed. * Retry after a second. */ - mod_timer(&stp_timer, jiffies + HZ); + mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC)); out_unlock: mutex_unlock(&stp_work_mutex); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 332b542548cd..ca47141a5be9 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -356,9 +356,9 @@ static atomic_t topology_poll = ATOMIC_INIT(0); static void set_topology_timer(void) { if (atomic_add_unless(&topology_poll, -1, 0)) - mod_timer(&topology_timer, jiffies + HZ / 10); + mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100)); else - mod_timer(&topology_timer, jiffies + HZ * 60); + mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC)); } void topology_expect_change(void) diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 36bce727897b..5c15ae3daf71 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -189,7 +189,7 @@ static void cmm_set_timer(void) del_timer(&cmm_timer); return; } - mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ); + mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC)); } static void cmm_timer_fn(struct timer_list *unused) From b39e7724b0c28d569e9bd7e95f1b839f64e154bd Mon Sep 17 00:00:00 2001 From: Alexander Egorenkov Date: Fri, 19 Jun 2020 10:38:46 +0200 Subject: [PATCH 071/502] s390/zcore: remove memmap device Remove unused /sys/kernel/debug/zcore/memmap device. Since at least version 1.24.0 of s390-tools zfcpdump no longer needs it and reads /proc/vmcore instead. Signed-off-by: Alexander Egorenkov Reviewed-by: Philipp Rudo Signed-off-by: Heiko Carstens --- drivers/s390/char/zcore.c | 57 ++------------------------------------- 1 file changed, 2 insertions(+), 55 deletions(-) diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index 08f812475f5e..d29f1b71618e 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -1,8 +1,7 @@ // SPDX-License-Identifier: GPL-1.0+ /* * zcore module to export memory content and register sets for creating system - * dumps on SCSI disks (zfcpdump). The "zcore/mem" debugfs file shows the same - * dump format as s390 standalone dumps. + * dumps on SCSI disks (zfcpdump). * * For more information please refer to Documentation/s390/zfcpdump.rst * @@ -16,7 +15,6 @@ #include #include #include -#include #include #include @@ -33,8 +31,6 @@ #define TRACE(x...) debug_sprintf_event(zcore_dbf, 1, x) -#define CHUNK_INFO_SIZE 34 /* 2 16-byte char, each followed by blank */ - enum arch_id { ARCH_S390 = 0, ARCH_S390X = 1, @@ -48,7 +44,6 @@ struct ipib_info { static struct debug_info *zcore_dbf; static int hsa_available; static struct dentry *zcore_dir; -static struct dentry *zcore_memmap_file; static struct dentry *zcore_reipl_file; static struct dentry *zcore_hsa_file; static struct ipl_parameter_block *zcore_ipl_block; @@ -139,46 +134,6 @@ static void release_hsa(void) hsa_available = 0; } -static ssize_t zcore_memmap_read(struct file *filp, char __user *buf, - size_t count, loff_t *ppos) -{ - return simple_read_from_buffer(buf, count, ppos, filp->private_data, - memblock.memory.cnt * CHUNK_INFO_SIZE); -} - -static int zcore_memmap_open(struct inode *inode, struct file *filp) -{ - struct memblock_region *reg; - char *buf; - int i = 0; - - buf = kcalloc(memblock.memory.cnt, CHUNK_INFO_SIZE, GFP_KERNEL); - if (!buf) { - return -ENOMEM; - } - for_each_memblock(memory, reg) { - sprintf(buf + (i++ * CHUNK_INFO_SIZE), "%016llx %016llx ", - (unsigned long long) reg->base, - (unsigned long long) reg->size); - } - filp->private_data = buf; - return nonseekable_open(inode, filp); -} - -static int zcore_memmap_release(struct inode *inode, struct file *filp) -{ - kfree(filp->private_data); - return 0; -} - -static const struct file_operations zcore_memmap_fops = { - .owner = THIS_MODULE, - .read = zcore_memmap_read, - .open = zcore_memmap_open, - .release = zcore_memmap_release, - .llseek = no_llseek, -}; - static ssize_t zcore_reipl_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { @@ -335,17 +290,11 @@ static int __init zcore_init(void) rc = -ENOMEM; goto fail; } - zcore_memmap_file = debugfs_create_file("memmap", S_IRUSR, zcore_dir, - NULL, &zcore_memmap_fops); - if (!zcore_memmap_file) { - rc = -ENOMEM; - goto fail_dir; - } zcore_reipl_file = debugfs_create_file("reipl", S_IRUSR, zcore_dir, NULL, &zcore_reipl_fops); if (!zcore_reipl_file) { rc = -ENOMEM; - goto fail_memmap_file; + goto fail_dir; } zcore_hsa_file = debugfs_create_file("hsa", S_IRUSR|S_IWUSR, zcore_dir, NULL, &zcore_hsa_fops); @@ -357,8 +306,6 @@ static int __init zcore_init(void) fail_reipl_file: debugfs_remove(zcore_reipl_file); -fail_memmap_file: - debugfs_remove(zcore_memmap_file); fail_dir: debugfs_remove(zcore_dir); fail: From 90ce70f06546e646713d036cfdec39427df296f7 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 12 May 2020 09:54:58 +0200 Subject: [PATCH 072/502] s390/pci: remove unused functions Signed-off-by: Sven Schnelle Acked-by: Niklas Schnelle Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pci_dma.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h index 419fac7a62c0..f62cd3ed2d44 100644 --- a/arch/s390/include/asm/pci_dma.h +++ b/arch/s390/include/asm/pci_dma.h @@ -131,12 +131,6 @@ static inline void validate_st_entry(unsigned long *entry) *entry |= ZPCI_TABLE_VALID; } -static inline void invalidate_table_entry(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_VALID_MASK; - *entry |= ZPCI_TABLE_INVALID; -} - static inline void invalidate_pt_entry(unsigned long *entry) { WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID); @@ -173,11 +167,6 @@ static inline int pt_entry_isvalid(unsigned long entry) return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID; } -static inline int entry_isprotected(unsigned long entry) -{ - return (entry & ZPCI_TABLE_PROT_MASK) == ZPCI_TABLE_PROTECTED; -} - static inline unsigned long *get_rt_sto(unsigned long entry) { return ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX) From 7fa0d6ff35cfaae9cc7012d9220cd24400c650f1 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 12 May 2020 09:55:18 +0200 Subject: [PATCH 073/502] s390/time: remove unused function Signed-off-by: Sven Schnelle Acked-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/include/asm/timex.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 6bf3a45ccfec..289aaff4d365 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -49,11 +49,6 @@ static inline void set_clock_comparator(__u64 time) asm volatile("sckc %0" : : "Q" (time)); } -static inline void store_clock_comparator(__u64 *time) -{ - asm volatile("stckc %0" : "=Q" (*time)); -} - void clock_comparator_work(void); void __init time_early_init(void); From ecb1ff6833c461ea3bcf16396cd4f1eb50b119c2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 18 Jun 2020 07:09:57 +0200 Subject: [PATCH 074/502] s390/debug: remove raw view There is not a single user of the debug raw view. Therefore remove it before anybody uses it. If anybody would make use of the view it would expose the struct __debug_entry definition to userspace and really would make it uapi. This wouldn't be good, since the definition is suboptimal and needs to be changed. Right now the structure definition is only defined to be uapi, however there is no user. Signed-off-by: Heiko Carstens --- Documentation/s390/s390dbf.rst | 17 ++++---------- arch/s390/include/asm/debug.h | 1 - arch/s390/kernel/debug.c | 42 ---------------------------------- 3 files changed, 4 insertions(+), 56 deletions(-) diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst index cdb36842b898..af8bdc3629e7 100644 --- a/Documentation/s390/s390dbf.rst +++ b/Documentation/s390/s390dbf.rst @@ -67,7 +67,7 @@ corresponding component. The debugfs normally should be mounted to The content of the directories are files which represent different views to the debug log. Each component can decide which views should be used through registering them with the function :c:func:`debug_register_view()`. -Predefined views for hex/ascii, sprintf and raw binary data are provided. +Predefined views for hex/ascii and sprintf data are provided. It is also possible to define other views. The content of a view can be inspected simply by reading the corresponding debugfs file. @@ -119,8 +119,6 @@ Predefined views: extern struct debug_view debug_hex_ascii_view; - extern struct debug_view debug_raw_view; - extern struct debug_view debug_sprintf_view; Examples @@ -129,7 +127,7 @@ Examples .. code-block:: c /* - * hex_ascii- + raw-view Example + * hex_ascii-view Example */ #include @@ -143,7 +141,6 @@ Examples debug_info = debug_register("test", 1, 4, 4 ); debug_register_view(debug_info, &debug_hex_ascii_view); - debug_register_view(debug_info, &debug_raw_view); debug_text_event(debug_info, 4 , "one "); debug_int_exception(debug_info, 4, 4711); @@ -201,7 +198,7 @@ debugfs-files: Example:: > ls /sys/kernel/debug/s390dbf/dasd - flush hex_ascii level pages raw + flush hex_ascii level pages > cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s 00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | .... 00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE @@ -298,10 +295,9 @@ order to see the debug entries well formatted. Predefined Views ---------------- -There are three predefined views: hex_ascii, raw and sprintf. +There are two predefined views: hex_ascii and sprintf. The hex_ascii view shows the data field in hex and ascii representation (e.g. ``45 43 4b 44 | ECKD``). -The raw view returns a bytestream as the debug areas are stored in memory. The sprintf view formats the debug entries in the same way as the sprintf function would do. The sprintf event/exception functions write to the @@ -334,11 +330,6 @@ The format of the hex_ascii and sprintf view is as follows: - Return Address to caller - data field -The format of the raw view is: - -- Header as described in debug.h -- datafield - A typical line of the hex_ascii view will look like the following (first line is only for explanation and will not be displayed when 'cating' the view):: diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index 310134015541..d39da8f3130e 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -82,7 +82,6 @@ struct debug_view { }; extern struct debug_view debug_hex_ascii_view; -extern struct debug_view debug_raw_view; extern struct debug_view debug_sprintf_view; /* do NOT use the _common functions */ diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 263075a1af36..beb4b44a11d1 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -90,27 +90,11 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, size_t user_buf_size, loff_t *offset); static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, char *out_buf, const char *in_buf); -static int debug_raw_format_fn(debug_info_t *id, - struct debug_view *view, char *out_buf, - const char *in_buf); -static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf); - static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, char *out_buf, debug_sprintf_entry_t *curr_event); /* globals */ -struct debug_view debug_raw_view = { - "raw", - NULL, - &debug_raw_header_fn, - &debug_raw_format_fn, - NULL, - NULL -}; -EXPORT_SYMBOL(debug_raw_view); - struct debug_view debug_hex_ascii_view = { "hex_ascii", NULL, @@ -1385,32 +1369,6 @@ out: return rc; /* number of input characters */ } -/* - * prints debug header in raw format - */ -static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf) -{ - int rc; - - rc = sizeof(debug_entry_t); - memcpy(out_buf, entry, sizeof(debug_entry_t)); - return rc; -} - -/* - * prints debug data in raw format - */ -static int debug_raw_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf) -{ - int rc; - - rc = id->buf_size; - memcpy(out_buf, in_buf, id->buf_size); - return rc; -} - /* * prints debug data in hex/ascii format */ From 6ffb3f6b46d0d02c318946047dc5ce6553495848 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 18 Jun 2020 07:41:18 +0200 Subject: [PATCH 075/502] s390/debug: remove struct __debug_entry from uapi There is no interface to userspace which exposes anything that would require the struct __debug_entry definition. Therefore remove it from uapi. This allows to change the definition, since it is only kernel internally used. The only exception is the crash utility, however that tool must handle changes all the time anyway. Signed-off-by: Heiko Carstens --- arch/s390/include/asm/debug.h | 17 ++++++++++++++- arch/s390/include/uapi/asm/debug.h | 35 ------------------------------ 2 files changed, 16 insertions(+), 36 deletions(-) delete mode 100644 arch/s390/include/uapi/asm/debug.h diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index d39da8f3130e..17a26261f288 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #define DEBUG_MAX_LEVEL 6 /* debug levels range from 0 to 6 */ #define DEBUG_OFF_LEVEL -1 /* level where debug is switched off */ @@ -26,6 +26,21 @@ #define DEBUG_DATA(entry) (char *)(entry + 1) /* data is stored behind */ /* the entry information */ +#define __DEBUG_FEATURE_VERSION 2 /* version of debug feature */ + +struct __debug_entry { + union { + struct { + unsigned long clock : 52; + unsigned long exception : 1; + unsigned long level : 3; + unsigned long cpuid : 8; + } fields; + unsigned long stck; + } id; + void *caller; +} __packed; + typedef struct __debug_entry debug_entry_t; struct debug_view; diff --git a/arch/s390/include/uapi/asm/debug.h b/arch/s390/include/uapi/asm/debug.h deleted file mode 100644 index c7c564d9aea4..000000000000 --- a/arch/s390/include/uapi/asm/debug.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * S/390 debug facility - * - * Copyright IBM Corp. 1999, 2000 - */ - -#ifndef _UAPIDEBUG_H -#define _UAPIDEBUG_H - -#include - -/* Note: - * struct __debug_entry must be defined outside of #ifdef __KERNEL__ - * in order to allow a user program to analyze the 'raw'-view. - */ - -struct __debug_entry{ - union { - struct { - unsigned long long clock:52; - unsigned long long exception:1; - unsigned long long level:3; - unsigned long long cpuid:8; - } fields; - - unsigned long long stck; - } id; - void* caller; -} __attribute__((packed)); - - -#define __DEBUG_FEATURE_VERSION 2 /* version of debug feature */ - -#endif /* _UAPIDEBUG_H */ From 28ccce5f50af2e9484d6b74b22ff9eb54bb775a2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 17 Jun 2020 16:29:30 -0500 Subject: [PATCH 076/502] s390/appldata: use struct_size() helper Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. This code was detected with the help of Coccinelle and, audited and fixed manually. Signed-off-by: Gustavo A. R. Silva Message-Id: <20200617212930.GA11728@embeddedor> Signed-off-by: Heiko Carstens --- arch/s390/appldata/appldata_os.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 5503217366ec..a363d30ce739 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -129,8 +129,7 @@ static void appldata_get_os_data(void *data) os_data->nr_cpus = j; - new_size = sizeof(struct appldata_os_data) + - (os_data->nr_cpus * sizeof(struct appldata_os_per_cpu)); + new_size = struct_size(os_data, os_cpu, os_data->nr_cpus); if (ops.size != new_size) { if (ops.active) { rc = appldata_diag(APPLDATA_RECORD_OS_ID, @@ -165,8 +164,7 @@ static int __init appldata_os_init(void) { int rc, max_size; - max_size = sizeof(struct appldata_os_data) + - (num_possible_cpus() * sizeof(struct appldata_os_per_cpu)); + max_size = struct_size(appldata_os_data, os_cpu, num_possible_cpus()); if (max_size > APPLDATA_MAX_REC_SIZE) { pr_err("Maximum OS record size %i exceeds the maximum " "record size %i\n", max_size, APPLDATA_MAX_REC_SIZE); From 6b05dfacd761c6ace11def4b3b42fc6a7583fec3 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:02 +0200 Subject: [PATCH 077/502] docs: RCU: Convert checklist.txt to ReST - Add a SPDX header; - Adjust document title; - Some whitespace fixes and new line breaks; - Use the right list markups; - Add it to RCU/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- .../RCU/{checklist.txt => checklist.rst} | 17 ++++++++++++----- Documentation/RCU/index.rst | 3 +++ 2 files changed, 15 insertions(+), 5 deletions(-) rename Documentation/RCU/{checklist.txt => checklist.rst} (98%) diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.rst similarity index 98% rename from Documentation/RCU/checklist.txt rename to Documentation/RCU/checklist.rst index e98ff261a438..2efed9926c3f 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.rst @@ -1,4 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================ Review Checklist for RCU Patches +================================ This document contains a checklist for producing and reviewing patches @@ -411,18 +415,21 @@ over a rather long period of time, but improvements are always welcome! __rcu sparse checks to validate your RCU code. These can help find problems as follows: - CONFIG_PROVE_LOCKING: check that accesses to RCU-protected data + CONFIG_PROVE_LOCKING: + check that accesses to RCU-protected data structures are carried out under the proper RCU read-side critical section, while holding the right combination of locks, or whatever other conditions are appropriate. - CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the + CONFIG_DEBUG_OBJECTS_RCU_HEAD: + check that you don't pass the same object to call_rcu() (or friends) before an RCU grace period has elapsed since the last time that you passed that same object to call_rcu() (or friends). - __rcu sparse checks: tag the pointer to the RCU-protected data + __rcu sparse checks: + tag the pointer to the RCU-protected data structure with __rcu, and sparse will warn you if you access that pointer without the services of one of the variants of rcu_dereference(). @@ -442,8 +449,8 @@ over a rather long period of time, but improvements are always welcome! You instead need to use one of the barrier functions: - o call_rcu() -> rcu_barrier() - o call_srcu() -> srcu_barrier() + - call_rcu() -> rcu_barrier() + - call_srcu() -> srcu_barrier() However, these barrier functions are absolutely -not- guaranteed to wait for a grace period. In fact, if there are no call_rcu() diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 81a0a1e5f767..c1ba4d130bb0 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + .. _rcu_concepts: ============ @@ -8,6 +10,7 @@ RCU concepts :maxdepth: 3 arrayRCU + checklist rcubarrier rcu_dereference whatisRCU From a3b0a79f8903f955250505f99d1e37b6c7d7b060 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:03 +0200 Subject: [PATCH 078/502] docs: RCU: Convert lockdep-splat.txt to ReST - Add a SPDX header; - Add a document title; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to RCU/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- Documentation/RCU/index.rst | 1 + .../{lockdep-splat.txt => lockdep-splat.rst} | 99 ++++++++++--------- 2 files changed, 53 insertions(+), 47 deletions(-) rename Documentation/RCU/{lockdep-splat.txt => lockdep-splat.rst} (54%) diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index c1ba4d130bb0..430a37132b2c 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -11,6 +11,7 @@ RCU concepts arrayRCU checklist + lockdep-splat rcubarrier rcu_dereference whatisRCU diff --git a/Documentation/RCU/lockdep-splat.txt b/Documentation/RCU/lockdep-splat.rst similarity index 54% rename from Documentation/RCU/lockdep-splat.txt rename to Documentation/RCU/lockdep-splat.rst index b8096316fd11..2a5c79db57dc 100644 --- a/Documentation/RCU/lockdep-splat.txt +++ b/Documentation/RCU/lockdep-splat.rst @@ -1,3 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +Lockdep-RCU Splat +================= + Lockdep-RCU was added to the Linux kernel in early 2010 (http://lwn.net/Articles/371986/). This facility checks for some common misuses of the RCU API, most notably using one of the rcu_dereference() @@ -12,55 +18,54 @@ overwriting or worse. There can of course be false positives, this being the real world and all that. So let's look at an example RCU lockdep splat from 3.0-rc5, one that -has long since been fixed: +has long since been fixed:: -============================= -WARNING: suspicious RCU usage ------------------------------ -block/cfq-iosched.c:2776 suspicious rcu_dereference_protected() usage! + ============================= + WARNING: suspicious RCU usage + ----------------------------- + block/cfq-iosched.c:2776 suspicious rcu_dereference_protected() usage! -other info that might help us debug this: +other info that might help us debug this:: + rcu_scheduler_active = 1, debug_locks = 0 + 3 locks held by scsi_scan_6/1552: + #0: (&shost->scan_mutex){+.+.}, at: [] + scsi_scan_host_selected+0x5a/0x150 + #1: (&eq->sysfs_lock){+.+.}, at: [] + elevator_exit+0x22/0x60 + #2: (&(&q->__queue_lock)->rlock){-.-.}, at: [] + cfq_exit_queue+0x43/0x190 -rcu_scheduler_active = 1, debug_locks = 0 -3 locks held by scsi_scan_6/1552: - #0: (&shost->scan_mutex){+.+.}, at: [] -scsi_scan_host_selected+0x5a/0x150 - #1: (&eq->sysfs_lock){+.+.}, at: [] -elevator_exit+0x22/0x60 - #2: (&(&q->__queue_lock)->rlock){-.-.}, at: [] -cfq_exit_queue+0x43/0x190 + stack backtrace: + Pid: 1552, comm: scsi_scan_6 Not tainted 3.0.0-rc5 #17 + Call Trace: + [] lockdep_rcu_dereference+0xbb/0xc0 + [] __cfq_exit_single_io_context+0xe9/0x120 + [] cfq_exit_queue+0x7c/0x190 + [] elevator_exit+0x36/0x60 + [] blk_cleanup_queue+0x4a/0x60 + [] scsi_free_queue+0x9/0x10 + [] __scsi_remove_device+0x84/0xd0 + [] scsi_probe_and_add_lun+0x353/0xb10 + [] ? error_exit+0x29/0xb0 + [] ? _raw_spin_unlock_irqrestore+0x3d/0x80 + [] __scsi_scan_target+0x112/0x680 + [] ? trace_hardirqs_off_thunk+0x3a/0x3c + [] ? error_exit+0x29/0xb0 + [] ? kobject_del+0x40/0x40 + [] scsi_scan_channel+0x86/0xb0 + [] scsi_scan_host_selected+0x140/0x150 + [] do_scsi_scan_host+0x89/0x90 + [] do_scan_async+0x20/0x160 + [] ? do_scsi_scan_host+0x90/0x90 + [] kthread+0xa6/0xb0 + [] kernel_thread_helper+0x4/0x10 + [] ? finish_task_switch+0x80/0x110 + [] ? retint_restore_args+0xe/0xe + [] ? __kthread_init_worker+0x70/0x70 + [] ? gs_change+0xb/0xb -stack backtrace: -Pid: 1552, comm: scsi_scan_6 Not tainted 3.0.0-rc5 #17 -Call Trace: - [] lockdep_rcu_dereference+0xbb/0xc0 - [] __cfq_exit_single_io_context+0xe9/0x120 - [] cfq_exit_queue+0x7c/0x190 - [] elevator_exit+0x36/0x60 - [] blk_cleanup_queue+0x4a/0x60 - [] scsi_free_queue+0x9/0x10 - [] __scsi_remove_device+0x84/0xd0 - [] scsi_probe_and_add_lun+0x353/0xb10 - [] ? error_exit+0x29/0xb0 - [] ? _raw_spin_unlock_irqrestore+0x3d/0x80 - [] __scsi_scan_target+0x112/0x680 - [] ? trace_hardirqs_off_thunk+0x3a/0x3c - [] ? error_exit+0x29/0xb0 - [] ? kobject_del+0x40/0x40 - [] scsi_scan_channel+0x86/0xb0 - [] scsi_scan_host_selected+0x140/0x150 - [] do_scsi_scan_host+0x89/0x90 - [] do_scan_async+0x20/0x160 - [] ? do_scsi_scan_host+0x90/0x90 - [] kthread+0xa6/0xb0 - [] kernel_thread_helper+0x4/0x10 - [] ? finish_task_switch+0x80/0x110 - [] ? retint_restore_args+0xe/0xe - [] ? __kthread_init_worker+0x70/0x70 - [] ? gs_change+0xb/0xb - -Line 2776 of block/cfq-iosched.c in v3.0-rc5 is as follows: +Line 2776 of block/cfq-iosched.c in v3.0-rc5 is as follows:: if (rcu_dereference(ioc->ioc_data) == cic) { @@ -70,7 +75,7 @@ case. Instead, we hold three locks, one of which might be RCU related. And maybe that lock really does protect this reference. If so, the fix is to inform RCU, perhaps by changing __cfq_exit_single_io_context() to take the struct request_queue "q" from cfq_exit_queue() as an argument, -which would permit us to invoke rcu_dereference_protected as follows: +which would permit us to invoke rcu_dereference_protected as follows:: if (rcu_dereference_protected(ioc->ioc_data, lockdep_is_held(&q->queue_lock)) == cic) { @@ -85,7 +90,7 @@ On the other hand, perhaps we really do need an RCU read-side critical section. In this case, the critical section must span the use of the return value from rcu_dereference(), or at least until there is some reference count incremented or some such. One way to handle this is to -add rcu_read_lock() and rcu_read_unlock() as follows: +add rcu_read_lock() and rcu_read_unlock() as follows:: rcu_read_lock(); if (rcu_dereference(ioc->ioc_data) == cic) { @@ -102,7 +107,7 @@ above lockdep-RCU splat. But in this particular case, we don't actually dereference the pointer returned from rcu_dereference(). Instead, that pointer is just compared to the cic pointer, which means that the rcu_dereference() can be replaced -by rcu_access_pointer() as follows: +by rcu_access_pointer() as follows:: if (rcu_access_pointer(ioc->ioc_data) == cic) { From 058cc23bcad08aca62987cc795fe406ac39146d0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:04 +0200 Subject: [PATCH 079/502] docs: RCU: Convert lockdep.txt to ReST - Add a SPDX header; - Adjust document title; - Mark literal blocks as such; - Add it to RCU/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- Documentation/RCU/index.rst | 1 + Documentation/RCU/{lockdep.txt => lockdep.rst} | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) rename Documentation/RCU/{lockdep.txt => lockdep.rst} (96%) diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 430a37132b2c..fa7a2a8949b7 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -11,6 +11,7 @@ RCU concepts arrayRCU checklist + lockdep lockdep-splat rcubarrier rcu_dereference diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.rst similarity index 96% rename from Documentation/RCU/lockdep.txt rename to Documentation/RCU/lockdep.rst index 89db949eeca0..f1fc8ae3846a 100644 --- a/Documentation/RCU/lockdep.txt +++ b/Documentation/RCU/lockdep.rst @@ -1,4 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== RCU and lockdep checking +======================== All flavors of RCU have lockdep checking available, so that lockdep is aware of when each task enters and leaves any flavor of RCU read-side @@ -8,7 +12,7 @@ tracking to include RCU state, which can sometimes help when debugging deadlocks and the like. In addition, RCU provides the following primitives that check lockdep's -state: +state:: rcu_read_lock_held() for normal RCU. rcu_read_lock_bh_held() for RCU-bh. @@ -63,7 +67,7 @@ checking of rcu_dereference() primitives: The rcu_dereference_check() check expression can be any boolean expression, but would normally include a lockdep expression. However, any boolean expression can be used. For a moderately ornate example, -consider the following: +consider the following:: file = rcu_dereference_check(fdt->fd[fd], lockdep_is_held(&files->file_lock) || @@ -82,7 +86,7 @@ RCU read-side critical sections, in case (2) the ->file_lock prevents any change from taking place, and finally, in case (3) the current task is the only task accessing the file_struct, again preventing any change from taking place. If the above statement was invoked only from updater -code, it could instead be written as follows: +code, it could instead be written as follows:: file = rcu_dereference_protected(fdt->fd[fd], lockdep_is_held(&files->file_lock) || @@ -105,7 +109,7 @@ false and they are called from outside any RCU read-side critical section. For example, the workqueue for_each_pwq() macro is intended to be used either within an RCU read-side critical section or with wq->mutex held. -It is thus implemented as follows: +It is thus implemented as follows:: #define for_each_pwq(pwq, wq) list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, From 2cdb54c93a7e5beb6f3f8b63575d9fb664dfc603 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:05 +0200 Subject: [PATCH 080/502] docs: RCU: Convert rculist_nulls.txt to ReST - Add a SPDX header; - Adjust document title; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to RCU/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- Documentation/RCU/index.rst | 1 + Documentation/RCU/rculist_nulls.rst | 194 ++++++++++++++++++++++++++++ Documentation/RCU/rculist_nulls.txt | 172 ------------------------ include/linux/rculist_nulls.h | 2 +- net/core/sock.c | 4 +- 5 files changed, 198 insertions(+), 175 deletions(-) create mode 100644 Documentation/RCU/rculist_nulls.rst delete mode 100644 Documentation/RCU/rculist_nulls.txt diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index fa7a2a8949b7..577a47e27f5d 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -17,6 +17,7 @@ RCU concepts rcu_dereference whatisRCU rcu + rculist_nulls listRCU NMI-RCU UP diff --git a/Documentation/RCU/rculist_nulls.rst b/Documentation/RCU/rculist_nulls.rst new file mode 100644 index 000000000000..d40374221d69 --- /dev/null +++ b/Documentation/RCU/rculist_nulls.rst @@ -0,0 +1,194 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================================= +Using RCU hlist_nulls to protect list and objects +================================================= + +This section describes how to use hlist_nulls to +protect read-mostly linked lists and +objects using SLAB_TYPESAFE_BY_RCU allocations. + +Please read the basics in Documentation/RCU/listRCU.rst + +Using special makers (called 'nulls') is a convenient way +to solve following problem : + +A typical RCU linked list managing objects which are +allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can +use following algos : + +1) Lookup algo +-------------- + +:: + + rcu_read_lock() + begin: + obj = lockless_lookup(key); + if (obj) { + if (!try_get_ref(obj)) // might fail for free objects + goto begin; + /* + * Because a writer could delete object, and a writer could + * reuse these object before the RCU grace period, we + * must check key after getting the reference on object + */ + if (obj->key != key) { // not the object we expected + put_ref(obj); + goto begin; + } + } + rcu_read_unlock(); + +Beware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu() +but a version with an additional memory barrier (smp_rmb()) + +:: + + lockless_lookup(key) + { + struct hlist_node *node, *next; + for (pos = rcu_dereference((head)->first); + pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); + pos = rcu_dereference(next)) + if (obj->key == key) + return obj; + return NULL; + } + +And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb():: + + struct hlist_node *node; + for (pos = rcu_dereference((head)->first); + pos && ({ prefetch(pos->next); 1; }) && + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); + pos = rcu_dereference(pos->next)) + if (obj->key == key) + return obj; + return NULL; + +Quoting Corey Minyard:: + + "If the object is moved from one list to another list in-between the + time the hash is calculated and the next field is accessed, and the + object has moved to the end of a new list, the traversal will not + complete properly on the list it should have, since the object will + be on the end of the new list and there's not a way to tell it's on a + new list and restart the list traversal. I think that this can be + solved by pre-fetching the "next" field (with proper barriers) before + checking the key." + +2) Insert algo +-------------- + +We need to make sure a reader cannot read the new 'obj->obj_next' value +and previous value of 'obj->key'. Or else, an item could be deleted +from a chain, and inserted into another chain. If new chain was empty +before the move, 'next' pointer is NULL, and lockless reader can +not detect it missed following items in original chain. + +:: + + /* + * Please note that new inserts are done at the head of list, + * not in the middle or end. + */ + obj = kmem_cache_alloc(...); + lock_chain(); // typically a spin_lock() + obj->key = key; + /* + * we need to make sure obj->key is updated before obj->next + * or obj->refcnt + */ + smp_wmb(); + atomic_set(&obj->refcnt, 1); + hlist_add_head_rcu(&obj->obj_node, list); + unlock_chain(); // typically a spin_unlock() + + +3) Remove algo +-------------- +Nothing special here, we can use a standard RCU hlist deletion. +But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused +very very fast (before the end of RCU grace period) + +:: + + if (put_last_reference_on(obj) { + lock_chain(); // typically a spin_lock() + hlist_del_init_rcu(&obj->obj_node); + unlock_chain(); // typically a spin_unlock() + kmem_cache_free(cachep, obj); + } + + + +-------------------------------------------------------------------------- + +With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup() +and extra smp_wmb() in insert function. + +For example, if we choose to store the slot number as the 'nulls' +end-of-list marker for each slot of the hash table, we can detect +a race (some writer did a delete and/or a move of an object +to another chain) checking the final 'nulls' value if +the lookup met the end of chain. If final 'nulls' value +is not the slot number, then we must restart the lookup at +the beginning. If the object was moved to the same chain, +then the reader doesn't care : It might eventually +scan the list again without harm. + + +1) lookup algo +-------------- + +:: + + head = &table[slot]; + rcu_read_lock(); + begin: + hlist_nulls_for_each_entry_rcu(obj, node, head, member) { + if (obj->key == key) { + if (!try_get_ref(obj)) // might fail for free objects + goto begin; + if (obj->key != key) { // not the object we expected + put_ref(obj); + goto begin; + } + goto out; + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begin; + obj = NULL; + + out: + rcu_read_unlock(); + +2) Insert function +------------------ + +:: + + /* + * Please note that new inserts are done at the head of list, + * not in the middle or end. + */ + obj = kmem_cache_alloc(cachep); + lock_chain(); // typically a spin_lock() + obj->key = key; + /* + * changes to obj->key must be visible before refcnt one + */ + smp_wmb(); + atomic_set(&obj->refcnt, 1); + /* + * insert obj in RCU way (readers might be traversing chain) + */ + hlist_nulls_add_head_rcu(&obj->obj_node, list); + unlock_chain(); // typically a spin_unlock() diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt deleted file mode 100644 index 23f115dc87cf..000000000000 --- a/Documentation/RCU/rculist_nulls.txt +++ /dev/null @@ -1,172 +0,0 @@ -Using hlist_nulls to protect read-mostly linked lists and -objects using SLAB_TYPESAFE_BY_RCU allocations. - -Please read the basics in Documentation/RCU/listRCU.rst - -Using special makers (called 'nulls') is a convenient way -to solve following problem : - -A typical RCU linked list managing objects which are -allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can -use following algos : - -1) Lookup algo --------------- -rcu_read_lock() -begin: -obj = lockless_lookup(key); -if (obj) { - if (!try_get_ref(obj)) // might fail for free objects - goto begin; - /* - * Because a writer could delete object, and a writer could - * reuse these object before the RCU grace period, we - * must check key after getting the reference on object - */ - if (obj->key != key) { // not the object we expected - put_ref(obj); - goto begin; - } -} -rcu_read_unlock(); - -Beware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu() -but a version with an additional memory barrier (smp_rmb()) - -lockless_lookup(key) -{ - struct hlist_node *node, *next; - for (pos = rcu_dereference((head)->first); - pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && - ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); - pos = rcu_dereference(next)) - if (obj->key == key) - return obj; - return NULL; - -And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb() : - - struct hlist_node *node; - for (pos = rcu_dereference((head)->first); - pos && ({ prefetch(pos->next); 1; }) && - ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); - pos = rcu_dereference(pos->next)) - if (obj->key == key) - return obj; - return NULL; -} - -Quoting Corey Minyard : - -"If the object is moved from one list to another list in-between the - time the hash is calculated and the next field is accessed, and the - object has moved to the end of a new list, the traversal will not - complete properly on the list it should have, since the object will - be on the end of the new list and there's not a way to tell it's on a - new list and restart the list traversal. I think that this can be - solved by pre-fetching the "next" field (with proper barriers) before - checking the key." - -2) Insert algo : ----------------- - -We need to make sure a reader cannot read the new 'obj->obj_next' value -and previous value of 'obj->key'. Or else, an item could be deleted -from a chain, and inserted into another chain. If new chain was empty -before the move, 'next' pointer is NULL, and lockless reader can -not detect it missed following items in original chain. - -/* - * Please note that new inserts are done at the head of list, - * not in the middle or end. - */ -obj = kmem_cache_alloc(...); -lock_chain(); // typically a spin_lock() -obj->key = key; -/* - * we need to make sure obj->key is updated before obj->next - * or obj->refcnt - */ -smp_wmb(); -atomic_set(&obj->refcnt, 1); -hlist_add_head_rcu(&obj->obj_node, list); -unlock_chain(); // typically a spin_unlock() - - -3) Remove algo --------------- -Nothing special here, we can use a standard RCU hlist deletion. -But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused -very very fast (before the end of RCU grace period) - -if (put_last_reference_on(obj) { - lock_chain(); // typically a spin_lock() - hlist_del_init_rcu(&obj->obj_node); - unlock_chain(); // typically a spin_unlock() - kmem_cache_free(cachep, obj); -} - - - --------------------------------------------------------------------------- -With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup() -and extra smp_wmb() in insert function. - -For example, if we choose to store the slot number as the 'nulls' -end-of-list marker for each slot of the hash table, we can detect -a race (some writer did a delete and/or a move of an object -to another chain) checking the final 'nulls' value if -the lookup met the end of chain. If final 'nulls' value -is not the slot number, then we must restart the lookup at -the beginning. If the object was moved to the same chain, -then the reader doesn't care : It might eventually -scan the list again without harm. - - -1) lookup algo - - head = &table[slot]; - rcu_read_lock(); -begin: - hlist_nulls_for_each_entry_rcu(obj, node, head, member) { - if (obj->key == key) { - if (!try_get_ref(obj)) // might fail for free objects - goto begin; - if (obj->key != key) { // not the object we expected - put_ref(obj); - goto begin; - } - goto out; - } -/* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - obj = NULL; - -out: - rcu_read_unlock(); - -2) Insert function : --------------------- - -/* - * Please note that new inserts are done at the head of list, - * not in the middle or end. - */ -obj = kmem_cache_alloc(cachep); -lock_chain(); // typically a spin_lock() -obj->key = key; -/* - * changes to obj->key must be visible before refcnt one - */ -smp_wmb(); -atomic_set(&obj->refcnt, 1); -/* - * insert obj in RCU way (readers might be traversing chain) - */ -hlist_nulls_add_head_rcu(&obj->obj_node, list); -unlock_chain(); // typically a spin_unlock() diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 9670b54b484a..ff3e94779e73 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -162,7 +162,7 @@ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) * The barrier() is needed to make sure compiler doesn't cache first element [1], * as this loop can be restarted [2] * [1] Documentation/core-api/atomic_ops.rst around line 114 - * [2] Documentation/RCU/rculist_nulls.txt around line 146 + * [2] Documentation/RCU/rculist_nulls.rst around line 146 */ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ for (({barrier();}), \ diff --git a/net/core/sock.c b/net/core/sock.c index d832c650287c..6921a85a1177 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1973,7 +1973,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* * Before updating sk_refcnt, we must commit prior changes to memory - * (Documentation/RCU/rculist_nulls.txt for details) + * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&newsk->sk_refcnt, 2); @@ -3035,7 +3035,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory - * (Documentation/RCU/rculist_nulls.txt for details) + * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); From 43cb5451dffe0bc5d59688d4898c9a1f7c40d3b4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:06 +0200 Subject: [PATCH 081/502] docs: RCU: Convert torture.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to RCU/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- Documentation/RCU/index.rst | 1 + .../RCU/{torture.txt => torture.rst} | 115 ++++++++++-------- Documentation/locking/locktorture.rst | 2 +- MAINTAINERS | 4 +- kernel/rcu/rcutorture.c | 2 +- 5 files changed, 68 insertions(+), 56 deletions(-) rename Documentation/RCU/{torture.txt => torture.rst} (76%) diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 577a47e27f5d..5d5f9a1ab8f9 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -18,6 +18,7 @@ RCU concepts whatisRCU rcu rculist_nulls + torture listRCU NMI-RCU UP diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.rst similarity index 76% rename from Documentation/RCU/torture.txt rename to Documentation/RCU/torture.rst index af712a3c5b6a..a90147713062 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.rst @@ -1,7 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== RCU Torture Test Operation +========================== CONFIG_RCU_TORTURE_TEST +======================= The CONFIG_RCU_TORTURE_TEST config option is available for all RCU implementations. It creates an rcutorture kernel module that can @@ -13,9 +18,10 @@ when the module is loaded, and stops when the module is unloaded. Module parameters are prefixed by "rcutorture." in Documentation/admin-guide/kernel-parameters.txt. -OUTPUT +Output +====== -The statistics output is as follows: +The statistics output is as follows:: rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4 rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767 @@ -36,53 +42,53 @@ automatic determination as to whether RCU operated correctly. The entries are as follows: -o "rtc": The hexadecimal address of the structure currently visible +* "rtc": The hexadecimal address of the structure currently visible to readers. -o "ver": The number of times since boot that the RCU writer task +* "ver": The number of times since boot that the RCU writer task has changed the structure visible to readers. -o "tfle": If non-zero, indicates that the "torture freelist" +* "tfle": If non-zero, indicates that the "torture freelist" containing structures to be placed into the "rtc" area is empty. This condition is important, since it can fool you into thinking that RCU is working when it is not. :-/ -o "rta": Number of structures allocated from the torture freelist. +* "rta": Number of structures allocated from the torture freelist. -o "rtaf": Number of allocations from the torture freelist that have +* "rtaf": Number of allocations from the torture freelist that have failed due to the list being empty. It is not unusual for this to be non-zero, but it is bad for it to be a large fraction of the value indicated by "rta". -o "rtf": Number of frees into the torture freelist. +* "rtf": Number of frees into the torture freelist. -o "rtmbe": A non-zero value indicates that rcutorture believes that +* "rtmbe": A non-zero value indicates that rcutorture believes that rcu_assign_pointer() and rcu_dereference() are not working correctly. This value should be zero. -o "rtbe": A non-zero value indicates that one of the rcu_barrier() +* "rtbe": A non-zero value indicates that one of the rcu_barrier() family of functions is not working correctly. -o "rtbke": rcutorture was unable to create the real-time kthreads +* "rtbke": rcutorture was unable to create the real-time kthreads used to force RCU priority inversion. This value should be zero. -o "rtbre": Although rcutorture successfully created the kthreads +* "rtbre": Although rcutorture successfully created the kthreads used to force RCU priority inversion, it was unable to set them to the real-time priority level of 1. This value should be zero. -o "rtbf": The number of times that RCU priority boosting failed +* "rtbf": The number of times that RCU priority boosting failed to resolve RCU priority inversion. -o "rtb": The number of times that rcutorture attempted to force +* "rtb": The number of times that rcutorture attempted to force an RCU priority inversion condition. If you are testing RCU priority boosting via the "test_boost" module parameter, this value should be non-zero. -o "nt": The number of times rcutorture ran RCU read-side code from +* "nt": The number of times rcutorture ran RCU read-side code from within a timer handler. This value should be non-zero only if you specified the "irqreader" module parameter. -o "Reader Pipe": Histogram of "ages" of structures seen by readers. +* "Reader Pipe": Histogram of "ages" of structures seen by readers. If any entries past the first two are non-zero, RCU is broken. And rcutorture prints the error flag string "!!!" to make sure you notice. The age of a newly allocated structure is zero, @@ -94,14 +100,14 @@ o "Reader Pipe": Histogram of "ages" of structures seen by readers. RCU. If you want to see what it looks like when broken, break it yourself. ;-) -o "Reader Batch": Another histogram of "ages" of structures seen +* "Reader Batch": Another histogram of "ages" of structures seen by readers, but in terms of counter flips (or batches) rather than in terms of grace periods. The legal number of non-zero entries is again two. The reason for this separate view is that it is sometimes easier to get the third entry to show up in the "Reader Batch" list than in the "Reader Pipe" list. -o "Free-Block Circulation": Shows the number of torture structures +* "Free-Block Circulation": Shows the number of torture structures that have reached a given point in the pipeline. The first element should closely correspond to the number of structures allocated, the second to the number that have been removed from reader view, @@ -112,7 +118,7 @@ o "Free-Block Circulation": Shows the number of torture structures Different implementations of RCU can provide implementation-specific additional information. For example, Tree SRCU provides the following -additional line: +additional line:: srcud-torture: Tree SRCU per-CPU(idx=0): 0(35,-21) 1(-4,24) 2(1,1) 3(-26,20) 4(28,-47) 5(-9,4) 6(-10,14) 7(-14,11) T(1,6) @@ -123,15 +129,15 @@ using a dynamically allocated srcu_struct (hence "srcud-" rather than "old" and "current" values to the underlying array, and is useful for debugging. The final "T" entry contains the totals of the counters. - -USAGE ON SPECIFIC KERNEL BUILDS +Usage on Specific Kernel Builds +=============================== It is sometimes desirable to torture RCU on a specific kernel build, for example, when preparing to put that kernel build into production. In that case, the kernel should be built with CONFIG_RCU_TORTURE_TEST=m so that the test can be started using modprobe and terminated using rmmod. -For example, the following script may be used to torture RCU: +For example, the following script may be used to torture RCU:: #!/bin/sh @@ -148,7 +154,8 @@ two are self-explanatory, while the last indicates that while there were no RCU failures, CPU-hotplug problems were detected. -USAGE ON MAINLINE KERNELS +Usage on Mainline Kernels +========================= When using rcutorture to test changes to RCU itself, it is often necessary to build a number of kernels in order to test that change @@ -180,16 +187,16 @@ to Tree SRCU might run only the SRCU-N and SRCU-P scenarios using the --configs argument to kvm.sh as follows: "--configs 'SRCU-N SRCU-P'". Large systems can run multiple copies of of the full set of scenarios, for example, a system with 448 hardware threads can run five instances -of the full set concurrently. To make this happen: +of the full set concurrently. To make this happen:: kvm.sh --cpus 448 --configs '5*CFLIST' Alternatively, such a system can run 56 concurrent instances of a single -eight-CPU scenario: +eight-CPU scenario:: kvm.sh --cpus 448 --configs '56*TREE04' -Or 28 concurrent instances of each of two eight-CPU scenarios: +Or 28 concurrent instances of each of two eight-CPU scenarios:: kvm.sh --cpus 448 --configs '28*TREE03 28*TREE04' @@ -199,14 +206,14 @@ values for memory may require disabling the callback-flooding tests using the --bootargs parameter discussed below. Sometimes additional debugging is useful, and in such cases the --kconfig -parameter to kvm.sh may be used, for example, "--kconfig 'CONFIG_KASAN=y'". +parameter to kvm.sh may be used, for example, ``--kconfig 'CONFIG_KASAN=y'``. Kernel boot arguments can also be supplied, for example, to control rcutorture's module parameters. For example, to test a change to RCU's CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'". This will of course result in the scripting reporting a failure, namely the resuling RCU CPU stall warning. As noted above, reducing memory may -require disabling rcutorture's callback-flooding tests: +require disabling rcutorture's callback-flooding tests:: kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \ --bootargs 'rcutorture.fwd_progress=0' @@ -225,7 +232,7 @@ is listed at the end of the kvm.sh output, which you really should redirect to a file. The build products and console output of each run is kept in tools/testing/selftests/rcutorture/res in timestamped directories. A given directory can be supplied to kvm-find-errors.sh in order to have -it cycle you through summaries of errors and full error logs. For example: +it cycle you through summaries of errors and full error logs. For example:: tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh \ tools/testing/selftests/rcutorture/res/2020.01.20-15.54.23 @@ -245,38 +252,42 @@ that was tested and any uncommitted changes in diff format. The most frequently used files in each per-scenario-run directory are: -.config: This file contains the Kconfig options. +.config: + This file contains the Kconfig options. -Make.out: This contains build output for a specific scenario. +Make.out: + This contains build output for a specific scenario. -console.log: This contains the console output for a specific scenario. +console.log: + This contains the console output for a specific scenario. This file may be examined once the kernel has booted, but it might not exist if the build failed. -vmlinux: This contains the kernel, which can be useful with tools like +vmlinux: + This contains the kernel, which can be useful with tools like objdump and gdb. A number of additional files are available, but are less frequently used. Many are intended for debugging of rcutorture itself or of its scripting. As of v5.4, a successful run with the default set of scenarios produces -the following summary at the end of the run on a 12-CPU system: +the following summary at the end of the run on a 12-CPU system:: -SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ] -SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ] -SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ] -SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ] -TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ] -TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ] -TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ] -TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198 -TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631 -TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ] -TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844 -TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497 -CPU count limited from 16 to 12 -TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961 -TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997 -TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732 -CPU count limited from 16 to 12 -TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011 + SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ] + SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ] + SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ] + SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ] + TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ] + TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ] + TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ] + TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198 + TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631 + TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ] + TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844 + TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497 + CPU count limited from 16 to 12 + TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961 + TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997 + TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732 + CPU count limited from 16 to 12 + TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011 diff --git a/Documentation/locking/locktorture.rst b/Documentation/locking/locktorture.rst index 8012a74555e7..dfaf9fc883f4 100644 --- a/Documentation/locking/locktorture.rst +++ b/Documentation/locking/locktorture.rst @@ -166,4 +166,4 @@ checked for such errors. The "rmmod" command forces a "SUCCESS", two are self-explanatory, while the last indicates that while there were no locking failures, CPU-hotplug problems were detected. -Also see: Documentation/RCU/torture.txt +Also see: Documentation/RCU/torture.rst diff --git a/MAINTAINERS b/MAINTAINERS index 496fd4eafb68..4429ce965b3a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14437,7 +14437,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev F: Documentation/RCU/ F: include/linux/rcu* F: kernel/rcu/ -X: Documentation/RCU/torture.txt +X: Documentation/RCU/torture.rst X: include/linux/srcu*.h X: kernel/rcu/srcu*.c @@ -17288,7 +17288,7 @@ M: Josh Triplett L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev -F: Documentation/RCU/torture.txt +F: Documentation/RCU/torture.rst F: kernel/locking/locktorture.c F: kernel/rcu/rcuperf.c F: kernel/rcu/rcutorture.c diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efb792e13fca..8205295fc33e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -7,7 +7,7 @@ * Authors: Paul E. McKenney * Josh Triplett * - * See also: Documentation/RCU/torture.txt + * See also: Documentation/RCU/torture.rst */ #define pr_fmt(fmt) fmt From 90c73cb2c65f9e78eb09a8cbcd4bcd4add2d3f4d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:07 +0200 Subject: [PATCH 082/502] docs: RCU: Convert rcuref.txt to ReST - Add a SPDX header; - Adjust document title; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to RCU/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- Documentation/RCU/index.rst | 1 + Documentation/RCU/{rcuref.txt => rcuref.rst} | 193 ++++++++++--------- 2 files changed, 101 insertions(+), 93 deletions(-) rename Documentation/RCU/{rcuref.txt => rcuref.rst} (50%) diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 5d5f9a1ab8f9..9a1d51f394dc 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -18,6 +18,7 @@ RCU concepts whatisRCU rcu rculist_nulls + rcuref torture listRCU NMI-RCU diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.rst similarity index 50% rename from Documentation/RCU/rcuref.txt rename to Documentation/RCU/rcuref.rst index 5e6429d66c24..b33aeb14fde3 100644 --- a/Documentation/RCU/rcuref.txt +++ b/Documentation/RCU/rcuref.rst @@ -1,4 +1,8 @@ -Reference-count design for elements of lists/arrays protected by RCU. +.. SPDX-License-Identifier: GPL-2.0 + +==================================================================== +Reference-count design for elements of lists/arrays protected by RCU +==================================================================== Please note that the percpu-ref feature is likely your first @@ -12,32 +16,33 @@ please read on. Reference counting on elements of lists which are protected by traditional reader/writer spinlocks or semaphores are straightforward: -CODE LISTING A: -1. 2. -add() search_and_reference() -{ { - alloc_object read_lock(&list_lock); - ... search_for_element - atomic_set(&el->rc, 1); atomic_inc(&el->rc); - write_lock(&list_lock); ... - add_element read_unlock(&list_lock); - ... ... - write_unlock(&list_lock); } -} +CODE LISTING A:: -3. 4. -release_referenced() delete() -{ { - ... write_lock(&list_lock); - if(atomic_dec_and_test(&el->rc)) ... - kfree(el); - ... remove_element -} write_unlock(&list_lock); - ... - if (atomic_dec_and_test(&el->rc)) - kfree(el); - ... - } + 1. 2. + add() search_and_reference() + { { + alloc_object read_lock(&list_lock); + ... search_for_element + atomic_set(&el->rc, 1); atomic_inc(&el->rc); + write_lock(&list_lock); ... + add_element read_unlock(&list_lock); + ... ... + write_unlock(&list_lock); } + } + + 3. 4. + release_referenced() delete() + { { + ... write_lock(&list_lock); + if(atomic_dec_and_test(&el->rc)) ... + kfree(el); + ... remove_element + } write_unlock(&list_lock); + ... + if (atomic_dec_and_test(&el->rc)) + kfree(el); + ... + } If this list/array is made lock free using RCU as in changing the write_lock() in add() and delete() to spin_lock() and changing read_lock() @@ -46,34 +51,35 @@ search_and_reference() could potentially hold reference to an element which has already been deleted from the list/array. Use atomic_inc_not_zero() in this scenario as follows: -CODE LISTING B: -1. 2. -add() search_and_reference() -{ { - alloc_object rcu_read_lock(); - ... search_for_element - atomic_set(&el->rc, 1); if (!atomic_inc_not_zero(&el->rc)) { - spin_lock(&list_lock); rcu_read_unlock(); - return FAIL; - add_element } - ... ... - spin_unlock(&list_lock); rcu_read_unlock(); -} } -3. 4. -release_referenced() delete() -{ { - ... spin_lock(&list_lock); - if (atomic_dec_and_test(&el->rc)) ... - call_rcu(&el->head, el_free); remove_element - ... spin_unlock(&list_lock); -} ... - if (atomic_dec_and_test(&el->rc)) - call_rcu(&el->head, el_free); - ... - } +CODE LISTING B:: + + 1. 2. + add() search_and_reference() + { { + alloc_object rcu_read_lock(); + ... search_for_element + atomic_set(&el->rc, 1); if (!atomic_inc_not_zero(&el->rc)) { + spin_lock(&list_lock); rcu_read_unlock(); + return FAIL; + add_element } + ... ... + spin_unlock(&list_lock); rcu_read_unlock(); + } } + 3. 4. + release_referenced() delete() + { { + ... spin_lock(&list_lock); + if (atomic_dec_and_test(&el->rc)) ... + call_rcu(&el->head, el_free); remove_element + ... spin_unlock(&list_lock); + } ... + if (atomic_dec_and_test(&el->rc)) + call_rcu(&el->head, el_free); + ... + } Sometimes, a reference to the element needs to be obtained in the -update (write) stream. In such cases, atomic_inc_not_zero() might be +update (write) stream. In such cases, atomic_inc_not_zero() might be overkill, since we hold the update-side spinlock. One might instead use atomic_inc() in such cases. @@ -82,39 +88,40 @@ search_and_reference() code path. In such cases, the atomic_dec_and_test() may be moved from delete() to el_free() as follows: -CODE LISTING C: -1. 2. -add() search_and_reference() -{ { - alloc_object rcu_read_lock(); - ... search_for_element - atomic_set(&el->rc, 1); atomic_inc(&el->rc); - spin_lock(&list_lock); ... +CODE LISTING C:: - add_element rcu_read_unlock(); - ... } - spin_unlock(&list_lock); 4. -} delete() -3. { -release_referenced() spin_lock(&list_lock); -{ ... - ... remove_element - if (atomic_dec_and_test(&el->rc)) spin_unlock(&list_lock); - kfree(el); ... - ... call_rcu(&el->head, el_free); -} ... -5. } -void el_free(struct rcu_head *rhp) -{ - release_referenced(); -} + 1. 2. + add() search_and_reference() + { { + alloc_object rcu_read_lock(); + ... search_for_element + atomic_set(&el->rc, 1); atomic_inc(&el->rc); + spin_lock(&list_lock); ... + + add_element rcu_read_unlock(); + ... } + spin_unlock(&list_lock); 4. + } delete() + 3. { + release_referenced() spin_lock(&list_lock); + { ... + ... remove_element + if (atomic_dec_and_test(&el->rc)) spin_unlock(&list_lock); + kfree(el); ... + ... call_rcu(&el->head, el_free); + } ... + 5. } + void el_free(struct rcu_head *rhp) + { + release_referenced(); + } The key point is that the initial reference added by add() is not removed until after a grace period has elapsed following removal. This means that search_and_reference() cannot find this element, which means that the value of el->rc cannot increase. Thus, once it reaches zero, there are no -readers that can or ever will be able to reference the element. The -element can therefore safely be freed. This in turn guarantees that if +readers that can or ever will be able to reference the element. The +element can therefore safely be freed. This in turn guarantees that if any reader finds the element, that reader may safely acquire a reference without checking the value of the reference counter. @@ -130,21 +137,21 @@ the eventual invocation of kfree(), which is usually not a problem on modern computer systems, even the small ones. In cases where delete() can sleep, synchronize_rcu() can be called from -delete(), so that el_free() can be subsumed into delete as follows: +delete(), so that el_free() can be subsumed into delete as follows:: -4. -delete() -{ - spin_lock(&list_lock); - ... - remove_element - spin_unlock(&list_lock); - ... - synchronize_rcu(); - if (atomic_dec_and_test(&el->rc)) - kfree(el); - ... -} + 4. + delete() + { + spin_lock(&list_lock); + ... + remove_element + spin_unlock(&list_lock); + ... + synchronize_rcu(); + if (atomic_dec_and_test(&el->rc)) + kfree(el); + ... + } As additional examples in the kernel, the pattern in listing C is used by reference counting of struct pid, while the pattern in listing B is used by From f2286ab99549271f3cec73e305b9ecca95d91394 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:10 +0200 Subject: [PATCH 083/502] docs: RCU: Convert stallwarn.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Fix list markups; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to RCU/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- Documentation/RCU/index.rst | 1 + .../RCU/{stallwarn.txt => stallwarn.rst} | 55 ++++++++++++------- kernel/rcu/tree_stall.h | 4 +- 3 files changed, 37 insertions(+), 23 deletions(-) rename Documentation/RCU/{stallwarn.txt => stallwarn.rst} (90%) diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 9a1d51f394dc..e703d3dbe60c 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -20,6 +20,7 @@ RCU concepts rculist_nulls rcuref torture + stallwarn listRCU NMI-RCU UP diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.rst similarity index 90% rename from Documentation/RCU/stallwarn.txt rename to Documentation/RCU/stallwarn.rst index a360a8796710..08bc9aec4606 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.rst @@ -1,4 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================== Using RCU's CPU Stall Detector +============================== This document first discusses what sorts of issues RCU's CPU stall detector can locate, and then discusses kernel parameters and Kconfig @@ -7,39 +11,40 @@ this document explains the stall detector's "splat" format. What Causes RCU CPU Stall Warnings? +=================================== So your kernel printed an RCU CPU stall warning. The next question is "What caused it?" The following problems can result in RCU CPU stall warnings: -o A CPU looping in an RCU read-side critical section. +- A CPU looping in an RCU read-side critical section. -o A CPU looping with interrupts disabled. +- A CPU looping with interrupts disabled. -o A CPU looping with preemption disabled. +- A CPU looping with preemption disabled. -o A CPU looping with bottom halves disabled. +- A CPU looping with bottom halves disabled. -o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel +- For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel without invoking schedule(). If the looping in the kernel is really expected and desirable behavior, you might need to add some calls to cond_resched(). -o Booting Linux using a console connection that is too slow to +- Booting Linux using a console connection that is too slow to keep up with the boot-time console-message rate. For example, a 115Kbaud serial console can be -way- too slow to keep up with boot-time message rates, and will frequently result in RCU CPU stall warning messages. Especially if you have added debug printk()s. -o Anything that prevents RCU's grace-period kthreads from running. +- Anything that prevents RCU's grace-period kthreads from running. This can result in the "All QSes seen" console-log message. This message will include information on when the kthread last ran and how often it should be expected to run. It can also - result in the "rcu_.*kthread starved for" console-log message, + result in the ``rcu_.*kthread starved for`` console-log message, which will include additional debugging information. -o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might +- A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might happen to preempt a low-priority task in the middle of an RCU read-side critical section. This is especially damaging if that low-priority task is not permitted to run on any other CPU, @@ -48,7 +53,7 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might While the system is in the process of running itself out of memory, you might see stall-warning messages. -o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that +- A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that is running at a higher priority than the RCU softirq threads. This will prevent RCU callbacks from ever being invoked, and in a CONFIG_PREEMPT_RCU kernel will further prevent @@ -63,7 +68,7 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that can increase your system's context-switch rate and thus degrade performance. -o A periodic interrupt whose handler takes longer than the time +- A periodic interrupt whose handler takes longer than the time interval between successive pairs of interrupts. This can prevent RCU's kthreads and softirq handlers from running. Note that certain high-overhead debugging options, for example @@ -71,20 +76,20 @@ o A periodic interrupt whose handler takes longer than the time considerably longer than normal, which can in turn result in RCU CPU stall warnings. -o Testing a workload on a fast system, tuning the stall-warning +- Testing a workload on a fast system, tuning the stall-warning timeout down to just barely avoid RCU CPU stall warnings, and then running the same workload with the same stall-warning timeout on a slow system. Note that thermal throttling and on-demand governors can cause a single system to be sometimes fast and sometimes slow! -o A hardware or software issue shuts off the scheduler-clock +- A hardware or software issue shuts off the scheduler-clock interrupt on a CPU that is not in dyntick-idle mode. This problem really has happened, and seems to be most likely to result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. -o A bug in the RCU implementation. +- A bug in the RCU implementation. -o A hardware failure. This is quite unlikely, but has occurred +- A hardware failure. This is quite unlikely, but has occurred at least once in real life. A CPU failed in a running system, becoming unresponsive, but not causing an immediate crash. This resulted in a series of RCU CPU stall warnings, eventually @@ -109,6 +114,7 @@ see include/trace/events/rcu.h. Fine-Tuning the RCU CPU Stall Detector +====================================== The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's CPU stall detector, which detects conditions that unduly delay RCU grace @@ -118,6 +124,7 @@ The stall detector's idea of what constitutes "unduly delayed" is controlled by a set of kernel configuration variables and cpp macros: CONFIG_RCU_CPU_STALL_TIMEOUT +---------------------------- This kernel configuration parameter defines the period of time that RCU will wait from the beginning of a grace period until it @@ -137,6 +144,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress. RCU_STALL_DELAY_DELTA +--------------------- Although the lockdep facility is extremely useful, it does add some overhead. Therefore, under CONFIG_PROVE_RCU, the @@ -145,6 +153,7 @@ RCU_STALL_DELAY_DELTA macro, not a kernel configuration parameter.) RCU_STALL_RAT_DELAY +------------------- The CPU stall detector tries to make the offending CPU print its own warnings, as this often gives better-quality stack traces. @@ -155,6 +164,7 @@ RCU_STALL_RAT_DELAY parameter.) rcupdate.rcu_task_stall_timeout +------------------------------- This boot/sysfs parameter controls the RCU-tasks stall warning interval. A value of zero or less suppresses RCU-tasks stall @@ -168,9 +178,10 @@ rcupdate.rcu_task_stall_timeout Interpreting RCU's CPU Stall-Detector "Splats" +============================================== For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling, -it will print a message similar to the following: +it will print a message similar to the following:: INFO: rcu_sched detected stalls on CPUs/tasks: 2-...: (3 GPs behind) idle=06c/0/0 softirq=1453/1455 fqs=0 @@ -223,7 +234,7 @@ an estimate of the total number of RCU callbacks queued across all CPUs (625 in this case). In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed -for each CPU: +for each CPU:: 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1 @@ -235,7 +246,7 @@ processing is enabled. If the grace period ends just as the stall warning starts printing, there will be a spurious stall-warning message, which will include -the following: +the following:: INFO: Stall ended before state dump start @@ -248,7 +259,7 @@ which is overkill for this sort of problem. If all CPUs and tasks have passed through quiescent states, but the grace period has nevertheless failed to end, the stall-warning splat -will include something like the following: +will include something like the following:: All QSes seen, last rcu_preempt kthread activity 23807 (4297905177-4297881370), jiffies_till_next_fqs=3, root ->qsmask 0x0 @@ -261,7 +272,7 @@ which is way less than 23807. Finally, the root rcu_node structure's If the relevant grace-period kthread has been unable to run prior to the stall warning, as was the case in the "All QSes seen" line above, -the following additional line is printed: +the following additional line is printed:: kthread starved for 23807 jiffies! g7075 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1 ->cpu=5 @@ -276,6 +287,7 @@ kthread last ran on CPU 5. Multiple Warnings From One Stall +================================ If a stall lasts long enough, multiple stall-warning messages will be printed for it. The second and subsequent messages are printed at @@ -285,9 +297,10 @@ of the stall and the first message. Stall Warnings for Expedited Grace Periods +========================================== If an expedited grace period detects a stall, it will place a message -like the following in dmesg: +like the following in dmesg:: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 7-... } 21119 jiffies s: 73 root: 0x2/. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 54a6dba0280d..b04256cd7e12 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -468,7 +468,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) /* * OK, time to rat on our buddy... - * See Documentation/RCU/stallwarn.txt for info on how to debug + * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); @@ -535,7 +535,7 @@ static void print_cpu_stall(unsigned long gps) /* * OK, time to rat on ourselves... - * See Documentation/RCU/stallwarn.txt for info on how to debug + * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); From 2d9c318bfd15394da014737bee30e7b2e22c5eac Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:11 +0200 Subject: [PATCH 084/502] docs: RCU: Don't duplicate chapter names in rculist_nulls.rst Since changeset 58ad30cf91f0 ("docs: fix reference to core-api/namespaces.rst"), auto-references for chapters are generated. This is a nice feature, but has a drawback: no chapters can have the same sumber. So, we need to add two higher hierarchy chapters on this document, in order to avoid such duplication. Fixes: 58ad30cf91f0 ("docs: fix reference to core-api/namespaces.rst") Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- Documentation/RCU/rculist_nulls.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/RCU/rculist_nulls.rst b/Documentation/RCU/rculist_nulls.rst index d40374221d69..a9fc774bc400 100644 --- a/Documentation/RCU/rculist_nulls.rst +++ b/Documentation/RCU/rculist_nulls.rst @@ -10,6 +10,9 @@ objects using SLAB_TYPESAFE_BY_RCU allocations. Please read the basics in Documentation/RCU/listRCU.rst +Using 'nulls' +============= + Using special makers (called 'nulls') is a convenient way to solve following problem : @@ -126,6 +129,9 @@ very very fast (before the end of RCU grace period) -------------------------------------------------------------------------- +Avoiding extra smp_rmb() +======================== + With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup() and extra smp_wmb() in insert function. From b81898e3d2133715e4475d25757595a3e18502ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Apr 2020 12:23:11 -0700 Subject: [PATCH 085/502] doc: Timer problems can cause RCU CPU stall warnings Over the past few years, there have been several cases where timekeeping bugs have caused RCU CPU stall warnings, particularly during hardware bringup. This commit therefore adds such bugs to the list of things that can result in RCU CPU stall warnings. Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index 08bc9aec4606..c9ab6af4d3be 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -87,6 +87,13 @@ warnings: problem really has happened, and seems to be most likely to result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. +- A hardware or software issue that prevents time-based wakeups + from occurring. These issues can range from misconfigured or + buggy timer hardware through bugs in the interrupt or exception + path (whether hardware, firmware, or software) through bugs + in Linux's timer subsystem through bugs in the scheduler, and, + yes, even including bugs in RCU itself. + - A bug in the RCU implementation. - A hardware failure. This is quite unlikely, but has occurred From d93d97cbe0d4369153fb04954f1481a9f42aa5b6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 May 2020 19:52:34 -0700 Subject: [PATCH 086/502] doc: Tasks RCU must protect instructions before trampoline Protecting the code in a trampoline can also require protecting a number of instructions prior to actually entering the trampoline. For example, these earlier instructions might be computing the address of the trampoline. This commit therefore updates RCU's requirements to record this for posterity. Link: https://lore.kernel.org/lkml/20200511154824.09a18c46@gandalf.local.home/ Reported-by: Lai Jiangshan Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Requirements/Requirements.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 75b8ca007a11..a69b5c43a10c 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -2583,7 +2583,12 @@ not work to have these markers in the trampoline itself, because there would need to be instructions following ``rcu_read_unlock()``. Although ``synchronize_rcu()`` would guarantee that execution reached the ``rcu_read_unlock()``, it would not be able to guarantee that execution -had completely left the trampoline. +had completely left the trampoline. Worse yet, in some situations +the trampoline's protection must extend a few instructions *prior* to +execution reaching the trampoline. For example, these few instructions +might calculate the address of the trampoline, so that entering the +trampoline would be pre-ordained a surprisingly long time before execution +actually reached the trampoline itself. The solution, in the form of `Tasks RCU `__, is to have implicit read-side From 7ee880b7bf1dea88d0a472b775aebdb4fb6bf860 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 15 Apr 2020 22:26:55 +0000 Subject: [PATCH 087/502] rcu: Initialize and destroy rcu_synchronize only when necessary The __wait_rcu_gp() function unconditionally initializes and cleans up each element of rs_array[], whether used or not. This is slightly wasteful and rather confusing, so this commit skips both initialization and cleanup for duplicate callback functions. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 84843adfd939..f5a82e107bcb 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -390,13 +390,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, might_sleep(); continue; } - init_rcu_head_on_stack(&rs_array[i].head); - init_completion(&rs_array[i].completion); for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } } /* Wait for all callbacks to be invoked. */ @@ -407,9 +408,10 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { wait_for_completion(&rs_array[i].completion); - destroy_rcu_head_on_stack(&rs_array[i].head); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } } EXPORT_SYMBOL_GPL(__wait_rcu_gp); From 0a3b3c253a1eb2c7fe7f34086d46660c909abeb3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 16 Apr 2020 16:46:10 -0700 Subject: [PATCH 088/502] mm/mmap.c: Add cond_resched() for exit_mmap() CPU stalls A large process running on a heavily loaded system can encounter the following RCU CPU stall warning: rcu: INFO: rcu_sched self-detected stall on CPU rcu: 3-....: (20998 ticks this GP) idle=4ea/1/0x4000000000000002 softirq=556558/556558 fqs=5190 (t=21013 jiffies g=1005461 q=132576) NMI backtrace for cpu 3 CPU: 3 PID: 501900 Comm: aio-free-ring-w Kdump: loaded Not tainted 5.2.9-108_fbk12_rc3_3858_gb83b75af7909 #1 Hardware name: Wiwynn HoneyBadger/PantherPlus, BIOS HBM6.71 02/03/2016 Call Trace: dump_stack+0x46/0x60 nmi_cpu_backtrace.cold.3+0x13/0x50 ? lapic_can_unplug_cpu.cold.27+0x34/0x34 nmi_trigger_cpumask_backtrace+0xba/0xca rcu_dump_cpu_stacks+0x99/0xc7 rcu_sched_clock_irq.cold.87+0x1aa/0x397 ? tick_sched_do_timer+0x60/0x60 update_process_times+0x28/0x60 tick_sched_timer+0x37/0x70 __hrtimer_run_queues+0xfe/0x270 hrtimer_interrupt+0xf4/0x210 smp_apic_timer_interrupt+0x5e/0x120 apic_timer_interrupt+0xf/0x20 RIP: 0010:kmem_cache_free+0x223/0x300 Code: 88 00 00 00 0f 85 ca 00 00 00 41 8b 55 18 31 f6 f7 da 41 f6 45 0a 02 40 0f 94 c6 83 c6 05 9c 41 5e fa e8 a0 a7 01 00 41 56 9d <49> 8b 47 08 a8 03 0f 85 87 00 00 00 65 48 ff 08 e9 3d fe ff ff 65 RSP: 0018:ffffc9000e8e3da8 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff13 RAX: 0000000000020000 RBX: ffff88861b9de960 RCX: 0000000000000030 RDX: fffffffffffe41e8 RSI: 000060777fe3a100 RDI: 000000000001be18 RBP: ffffea00186e7780 R08: ffffffffffffffff R09: ffffffffffffffff R10: ffff88861b9dea28 R11: ffff88887ffde000 R12: ffffffff81230a1f R13: ffff888854684dc0 R14: 0000000000000206 R15: ffff8888547dbc00 ? remove_vma+0x4f/0x60 remove_vma+0x4f/0x60 exit_mmap+0xd6/0x160 mmput+0x4a/0x110 do_exit+0x278/0xae0 ? syscall_trace_enter+0x1d3/0x2b0 ? handle_mm_fault+0xaa/0x1c0 do_group_exit+0x3a/0xa0 __x64_sys_exit_group+0x14/0x20 do_syscall_64+0x42/0x100 entry_SYSCALL_64_after_hwframe+0x44/0xa9 And on a PREEMPT=n kernel, the "while (vma)" loop in exit_mmap() can run for a very long time given a large process. This commit therefore adds a cond_resched() to this loop, providing RCU any needed quiescent states. Cc: Andrew Morton Cc: Reviewed-by: Shakeel Butt Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mmap.c b/mm/mmap.c index 59a4682ebf3f..972f839c6ec8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3159,6 +3159,7 @@ void exit_mmap(struct mm_struct *mm) if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma = remove_vma(vma); + cond_resched(); } vm_unacct_memory(nr_accounted); } From abfce0414814149f716e1d30da1fb3140d1b3473 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 19 Apr 2020 21:57:15 +0000 Subject: [PATCH 089/502] rcu: Simplify the calculation of rcu_state.ncpus There is only 1 bit set in mask, which means that the only difference between oldmask and the new one will be at the position where the bit is set in mask. This commit therefore updates rcu_state.ncpus by checking whether the bit in mask is already set in rnp->expmaskinitnext. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c6569e0586c..bef1dc91bfbe 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3842,10 +3842,9 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; - int nbits; - unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; + bool newcpu; if (per_cpu(rcu_cpu_started, cpu)) return; @@ -3857,12 +3856,10 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); - oldmask = rnp->expmaskinitnext; + newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; - oldmask ^= rnp->expmaskinitnext; - nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ - smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */ ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); From e816d56fad57ba9817cef6606b12f5e14647c3bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 May 2020 16:49:48 -0700 Subject: [PATCH 090/502] rcu: Add callbacks-invoked counters This commit adds a count of the callbacks invoked to the per-CPU rcu_data structure. This count is printed by the show_rcu_gp_kthreads() that is invoked by rcutorture and the RCU CPU stall-warning code. It is also intended for use by drgn. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree.h | 1 + kernel/rcu/tree_stall.h | 3 +++ 3 files changed, 5 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bef1dc91bfbe..874c831bcc45 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2443,6 +2443,7 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); count = -rcl.len; + rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 43991a40b084..9c6f7343bec0 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -171,6 +171,7 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 54a6dba0280d..2768ce6bf657 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -649,6 +649,7 @@ static void check_cpu_stall(struct rcu_data *rdp) */ void show_rcu_gp_kthreads(void) { + unsigned long cbs = 0; int cpu; unsigned long j; unsigned long ja; @@ -690,9 +691,11 @@ void show_rcu_gp_kthreads(void) } for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); + cbs += data_race(rdp->n_cbs_invoked); if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } + pr_info("RCU callbacks invoked since boot: %lu\n", cbs); show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); From f8466f94685b5bd931384526cf51e090fd2ac706 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 3 May 2020 19:16:09 -0700 Subject: [PATCH 091/502] rcu: Add comment documenting rcu_callback_map's purpose The rcu_callback_map lockdep_map structure was added back in 2013, but its purpose has become obscure. This commit therefore documments that the purpose of rcu_callback map is, in the words of commit 24ef659a857 ("rcu: Provide better diagnostics for blocking in RCU callback functions"), to help lockdep to tie an "inappropriate voluntary context switch back to the fact that the function is being invoked from within a callback." Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f5a82e107bcb..ca17b771ad60 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -279,6 +279,7 @@ struct lockdep_map rcu_sched_lock_map = { }; EXPORT_SYMBOL_GPL(rcu_sched_lock_map); +// Tell lockdep when RCU callbacks are being invoked. static struct lock_class_key rcu_callback_key; struct lockdep_map rcu_callback_map = STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); From 88748e330040ecf4681a2c8f344fd386862bf913 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Mon, 4 May 2020 08:05:05 -0400 Subject: [PATCH 092/502] trace: events: rcu: Change description of rcu_dyntick trace event The different strings used for describing the polarity are Start, End and StillNonIdle. Since StillIdle is not used in any trace point for rcu_dyntick, it can be removed and StillNonIdle can be added in the description. Because StillNonIdle is used in a few tracepoints for rcu_dyntick. Similarly, USER, IDLE and IRQ are used for describing context in the rcu_dyntick tracepoints. Since, "KERNEL" is not used for any of the rcu_dyntick tracepoints, remove it from the description. Signed-off-by: Madhuparna Bhowmik Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index f9a7811148e2..af274d1532bf 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -435,11 +435,12 @@ TRACE_EVENT_RCU(rcu_fqs, #endif /* #if defined(CONFIG_TREE_RCU) */ /* - * Tracepoint for dyntick-idle entry/exit events. These take a string - * as argument: "Start" for entering dyntick-idle mode, "Startirq" for - * entering it from irq/NMI, "End" for leaving it, "Endirq" for leaving it - * to irq/NMI, "--=" for events moving towards idle, and "++=" for events - * moving away from idle. + * Tracepoint for dyntick-idle entry/exit events. These take 2 strings + * as argument: + * polarity: "Start", "End", "StillNonIdle" for entering, exiting or still not + * being in dyntick-idle mode. + * context: "USER" or "IDLE" or "IRQ". + * NMIs nested in IRQs are inferred with dynticks_nesting > 1 in IRQ context. * * These events also take a pair of numbers, which indicate the nesting * depth before and after the event of interest, and a third number that is From 77865dea25c4f45ce0c5bf61a8470af01fccd944 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 15:44:46 -0700 Subject: [PATCH 093/502] rcu: Grace-period-kthread related sleeps to idle priority This commit converts the long-standing schedule_timeout_interruptible() and schedule_timeout_uninterruptible() calls used by RCU's grace-period kthread to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 874c831bcc45..feb31c201dee 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1638,7 +1638,7 @@ static void rcu_gp_slow(int delay) if (delay > 0 && !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) - schedule_timeout_uninterruptible(delay); + schedule_timeout_idle(delay); } static unsigned long sleep_duration; @@ -1661,7 +1661,7 @@ static void rcu_gp_torture_wait(void) duration = xchg(&sleep_duration, 0UL); if (duration > 0) { pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); - schedule_timeout_uninterruptible(duration); + schedule_timeout_idle(duration); pr_alert("%s: Wait complete\n", __func__); } } @@ -2727,7 +2727,7 @@ static void rcu_cpu_kthread(unsigned int cpu) } *statusp = RCU_KTHREAD_YIELDING; trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; } From a9352f72d6a9e8fe4840b9f0d97af8f5a6c52c79 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:34:38 -0700 Subject: [PATCH 094/502] rcu: Priority-boost-related sleeps to idle priority This commit converts the long-standing schedule_timeout_interruptible() call used by RCU's priority-boosting kthreads to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 352223664ebd..25296c17a30d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1033,7 +1033,7 @@ static int rcu_boost_kthread(void *arg) if (spincnt > 10) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); spincnt = 0; } From f5ca34643bbd84f514bdeee194c45dd1fb066ef2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:36:10 -0700 Subject: [PATCH 095/502] rcu: No-CBs-related sleeps to idle priority This commit converts the schedule_timeout_interruptible() call used by RCU's no-CBs grace-period kthreads to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 25296c17a30d..982fc5be5269 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2005,7 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) /* Polling, so trace if first poll in the series. */ if (gotcbs) trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_interruptible(1); + schedule_timeout_idle(1); } else if (!needwait_gp) { /* Wait for callbacks to appear. */ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); From 68c2f27e01f61760e6ae76fff9682e1ffe9bacb6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:38:29 -0700 Subject: [PATCH 096/502] rcu: Expedited grace-period sleeps to idle priority This commit converts the schedule_timeout_uninterruptible() call used by RCU's expedited grace-period processing to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 72952edad1e4..1888c0eb1216 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -403,7 +403,7 @@ retry_ipi: /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl")); - schedule_timeout_uninterruptible(1); + schedule_timeout_idle(1); goto retry_ipi; } /* CPU really is offline, so we must report its QS. */ From 9f47eb5461aaeb6cb8696f9d11503ae90e4d5cb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 May 2020 14:15:37 -0700 Subject: [PATCH 097/502] fs/btrfs: Add cond_resched() for try_release_extent_mapping() stalls Very large I/Os can cause the following RCU CPU stall warning: RIP: 0010:rb_prev+0x8/0x50 Code: 49 89 c0 49 89 d1 48 89 c2 48 89 f8 e9 e5 fd ff ff 4c 89 48 10 c3 4c = 89 06 c3 4c 89 40 10 c3 0f 1f 00 48 8b 0f 48 39 cf 74 38 <48> 8b 47 10 48 85 c0 74 22 48 8b 50 08 48 85 d2 74 0c 48 89 d0 48 RSP: 0018:ffffc9002212bab0 EFLAGS: 00000287 ORIG_RAX: ffffffffffffff13 RAX: ffff888821f93630 RBX: ffff888821f93630 RCX: ffff888821f937e0 RDX: 0000000000000000 RSI: 0000000000102000 RDI: ffff888821f93630 RBP: 0000000000103000 R08: 000000000006c000 R09: 0000000000000238 R10: 0000000000102fff R11: ffffc9002212bac8 R12: 0000000000000001 R13: ffffffffffffffff R14: 0000000000102000 R15: ffff888821f937e0 __lookup_extent_mapping+0xa0/0x110 try_release_extent_mapping+0xdc/0x220 btrfs_releasepage+0x45/0x70 shrink_page_list+0xa39/0xb30 shrink_inactive_list+0x18f/0x3b0 shrink_lruvec+0x38e/0x6b0 shrink_node+0x14d/0x690 do_try_to_free_pages+0xc6/0x3e0 try_to_free_mem_cgroup_pages+0xe6/0x1e0 reclaim_high.constprop.73+0x87/0xc0 mem_cgroup_handle_over_high+0x66/0x150 exit_to_usermode_loop+0x82/0xd0 do_syscall_64+0xd4/0x100 entry_SYSCALL_64_after_hwframe+0x44/0xa9 On a PREEMPT=n kernel, the try_release_extent_mapping() function's "while" loop might run for a very long time on a large I/O. This commit therefore adds a cond_resched() to this loop, providing RCU any needed quiescent states. Signed-off-by: Paul E. McKenney --- fs/btrfs/extent_io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 68c96057ad2d..704239546093 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4515,6 +4515,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) /* once for us */ free_extent_map(em); + + cond_resched(); /* Allow large-extent preemption. */ } } return try_release_extent_state(tree, page, mask); From 360fbbb4897c98971e8955b063c01250817a2191 Mon Sep 17 00:00:00 2001 From: Lihao Liang Date: Thu, 14 May 2020 21:34:34 +0100 Subject: [PATCH 098/502] rcu: Update comment from rsp->rcu_gp_seq to rsp->gp_seq Signed-off-by: Lihao Liang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9c6f7343bec0..575745f0a464 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -41,7 +41,7 @@ struct rcu_node { raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ /* some rcu_state fields as well as */ /* following. */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ + unsigned long gp_seq; /* Track rsp->gp_seq. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ @@ -149,7 +149,7 @@ union rcu_noqs { /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ + unsigned long gp_seq; /* Track rsp->gp_seq counter. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ From 3c8920e2dbd1a55f72dc14d656df9d0097cf5c72 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 15 May 2020 02:34:29 +0200 Subject: [PATCH 099/502] tick/nohz: Narrow down noise while setting current task's tick dependency Setting a tick dependency on any task, including the case where a task sets that dependency on itself, triggers an IPI to all CPUs. That is of course suboptimal but it had previously not been an issue because it was only used by POSIX CPU timers on nohz_full, which apparently never occurs in latency-sensitive workloads in production. (Or users of such systems are suffering in silence on the one hand or venting their ire on the wrong people on the other.) But RCU now sets a task tick dependency on the current task in order to fix stall issues that can occur during RCU callback processing. Thus, RCU callback processing triggers frequent system-wide IPIs from nohz_full CPUs. This is quite counter-productive, after all, avoiding IPIs is what nohz_full is supposed to be all about. This commit therefore optimizes tasks' self-setting of a task tick dependency by using tick_nohz_full_kick() to avoid the system-wide IPI. Instead, only the execution of the one task is disturbed, which is acceptable given that this disturbance is well down into the noise compared to the degree to which the RCU callback processing itself disturbs execution. Fixes: 6a949b7af82d (rcu: Force on tick when invoking lots of callbacks) Reported-by: Matt Fleming Signed-off-by: Frederic Weisbecker Cc: stable@kernel.org Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/time/tick-sched.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e2dc9b8858c..f0199a4ba1ad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* - * Set a per-task tick dependency. Posix CPU timers need this in order to elapse - * per task timers. + * Set a per-task tick dependency. RCU need this. Also posix CPU timers + * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { - /* - * We could optimize this with just kicking the target running the task - * if that noise matters for nohz full users. - */ - tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); + if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) { + if (tsk == current) { + preempt_disable(); + tick_nohz_full_kick(); + preempt_enable(); + } else { + /* + * Some future tick_nohz_full_kick_task() + * should optimize this. + */ + tick_nohz_full_kick_all(); + } + } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); From 55fbe86ef303bc8ab040e579fba34a750c08200e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 May 2020 15:02:02 -0700 Subject: [PATCH 100/502] rcu: Remove initialized but unused rnp from check_slow_task() This commit removes the variable rnp from check_slow_task(), which is defined, assigned to, but not otherwise used. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 2768ce6bf657..d203f82a380a 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -237,14 +237,12 @@ struct rcu_stall_chk_rdr { */ static bool check_slow_task(struct task_struct *t, void *arg) { - struct rcu_node *rnp; struct rcu_stall_chk_rdr *rscrp = arg; if (task_curr(t)) return false; // It is running, so decline to inspect it. rscrp->nesting = t->rcu_read_lock_nesting; rscrp->rs = t->rcu_read_unlock_special; - rnp = t->rcu_blocked_node; rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); return true; } From 04b25a495bd68c1dad07263fb91e8b5a31c00a9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 May 2020 17:00:54 -0700 Subject: [PATCH 101/502] rcu: Mark rcu_nmi_enter() call to rcu_cleanup_after_idle() noinstr The objtool complains about the call to rcu_cleanup_after_idle() from rcu_nmi_enter(), so this commit adds instrumentation_begin() before that call and instrumentation_end() after it. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index feb31c201dee..d17e5a08bf43 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -990,8 +990,11 @@ noinstr void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); // ... but is watching here. - if (!in_nmi()) + if (!in_nmi()) { + instrumentation_begin(); rcu_cleanup_after_idle(); + instrumentation_end(); + } instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() From d29e0b26b020422cc51b5b51733cc50fcf443965 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 08:49:29 -0700 Subject: [PATCH 102/502] lockdep: Complain only once about RCU in extended quiescent state Currently, lockdep_rcu_suspicious() complains twice about RCU read-side critical sections being invoked from within extended quiescent states, for example: RCU used illegally from idle CPU! rcu_scheduler_active = 2, debug_locks = 1 RCU used illegally from extended quiescent state! This commit therefore saves a couple lines of code and one line of console-log output by eliminating the first of these two complaints. Link: https://lore.kernel.org/lkml/87wo4wnpzb.fsf@nanos.tec.linutronix.de Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Signed-off-by: Paul E. McKenney --- kernel/locking/lockdep.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..0a7549d159ed 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5851,9 +5851,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" - : !rcu_is_watching() - ? "RCU used illegally from idle CPU!\n" - : "", + : "", rcu_scheduler_active, debug_locks); /* From e40bb921119814c6f746891af9cd37eccda616a4 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 1 Jun 2020 19:45:49 +0100 Subject: [PATCH 103/502] rcu: Replace 1 with true Coccinelle reports a warning WARNING: Assignment of 0/1 to bool variable The root cause is that the variable lastphase is a bool, but is initialised with integer 1. This commit therefore replaces the 1 with a true. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index ca17b771ad60..a0ba8858dd35 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -207,7 +207,7 @@ void rcu_end_inkernel_boot(void) rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); - rcu_boot_ended = 1; + rcu_boot_ended = true; } /* From c6dfd72b7a3b70a2054db0f73245ea2f762a8452 Mon Sep 17 00:00:00 2001 From: Peter Enderborg Date: Thu, 4 Jun 2020 12:23:20 +0200 Subject: [PATCH 104/502] rcu: Stop shrinker loop The count and scan can be separated in time, and there is a fair chance that all work is already done when the scan starts, which might in turn result in a needless retry. This commit therefore avoids this retry by returning SHRINK_STOP. Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Peter Enderborg Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d17e5a08bf43..c8196fab563c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3332,7 +3332,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - return freed; + return freed == 0 ? SHRINK_STOP : freed; } static struct shrinker kfree_rcu_shrinker = { From 00943a609d7ad0f08e58bc9c214f38b0ba163c88 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jun 2020 10:07:52 +0800 Subject: [PATCH 105/502] rcu: gp_max is protected by root rcu_node's lock Because gp_max is protected by root rcu_node's lock, this commit moves the gp_max definition to the region of the rcu_node structure containing fields protected by this lock. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 575745f0a464..09ec93b16f28 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -302,6 +302,8 @@ struct rcu_state { u8 boost ____cacheline_internodealigned_in_smp; /* Subject to priority boost. */ unsigned long gp_seq; /* Grace-period sequence #. */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ @@ -347,8 +349,6 @@ struct rcu_state { /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ /* GP start. */ - unsigned long gp_max; /* Maximum GP duration in */ - /* jiffies. */ const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ From a2dae43088d51c4869e7fa91ca09bcc890e277fc Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jun 2020 10:07:53 +0800 Subject: [PATCH 106/502] rcu: grplo/grphi just records CPU number The ->grplo and ->grphi fields store the lowest and highest CPU number covered by to a rcu_node structure, which is not the group number. This commit therefore adjusts these fields' comments to match reality. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 09ec93b16f28..9f903f5c9fa1 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -73,8 +73,8 @@ struct rcu_node { unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ - int grplo; /* lowest-numbered CPU or group here. */ - int grphi; /* highest-numbered CPU or group here. */ + int grplo; /* lowest-numbered CPU here. */ + int grphi; /* highest-numbered CPU here. */ u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ From 7a0c2b0940c13a06573320ab7118375b35feef8b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jun 2020 10:07:54 +0800 Subject: [PATCH 107/502] rcu: grpnum just records group number The ->grpnum field in the rcu_node structure contains the bit position in this structure's parent's bitmasks, which is not the CPU number. This commit therefore adjusts this field's comment accordingly. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9f903f5c9fa1..c96ae351688b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -75,7 +75,7 @@ struct rcu_node { /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU here. */ int grphi; /* highest-numbered CPU here. */ - u8 grpnum; /* CPU/group number for next level up. */ + u8 grpnum; /* group number for next level up. */ u8 level; /* root is at level 0. */ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ /* exit RCU read-side critical sections */ From c3cb47a6cc74af0b79579ba167d7124eb669fbaa Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Jun 2020 12:28:05 -0700 Subject: [PATCH 108/502] kernel/rcu/tree.c: Fix kernel-doc warnings Fix kernel-doc warning: ../kernel/rcu/tree.c:959: warning: Excess function parameter 'irq' description in 'rcu_nmi_enter' Fixes: cf7614e13c8f ("rcu: Refactor rcu_{nmi,irq}_{enter,exit}()") Signed-off-by: Randy Dunlap Cc: Byungchul Park Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c8196fab563c..ef05aac7f9d3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -954,7 +954,6 @@ void __rcu_irq_enter_check_tick(void) /** * rcu_nmi_enter - inform RCU of entry to NMI context - * @irq: Is this call from rcu_irq_enter? * * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know From 24692fa22c30cb8fcfcabdc07a3c82964475b639 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 15 Jun 2020 08:46:49 +0200 Subject: [PATCH 109/502] rcu: Fix some kernel-doc warnings The current code provokes some kernel-doc warnings: ./kernel/rcu/tree.c:2915: warning: Function parameter or member 'count' not described in 'kfree_rcu_cpu' ./include/linux/rculist.h:517: warning: bad line: [@right ][node2 ... ] ./include/linux/rculist.h:2: WARNING: Unexpected indentation. This commit therefore moves the comment for "count" to the kernel-doc markup and adds a missing "*" on one kernel-doc continuation line. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index df587d181844..7eed65b5f713 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -512,7 +512,7 @@ static inline void hlist_replace_rcu(struct hlist_node *old, * @right: The hlist head on the right * * The lists start out as [@left ][node1 ... ] and - [@right ][node2 ... ] + * [@right ][node2 ... ] * The lists end up as [@left ][node2 ... ] * [@right ][node1 ... ] */ From 8e11690d2f5a9823d66f68918c3986b4e9e160ab Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 4 May 2020 14:35:00 +0200 Subject: [PATCH 110/502] rcu: Fix a kernel-doc warnings for "count" There are some kernel-doc warnings: ./kernel/rcu/tree.c:2915: warning: Function parameter or member 'count' not described in 'kfree_rcu_cpu' This commit therefore moves the comment for "count" to the kernel-doc markup. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c6569e0586c..ba4c477495b5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3004,6 +3004,7 @@ struct kfree_rcu_cpu_work { * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @lock and @rcu_work fields have been initialized + * @count: Number of objects for which GP not started * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -3019,7 +3020,6 @@ struct kfree_rcu_cpu { struct delayed_work monitor_work; bool monitor_todo; bool initialized; - // Number of objects for which GP not started int count; }; From 8ac88f7177c75bf9b7b8c29a8054115e1c712baf Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:45 +0200 Subject: [PATCH 111/502] rcu/tree: Keep kfree_rcu() awake during lock contention On PREEMPT_RT kernels, the krcp spinlock gets converted to an rt-mutex and causes kfree_rcu() callers to sleep. This makes it unusable for callers in purely atomic sections such as non-threaded IRQ handlers and raw spinlock sections. Fix it by converting the spinlock to a raw spinlock. Vetting all code paths, there is no reason to believe that the raw spinlock will hurt RT latencies as it is not held for a long time. Cc: bigeasy@linutronix.de Cc: Uladzislau Rezki Reviewed-by: Uladzislau Rezki Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ba4c477495b5..c5de5adca0dd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3016,7 +3016,7 @@ struct kfree_rcu_cpu { struct kfree_rcu_bulk_data *bhead; struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - spinlock_t lock; + raw_spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; bool initialized; @@ -3049,12 +3049,12 @@ static void kfree_rcu_work(struct work_struct *work) krwp = container_of(to_rcu_work(work), struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); head = krwp->head_free; krwp->head_free = NULL; bhead = krwp->bhead_free; krwp->bhead_free = NULL; - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); /* "bhead" is now private, so traverse locklessly. */ for (; bhead; bhead = bnext) { @@ -3157,14 +3157,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); return; } // Previous RCU batch still in progress, try again later. krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } /* @@ -3177,11 +3177,11 @@ static void kfree_rcu_monitor(struct work_struct *work) struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, monitor_work.work); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } static inline bool @@ -3252,7 +3252,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) - spin_lock(&krcp->lock); + raw_spin_lock(&krcp->lock); // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(head)) { @@ -3283,7 +3283,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: if (krcp->initialized) - spin_unlock(&krcp->lock); + raw_spin_unlock(&krcp->lock); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -3315,11 +3315,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count = krcp->count; - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); sc->nr_to_scan -= count; freed += count; @@ -3346,15 +3346,15 @@ void __init kfree_rcu_scheduler_running(void) for_each_online_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (!krcp->head || krcp->monitor_todo) { - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); continue; } krcp->monitor_todo = true; schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -4250,7 +4250,7 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_init(&krcp->lock); + raw_spin_lock_init(&krcp->lock); for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; From 4d2919411867848fab78c7cb13139e17ad8b85bc Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:46 +0200 Subject: [PATCH 112/502] rcu/tree: Skip entry into the page allocator for PREEMPT_RT To keep the kfree_rcu() code working in purely atomic sections on RT, such as non-threaded IRQ handlers and raw spinlock sections, avoid calling into the page allocator which uses sleeping locks on RT. In fact, even if the caller is preemptible, the kfree_rcu() code is not, as the krcp->lock is a raw spinlock. Calling into the page allocator is optional and avoiding it should be Ok, especially with the page pre-allocation support in future patches. Such pre-allocation would further avoid the a need for a dynamically allocated page in the first place. Cc: Sebastian Andrzej Siewior Reviewed-by: Uladzislau Rezki Co-developed-by: Uladzislau Rezki Signed-off-by: Uladzislau Rezki Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c5de5adca0dd..e0425faf3b3b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3202,6 +3202,18 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, if (!bnode) { WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); + /* + * To keep this path working on raw non-preemptible + * sections, prevent the optional entry into the + * allocator as it uses sleeping locks. In fact, even + * if the caller of kfree_rcu() is preemptible, this + * path still is not, as krcp->lock is a raw spinlock. + * With additional page pre-allocation in the works, + * hitting this return is going to be much less likely. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + bnode = (struct kfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } From 594aa5975b9b5cfe9edaec06170e43b8c0607377 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:47 +0200 Subject: [PATCH 113/502] rcu/tree: Repeat the monitor if any free channel is busy It is possible that one of the channels cannot be detached because its free channel is busy and previously queued data has not been processed yet. On the other hand, another channel can be successfully detached causing the monitor work to stop. Prevent that by rescheduling the monitor work if there are any channels in the pending state after a detach attempt. Fixes: 34c881745549e ("rcu: Support kfree_bulk() interface in kfree_rcu()") Acked-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e0425faf3b3b..5151fe4e1429 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3105,7 +3105,7 @@ static void kfree_rcu_work(struct work_struct *work) static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; - bool queued = false; + bool repeat = false; int i; lockdep_assert_held(&krcp->lock); @@ -3143,11 +3143,14 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) * been detached following each other, one by one. */ queue_rcu_work(system_wq, &krwp->rcu_work); - queued = true; } + + /* Repeat if any "free" corresponding channel is still busy. */ + if (krcp->bhead || krcp->head) + repeat = true; } - return queued; + return !repeat; } static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, From 446044eb9c9c335d3ae1be4665193ab43ebb284e Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:48 +0200 Subject: [PATCH 114/502] rcu/tree: Make debug_objects logic independent of rcu_head kfree_rcu()'s debug_objects logic uses the address of the object's embedded rcu_head to queue/unqueue. Instead of this, make use of the object's address itself as preparation for future headless kfree_rcu() support. Reviewed-by: Uladzislau Rezki Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5151fe4e1429..143c1e9265b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2970,13 +2970,11 @@ EXPORT_SYMBOL_GPL(call_rcu); * @nr_records: Number of active pointers in the array * @records: Array of the kfree_rcu() pointers * @next: Next bulk object in the block chain - * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set */ struct kfree_rcu_bulk_data { unsigned long nr_records; void *records[KFREE_BULK_MAX_ENTR]; struct kfree_rcu_bulk_data *next; - struct rcu_head *head_free_debug; }; /** @@ -3026,11 +3024,13 @@ struct kfree_rcu_cpu { static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); static __always_inline void -debug_rcu_head_unqueue_bulk(struct rcu_head *head) +debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) { #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - for (; head; head = head->next) - debug_rcu_head_unqueue(head); + int i; + + for (i = 0; i < bhead->nr_records; i++) + debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); #endif } @@ -3060,7 +3060,7 @@ static void kfree_rcu_work(struct work_struct *work) for (; bhead; bhead = bnext) { bnext = bhead->next; - debug_rcu_head_unqueue_bulk(bhead->head_free_debug); + debug_rcu_bhead_unqueue(bhead); rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, @@ -3082,14 +3082,15 @@ static void kfree_rcu_work(struct work_struct *work) */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; + void *ptr = (void *)head - offset; next = head->next; - debug_rcu_head_unqueue(head); + debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) - kfree((void *)head - offset); + kfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -3228,18 +3229,11 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Initialize the new block. */ bnode->nr_records = 0; bnode->next = krcp->bhead; - bnode->head_free_debug = NULL; /* Attach it to the head. */ krcp->bhead = bnode; } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - head->func = func; - head->next = krcp->bhead->head_free_debug; - krcp->bhead->head_free_debug = head; -#endif - /* Finally insert. */ krcp->bhead->records[krcp->bhead->nr_records++] = (void *) head - (unsigned long) func; @@ -3263,14 +3257,17 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; + void *ptr; local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) raw_spin_lock(&krcp->lock); + ptr = (void *)head - (unsigned long)func; + // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(head)) { + if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", __func__, head); From 3af84862817403d317dc33312e7a88d76e79401a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:49 +0200 Subject: [PATCH 115/502] rcu/tree: Simplify KFREE_BULK_MAX_ENTR macro We can simplify KFREE_BULK_MAX_ENTR macro and get rid of magic numbers which were used to make the structure to be exactly one page. Suggested-by: Boqun Feng Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 143c1e9265b6..bcdc06364426 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2958,13 +2958,6 @@ EXPORT_SYMBOL_GPL(call_rcu); #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 -/* - * This macro defines how many entries the "records" array - * will contain. It is based on the fact that the size of - * kfree_rcu_bulk_data structure becomes exactly one page. - */ -#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3) - /** * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers * @nr_records: Number of active pointers in the array @@ -2973,10 +2966,18 @@ EXPORT_SYMBOL_GPL(call_rcu); */ struct kfree_rcu_bulk_data { unsigned long nr_records; - void *records[KFREE_BULK_MAX_ENTR]; struct kfree_rcu_bulk_data *next; + void *records[]; }; +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *)) + /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period From 952371d6fc0bc360d1d5780f86bb355836117ca2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:50 +0200 Subject: [PATCH 116/502] rcu/tree: Move kfree_rcu_cpu locking/unlocking to separate functions Introduce helpers to lock and unlock per-cpu "kfree_rcu_cpu" structures. That will make kfree_call_rcu() more readable and prevent programming errors. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bcdc06364426..368bdc441ffb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3035,6 +3035,27 @@ debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) #endif } +static inline struct kfree_rcu_cpu * +krc_this_cpu_lock(unsigned long *flags) +{ + struct kfree_rcu_cpu *krcp; + + local_irq_save(*flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + if (likely(krcp->initialized)) + raw_spin_lock(&krcp->lock); + + return krcp; +} + +static inline void +krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) +{ + if (likely(krcp->initialized)) + raw_spin_unlock(&krcp->lock); + local_irq_restore(flags); +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -3260,11 +3281,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) struct kfree_rcu_cpu *krcp; void *ptr; - local_irq_save(flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - if (krcp->initialized) - raw_spin_lock(&krcp->lock); - + krcp = krc_this_cpu_lock(&flags); ptr = (void *)head - (unsigned long)func; // Queue the object but don't yet schedule the batch. @@ -3295,9 +3312,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) } unlock_return: - if (krcp->initialized) - raw_spin_unlock(&krcp->lock); - local_irq_restore(flags); + krc_this_cpu_unlock(krcp, flags); } EXPORT_SYMBOL_GPL(kfree_call_rcu); From 69f08d3999dbef1553a3332b8055282dd3893b6c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 25 May 2020 23:47:51 +0200 Subject: [PATCH 117/502] rcu/tree: Use static initializer for krc.lock The per-CPU variable is initialized at runtime in kfree_rcu_batch_init(). This function is invoked before 'rcu_scheduler_active' is set to 'RCU_SCHEDULER_RUNNING'. After the initialisation, '->initialized' is to true. The raw_spin_lock is only acquired if '->initialized' is set to true. The worqueue item is only used if 'rcu_scheduler_active' set to RCU_SCHEDULER_RUNNING which happens after initialisation. Use a static initializer for krc.lock and remove the runtime initialisation of the lock. Since the lock can now be always acquired, remove the '->initialized' check. Cc: Sebastian Andrzej Siewior Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 368bdc441ffb..a42a4693f161 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3002,7 +3002,7 @@ struct kfree_rcu_cpu_work { * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending - * @initialized: The @lock and @rcu_work fields have been initialized + * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started * * This is a per-CPU structure. The reason that it is not included in @@ -3022,7 +3022,9 @@ struct kfree_rcu_cpu { int count; }; -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), +}; static __always_inline void debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) @@ -3042,8 +3044,7 @@ krc_this_cpu_lock(unsigned long *flags) local_irq_save(*flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); - if (likely(krcp->initialized)) - raw_spin_lock(&krcp->lock); + raw_spin_lock(&krcp->lock); return krcp; } @@ -3051,8 +3052,7 @@ krc_this_cpu_lock(unsigned long *flags) static inline void krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { - if (likely(krcp->initialized)) - raw_spin_unlock(&krcp->lock); + raw_spin_unlock(&krcp->lock); local_irq_restore(flags); } @@ -4278,7 +4278,6 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - raw_spin_lock_init(&krcp->lock); for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; From 53c72b590b3a0afd6747d6f7957e6838003e90a4 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:52 +0200 Subject: [PATCH 118/502] rcu/tree: cache specified number of objects In order to reduce the dynamic need for pages in kfree_rcu(), pre-allocate a configurable number of pages per CPU and link them in a list. When kfree_rcu() reclaims objects, the object's container page is cached into a list instead of being released to the low-level page allocator. Such an approach provides O(1) access to free pages while also reducing the number of requests to the page allocator. It also makes the kfree_rcu() code to have free pages available during a low memory condition. A read-only sysfs parameter (rcu_min_cached_objs) reflects the minimum number of allowed cached pages per CPU. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 8 +++ kernel/rcu/tree.c | 66 +++++++++++++++++-- 2 files changed, 70 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb95fad81c79..befaa63652ff 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4038,6 +4038,14 @@ latencies, which will choose a value aligned with the appropriate hardware boundaries. + rcutree.rcu_min_cached_objs= [KNL] + Minimum number of objects which are cached and + maintained per one CPU. Object size is equal + to PAGE_SIZE. The cache allows to reduce the + pressure to page allocator, also it makes the + whole algorithm to behave better in low memory + condition. + rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a42a4693f161..37c0cd0332f8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -175,6 +175,15 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* + * This rcu parameter is runtime-read-only. It reflects + * a minimum allowed number of objects which can be cached + * per-CPU. Object size is equal to one page. This value + * can be changed at boot time. + */ +static int rcu_min_cached_objs = 2; +module_param(rcu_min_cached_objs, int, 0444); + /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -2997,7 +3006,6 @@ struct kfree_rcu_cpu_work { * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period - * @bcached: Keeps at most one object for later reuse when build chain blocks * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -3013,13 +3021,22 @@ struct kfree_rcu_cpu_work { struct kfree_rcu_cpu { struct rcu_head *head; struct kfree_rcu_bulk_data *bhead; - struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; bool initialized; int count; + + /* + * A simple cache list that contains objects for + * reuse purpose. In order to save some per-cpu + * space the list is singular. Even though it is + * lockless an access has to be protected by the + * per-cpu lock. + */ + struct llist_head bkvcache; + int nr_bkv_objs; }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { @@ -3056,6 +3073,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) local_irq_restore(flags); } +static inline struct kfree_rcu_bulk_data * +get_cached_bnode(struct kfree_rcu_cpu *krcp) +{ + if (!krcp->nr_bkv_objs) + return NULL; + + krcp->nr_bkv_objs--; + return (struct kfree_rcu_bulk_data *) + llist_del_first(&krcp->bkvcache); +} + +static inline bool +put_cached_bnode(struct kfree_rcu_cpu *krcp, + struct kfree_rcu_bulk_data *bnode) +{ + // Check the limit. + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) + return false; + + llist_add((struct llist_node *) bnode, &krcp->bkvcache); + krcp->nr_bkv_objs++; + return true; + +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -3091,7 +3133,12 @@ static void kfree_rcu_work(struct work_struct *work) kfree_bulk(bhead->nr_records, bhead->records); rcu_lock_release(&rcu_callback_map); - if (cmpxchg(&krcp->bcached, NULL, bhead)) + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bhead)) + bhead = NULL; + krc_this_cpu_unlock(krcp, flags); + + if (bhead) free_page((unsigned long) bhead); cond_resched_tasks_rcu_qs(); @@ -3224,7 +3271,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Check if a new block is required. */ if (!krcp->bhead || krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { - bnode = xchg(&krcp->bcached, NULL); + bnode = get_cached_bnode(krcp); if (!bnode) { WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); @@ -4277,12 +4324,23 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + struct kfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; } + for (i = 0; i < rcu_min_cached_objs; i++) { + bnode = (struct kfree_rcu_bulk_data *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (bnode) + put_cached_bnode(krcp, bnode); + else + pr_err("Failed to preallocate for %d CPU!\n", cpu); + } + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } From 5f3c8d620447d509e534962e23f7edfb85f4e533 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:53 +0200 Subject: [PATCH 119/502] rcu/tree: Maintain separate array for vmalloc ptrs To do so, we use an array of kvfree_rcu_bulk_data structures. It consists of two elements: - index number 0 corresponds to slab pointers. - index number 1 corresponds to vmalloc pointers. Keeping vmalloc pointers separated from slab pointers makes it possible to invoke the right freeing API for the right kind of pointer. It also prepares us for future headless support for vmalloc and SLAB objects. Such objects cannot be queued on a linked list and are instead directly into an array. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Reviewed-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 167 +++++++++++++++++++++++++++------------------- 1 file changed, 97 insertions(+), 70 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 37c0cd0332f8..67c4b984c499 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,8 @@ #include #include #include +#include +#include #include "../time/tick-internal.h" #include "tree.h" @@ -2966,46 +2968,47 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 +#define FREE_N_CHANNELS 2 /** - * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers + * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers * @nr_records: Number of active pointers in the array - * @records: Array of the kfree_rcu() pointers * @next: Next bulk object in the block chain + * @records: Array of the kvfree_rcu() pointers */ -struct kfree_rcu_bulk_data { +struct kvfree_rcu_bulk_data { unsigned long nr_records; - struct kfree_rcu_bulk_data *next; + struct kvfree_rcu_bulk_data *next; void *records[]; }; /* * This macro defines how many entries the "records" array * will contain. It is based on the fact that the size of - * kfree_rcu_bulk_data structure becomes exactly one page. + * kvfree_rcu_bulk_data structure becomes exactly one page. */ -#define KFREE_BULK_MAX_ENTR \ - ((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *)) +#define KVFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period + * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; - struct kfree_rcu_bulk_data *bhead_free; + struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period + * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -3020,7 +3023,7 @@ struct kfree_rcu_cpu_work { */ struct kfree_rcu_cpu { struct rcu_head *head; - struct kfree_rcu_bulk_data *bhead; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; @@ -3044,7 +3047,7 @@ static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { }; static __always_inline void -debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) +debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) { #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD int i; @@ -3073,20 +3076,20 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) local_irq_restore(flags); } -static inline struct kfree_rcu_bulk_data * +static inline struct kvfree_rcu_bulk_data * get_cached_bnode(struct kfree_rcu_cpu *krcp) { if (!krcp->nr_bkv_objs) return NULL; krcp->nr_bkv_objs--; - return (struct kfree_rcu_bulk_data *) + return (struct kvfree_rcu_bulk_data *) llist_del_first(&krcp->bkvcache); } static inline bool put_cached_bnode(struct kfree_rcu_cpu *krcp, - struct kfree_rcu_bulk_data *bnode) + struct kvfree_rcu_bulk_data *bnode) { // Check the limit. if (krcp->nr_bkv_objs >= rcu_min_cached_objs) @@ -3105,43 +3108,63 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp, static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; struct rcu_head *head, *next; - struct kfree_rcu_bulk_data *bhead, *bnext; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + int i, j; krwp = container_of(to_rcu_work(work), struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; + raw_spin_lock_irqsave(&krcp->lock, flags); + // Channels 1 and 2. + for (i = 0; i < FREE_N_CHANNELS; i++) { + bkvhead[i] = krwp->bkvhead_free[i]; + krwp->bkvhead_free[i] = NULL; + } + + // Channel 3. head = krwp->head_free; krwp->head_free = NULL; - bhead = krwp->bhead_free; - krwp->bhead_free = NULL; raw_spin_unlock_irqrestore(&krcp->lock, flags); - /* "bhead" is now private, so traverse locklessly. */ - for (; bhead; bhead = bnext) { - bnext = bhead->next; + // Handle two first channels. + for (i = 0; i < FREE_N_CHANNELS; i++) { + for (; bkvhead[i]; bkvhead[i] = bnext) { + bnext = bkvhead[i]->next; + debug_rcu_bhead_unqueue(bkvhead[i]); - debug_rcu_bhead_unqueue(bhead); + rcu_lock_acquire(&rcu_callback_map); + if (i == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bkvhead[i]->nr_records, + bkvhead[i]->records); - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, - bhead->nr_records, bhead->records); + kfree_bulk(bkvhead[i]->nr_records, + bkvhead[i]->records); + } else { // vmalloc() / vfree(). + for (j = 0; j < bkvhead[i]->nr_records; j++) { + trace_rcu_invoke_kfree_callback( + rcu_state.name, + bkvhead[i]->records[j], 0); - kfree_bulk(bhead->nr_records, bhead->records); - rcu_lock_release(&rcu_callback_map); + vfree(bkvhead[i]->records[j]); + } + } + rcu_lock_release(&rcu_callback_map); - krcp = krc_this_cpu_lock(&flags); - if (put_cached_bnode(krcp, bhead)) - bhead = NULL; - krc_this_cpu_unlock(krcp, flags); + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bkvhead[i])) + bkvhead[i] = NULL; + krc_this_cpu_unlock(krcp, flags); - if (bhead) - free_page((unsigned long) bhead); + if (bkvhead[i]) + free_page((unsigned long) bkvhead[i]); - cond_resched_tasks_rcu_qs(); + cond_resched_tasks_rcu_qs(); + } } /* @@ -3159,7 +3182,7 @@ static void kfree_rcu_work(struct work_struct *work) trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) - kfree(ptr); + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -3176,7 +3199,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; bool repeat = false; - int i; + int i, j; lockdep_assert_held(&krcp->lock); @@ -3184,21 +3207,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krwp = &(krcp->krw_arr[i]); /* - * Try to detach bhead or head and attach it over any + * Try to detach bkvhead or head and attach it over any * available corresponding free channel. It can be that * a previous RCU batch is in progress, it means that * immediately to queue another one is not possible so * return false to tell caller to retry. */ - if ((krcp->bhead && !krwp->bhead_free) || + if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || + (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || (krcp->head && !krwp->head_free)) { - /* Channel 1. */ - if (!krwp->bhead_free) { - krwp->bhead_free = krcp->bhead; - krcp->bhead = NULL; + // Channel 1 corresponds to SLAB ptrs. + // Channel 2 corresponds to vmalloc ptrs. + for (j = 0; j < FREE_N_CHANNELS; j++) { + if (!krwp->bkvhead_free[j]) { + krwp->bkvhead_free[j] = krcp->bkvhead[j]; + krcp->bkvhead[j] = NULL; + } } - /* Channel 2. */ + // Channel 3 corresponds to emergency path. if (!krwp->head_free) { krwp->head_free = krcp->head; krcp->head = NULL; @@ -3207,16 +3234,17 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) WRITE_ONCE(krcp->count, 0); /* - * One work is per one batch, so there are two "free channels", - * "bhead_free" and "head_free" the batch can handle. It can be - * that the work is in the pending state when two channels have - * been detached following each other, one by one. + * One work is per one batch, so there are three + * "free channels", the batch can handle. It can + * be that the work is in the pending state when + * channels have been detached following by each + * other. */ queue_rcu_work(system_wq, &krwp->rcu_work); } - /* Repeat if any "free" corresponding channel is still busy. */ - if (krcp->bhead || krcp->head) + // Repeat if any "free" corresponding channel is still busy. + if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head) repeat = true; } @@ -3258,23 +3286,22 @@ static void kfree_rcu_monitor(struct work_struct *work) } static inline bool -kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, - struct rcu_head *head, rcu_callback_t func) +kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) { - struct kfree_rcu_bulk_data *bnode; + struct kvfree_rcu_bulk_data *bnode; + int idx; if (unlikely(!krcp->initialized)) return false; lockdep_assert_held(&krcp->lock); + idx = !!is_vmalloc_addr(ptr); /* Check if a new block is required. */ - if (!krcp->bhead || - krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { + if (!krcp->bkvhead[idx] || + krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { bnode = get_cached_bnode(krcp); if (!bnode) { - WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); - /* * To keep this path working on raw non-preemptible * sections, prevent the optional entry into the @@ -3287,7 +3314,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, if (IS_ENABLED(CONFIG_PREEMPT_RT)) return false; - bnode = (struct kfree_rcu_bulk_data *) + bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } @@ -3297,30 +3324,30 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Initialize the new block. */ bnode->nr_records = 0; - bnode->next = krcp->bhead; + bnode->next = krcp->bkvhead[idx]; /* Attach it to the head. */ - krcp->bhead = bnode; + krcp->bkvhead[idx] = bnode; } /* Finally insert. */ - krcp->bhead->records[krcp->bhead->nr_records++] = - (void *) head - (unsigned long) func; + krcp->bkvhead[idx]->records + [krcp->bkvhead[idx]->nr_records++] = ptr; return true; } /* - * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace - * period. Please note there are two paths are maintained, one is the main one - * that uses kfree_bulk() interface and second one is emergency one, that is - * used only when the main path can not be maintained temporary, due to memory - * pressure. + * Queue a request for lazy invocation of appropriate free routine after a + * grace period. Please note there are three paths are maintained, two are the + * main ones that use array of pointers interface and third one is emergency + * one, that is used only when the main path can not be maintained temporary, + * due to memory pressure. * * Each kfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu() load. + * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -3343,7 +3370,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) * Under high memory pressure GFP_NOWAIT can fail, * in that case the emergency path is maintained. */ - if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) { + if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) { head->func = func; head->next = krcp->head; krcp->head = head; @@ -4324,7 +4351,7 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - struct kfree_rcu_bulk_data *bnode; + struct kvfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); @@ -4332,7 +4359,7 @@ static void __init kfree_rcu_batch_init(void) } for (i = 0; i < rcu_min_cached_objs; i++) { - bnode = (struct kfree_rcu_bulk_data *) + bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (bnode) From 64d1d06ccb1b7de245ccf781b91517f328bebd9f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:54 +0200 Subject: [PATCH 120/502] rcu/tiny: support vmalloc in tiny-RCU Replace kfree() with kvfree() in rcu_reclaim_tiny(). This makes it possible to release either SLAB or vmalloc objects after a GP. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index dd572ce7c747..4b99f7b88bee 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "rcu.h" @@ -86,7 +87,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { trace_rcu_invoke_kfree_callback("", head, offset); - kfree((void *)head - offset); + kvfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } From c408b215f58f7156bb6bafb64c0263ee907033df Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:55 +0200 Subject: [PATCH 121/502] rcu: Rename *_kfree_callback/*_kfree_rcu_offset/kfree_call_* The following changes are introduced: 1. Rename rcu_invoke_kfree_callback() to rcu_invoke_kvfree_callback(), as well as the associated trace events, so the rcu_kfree_callback(), becomes rcu_kvfree_callback(). The reason is to be aligned with kvfree() notation. 2. Rename __is_kfree_rcu_offset to __is_kvfree_rcu_offset. All RCU paths use kvfree() now instead of kfree(), thus rename it. 3. Rename kfree_call_rcu() to the kvfree_call_rcu(). The reason is, it is capable of freeing vmalloc() memory now. Do the same with __kfree_rcu() macro, it becomes __kvfree_rcu(), the goal is the same. Reviewed-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 14 +++++++------- include/linux/rcutiny.h | 2 +- include/linux/rcutree.h | 2 +- include/trace/events/rcu.h | 8 ++++---- kernel/rcu/tiny.c | 4 ++-- kernel/rcu/tree.c | 16 ++++++++-------- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 659cbfa7581a..b344fc800a9b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -828,17 +828,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) /* * Does the specified offset indicate that the corresponding rcu_head - * structure can be handled by kfree_rcu()? + * structure can be handled by kvfree_rcu()? */ -#define __is_kfree_rcu_offset(offset) ((offset) < 4096) +#define __is_kvfree_rcu_offset(offset) ((offset) < 4096) /* * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain. */ -#define __kfree_rcu(head, offset) \ +#define __kvfree_rcu(head, offset) \ do { \ - BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \ - kfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ + BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \ + kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ } while (0) /** @@ -857,7 +857,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * Because the functions are not allowed in the low-order 4096 bytes of * kernel virtual memory, offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will - * be generated in __kfree_rcu(). If this error is triggered, you can + * be generated in __kvfree_rcu(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * @@ -872,7 +872,7 @@ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) \ - __kfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ + __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ } while (0) /* diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 8512caeb7682..fb2eb39c484f 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -34,7 +34,7 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } -static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { call_rcu(head, func); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d5cc9d675987..d2f4064ebd1d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -33,7 +33,7 @@ static inline void rcu_virt_note_context_switch(int cpu) } void synchronize_rcu_expedited(void); -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func); +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index f9a7811148e2..0ee93d0b1daa 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -506,13 +506,13 @@ TRACE_EVENT_RCU(rcu_callback, /* * Tracepoint for the registration of a single RCU callback of the special - * kfree() form. The first argument is the RCU type, the second argument + * kvfree() form. The first argument is the RCU type, the second argument * is a pointer to the RCU callback, the third argument is the offset * of the callback within the enclosing RCU-protected data structure, * the fourth argument is the number of lazy callbacks queued, and the * fifth argument is the total number of callbacks queued. */ -TRACE_EVENT_RCU(rcu_kfree_callback, +TRACE_EVENT_RCU(rcu_kvfree_callback, TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, long qlen), @@ -596,12 +596,12 @@ TRACE_EVENT_RCU(rcu_invoke_callback, /* * Tracepoint for the invocation of a single RCU callback of the special - * kfree() form. The first argument is the RCU flavor, the second + * kvfree() form. The first argument is the RCU flavor, the second * argument is a pointer to the RCU callback, and the third argument * is the offset of the callback within the enclosing RCU-protected * data structure. */ -TRACE_EVENT_RCU(rcu_invoke_kfree_callback, +TRACE_EVENT_RCU(rcu_invoke_kvfree_callback, TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset), diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b99f7b88bee..aa897c3f2e92 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,8 +85,8 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback("", head, offset); + if (__is_kvfree_rcu_offset(offset)) { + trace_rcu_invoke_kvfree_callback("", head, offset); kvfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 67c4b984c499..f22c47e72287 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2905,8 +2905,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) return; // Enqueued onto ->nocb_bypass, so just leave. // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rcu_state.name, head, + if (__is_kvfree_rcu_offset((unsigned long)func)) + trace_rcu_kvfree_callback(rcu_state.name, head, (unsigned long)func, rcu_segcblist_n_cbs(&rdp->cblist)); else @@ -3146,7 +3146,7 @@ static void kfree_rcu_work(struct work_struct *work) bkvhead[i]->records); } else { // vmalloc() / vfree(). for (j = 0; j < bkvhead[i]->nr_records; j++) { - trace_rcu_invoke_kfree_callback( + trace_rcu_invoke_kvfree_callback( rcu_state.name, bkvhead[i]->records[j], 0); @@ -3179,9 +3179,9 @@ static void kfree_rcu_work(struct work_struct *work) next = head->next; debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) + if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) kvfree(ptr); rcu_lock_release(&rcu_callback_map); @@ -3344,12 +3344,12 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) * one, that is used only when the main path can not be maintained temporary, * due to memory pressure. * - * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * Each kvfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; @@ -3388,7 +3388,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: krc_this_cpu_unlock(krcp, flags); } -EXPORT_SYMBOL_GPL(kfree_call_rcu); +EXPORT_SYMBOL_GPL(kvfree_call_rcu); static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) From e0feed08ab41df0fedc38d35938891ef5715c1d3 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:56 +0200 Subject: [PATCH 122/502] mm/list_lru.c: Rename kvfree_rcu() to local variant Rename kvfree_rcu() function to the kvfree_rcu_local() one. The purpose is to prevent a conflict of two same function declarations. The kvfree_rcu() will be globally visible what would lead to a build error. No functional change. Cc: linux-mm@kvack.org Cc: rcu@vger.kernel.org Cc: Andrew Morton Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Joel Fernandes (Google) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- mm/list_lru.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 9222910ab1cb..e825804b3928 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -373,14 +373,14 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) struct list_lru_memcg *memcg_lrus; /* * This is called when shrinker has already been unregistered, - * and nobody can use it. So, there is no need to use kvfree_rcu(). + * and nobody can use it. So, there is no need to use kvfree_rcu_local(). */ memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true); __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); kvfree(memcg_lrus); } -static void kvfree_rcu(struct rcu_head *head) +static void kvfree_rcu_local(struct rcu_head *head) { struct list_lru_memcg *mlru; @@ -419,7 +419,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, rcu_assign_pointer(nlru->memcg_lrus, new); spin_unlock_irq(&nlru->lock); - call_rcu(&old->rcu, kvfree_rcu); + call_rcu(&old->rcu, kvfree_rcu_local); return 0; } From ce4dce123fdcb5f209752d13f9f06926be65fc78 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:57 +0200 Subject: [PATCH 123/502] rcu: Introduce 2 arg kvfree_rcu() interface kvmalloc() can allocate two types of objects: SLAB backed and vmalloc backed. How it behaves depends on requested object's size and memory pressure. Add a kvfree_rcu() interface that can free memory allocated via kvmalloc(). It is a simple alias to kfree_rcu() which can now handle either type of object. struct test_kvfree_rcu { struct rcu_head rcu; unsigned char array[100]; }; struct test_kvfree_rcu *p; p = kvmalloc(10 * PAGE_SIZE); if (p) kvfree_rcu(p, rcu); Signed-off-by: Uladzislau Rezki (Sony) Co-developed-by: Joel Fernandes (Google) Reviewed-by: Joel Fernandes (Google) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b344fc800a9b..51b26ab02878 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -875,6 +875,15 @@ do { \ __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ } while (0) +/** + * kvfree_rcu() - kvfree an object after a grace period. + * @ptr: pointer to kvfree + * @rhf: the name of the struct rcu_head within the type of @ptr. + * + * Same as kfree_rcu(), just simple alias. + */ +#define kvfree_rcu(ptr, rhf) kfree_rcu(ptr, rhf) + /* * Place this after a lock-acquisition primitive to guarantee that * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies From 3042f83f19bec2e0cd356f72b39e4d816e8cd5ff Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:58 +0200 Subject: [PATCH 124/502] rcu: Support reclaim for head-less object Update the kvfree_call_rcu() function with head-less support. This allows RCU to reclaim objects without an embedded rcu_head. tree-RCU: We introduce two chains of arrays to store SLAB-backed and vmalloc pointers, each. Storage in either of these arrays does not require embedding an rcu_head within the object. Maintaining the arrays may become impossible due to high memory pressure. For such cases there is an emergency path. Objects with rcu_head inside are just queued on a backup rcu_head list. Later on that list is drained. As for the head-less variant, as the current context can sleep, the following emergency measures are applied: a) Synchronously wait until a grace period has elapsed. b) Call kvfree(). tiny-RCU: For double argument calls, there are no new changes in behavior. For single argument call, kvfree() is directly inlined on the current stack after a synchronize_rcu() call. Note that for tiny-RCU, any call to synchronize_rcu() is actually a quiescent state, therefore it does nothing. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 18 ++++++++++++++++- kernel/rcu/tree.c | 45 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index fb2eb39c484f..5cc9637cac16 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -34,9 +34,25 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } +/* + * Add one more declaration of kvfree() here. It is + * not so straight forward to just include + * where it is defined due to getting many compile + * errors caused by that include. + */ +extern void kvfree(const void *addr); + static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { - call_rcu(head, func); + if (head) { + call_rcu(head, func); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree((void *) func); } void rcu_qs(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f22c47e72287..01f29e4500ba 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3314,6 +3314,13 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) if (IS_ENABLED(CONFIG_PREEMPT_RT)) return false; + /* + * NOTE: For one argument of kvfree_rcu() we can + * drop the lock and get the page in sleepable + * context. That would allow to maintain an array + * for the CONFIG_PREEMPT_RT as well if no cached + * pages are available. + */ bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } @@ -3353,16 +3360,33 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; + bool success; void *ptr; + if (head) { + ptr = (void *) head - (unsigned long) func; + } else { + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + might_sleep(); + ptr = (unsigned long *) func; + } + krcp = krc_this_cpu_lock(&flags); - ptr = (void *)head - (unsigned long)func; // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", __func__, head); + + // Mark as success and leave. + success = true; goto unlock_return; } @@ -3370,10 +3394,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) * Under high memory pressure GFP_NOWAIT can fail, * in that case the emergency path is maintained. */ - if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) { + success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); + if (!success) { + if (head == NULL) + // Inline if kvfree_rcu(one_arg) call. + goto unlock_return; + head->func = func; head->next = krcp->head; krcp->head = head; + success = true; } WRITE_ONCE(krcp->count, krcp->count + 1); @@ -3387,6 +3417,17 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: krc_this_cpu_unlock(krcp, flags); + + /* + * Inline kvfree() after synchronize_rcu(). We can do + * it from might_sleep() context only, so the current + * CPU can pass the QS state. + */ + if (!success) { + debug_rcu_head_unqueue((struct rcu_head *) ptr); + synchronize_rcu(); + kvfree(ptr); + } } EXPORT_SYMBOL_GPL(kvfree_call_rcu); From 1835f475e3518ade61e25a57572c78b953778656 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:59 +0200 Subject: [PATCH 125/502] rcu: Introduce single argument kvfree_rcu() interface Make kvfree_rcu() capable of freeing objects that will not embed an rcu_head within it. This saves storage overhead in such objects. Reclaiming headless objects this way requires only a single argument (pointer to the object). After this patch, there are two ways to use kvfree_rcu(): a) kvfree_rcu(ptr, rhf); struct X { struct rcu_head rhf; unsigned char data[100]; }; void *ptr = kvmalloc(sizeof(struct X), GFP_KERNEL); if (ptr) kvfree_rcu(ptr, rhf); b) kvfree_rcu(ptr); void *ptr = kvmalloc(some_bytes, GFP_KERNEL); if (ptr) kvfree_rcu(ptr); Note that the headless usage (example b) can only be used in a code that can sleep. This is enforced by the CONFIG_DEBUG_ATOMIC_SLEEP option. Co-developed-by: Joel Fernandes (Google) Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 51b26ab02878..d15d46db61f7 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -877,12 +877,42 @@ do { \ /** * kvfree_rcu() - kvfree an object after a grace period. - * @ptr: pointer to kvfree - * @rhf: the name of the struct rcu_head within the type of @ptr. * - * Same as kfree_rcu(), just simple alias. + * This macro consists of one or two arguments and it is + * based on whether an object is head-less or not. If it + * has a head then a semantic stays the same as it used + * to be before: + * + * kvfree_rcu(ptr, rhf); + * + * where @ptr is a pointer to kvfree(), @rhf is the name + * of the rcu_head structure within the type of @ptr. + * + * When it comes to head-less variant, only one argument + * is passed and that is just a pointer which has to be + * freed after a grace period. Therefore the semantic is + * + * kvfree_rcu(ptr); + * + * where @ptr is a pointer to kvfree(). + * + * Please note, head-less way of freeing is permitted to + * use from a context that has to follow might_sleep() + * annotation. Otherwise, please switch and embed the + * rcu_head structure within the type of @ptr. */ -#define kvfree_rcu(ptr, rhf) kfree_rcu(ptr, rhf) +#define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \ + kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__) + +#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME +#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf) +#define kvfree_rcu_arg_1(ptr) \ +do { \ + typeof(ptr) ___p = (ptr); \ + \ + if (___p) \ + kvfree_call_rcu(NULL, (rcu_callback_t) (___p)); \ +} while (0) /* * Place this after a lock-acquisition primitive to guarantee that From da4fc00abb97ce1269b0940abe86e25456e28424 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:48:00 +0200 Subject: [PATCH 126/502] lib/test_vmalloc.c: Add test cases for kvfree_rcu() Introduce four new test cases for testing the kvfree_rcu() interface. Two of them belong to single argument functionality and another two for 2-argument functionality. The aim is to stress and check how kvfree_rcu() behaves under different load and memory conditions and analyze its performance throughput. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- lib/test_vmalloc.c | 103 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 95 insertions(+), 8 deletions(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index ddc9685702b1..5cf2fe9aab9e 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #define __param(type, name, init, msg) \ static type name = init; \ @@ -35,14 +37,18 @@ __param(int, test_loop_count, 1000000, __param(int, run_test_mask, INT_MAX, "Set tests specified in the mask.\n\n" - "\t\tid: 1, name: fix_size_alloc_test\n" - "\t\tid: 2, name: full_fit_alloc_test\n" - "\t\tid: 4, name: long_busy_list_alloc_test\n" - "\t\tid: 8, name: random_size_alloc_test\n" - "\t\tid: 16, name: fix_align_alloc_test\n" - "\t\tid: 32, name: random_size_align_alloc_test\n" - "\t\tid: 64, name: align_shift_alloc_test\n" - "\t\tid: 128, name: pcpu_alloc_test\n" + "\t\tid: 1, name: fix_size_alloc_test\n" + "\t\tid: 2, name: full_fit_alloc_test\n" + "\t\tid: 4, name: long_busy_list_alloc_test\n" + "\t\tid: 8, name: random_size_alloc_test\n" + "\t\tid: 16, name: fix_align_alloc_test\n" + "\t\tid: 32, name: random_size_align_alloc_test\n" + "\t\tid: 64, name: align_shift_alloc_test\n" + "\t\tid: 128, name: pcpu_alloc_test\n" + "\t\tid: 256, name: kvfree_rcu_1_arg_vmalloc_test\n" + "\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n" + "\t\tid: 1024, name: kvfree_rcu_1_arg_slab_test\n" + "\t\tid: 2048, name: kvfree_rcu_2_arg_slab_test\n" /* Add a new test case description here. */ ); @@ -316,6 +322,83 @@ pcpu_alloc_test(void) return rv; } +struct test_kvfree_rcu { + struct rcu_head rcu; + unsigned char array[20]; +}; + +static int +kvfree_rcu_1_arg_vmalloc_test(void) +{ + struct test_kvfree_rcu *p; + int i; + + for (i = 0; i < test_loop_count; i++) { + p = vmalloc(1 * PAGE_SIZE); + if (!p) + return -1; + + p->array[0] = 'a'; + kvfree_rcu(p); + } + + return 0; +} + +static int +kvfree_rcu_2_arg_vmalloc_test(void) +{ + struct test_kvfree_rcu *p; + int i; + + for (i = 0; i < test_loop_count; i++) { + p = vmalloc(1 * PAGE_SIZE); + if (!p) + return -1; + + p->array[0] = 'a'; + kvfree_rcu(p, rcu); + } + + return 0; +} + +static int +kvfree_rcu_1_arg_slab_test(void) +{ + struct test_kvfree_rcu *p; + int i; + + for (i = 0; i < test_loop_count; i++) { + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -1; + + p->array[0] = 'a'; + kvfree_rcu(p); + } + + return 0; +} + +static int +kvfree_rcu_2_arg_slab_test(void) +{ + struct test_kvfree_rcu *p; + int i; + + for (i = 0; i < test_loop_count; i++) { + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -1; + + p->array[0] = 'a'; + kvfree_rcu(p, rcu); + } + + return 0; +} + struct test_case_desc { const char *test_name; int (*test_func)(void); @@ -330,6 +413,10 @@ static struct test_case_desc test_case_array[] = { { "random_size_align_alloc_test", random_size_align_alloc_test }, { "align_shift_alloc_test", align_shift_alloc_test }, { "pcpu_alloc_test", pcpu_alloc_test }, + { "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test }, + { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test }, + { "kvfree_rcu_1_arg_slab_test", kvfree_rcu_1_arg_slab_test }, + { "kvfree_rcu_2_arg_slab_test", kvfree_rcu_2_arg_slab_test }, /* Add a new test case here. */ }; From ea6eed9f7d7382c7230202d4c3bf74185f193394 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:47:13 -0700 Subject: [PATCH 127/502] rcu-tasks: Convert sleeps to idle priority This commit converts the long-standing schedule_timeout_interruptible() and schedule_timeout_uninterruptible() calls used by the various Tasks RCU's grace-period kthreads to schedule_timeout_idle(). This conversion avoids polluting the load-average with Tasks-RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ce23f6cc5043..91fee8122acd 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -205,7 +205,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) if (!rtp->cbs_head) { WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); - schedule_timeout_interruptible(HZ/10); + schedule_timeout_idle(HZ/10); } continue; } @@ -227,7 +227,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) cond_resched(); } /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); + schedule_timeout_idle(HZ/10); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); } @@ -336,7 +336,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) /* Slowly back off waiting for holdouts */ set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_interruptible(HZ/fract); + schedule_timeout_idle(HZ/fract); if (fract > 1) fract--; From 04a3c5aa7a8cb2ce97f9beb627ba742bc8b0fe03 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 19:27:06 -0700 Subject: [PATCH 128/502] rcu-tasks: Make rcu_tasks_postscan() be static The rcu_tasks_postscan() function is not used outside of RCU's tasks.h file, so this commit makes it be static. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 91fee8122acd..da200e53d60d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -402,7 +402,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) } /* Processing between scanning taskslist and draining the holdout list. */ -void rcu_tasks_postscan(struct list_head *hop) +static void rcu_tasks_postscan(struct list_head *hop) { /* * Wait for tasks that are in the process of exiting. This From 5b3cc99bedf5885055fbaf35fe63d205f06b5be5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 19:33:47 -0700 Subject: [PATCH 129/502] rcu-tasks: Add #include of rcupdate_trace.h to update.c Although this is in some strict sense unnecessary, it is good to allow the compiler to compare the function declaration with its definition. This commit therefore adds a #include of linux/rcupdate_trace.h to kernel/rcu/update.c. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 84843adfd939..c0fea809d738 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -42,6 +42,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS From 8344496e8b49c4122c1808d6cd3f8dc71bccb595 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 20:03:48 -0700 Subject: [PATCH 130/502] rcu-tasks: Conditionally compile show_rcu_tasks_gp_kthreads() The show_rcu_tasks_gp_kthreads() function is not invoked by Tiny RCU, but is nevertheless defined in Tiny RCU builds that enable Tasks Trace RCU. This commit therefore conditionally compiles this function so that it is defined only in builds that actually use it. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index da200e53d60d..d5c003c1972c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -103,6 +103,7 @@ module_param(rcu_task_stall_timeout, int, 0644); #define RTGS_WAIT_READERS 9 #define RTGS_INVOKE_CBS 10 #define RTGS_WAIT_CBS 11 +#ifndef CONFIG_TINY_RCU static const char * const rcu_tasks_gp_state_names[] = { "RTGS_INIT", "RTGS_WAIT_WAIT_CBS", @@ -117,6 +118,7 @@ static const char * const rcu_tasks_gp_state_names[] = { "RTGS_INVOKE_CBS", "RTGS_WAIT_CBS", }; +#endif /* #ifndef CONFIG_TINY_RCU */ //////////////////////////////////////////////////////////////////////// // @@ -129,6 +131,7 @@ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) rtp->gp_jiffies = jiffies; } +#ifndef CONFIG_TINY_RCU /* Return state name. */ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) { @@ -139,6 +142,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) return "???"; return rcu_tasks_gp_state_names[j]; } +#endif /* #ifndef CONFIG_TINY_RCU */ // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, @@ -268,6 +272,7 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifndef CONFIG_TINY_RCU */ +#ifndef CONFIG_TINY_RCU /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { @@ -281,6 +286,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) ".C"[!!data_race(rtp->cbs_head)], s); } +#endif /* #ifndef CONFIG_TINY_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -557,10 +563,12 @@ static int __init rcu_spawn_tasks_kthread(void) } core_initcall(rcu_spawn_tasks_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_classic_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } +#endif /* #ifndef CONFIG_TINY_RCU */ /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) @@ -682,10 +690,12 @@ static int __init rcu_spawn_tasks_rude_kthread(void) } core_initcall(rcu_spawn_tasks_rude_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_rude_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_RUDE_RCU */ static void show_rcu_tasks_rude_gp_kthread(void) {} @@ -1164,6 +1174,7 @@ static int __init rcu_spawn_tasks_trace_kthread(void) } core_initcall(rcu_spawn_tasks_trace_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; @@ -1174,18 +1185,21 @@ static void show_rcu_tasks_trace_gp_kthread(void) data_race(n_heavy_reader_attempts)); show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ +#ifndef CONFIG_TINY_RCU void show_rcu_tasks_gp_kthreads(void) { show_rcu_tasks_classic_gp_kthread(); show_rcu_tasks_rude_gp_kthread(); show_rcu_tasks_trace_gp_kthread(); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} From 30d8aa5128f12c9d781b67c9694c1abfa4f6ce6a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Jun 2020 09:24:51 -0700 Subject: [PATCH 131/502] rcu-tasks: Fix code-style issues This commit declares trc_n_readers_need_end and trc_wait static and replaced a "&" with "&&". The "&" happened to work because the values are bool, but accidents waiting to happen and all that... Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d5c003c1972c..828f222895f1 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -737,8 +737,8 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map); #ifdef CONFIG_TASKS_TRACE_RCU -atomic_t trc_n_readers_need_end; // Number of waited-for readers. -DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. +static atomic_t trc_n_readers_need_end; // Number of waited-for readers. +static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); @@ -845,7 +845,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) bool ofl = cpu_is_offline(cpu); if (task_curr(t)) { - WARN_ON_ONCE(ofl & !is_idle_task(t)); + WARN_ON_ONCE(ofl && !is_idle_task(t)); // If no chance of heavyweight readers, do it the hard way. if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) From 7e866460cc18797b3a59360f5f8c444598a21729 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 00:36:47 -0400 Subject: [PATCH 132/502] rcuperf: Remove useless while loops around wait_event wait_event() already retries if the condition for the wake up is not satisifed after wake up. Remove them from the rcuperf test. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 16dd1e6b7c09..246da8fe199e 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -576,11 +576,8 @@ static int compute_real(int n) static int rcu_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= - nrealwriters); - } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ rcu_perf_cleanup(); kernel_power_off(); @@ -693,11 +690,8 @@ kfree_perf_cleanup(void) static int kfree_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_kfree_perf_thread_ended) >= - kfree_nrealthreads); - } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads); + wait_event(shutdown_wq, + atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ From 653ed64b01dc5989f8f579d0038e987476c2c023 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 00:36:48 -0400 Subject: [PATCH 133/502] refperf: Add a test to measure performance of read-side synchronization Add a test for comparing the performance of RCU with various read-side synchronization mechanisms. The test has proved useful for collecting data and performing these comparisons. Currently RCU, SRCU, reader-writer lock, reader-writer semaphore and reference counting can be measured using refperf.perf_type parameter. Each invocation of the test runs measures performance of a specific mechanism. The maximum number of CPUs to concurrently run readers on is chosen by the test itself and is 75% of the total number of CPUs. So if you had 24 CPUs, the test runs with a maximum of 18 parallel readers. A number of experiments are conducted, and in each experiment, the number of readers is increased by 1, upto the 75% of CPUs mark. During each experiment, all readers execute an empty loop with refperf.loops iterations and time the total loop duration. This is then averaged. Example output: Parameters "refperf.perf_type=srcu refperf.loops=2000000" looks like: [ 3.347133] srcu-ref-perf: [ 3.347133] Threads Time(ns) [ 3.347133] 1 36 [ 3.347133] 2 34 [ 3.347133] 3 34 [ 3.347133] 4 34 [ 3.347133] 5 33 [ 3.347133] 6 33 [ 3.347133] 7 33 [ 3.347133] 8 33 [ 3.347133] 9 33 [ 3.347133] 10 33 [ 3.347133] 11 33 [ 3.347133] 12 33 [ 3.347133] 13 33 [ 3.347133] 14 33 [ 3.347133] 15 32 [ 3.347133] 16 33 [ 3.347133] 17 33 [ 3.347133] 18 34 Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 19 ++ kernel/rcu/Makefile | 1 + kernel/rcu/refperf.c | 558 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 578 insertions(+) create mode 100644 kernel/rcu/refperf.c diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 452feae8de20..858765b7f644 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -61,6 +61,25 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_REF_PERF_TEST + tristate "Performance tests for read-side synchronization (RCU and others)" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU + default n + help + This option provides a kernel module that runs performance tests + useful comparing RCU with various read-side synchronization mechanisms. + The kernel module may be built after the fact on the running kernel to be + tested, if desired. + + Say Y here if you want these performance tests built into the kernel. + Say M if you want to build it as a module instead. + Say N if you are unsure. + config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on RCU_STALL_COMMON diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index f91f2c2cf138..ba7d82609cbe 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_REF_PERF_TEST) += refperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c new file mode 100644 index 000000000000..61161530acc8 --- /dev/null +++ b/kernel/rcu/refperf.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Performance test comparing RCU vs other mechanisms +// for acquiring references on objects. +// +// Copyright (C) Google, 2020. +// +// Author: Joel Fernandes + +#define pr_fmt(fmt) fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rcu.h" + +#define PERF_FLAG "-ref-perf: " + +#define PERFOUT(s, x...) \ + pr_alert("%s" PERF_FLAG s, perf_type, ## x) + +#define VERBOSE_PERFOUT(s, x...) \ + do { if (verbose) pr_alert("%s" PERF_FLAG s, perf_type, ## x); } while (0) + +#define VERBOSE_PERFOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert("%s" PERF_FLAG "!!! " s, perf_type, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joel Fernandes (Google) "); + +static char *perf_type = "rcu"; +module_param(perf_type, charp, 0444); +MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); + +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); + +// Number of loops per experiment, all readers execute an operation concurrently +torture_param(long, loops, 10000000, "Number of loops per experiment."); + +#ifdef MODULE +# define REFPERF_SHUTDOWN 0 +#else +# define REFPERF_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, REFPERF_SHUTDOWN, + "Shutdown at end of performance tests."); + +struct reader_task { + struct task_struct *task; + atomic_t start; + wait_queue_head_t wq; + u64 last_duration_ns; + + // The average latency When 1.. are concurrently + // running an experiment. For example, if this reader_task is + // of index 5 in the reader_tasks array, then result is for + // 6 cores. + u64 result_avg; +}; + +static struct task_struct *shutdown_task; +static wait_queue_head_t shutdown_wq; + +static struct task_struct *main_task; +static wait_queue_head_t main_wq; +static int shutdown_start; + +static struct reader_task *reader_tasks; +static int nreaders; + +// Number of readers that are part of the current experiment. +static atomic_t nreaders_exp; + +// Use to wait for all threads to start. +static atomic_t n_init; + +// Track which experiment is currently running. +static int exp_idx; + +// Operations vector for selecting different types of tests. +struct ref_perf_ops { + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + const char *name; +}; + +static struct ref_perf_ops *cur_ops; + +// Definitions for RCU ref perf testing. +static int ref_rcu_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void ref_rcu_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static void rcu_sync_perf_init(void) +{ +} + +static struct ref_perf_ops rcu_ops = { + .init = rcu_sync_perf_init, + .readlock = ref_rcu_read_lock, + .readunlock = ref_rcu_read_unlock, + .name = "rcu" +}; + + +// Definitions for SRCU ref perf testing. +DEFINE_STATIC_SRCU(srcu_refctl_perf); +static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; + +static int srcu_ref_perf_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_ref_perf_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static struct ref_perf_ops srcu_ops = { + .init = rcu_sync_perf_init, + .readlock = srcu_ref_perf_read_lock, + .readunlock = srcu_ref_perf_read_unlock, + .name = "srcu" +}; + +// Definitions for reference count +static atomic_t refcnt; + +static int srcu_ref_perf_refcnt_lock(void) +{ + atomic_inc(&refcnt); + return 0; +} + +static void srcu_ref_perf_refcnt_unlock(int idx) __releases(srcu_ctlp) +{ + atomic_dec(&refcnt); + srcu_read_unlock(srcu_ctlp, idx); +} + +static struct ref_perf_ops refcnt_ops = { + .init = rcu_sync_perf_init, + .readlock = srcu_ref_perf_refcnt_lock, + .readunlock = srcu_ref_perf_refcnt_unlock, + .name = "refcnt" +}; + +// Definitions for rwlock +static rwlock_t test_rwlock; + +static void ref_perf_rwlock_init(void) +{ + rwlock_init(&test_rwlock); +} + +static int ref_perf_rwlock_lock(void) +{ + read_lock(&test_rwlock); + return 0; +} + +static void ref_perf_rwlock_unlock(int idx) +{ + read_unlock(&test_rwlock); +} + +static struct ref_perf_ops rwlock_ops = { + .init = ref_perf_rwlock_init, + .readlock = ref_perf_rwlock_lock, + .readunlock = ref_perf_rwlock_unlock, + .name = "rwlock" +}; + +// Definitions for rwsem +static struct rw_semaphore test_rwsem; + +static void ref_perf_rwsem_init(void) +{ + init_rwsem(&test_rwsem); +} + +static int ref_perf_rwsem_lock(void) +{ + down_read(&test_rwsem); + return 0; +} + +static void ref_perf_rwsem_unlock(int idx) +{ + up_read(&test_rwsem); +} + +static struct ref_perf_ops rwsem_ops = { + .init = ref_perf_rwsem_init, + .readlock = ref_perf_rwsem_lock, + .readunlock = ref_perf_rwsem_unlock, + .name = "rwsem" +}; + +// Reader kthread. Repeatedly does empty RCU read-side +// critical section, minimizing update-side interference. +static int +ref_perf_reader(void *arg) +{ + unsigned long flags; + long me = (long)arg; + struct reader_task *rt = &(reader_tasks[me]); + unsigned long spincnt; + int idx; + u64 start; + s64 duration; + + VERBOSE_PERFOUT("ref_perf_reader %ld: task started", me); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_init); +repeat: + VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + + // Wait for signal that this reader can start. + wait_event(rt->wq, (atomic_read(&nreaders_exp) && atomic_read(&rt->start)) || + torture_must_stop()); + + if (torture_must_stop()) + goto end; + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != me); + + atomic_dec(&rt->start); + + // To prevent noise, keep interrupts disabled. This also has the + // effect of preventing entries into slow path for rcu_read_unlock(). + local_irq_save(flags); + start = ktime_get_mono_fast_ns(); + + VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); + + for (spincnt = 0; spincnt < loops; spincnt++) { + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + } + + duration = ktime_get_mono_fast_ns() - start; + local_irq_restore(flags); + + rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + + atomic_dec(&nreaders_exp); + + VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); + + if (!atomic_read(&nreaders_exp)) + wake_up(&main_wq); + + if (!torture_must_stop()) + goto repeat; +end: + torture_kthread_stopping("ref_perf_reader"); + return 0; +} + +void reset_readers(int n) +{ + int i; + struct reader_task *rt; + + for (i = 0; i < n; i++) { + rt = &(reader_tasks[i]); + + rt->last_duration_ns = 0; + } +} + +// Print the results of each reader and return the sum of all their durations. +u64 process_durations(int n) +{ + int i; + struct reader_task *rt; + char buf1[64]; + char buf[512]; + u64 sum = 0; + + buf[0] = 0; + sprintf(buf, "Experiment #%d (Format: :)", + exp_idx); + + for (i = 0; i <= n && !torture_must_stop(); i++) { + rt = &(reader_tasks[i]); + sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); + + if (i % 5 == 0) + strcat(buf, "\n"); + strcat(buf, buf1); + + sum += rt->last_duration_ns; + } + strcat(buf, "\n"); + + PERFOUT("%s\n", buf); + + return sum; +} + +// The main_func is the main orchestrator, it performs a bunch of +// experiments. For every experiment, it orders all the readers +// involved to start and waits for them to finish the experiment. It +// then reads their timestamps and starts the next experiment. Each +// experiment progresses from 1 concurrent reader to N of them at which +// point all the timestamps are printed. +static int main_func(void *arg) +{ + int exp, r; + char buf1[64]; + char buf[512]; + + set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + VERBOSE_PERFOUT("main_func task started"); + atomic_inc(&n_init); + + // Wait for all threads to start. + wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1)); + + // Start exp readers up per experiment + for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) { + if (torture_must_stop()) + goto end; + + reset_readers(exp); + atomic_set(&nreaders_exp, exp + 1); + + exp_idx = exp; + + for (r = 0; r <= exp; r++) { + atomic_set(&reader_tasks[r].start, 1); + wake_up(&reader_tasks[r].wq); + } + + VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers", + exp); + + wait_event(main_wq, + !atomic_read(&nreaders_exp) || torture_must_stop()); + + VERBOSE_PERFOUT("main_func: experiment ended"); + + if (torture_must_stop()) + goto end; + + reader_tasks[exp].result_avg = process_durations(exp) / ((exp + 1) * loops); + } + + // Print the average of all experiments + PERFOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); + + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Threads\tTime(ns)\n"); + + for (exp = 0; exp < nreaders; exp++) { + sprintf(buf1, "%d\t%llu\n", exp + 1, reader_tasks[exp].result_avg); + strcat(buf, buf1); + } + + PERFOUT("%s", buf); + + // This will shutdown everything including us. + if (shutdown) { + shutdown_start = 1; + wake_up(&shutdown_wq); + } + + // Wait for torture to stop us + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + +end: + torture_kthread_stopping("main_func"); + return 0; +} + +static void +ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) +{ + pr_alert("%s" PERF_FLAG + "--- %s: verbose=%d shutdown=%d loops=%ld\n", perf_type, tag, + verbose, shutdown, loops); +} + +static void +ref_perf_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nreaders; i++) + torture_stop_kthread("ref_perf_reader", + reader_tasks[i].task); + } + kfree(reader_tasks); + + torture_stop_kthread("main_task", main_task); + kfree(main_task); + + // Do perf-type-specific cleanup operations. + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +// Shutdown kthread. Just waits to be awakened, then shuts down system. +static int +ref_perf_shutdown(void *arg) +{ + wait_event(shutdown_wq, shutdown_start); + + smp_mb(); // Wake before output. + ref_perf_cleanup(); + kernel_power_off(); + + return -EINVAL; +} + +static int __init +ref_perf_init(void) +{ + long i; + int firsterr = 0; + static struct ref_perf_ops *perf_ops[] = { + &rcu_ops, &srcu_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, + }; + + if (!torture_init_begin(perf_type, verbose)) + return -EBUSY; + + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { + cur_ops = perf_ops[i]; + if (strcmp(perf_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(perf_ops)) { + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); + pr_alert("rcu-perf types:"); + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) + pr_cont(" %s", perf_ops[i]->name); + pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_PERF_TEST)); + firsterr = -EINVAL; + cur_ops = NULL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + ref_perf_print_module_parms(cur_ops, "Start of test"); + + // Shutdown task + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(ref_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + // Reader tasks (~75% of online CPUs). + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (!reader_tasks) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_PERFOUT("Starting %d reader threads\n", nreaders); + + for (i = 0; i < nreaders; i++) { + firsterr = torture_create_kthread(ref_perf_reader, (void *)i, + reader_tasks[i].task); + if (firsterr) + goto unwind; + + init_waitqueue_head(&(reader_tasks[i].wq)); + } + + // Main Task + init_waitqueue_head(&main_wq); + firsterr = torture_create_kthread(main_func, NULL, main_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + + + // Wait until all threads start + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + + wake_up(&main_wq); + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + ref_perf_cleanup(); + return firsterr; +} + +module_init(ref_perf_init); +module_exit(ref_perf_cleanup); From 708cda31652c02e64adaeafafe7b996e4e14c3eb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 09:22:24 -0700 Subject: [PATCH 134/502] rcuperf: Add comments explaining the high reader overhead This commit adds comments explaining why the readers have otherwise insane levels of measurement overhead, namely that they are intended as a test load for update-side performance measurements, not as a straight-up read-side performance test. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 246da8fe199e..d906ca987936 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -69,6 +69,11 @@ MODULE_AUTHOR("Paul E. McKenney "); * value specified by nr_cpus for a read-only test. * * Various other use cases may of course be specified. + * + * Note that this test's readers are intended only as a test load for + * the writers. The reader performance statistics will be overly + * pessimistic due to the per-critical-section interrupt disabling, + * test-end checks, and the pair of calls through pointers. */ #ifdef MODULE @@ -309,8 +314,10 @@ static void rcu_perf_wait_shutdown(void) } /* - * RCU perf reader kthread. Repeatedly does empty RCU read-side - * critical section, minimizing update-side interference. + * RCU perf reader kthread. Repeatedly does empty RCU read-side critical + * section, minimizing update-side interference. However, the point of + * this test is not to evaluate reader performance, but instead to serve + * as a test load for update-side performance testing. */ static int rcu_perf_reader(void *arg) From f8b4bb23ec014a5d16663ad70b45d9f46c456ec4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 14:07:52 -0700 Subject: [PATCH 135/502] torture: Add refperf to the rcutorture scripting This commit updates the rcutorture scripting to include the new refperf torture-test module. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-recheck-refperf.sh | 67 +++++++++++++++++++ tools/testing/selftests/rcutorture/bin/kvm.sh | 9 +-- .../selftests/rcutorture/bin/parse-console.sh | 4 +- .../rcutorture/configs/refperf/CFLIST | 2 + .../rcutorture/configs/refperf/CFcommon | 2 + .../rcutorture/configs/refperf/NOPREEMPT | 18 +++++ .../rcutorture/configs/refperf/PREEMPT | 18 +++++ .../configs/refperf/ver_functions.sh | 16 +++++ 8 files changed, 130 insertions(+), 6 deletions(-) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh create mode 100644 tools/testing/selftests/rcutorture/configs/refperf/CFLIST create mode 100644 tools/testing/selftests/rcutorture/configs/refperf/CFcommon create mode 100644 tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT create mode 100644 tools/testing/selftests/rcutorture/configs/refperf/PREEMPT create mode 100644 tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh new file mode 100755 index 000000000000..6fc06cd3538e --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Analyze a given results directory for refperf performance measurements. +# +# Usage: kvm-recheck-refperf.sh resdir +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney + +i="$1" +if test -d "$i" -a -r "$i" +then + : +else + echo Unreadable results directory: $i + exit 1 +fi +PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH +. functions.sh + +configfile=`echo $i | sed -e 's/^.*\///'` + +sed -e 's/^\[[^]]*]//' < $i/console.log | tr -d '\015' | +awk -v configfile="$configfile" ' +/^[ ]*Threads Time\(ns\) *$/ { + if (dataphase + 0 == 0) { + dataphase = 1; + # print configfile, $0; + } + next; +} + +/[^ ]*[0-9][0-9]* [0-9][0-9]*\.[0-9][0-9]*$/ { + if (dataphase == 1) { + # print $0; + readertimes[++n] = $2; + sum += $2; + } + next; +} + +{ + if (dataphase == 1) + dataphase == 2; + next; +} + +END { + print configfile " results:"; + newNR = asort(readertimes); + if (newNR <= 0) { + print "No refperf records found???" + exit; + } + medianidx = int(newNR / 2); + if (newNR == medianidx * 2) + medianvalue = (readertimes[medianidx - 1] + readertimes[medianidx]) / 2; + else + medianvalue = readertimes[medianidx]; + print "Average reader duration: " sum / newNR " nanoseconds"; + print "Minimum reader duration: " readertimes[1]; + print "Median reader duration: " medianvalue; + print "Maximum reader duration: " readertimes[newNR]; + print "Computed from refperf printk output."; +}' diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index c279cf9cb010..48b6a7248f50 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -180,13 +180,14 @@ do shift ;; --torture) - checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\)$' '^--' + checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refperf\)$' '^--' TORTURE_SUITE=$2 shift - if test "$TORTURE_SUITE" = rcuperf + if test "$TORTURE_SUITE" = rcuperf || test "$TORTURE_SUITE" = refperf then - # If you really want jitter for rcuperf, specify - # it after specifying rcuperf. (But why?) + # If you really want jitter for refperf or + # rcuperf, specify it after specifying the rcuperf + # or the refperf. (But why jitter in these cases?) jitter=0 fi ;; diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 4bf62d7b1cbc..85af11d2d0cb 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -33,8 +33,8 @@ then fi cat /dev/null > $file.diags -# Check for proper termination, except that rcuperf runs don't indicate this. -if test "$TORTURE_SUITE" != rcuperf +# Check for proper termination, except for rcuperf and refperf. +if test "$TORTURE_SUITE" != rcuperf && test "$TORTURE_SUITE" != refperf then # check for abject failure diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFLIST b/tools/testing/selftests/rcutorture/configs/refperf/CFLIST new file mode 100644 index 000000000000..4d62eb4a39f9 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refperf/CFLIST @@ -0,0 +1,2 @@ +NOPREEMPT +PREEMPT diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon new file mode 100644 index 000000000000..8ba5ba207503 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon @@ -0,0 +1,2 @@ +CONFIG_RCU_REF_PERF_TEST=y +CONFIG_PRINTK_TIME=y diff --git a/tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT new file mode 100644 index 000000000000..1cd25b7314e3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT @@ -0,0 +1,18 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_PREEMPT_RCU=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/refperf/PREEMPT b/tools/testing/selftests/rcutorture/configs/refperf/PREEMPT new file mode 100644 index 000000000000..d10bc694f42c --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refperf/PREEMPT @@ -0,0 +1,18 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh new file mode 100644 index 000000000000..489f05dd929a --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Torture-suite-dependent shell functions for the rest of the scripts. +# +# Copyright (C) IBM Corporation, 2015 +# +# Authors: Paul E. McKenney + +# per_version_boot_params bootparam-string config-file seconds +# +# Adds per-version torture-module parameters to kernels supporting them. +per_version_boot_params () { + echo $1 refperf.shutdown=1 \ + refperf.verbose=1 +} From 777a54c908ec69fa0eccab54068a49ecda38ffde Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 14:16:44 -0700 Subject: [PATCH 136/502] refperf: Add holdoff parameter to allow CPUs to come online This commit adds an rcuperf module parameter named "holdoff" that defaults to 10 seconds if refperf is built in and to zero otherwise. The assumption is that all the CPUs are online by the time that the modprobe and insmod commands are going to do anything, and that normal systems will have all the CPUs online within ten seconds. Larger systems may take many tens of seconds or even minutes to get to this point, hence this being a module parameter instead of being a hard-coded constant. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 61161530acc8..4d686fdc3105 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -57,7 +57,10 @@ MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); -// Number of loops per experiment, all readers execute an operation concurrently +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +// Number of loops per experiment, all readers execute operations concurrently. torture_param(long, loops, 10000000, "Number of loops per experiment."); #ifdef MODULE @@ -248,6 +251,8 @@ ref_perf_reader(void *arg) set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); atomic_inc(&n_init); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); @@ -357,6 +362,8 @@ static int main_func(void *arg) // Wait for all threads to start. wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1)); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); // Start exp readers up per experiment for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) { @@ -420,8 +427,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d loops=%ld\n", perf_type, tag, - verbose, shutdown, loops); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld\n", perf_type, tag, + verbose, shutdown, holdoff, loops); } static void From 75dd8efef56ed5959c398974c785026f84aa0d1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 14:59:06 -0700 Subject: [PATCH 137/502] refperf: Hoist function-pointer calls out of the loop Current runs show PREEMPT=n rcu_read_lock()/rcu_read_unlock() pairs consuming between 20 and 30 nanoseconds, when in fact the actual value is zero, give or take the barrier() asm's effect on compiler optimizations. The additional overhead is caused by function calls through pointers (especially in these days of Spectre mitigations) and perhaps also needless argument passing, a non-const loop limit, and an upcounting loop. This commit therefore combines the ->readlock() and ->readunlock() function pointers into a single ->readsection() function pointer that takes the loop count as a const parameter and keeps any data passed from the read-lock to the read-unlock internal to this new function. These changes reduce the measured overhead of the aforementioned PREEMPT=n rcu_read_lock()/rcu_read_unlock() pairs from between 20 and 30 nanoseconds to somewhere south of 500 picoseconds. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 92 ++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 54 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 4d686fdc3105..57c7b7a40bd2 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -108,23 +108,20 @@ static int exp_idx; struct ref_perf_ops { void (*init)(void); void (*cleanup)(void); - int (*readlock)(void); - void (*readunlock)(int idx); + void (*readsection)(const int nloops); const char *name; }; static struct ref_perf_ops *cur_ops; -// Definitions for RCU ref perf testing. -static int ref_rcu_read_lock(void) __acquires(RCU) +static void ref_rcu_read_section(const int nloops) { - rcu_read_lock(); - return 0; -} + int i; -static void ref_rcu_read_unlock(int idx) __releases(RCU) -{ - rcu_read_unlock(); + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + rcu_read_unlock(); + } } static void rcu_sync_perf_init(void) @@ -133,8 +130,7 @@ static void rcu_sync_perf_init(void) static struct ref_perf_ops rcu_ops = { .init = rcu_sync_perf_init, - .readlock = ref_rcu_read_lock, - .readunlock = ref_rcu_read_unlock, + .readsection = ref_rcu_read_section, .name = "rcu" }; @@ -143,42 +139,39 @@ static struct ref_perf_ops rcu_ops = { DEFINE_STATIC_SRCU(srcu_refctl_perf); static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; -static int srcu_ref_perf_read_lock(void) __acquires(srcu_ctlp) +static void srcu_ref_perf_read_section(int nloops) { - return srcu_read_lock(srcu_ctlp); -} + int i; + int idx; -static void srcu_ref_perf_read_unlock(int idx) __releases(srcu_ctlp) -{ - srcu_read_unlock(srcu_ctlp, idx); + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + srcu_read_unlock(srcu_ctlp, idx); + } } static struct ref_perf_ops srcu_ops = { .init = rcu_sync_perf_init, - .readlock = srcu_ref_perf_read_lock, - .readunlock = srcu_ref_perf_read_unlock, + .readsection = srcu_ref_perf_read_section, .name = "srcu" }; // Definitions for reference count static atomic_t refcnt; -static int srcu_ref_perf_refcnt_lock(void) +static void ref_perf_refcnt_section(const int nloops) { - atomic_inc(&refcnt); - return 0; -} + int i; -static void srcu_ref_perf_refcnt_unlock(int idx) __releases(srcu_ctlp) -{ - atomic_dec(&refcnt); - srcu_read_unlock(srcu_ctlp, idx); + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + atomic_dec(&refcnt); + } } static struct ref_perf_ops refcnt_ops = { .init = rcu_sync_perf_init, - .readlock = srcu_ref_perf_refcnt_lock, - .readunlock = srcu_ref_perf_refcnt_unlock, + .readsection = ref_perf_refcnt_section, .name = "refcnt" }; @@ -190,21 +183,19 @@ static void ref_perf_rwlock_init(void) rwlock_init(&test_rwlock); } -static int ref_perf_rwlock_lock(void) +static void ref_perf_rwlock_section(const int nloops) { - read_lock(&test_rwlock); - return 0; -} + int i; -static void ref_perf_rwlock_unlock(int idx) -{ - read_unlock(&test_rwlock); + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + read_unlock(&test_rwlock); + } } static struct ref_perf_ops rwlock_ops = { .init = ref_perf_rwlock_init, - .readlock = ref_perf_rwlock_lock, - .readunlock = ref_perf_rwlock_unlock, + .readsection = ref_perf_rwlock_section, .name = "rwlock" }; @@ -216,21 +207,19 @@ static void ref_perf_rwsem_init(void) init_rwsem(&test_rwsem); } -static int ref_perf_rwsem_lock(void) +static void ref_perf_rwsem_section(const int nloops) { - down_read(&test_rwsem); - return 0; -} + int i; -static void ref_perf_rwsem_unlock(int idx) -{ - up_read(&test_rwsem); + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + up_read(&test_rwsem); + } } static struct ref_perf_ops rwsem_ops = { .init = ref_perf_rwsem_init, - .readlock = ref_perf_rwsem_lock, - .readunlock = ref_perf_rwsem_unlock, + .readsection = ref_perf_rwsem_section, .name = "rwsem" }; @@ -242,8 +231,6 @@ ref_perf_reader(void *arg) unsigned long flags; long me = (long)arg; struct reader_task *rt = &(reader_tasks[me]); - unsigned long spincnt; - int idx; u64 start; s64 duration; @@ -275,10 +262,7 @@ repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); - for (spincnt = 0; spincnt < loops; spincnt++) { - idx = cur_ops->readlock(); - cur_ops->readunlock(idx); - } + cur_ops->readsection(loops); duration = ktime_get_mono_fast_ns() - start; local_irq_restore(flags); From 83b88c86da0e5f97faeac5a9bb19fe32f8c0394b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 15:31:07 -0700 Subject: [PATCH 138/502] refperf: Allow decimal nanoseconds The CONFIG_PREEMPT=n rcu_read_lock()/rcu_read_unlock() pair's overhead, even including loop overhead, is far less than one nanosecond. Since logscale plots are not all that happy with zero values, provide picoseconds as decimals. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 57c7b7a40bd2..e991d4820f51 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -375,7 +375,7 @@ static int main_func(void *arg) if (torture_must_stop()) goto end; - reader_tasks[exp].result_avg = process_durations(exp) / ((exp + 1) * loops); + reader_tasks[exp].result_avg = 1000 * process_durations(exp) / ((exp + 1) * loops); } // Print the average of all experiments @@ -386,7 +386,7 @@ static int main_func(void *arg) strcat(buf, "Threads\tTime(ns)\n"); for (exp = 0; exp < nreaders; exp++) { - sprintf(buf1, "%d\t%llu\n", exp + 1, reader_tasks[exp].result_avg); + sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, reader_tasks[exp].result_avg / 1000, (int)(reader_tasks[exp].result_avg % 1000)); strcat(buf, buf1); } From 8fc28783a0c3704ea27505a25dbde8333d75380c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 15:48:38 -0700 Subject: [PATCH 139/502] refperf: Convert nreaders to a module parameter This commit converts nreaders to a module parameter, with the default of -1 specifying the old behavior of using 75% of the readers. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index e991d4820f51..020e55a9a64b 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -62,6 +62,12 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, "Holdoff time before test start (s)"); // Number of loops per experiment, all readers execute operations concurrently. torture_param(long, loops, 10000000, "Number of loops per experiment."); +// Number of readers, with -1 defaulting to about 75% of the CPUs. +torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); +// Number of runs. +torture_param(int, nruns, 30, "Number of experiments to run."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); #ifdef MODULE # define REFPERF_SHUTDOWN 0 @@ -93,7 +99,6 @@ static wait_queue_head_t main_wq; static int shutdown_start; static struct reader_task *reader_tasks; -static int nreaders; // Number of readers that are part of the current experiment. static atomic_t nreaders_exp; @@ -411,8 +416,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld\n", perf_type, tag, - verbose, shutdown, holdoff, loops); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d\n", perf_type, tag, + verbose, shutdown, holdoff, loops, nreaders); } static void @@ -501,8 +506,9 @@ ref_perf_init(void) schedule_timeout_uninterruptible(1); } - // Reader tasks (~75% of online CPUs). - nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + // Reader tasks (default to ~75% of online CPUs). + if (nreaders < 0) + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { From dbf28efdae7bb51032eeb0fe1b6bd07d6f0f9b6c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 17:22:24 -0700 Subject: [PATCH 140/502] refperf: Provide module parameter to specify number of experiments The current code uses the number of threads both to limit the number of threads and to specify the number of experiments, but also varies the number of threads as the experiments progress. This commit takes a different approach by adding an refperf.nruns module parameter that specifies the number of experiments, and furthermore uses the same number of threads for each experiment. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 020e55a9a64b..6324449db404 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -83,12 +83,6 @@ struct reader_task { atomic_t start; wait_queue_head_t wq; u64 last_duration_ns; - - // The average latency When 1.. are concurrently - // running an experiment. For example, if this reader_task is - // of index 5 in the reader_tasks array, then result is for - // 6 cores. - u64 result_avg; }; static struct task_struct *shutdown_task; @@ -289,12 +283,12 @@ end: return 0; } -void reset_readers(int n) +void reset_readers(void) { int i; struct reader_task *rt; - for (i = 0; i < n; i++) { + for (i = 0; i < nreaders; i++) { rt = &(reader_tasks[i]); rt->last_duration_ns = 0; @@ -314,7 +308,7 @@ u64 process_durations(int n) sprintf(buf, "Experiment #%d (Format: :)", exp_idx); - for (i = 0; i <= n && !torture_must_stop(); i++) { + for (i = 0; i < n && !torture_must_stop(); i++) { rt = &(reader_tasks[i]); sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); @@ -342,11 +336,15 @@ static int main_func(void *arg) int exp, r; char buf1[64]; char buf[512]; + u64 *result_avg; set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); set_user_nice(current, MAX_NICE); VERBOSE_PERFOUT("main_func task started"); + result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); + if (!result_avg) + VERBOSE_PERFOUT_ERRSTRING("out of memory"); atomic_inc(&n_init); // Wait for all threads to start. @@ -355,22 +353,24 @@ static int main_func(void *arg) schedule_timeout_interruptible(holdoff * HZ); // Start exp readers up per experiment - for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) { + for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { + if (!result_avg) + break; if (torture_must_stop()) goto end; - reset_readers(exp); - atomic_set(&nreaders_exp, exp + 1); + reset_readers(); + atomic_set(&nreaders_exp, nreaders); exp_idx = exp; - for (r = 0; r <= exp; r++) { + for (r = 0; r < nreaders; r++) { atomic_set(&reader_tasks[r].start, 1); wake_up(&reader_tasks[r].wq); } VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers", - exp); + nreaders); wait_event(main_wq, !atomic_read(&nreaders_exp) || torture_must_stop()); @@ -380,7 +380,7 @@ static int main_func(void *arg) if (torture_must_stop()) goto end; - reader_tasks[exp].result_avg = 1000 * process_durations(exp) / ((exp + 1) * loops); + result_avg[exp] = 1000 * process_durations(nreaders) / (nreaders * loops); } // Print the average of all experiments @@ -390,12 +390,15 @@ static int main_func(void *arg) strcat(buf, "\n"); strcat(buf, "Threads\tTime(ns)\n"); - for (exp = 0; exp < nreaders; exp++) { - sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, reader_tasks[exp].result_avg / 1000, (int)(reader_tasks[exp].result_avg % 1000)); + for (exp = 0; exp < nruns; exp++) { + if (!result_avg) + break; + sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000)); strcat(buf, buf1); } - PERFOUT("%s", buf); + if (result_avg) + PERFOUT("%s", buf); // This will shutdown everything including us. if (shutdown) { @@ -416,8 +419,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d\n", perf_type, tag, - verbose, shutdown, holdoff, loops, nreaders); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d\n", perf_type, tag, + verbose, shutdown, holdoff, loops, nreaders, nruns); } static void From f518f154ecef347777db33b7c9b0581f245159f0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 17:32:56 -0700 Subject: [PATCH 141/502] refperf: Dynamically allocate experiment-summary output buffer Currently, the buffer used to accumulate the experiment-summary output is fixed size, which will cause problems if someone decides to run one hundred experiments. This commit therefore dynamically allocates this buffer. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 6324449db404..75b9cceaece1 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -333,9 +333,10 @@ u64 process_durations(int n) // point all the timestamps are printed. static int main_func(void *arg) { + bool errexit = false; int exp, r; char buf1[64]; - char buf[512]; + char *buf; u64 *result_avg; set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); @@ -343,8 +344,11 @@ static int main_func(void *arg) VERBOSE_PERFOUT("main_func task started"); result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); - if (!result_avg) + buf = kzalloc(64 + nruns * 32, GFP_KERNEL); + if (!result_avg || !buf) { VERBOSE_PERFOUT_ERRSTRING("out of memory"); + errexit = true; + } atomic_inc(&n_init); // Wait for all threads to start. @@ -354,7 +358,7 @@ static int main_func(void *arg) // Start exp readers up per experiment for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { - if (!result_avg) + if (errexit) break; if (torture_must_stop()) goto end; @@ -391,13 +395,13 @@ static int main_func(void *arg) strcat(buf, "Threads\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { - if (!result_avg) + if (errexit) break; sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000)); strcat(buf, buf1); } - if (result_avg) + if (!errexit) PERFOUT("%s", buf); // This will shutdown everything including us. @@ -412,6 +416,8 @@ static int main_func(void *arg) end: torture_kthread_stopping("main_func"); + kfree(result_avg); + kfree(buf); return 0; } From 2e90de76f226f11fe26c871aa321be28152f565a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 17:45:03 -0700 Subject: [PATCH 142/502] refperf: Dynamically allocate thread-summary output buffer Currently, the buffer used to accumulate the thread-summary output is fixed size, which will cause problems if someone decides to run on a large number of PCUs. This commit therefore dynamically allocates this buffer. [ paulmck: Fix memory allocation as suggested by KASAN. ] Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 75b9cceaece1..fc940e3dba1f 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -301,9 +301,12 @@ u64 process_durations(int n) int i; struct reader_task *rt; char buf1[64]; - char buf[512]; + char *buf; u64 sum = 0; + buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); + if (!buf) + return 0; buf[0] = 0; sprintf(buf, "Experiment #%d (Format: :)", exp_idx); @@ -322,6 +325,7 @@ u64 process_durations(int n) PERFOUT("%s\n", buf); + kfree(buf); return sum; } From 2990750bceb05c3cdeae3a6d2683cbc4ae4de15e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 09:32:57 -0700 Subject: [PATCH 143/502] refperf: Make functions static Because the reset_readers() and process_durations() functions are used only within kernel/rcu/refperf.c, this commit makes them static. Reported-by: kbuild test robot Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index fc940e3dba1f..0a900f3ae151 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -283,7 +283,7 @@ end: return 0; } -void reset_readers(void) +static void reset_readers(void) { int i; struct reader_task *rt; @@ -296,7 +296,7 @@ void reset_readers(void) } // Print the results of each reader and return the sum of all their durations. -u64 process_durations(int n) +static u64 process_durations(int n) { int i; struct reader_task *rt; From b864f89ff61492f56b4e8c6713a5efec6540a0e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 10:57:34 -0700 Subject: [PATCH 144/502] refperf: Tune reader measurement interval This commit moves a printk() out of the measurement interval, converts a atomic_dec()/atomic_read() pair to atomic_dec_and_test(), and adds a smp_mb__before_atomic() to avoid potential wake/wait hangs. These changes have the added benefit of reducing the number of loops required for amortizing loop overhead for CONFIG_PREEMPT=n RCU measurements from 1,000,000 to 10,000. This reduction in turn shortens the test, reducing the probability of interference. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 0a900f3ae151..8815ccfb6f98 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -252,15 +252,16 @@ repeat: // Make sure that the CPU is affinitized appropriately during testing. WARN_ON_ONCE(smp_processor_id() != me); + smp_mb__before_atomic(); atomic_dec(&rt->start); + VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); + // To prevent noise, keep interrupts disabled. This also has the // effect of preventing entries into slow path for rcu_read_unlock(). local_irq_save(flags); start = ktime_get_mono_fast_ns(); - VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); - cur_ops->readsection(loops); duration = ktime_get_mono_fast_ns() - start; @@ -268,14 +269,12 @@ repeat: rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; - atomic_dec(&nreaders_exp); + if (atomic_dec_and_test(&nreaders_exp)) + wake_up(&main_wq); VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)", me, exp_idx, atomic_read(&nreaders_exp)); - if (!atomic_read(&nreaders_exp)) - wake_up(&main_wq); - if (!torture_must_stop()) goto repeat; end: From af2789db13b8dc38d16e969f8c11b9468be42d46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 11:22:03 -0700 Subject: [PATCH 145/502] refperf: Convert reader_task structure's "start" field to int This commit converts the reader_task structure's "start" field to int in order to demote a full barrier to an smp_load_acquire() and also to simplify the code a bit. While in the area, and to enlist the compiler's help in ensuring that nothing was missed, the field's name was changed to start_reader. Also while in the area, change the main_func() store to use smp_store_release() to further fortify against wait/wake races. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 8815ccfb6f98..2fd3ed1a0d0d 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -80,7 +80,7 @@ torture_param(bool, shutdown, REFPERF_SHUTDOWN, struct reader_task { struct task_struct *task; - atomic_t start; + int start_reader; wait_queue_head_t wq; u64 last_duration_ns; }; @@ -243,7 +243,7 @@ repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); // Wait for signal that this reader can start. - wait_event(rt->wq, (atomic_read(&nreaders_exp) && atomic_read(&rt->start)) || + wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || torture_must_stop()); if (torture_must_stop()) @@ -252,8 +252,7 @@ repeat: // Make sure that the CPU is affinitized appropriately during testing. WARN_ON_ONCE(smp_processor_id() != me); - smp_mb__before_atomic(); - atomic_dec(&rt->start); + WRITE_ONCE(rt->start_reader, 0); VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); @@ -372,7 +371,7 @@ static int main_func(void *arg) exp_idx = exp; for (r = 0; r < nreaders; r++) { - atomic_set(&reader_tasks[r].start, 1); + smp_store_release(&reader_tasks[r].start_reader, 1); wake_up(&reader_tasks[r].wq); } From 86e0da2bb8ed934d3dce5a337895f1118f59c087 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 11:40:52 -0700 Subject: [PATCH 146/502] refperf: More closely synchronize reader start times Currently, readers are awakened individually. On most systems, this results in significant wakeup delay from one reader to the next, which can result in the first and last reader having sole access to the synchronization primitive in question. If that synchronization primitive involves shared memory, those readers will rack up a huge number of operations in a very short time, causing large perturbations in the results. This commit therefore has the readers busy-wait after being awakened, and uses a new n_started variable to synchronize their start times. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 2fd3ed1a0d0d..234bb0e84a8b 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -99,6 +99,7 @@ static atomic_t nreaders_exp; // Use to wait for all threads to start. static atomic_t n_init; +static atomic_t n_started; // Track which experiment is currently running. static int exp_idx; @@ -253,6 +254,9 @@ repeat: WARN_ON_ONCE(smp_processor_id() != me); WRITE_ONCE(rt->start_reader, 0); + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) + cpu_relax(); VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); @@ -367,6 +371,7 @@ static int main_func(void *arg) reset_readers(); atomic_set(&nreaders_exp, nreaders); + atomic_set(&n_started, nreaders); exp_idx = exp; From 2db0bda38453f472640f4ece1e2a495cbd44f892 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 12:34:57 -0700 Subject: [PATCH 147/502] refperf: Add warmup and cooldown processing phases This commit causes all the readers to start running unmeasured load until all readers have done at least one such run (thus having warmed up), then run the measured load, and then run unmeasured load until all readers have completed their measured load. This approach avoids any thread running measured load while other readers are idle. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 234bb0e84a8b..445190b97b05 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -100,6 +100,8 @@ static atomic_t nreaders_exp; // Use to wait for all threads to start. static atomic_t n_init; static atomic_t n_started; +static atomic_t n_warmedup; +static atomic_t n_cooleddown; // Track which experiment is currently running. static int exp_idx; @@ -260,8 +262,15 @@ repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); - // To prevent noise, keep interrupts disabled. This also has the - // effect of preventing entries into slow path for rcu_read_unlock(). + + // To reduce noise, do an initial cache-warming invocation, check + // in, and then keep warming until everyone has checked in. + cur_ops->readsection(loops); + if (!atomic_dec_return(&n_warmedup)) + while (atomic_read_acquire(&n_warmedup)) + cur_ops->readsection(loops); + // Also keep interrupts disabled. This also has the effect + // of preventing entries into slow path for rcu_read_unlock(). local_irq_save(flags); start = ktime_get_mono_fast_ns(); @@ -271,6 +280,11 @@ repeat: local_irq_restore(flags); rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + // To reduce runtime-skew noise, do maintain-load invocations until + // everyone is done. + if (!atomic_dec_return(&n_cooleddown)) + while (atomic_read_acquire(&n_cooleddown)) + cur_ops->readsection(loops); if (atomic_dec_and_test(&nreaders_exp)) wake_up(&main_wq); @@ -372,6 +386,8 @@ static int main_func(void *arg) reset_readers(); atomic_set(&nreaders_exp, nreaders); atomic_set(&n_started, nreaders); + atomic_set(&n_warmedup, nreaders); + atomic_set(&n_cooleddown, nreaders); exp_idx = exp; From 6efb06340846c788336f402e3a472a24fabb431e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 14:26:25 -0700 Subject: [PATCH 148/502] refperf: Label experiment-number column "Runs" The experiment-number column is currently labeled "Threads", which is misleading at best. This commit therefore relabels it as "Runs", and adjusts the scripts accordingly. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 2 +- tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 445190b97b05..2d2d227d761a 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -415,7 +415,7 @@ static int main_func(void *arg) buf[0] = 0; strcat(buf, "\n"); - strcat(buf, "Threads\tTime(ns)\n"); + strcat(buf, "Runs\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { if (errexit) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh index 6fc06cd3538e..0660f3fab215 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh @@ -24,7 +24,7 @@ configfile=`echo $i | sed -e 's/^.*\///'` sed -e 's/^\[[^]]*]//' < $i/console.log | tr -d '\015' | awk -v configfile="$configfile" ' -/^[ ]*Threads Time\(ns\) *$/ { +/^[ ]*Runs Time\(ns\) *$/ { if (dataphase + 0 == 0) { dataphase = 1; # print configfile, $0; From 9d1914d34cebe111a23ab1670633900fd770cec3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 15:30:09 -0700 Subject: [PATCH 149/502] refperf: Output per-experiment data points Currently, it is necessary to manually edit the console output to see anything more than statistics, and sometimes the statistics can indicate outliers that need more investigation. This commit therefore dumps out the per-experiment measurements, sorted in ascending order, just before dumping out the statistics. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh index 0660f3fab215..0e29cfd9986c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh @@ -59,6 +59,10 @@ END { medianvalue = (readertimes[medianidx - 1] + readertimes[medianidx]) / 2; else medianvalue = readertimes[medianidx]; + points = "Points:"; + for (i = 1; i <= newNR; i++) + points = points " " readertimes[i]; + print points; print "Average reader duration: " sum / newNR " nanoseconds"; print "Minimum reader duration: " readertimes[1]; print "Median reader duration: " medianvalue; From 96af8669591d740a1e2695c4d96e544409dbf896 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 May 2020 16:46:56 -0700 Subject: [PATCH 150/502] refperf: Simplify initialization-time wakeup protocol This commit moves the reader-launch wait loop from ref_perf_init() to main_func(), removing one layer of wakeup and allowing slightly faster system boot. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 2d2d227d761a..7839237ffc17 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -369,13 +369,14 @@ static int main_func(void *arg) VERBOSE_PERFOUT_ERRSTRING("out of memory"); errexit = true; } - atomic_inc(&n_init); - - // Wait for all threads to start. - wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1)); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); + // Wait for all threads to start. + atomic_inc(&n_init); + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + // Start exp readers up per experiment for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { if (errexit) @@ -565,14 +566,6 @@ ref_perf_init(void) firsterr = torture_create_kthread(main_func, NULL, main_task); if (firsterr) goto unwind; - schedule_timeout_uninterruptible(1); - - - // Wait until all threads start - while (atomic_read(&n_init) < nreaders + 1) - schedule_timeout_uninterruptible(1); - - wake_up(&main_wq); torture_init_end(); return 0; From b4d1e34f6502a138e32275baabdb6d593d7ea432 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 16:37:35 -0700 Subject: [PATCH 151/502] refperf: Add read-side delay module parameter This commit adds a refperf.readdelay module parameter that controls the duration of each critical section. This parameter allows gathering data showing how the performance differences between the various primitives vary with critical-section length. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 108 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 89 insertions(+), 19 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 7839237ffc17..57a750bbcaca 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -66,8 +66,8 @@ torture_param(long, loops, 10000000, "Number of loops per experiment."); torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. torture_param(int, nruns, 30, "Number of experiments to run."); -// Reader delay in nanoseconds, 0 for no delay. -torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); +// Reader delay in microseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in microseconds."); #ifdef MODULE # define REFPERF_SHUTDOWN 0 @@ -111,6 +111,7 @@ struct ref_perf_ops { void (*init)(void); void (*cleanup)(void); void (*readsection)(const int nloops); + void (*delaysection)(const int nloops, const int ndelay); const char *name; }; @@ -126,6 +127,17 @@ static void ref_rcu_read_section(const int nloops) } } +static void ref_rcu_delay_section(const int nloops, const int ndelay) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + udelay(ndelay); + rcu_read_unlock(); + } +} + static void rcu_sync_perf_init(void) { } @@ -133,6 +145,7 @@ static void rcu_sync_perf_init(void) static struct ref_perf_ops rcu_ops = { .init = rcu_sync_perf_init, .readsection = ref_rcu_read_section, + .delaysection = ref_rcu_delay_section, .name = "rcu" }; @@ -141,7 +154,7 @@ static struct ref_perf_ops rcu_ops = { DEFINE_STATIC_SRCU(srcu_refctl_perf); static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; -static void srcu_ref_perf_read_section(int nloops) +static void srcu_ref_perf_read_section(const int nloops) { int i; int idx; @@ -152,16 +165,29 @@ static void srcu_ref_perf_read_section(int nloops) } } +static void srcu_ref_perf_delay_section(const int nloops, const int ndelay) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + udelay(ndelay); + srcu_read_unlock(srcu_ctlp, idx); + } +} + static struct ref_perf_ops srcu_ops = { .init = rcu_sync_perf_init, .readsection = srcu_ref_perf_read_section, + .delaysection = srcu_ref_perf_delay_section, .name = "srcu" }; // Definitions for reference count static atomic_t refcnt; -static void ref_perf_refcnt_section(const int nloops) +static void ref_refcnt_section(const int nloops) { int i; @@ -171,21 +197,33 @@ static void ref_perf_refcnt_section(const int nloops) } } +static void ref_refcnt_delay_section(const int nloops, const int ndelay) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + udelay(ndelay); + atomic_dec(&refcnt); + } +} + static struct ref_perf_ops refcnt_ops = { .init = rcu_sync_perf_init, - .readsection = ref_perf_refcnt_section, + .readsection = ref_refcnt_section, + .delaysection = ref_refcnt_delay_section, .name = "refcnt" }; // Definitions for rwlock static rwlock_t test_rwlock; -static void ref_perf_rwlock_init(void) +static void ref_rwlock_init(void) { rwlock_init(&test_rwlock); } -static void ref_perf_rwlock_section(const int nloops) +static void ref_rwlock_section(const int nloops) { int i; @@ -195,21 +233,33 @@ static void ref_perf_rwlock_section(const int nloops) } } +static void ref_rwlock_delay_section(const int nloops, const int ndelay) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + udelay(ndelay); + read_unlock(&test_rwlock); + } +} + static struct ref_perf_ops rwlock_ops = { - .init = ref_perf_rwlock_init, - .readsection = ref_perf_rwlock_section, + .init = ref_rwlock_init, + .readsection = ref_rwlock_section, + .delaysection = ref_rwlock_delay_section, .name = "rwlock" }; // Definitions for rwsem static struct rw_semaphore test_rwsem; -static void ref_perf_rwsem_init(void) +static void ref_rwsem_init(void) { init_rwsem(&test_rwsem); } -static void ref_perf_rwsem_section(const int nloops) +static void ref_rwsem_section(const int nloops) { int i; @@ -219,12 +269,32 @@ static void ref_perf_rwsem_section(const int nloops) } } +static void ref_rwsem_delay_section(const int nloops, const int ndelay) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + udelay(ndelay); + up_read(&test_rwsem); + } +} + static struct ref_perf_ops rwsem_ops = { - .init = ref_perf_rwsem_init, - .readsection = ref_perf_rwsem_section, + .init = ref_rwsem_init, + .readsection = ref_rwsem_section, + .delaysection = ref_rwsem_delay_section, .name = "rwsem" }; +static void rcu_perf_one_reader(void) +{ + if (readdelay <= 0) + cur_ops->readsection(loops); + else + cur_ops->delaysection(loops, readdelay); +} + // Reader kthread. Repeatedly does empty RCU read-side // critical section, minimizing update-side interference. static int @@ -265,16 +335,16 @@ repeat: // To reduce noise, do an initial cache-warming invocation, check // in, and then keep warming until everyone has checked in. - cur_ops->readsection(loops); + rcu_perf_one_reader(); if (!atomic_dec_return(&n_warmedup)) while (atomic_read_acquire(&n_warmedup)) - cur_ops->readsection(loops); + rcu_perf_one_reader(); // Also keep interrupts disabled. This also has the effect // of preventing entries into slow path for rcu_read_unlock(). local_irq_save(flags); start = ktime_get_mono_fast_ns(); - cur_ops->readsection(loops); + rcu_perf_one_reader(); duration = ktime_get_mono_fast_ns() - start; local_irq_restore(flags); @@ -284,7 +354,7 @@ repeat: // everyone is done. if (!atomic_dec_return(&n_cooleddown)) while (atomic_read_acquire(&n_cooleddown)) - cur_ops->readsection(loops); + rcu_perf_one_reader(); if (atomic_dec_and_test(&nreaders_exp)) wake_up(&main_wq); @@ -449,8 +519,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d\n", perf_type, tag, - verbose, shutdown, holdoff, loops, nreaders, nruns); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", perf_type, tag, + verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); } static void From 4dd72a338a07486823037a6b45334d05192c913a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 29 May 2020 13:11:26 -0700 Subject: [PATCH 152/502] refperf: Adjust refperf.loop default value With the various measurement optimizations, 10,000 loops normally suffices. This commit therefore reduces the refperf.loops default value from 10,000,000 to 10,000. Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 57a750bbcaca..063eeb0473a1 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -61,7 +61,7 @@ torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, "Holdoff time before test start (s)"); // Number of loops per experiment, all readers execute operations concurrently. -torture_param(long, loops, 10000000, "Number of loops per experiment."); +torture_param(long, loops, 10000, "Number of loops per experiment."); // Number of readers, with -1 defaulting to about 75% of the CPUs. torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. From 847dd70aa971a67b4dfdb8f131428dfb90d88714 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 29 May 2020 14:24:03 -0700 Subject: [PATCH 153/502] doc: Document rcuperf's module parameters This commit adds documentation for the rcuperf module parameters. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb95fad81c79..20cd00b78fc4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4407,6 +4407,42 @@ reboot_cpu is s[mp]#### with #### being the processor to be used for rebooting. + refperf.holdoff= [KNL] + Set test-start holdoff period. The purpose of + this parameter is to delay the start of the + test until boot completes in order to avoid + interference. + + refperf.loops= [KNL] + Set the number of loops over the synchronization + primitive under test. Increasing this number + reduces noise due to loop start/end overhead, + but the default has already reduced the per-pass + noise to a handful of picoseconds on ca. 2020 + x86 laptops. + + refperf.nreaders= [KNL] + Set number of readers. The default value of -1 + selects N, where N is roughly 75% of the number + of CPUs. A value of zero is an interesting choice. + + refperf.nruns= [KNL] + Set number of runs, each of which is dumped onto + the console log. + + refperf.readdelay= [KNL] + Set the read-side critical-section duration, + measured in microseconds. + + refperf.shutdown= [KNL] + Shut down the system at the end of the performance + test. This defaults to 1 (shut it down) when + rcuperf is built into the kernel and to 0 (leave + it running) when rcuperf is built as a module. + + refperf.verbose= [KNL] + Enable additional printk() statements. + relax_domain_level= [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/admin-guide/cgroup-v1/cpusets.rst. From 7c944d7c67daee84e3c756bb74ad2f32b28c41cf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 May 2020 14:36:26 -0700 Subject: [PATCH 154/502] refperf: Work around 64-bit division MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A 64-bit division was introduced in refperf, breaking compilation on all 32-bit architectures: kernel/rcu/refperf.o: in function `main_func': refperf.c:(.text+0x57c): undefined reference to `__aeabi_uldivmod' Fix this by using div_u64 to mark the expensive operation. [ paulmck: Update primitive and format per Nathan Chancellor. ] Fixes: bd5b16d6c88d ("refperf: Allow decimal nanoseconds") Reported-by: kbuild test robot Reported-by: Valdis Klētnieks Acked-by: Randy Dunlap # build-tested Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 063eeb0473a1..80d449060bdf 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -478,7 +478,7 @@ static int main_func(void *arg) if (torture_must_stop()) goto end; - result_avg[exp] = 1000 * process_durations(nreaders) / (nreaders * loops); + result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); } // Print the average of all experiments @@ -489,9 +489,13 @@ static int main_func(void *arg) strcat(buf, "Runs\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { + u64 avg; + u32 rem; + if (errexit) break; - sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000)); + avg = div_u64_rem(result_avg[exp], 1000, &rem); + sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); strcat(buf, buf1); } From 918b351d965560c7902ad482cf87049517843ff2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 31 May 2020 18:14:57 -0700 Subject: [PATCH 155/502] refperf: Change readdelay module parameter to nanoseconds The current units of microseconds are too coarse, so this commit changes the units to nanoseconds. However, ndelay is used only for the nanoseconds with udelay being used for whole microseconds. For example, setting refperf.readdelay=1500 results in a udelay(1) followed by an ndelay(500). Suggested-by: Akira Yokosawa [ paulmck: Abstracted delay per Akira feedback and move from 80 to 100 lines. ] [ paulmck: Fix names as suggested by kbuild test robot. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 80d449060bdf..49fffb9bce77 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -66,8 +66,8 @@ torture_param(long, loops, 10000, "Number of loops per experiment."); torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. torture_param(int, nruns, 30, "Number of experiments to run."); -// Reader delay in microseconds, 0 for no delay. -torture_param(int, readdelay, 0, "Read-side delay in microseconds."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); #ifdef MODULE # define REFPERF_SHUTDOWN 0 @@ -111,12 +111,20 @@ struct ref_perf_ops { void (*init)(void); void (*cleanup)(void); void (*readsection)(const int nloops); - void (*delaysection)(const int nloops, const int ndelay); + void (*delaysection)(const int nloops, const int udl, const int ndl); const char *name; }; static struct ref_perf_ops *cur_ops; +static void un_delay(const int udl, const int ndl) +{ + if (udl) + udelay(udl); + if (ndl) + ndelay(ndl); +} + static void ref_rcu_read_section(const int nloops) { int i; @@ -127,13 +135,13 @@ static void ref_rcu_read_section(const int nloops) } } -static void ref_rcu_delay_section(const int nloops, const int ndelay) +static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { rcu_read_lock(); - udelay(ndelay); + un_delay(udl, ndl); rcu_read_unlock(); } } @@ -165,14 +173,14 @@ static void srcu_ref_perf_read_section(const int nloops) } } -static void srcu_ref_perf_delay_section(const int nloops, const int ndelay) +static void srcu_ref_perf_delay_section(const int nloops, const int udl, const int ndl) { int i; int idx; for (i = nloops; i >= 0; i--) { idx = srcu_read_lock(srcu_ctlp); - udelay(ndelay); + un_delay(udl, ndl); srcu_read_unlock(srcu_ctlp, idx); } } @@ -197,13 +205,13 @@ static void ref_refcnt_section(const int nloops) } } -static void ref_refcnt_delay_section(const int nloops, const int ndelay) +static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { atomic_inc(&refcnt); - udelay(ndelay); + un_delay(udl, ndl); atomic_dec(&refcnt); } } @@ -233,13 +241,13 @@ static void ref_rwlock_section(const int nloops) } } -static void ref_rwlock_delay_section(const int nloops, const int ndelay) +static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { read_lock(&test_rwlock); - udelay(ndelay); + un_delay(udl, ndl); read_unlock(&test_rwlock); } } @@ -269,13 +277,13 @@ static void ref_rwsem_section(const int nloops) } } -static void ref_rwsem_delay_section(const int nloops, const int ndelay) +static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { down_read(&test_rwsem); - udelay(ndelay); + un_delay(udl, ndl); up_read(&test_rwsem); } } @@ -292,7 +300,7 @@ static void rcu_perf_one_reader(void) if (readdelay <= 0) cur_ops->readsection(loops); else - cur_ops->delaysection(loops, readdelay); + cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); } // Reader kthread. Repeatedly does empty RCU read-side From 72bb749e7048d0a8d7663b59ec1a33bd56c51083 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jun 2020 08:34:41 -0700 Subject: [PATCH 156/502] refperf: Add test for RCU Tasks Trace readers. This commit adds testing for RCU Tasks Trace readers to the refperf module. Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 49fffb9bce77..da7de9ac548d 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -157,7 +158,6 @@ static struct ref_perf_ops rcu_ops = { .name = "rcu" }; - // Definitions for SRCU ref perf testing. DEFINE_STATIC_SRCU(srcu_refctl_perf); static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; @@ -192,6 +192,35 @@ static struct ref_perf_ops srcu_ops = { .name = "srcu" }; +// Definitions for RCU Tasks Trace ref perf testing. +static void rcu_trace_ref_perf_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + rcu_read_unlock_trace(); + } +} + +static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + un_delay(udl, ndl); + rcu_read_unlock_trace(); + } +} + +static struct ref_perf_ops rcu_trace_ops = { + .init = rcu_sync_perf_init, + .readsection = rcu_trace_ref_perf_read_section, + .delaysection = rcu_trace_ref_perf_delay_section, + .name = "rcu-trace" +}; + // Definitions for reference count static atomic_t refcnt; @@ -584,7 +613,7 @@ ref_perf_init(void) long i; int firsterr = 0; static struct ref_perf_ops *perf_ops[] = { - &rcu_ops, &srcu_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, + &rcu_ops, &srcu_ops, &rcu_trace_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, }; if (!torture_init_begin(perf_type, verbose)) From e13ef442fe522fa1f604efec8c899a0e1fc3d426 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jun 2020 11:56:34 -0700 Subject: [PATCH 157/502] refperf: Add test for RCU Tasks readers This commit adds testing for RCU Tasks readers to the refperf module. This also applies to RCU Rude readers, as both flavors have empty (as in non-existent) read-side markers. Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index da7de9ac548d..2bfdcdcb6bd1 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -192,6 +192,31 @@ static struct ref_perf_ops srcu_ops = { .name = "srcu" }; +// Definitions for RCU Tasks ref perf testing: Empty read markers. +// These definitions also work for RCU Rude readers. +static void rcu_tasks_ref_perf_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) + continue; +} + +static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) + un_delay(udl, ndl); +} + +static struct ref_perf_ops rcu_tasks_ops = { + .init = rcu_sync_perf_init, + .readsection = rcu_tasks_ref_perf_read_section, + .delaysection = rcu_tasks_ref_perf_delay_section, + .name = "rcu-tasks" +}; + // Definitions for RCU Tasks Trace ref perf testing. static void rcu_trace_ref_perf_read_section(const int nloops) { @@ -613,7 +638,8 @@ ref_perf_init(void) long i; int firsterr = 0; static struct ref_perf_ops *perf_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, + &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, + &refcnt_ops, &rwlock_ops, &rwsem_ops, }; if (!torture_init_begin(perf_type, verbose)) From c7dcf8106f7570b133b05ff68fd4100064965d9d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Jun 2020 13:11:29 -0700 Subject: [PATCH 158/502] rcu-tasks: Fix synchronize_rcu_tasks_trace() header comment The synchronize_rcu_tasks_trace() header comment incorrectly claims that any number of things delimit RCU Tasks Trace read-side critical sections, when in fact only rcu_read_lock_trace() and rcu_read_unlock_trace() do so. This commit therefore fixes this comment, and, while in the area, fixes a typo in the rcu_read_lock_trace() header comment. Reported-by: Alexei Starovoitov Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 4 ++-- kernel/rcu/tasks.h | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index 4c25a41f8b27..d9015aac78c6 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -36,8 +36,8 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting); /** * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section * - * When synchronize_rcu_trace() is invoked by one task, then that task - * is guaranteed to block until all other tasks exit their read-side + * When synchronize_rcu_tasks_trace() is invoked by one task, then that + * task is guaranteed to block until all other tasks exit their read-side * critical sections. Similarly, if call_rcu_trace() is invoked on one * task while other tasks are within RCU read-side critical sections, * invocation of the corresponding RCU callback is deferred until after diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ce23f6cc5043..a77298c1d126 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1118,11 +1118,10 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period * * Control will return to the caller some time after a trace rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, - * anyway) cond_resched(). + * grace period has elapsed, in other words after all currently executing + * rcu-tasks read-side critical sections have elapsed. These read-side + * critical sections are delimited by calls to rcu_read_lock_trace() + * and rcu_read_unlock_trace(). * * This is a very specialized primitive, intended only for a few uses in * tracing and other situations requiring manipulation of function preambles From 8e4ec3d02b549a731c94b4bcddff212bb92cdbaf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 11:33:54 -0700 Subject: [PATCH 159/502] refperf: Rename RCU_REF_PERF_TEST to RCU_REF_SCALE_TEST The old Kconfig option name is all too easy to conflate with the unrelated "perf" feature, so this commit renames RCU_REF_PERF_TEST to RCU_REF_SCALE_TEST. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 4 ++-- kernel/rcu/Makefile | 2 +- kernel/rcu/refperf.c | 6 +++--- tools/testing/selftests/rcutorture/configs/refperf/CFcommon | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 858765b7f644..3cf6132a4bb9 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -61,8 +61,8 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. -config RCU_REF_PERF_TEST - tristate "Performance tests for read-side synchronization (RCU and others)" +config RCU_REF_SCALE_TEST + tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL select TORTURE_TEST select SRCU diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index ba7d82609cbe..45d562de279a 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o -obj-$(CONFIG_RCU_REF_PERF_TEST) += refperf.o +obj-$(CONFIG_RCU_REF_SCALE_TEST) += refperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 2bfdcdcb6bd1..7c980573acbe 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ // -// Performance test comparing RCU vs other mechanisms +// Scalability test comparing RCU vs other mechanisms // for acquiring references on objects. // // Copyright (C) Google, 2020. @@ -59,7 +59,7 @@ MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); // Wait until there are multiple CPUs before starting test. -torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, "Holdoff time before test start (s)"); // Number of loops per experiment, all readers execute operations concurrently. torture_param(long, loops, 10000, "Number of loops per experiment."); @@ -656,7 +656,7 @@ ref_perf_init(void) for (i = 0; i < ARRAY_SIZE(perf_ops); i++) pr_cont(" %s", perf_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_REF_PERF_TEST)); + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon index 8ba5ba207503..a98b58b54bb1 100644 --- a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon @@ -1,2 +1,2 @@ -CONFIG_RCU_REF_PERF_TEST=y +CONFIG_RCU_REF_SCALE_TEST=y CONFIG_PRINTK_TIME=y From 1fbeb3a8c4de29433a8d230ee600b13d369b6c0f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 11:53:53 -0700 Subject: [PATCH 160/502] refperf: Rename refperf.c to refscale.c and change internal names This commit further avoids conflation of refperf with the kernel's perf feature by renaming kernel/rcu/refperf.c to kernel/rcu/refscale.c, and also by similarly renaming the functions and variables inside this file. This has the side effect of changing the names of the kernel boot parameters, so kernel-parameters.txt and ver_functions.sh are also updated. The rcutorture --torture type remains refperf, and this will be addressed in a separate commit. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 17 +- kernel/rcu/Makefile | 2 +- kernel/rcu/{refperf.c => refscale.c} | 182 +++++++++--------- .../configs/refperf/ver_functions.sh | 4 +- 4 files changed, 104 insertions(+), 101 deletions(-) rename kernel/rcu/{refperf.c => refscale.c} (74%) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 20cd00b78fc4..a4e4e0f6a550 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4407,13 +4407,13 @@ reboot_cpu is s[mp]#### with #### being the processor to be used for rebooting. - refperf.holdoff= [KNL] + refscale.holdoff= [KNL] Set test-start holdoff period. The purpose of this parameter is to delay the start of the test until boot completes in order to avoid interference. - refperf.loops= [KNL] + refscale.loops= [KNL] Set the number of loops over the synchronization primitive under test. Increasing this number reduces noise due to loop start/end overhead, @@ -4421,26 +4421,29 @@ noise to a handful of picoseconds on ca. 2020 x86 laptops. - refperf.nreaders= [KNL] + refscale.nreaders= [KNL] Set number of readers. The default value of -1 selects N, where N is roughly 75% of the number of CPUs. A value of zero is an interesting choice. - refperf.nruns= [KNL] + refscale.nruns= [KNL] Set number of runs, each of which is dumped onto the console log. - refperf.readdelay= [KNL] + refscale.readdelay= [KNL] Set the read-side critical-section duration, measured in microseconds. - refperf.shutdown= [KNL] + refscale.scale_type= [KNL] + Specify the read-protection implementation to test. + + refscale.shutdown= [KNL] Shut down the system at the end of the performance test. This defaults to 1 (shut it down) when rcuperf is built into the kernel and to 0 (leave it running) when rcuperf is built as a module. - refperf.verbose= [KNL] + refscale.verbose= [KNL] Enable additional printk() statements. relax_domain_level= diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 45d562de279a..95f5117ef8da 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o -obj-$(CONFIG_RCU_REF_SCALE_TEST) += refperf.o +obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refscale.c similarity index 74% rename from kernel/rcu/refperf.c rename to kernel/rcu/refscale.c index 7c980573acbe..d9291f883b54 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refscale.c @@ -38,23 +38,23 @@ #include "rcu.h" -#define PERF_FLAG "-ref-perf: " +#define SCALE_FLAG "-ref-scale: " -#define PERFOUT(s, x...) \ - pr_alert("%s" PERF_FLAG s, perf_type, ## x) +#define SCALEOUT(s, x...) \ + pr_alert("%s" SCALE_FLAG s, scale_type, ## x) -#define VERBOSE_PERFOUT(s, x...) \ - do { if (verbose) pr_alert("%s" PERF_FLAG s, perf_type, ## x); } while (0) +#define VERBOSE_SCALEOUT(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0) -#define VERBOSE_PERFOUT_ERRSTRING(s, x...) \ - do { if (verbose) pr_alert("%s" PERF_FLAG "!!! " s, perf_type, ## x); } while (0) +#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Joel Fernandes (Google) "); -static char *perf_type = "rcu"; -module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); @@ -71,13 +71,13 @@ torture_param(int, nruns, 30, "Number of experiments to run."); torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); #ifdef MODULE -# define REFPERF_SHUTDOWN 0 +# define REFSCALE_SHUTDOWN 0 #else -# define REFPERF_SHUTDOWN 1 +# define REFSCALE_SHUTDOWN 1 #endif -torture_param(bool, shutdown, REFPERF_SHUTDOWN, - "Shutdown at end of performance tests."); +torture_param(bool, shutdown, REFSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); struct reader_task { struct task_struct *task; @@ -108,7 +108,7 @@ static atomic_t n_cooleddown; static int exp_idx; // Operations vector for selecting different types of tests. -struct ref_perf_ops { +struct ref_scale_ops { void (*init)(void); void (*cleanup)(void); void (*readsection)(const int nloops); @@ -116,7 +116,7 @@ struct ref_perf_ops { const char *name; }; -static struct ref_perf_ops *cur_ops; +static struct ref_scale_ops *cur_ops; static void un_delay(const int udl, const int ndl) { @@ -147,22 +147,22 @@ static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl } } -static void rcu_sync_perf_init(void) +static void rcu_sync_scale_init(void) { } -static struct ref_perf_ops rcu_ops = { - .init = rcu_sync_perf_init, +static struct ref_scale_ops rcu_ops = { + .init = rcu_sync_scale_init, .readsection = ref_rcu_read_section, .delaysection = ref_rcu_delay_section, .name = "rcu" }; -// Definitions for SRCU ref perf testing. -DEFINE_STATIC_SRCU(srcu_refctl_perf); -static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; +// Definitions for SRCU ref scale testing. +DEFINE_STATIC_SRCU(srcu_refctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale; -static void srcu_ref_perf_read_section(const int nloops) +static void srcu_ref_scale_read_section(const int nloops) { int i; int idx; @@ -173,7 +173,7 @@ static void srcu_ref_perf_read_section(const int nloops) } } -static void srcu_ref_perf_delay_section(const int nloops, const int udl, const int ndl) +static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl) { int i; int idx; @@ -185,16 +185,16 @@ static void srcu_ref_perf_delay_section(const int nloops, const int udl, const i } } -static struct ref_perf_ops srcu_ops = { - .init = rcu_sync_perf_init, - .readsection = srcu_ref_perf_read_section, - .delaysection = srcu_ref_perf_delay_section, +static struct ref_scale_ops srcu_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_ref_scale_read_section, + .delaysection = srcu_ref_scale_delay_section, .name = "srcu" }; -// Definitions for RCU Tasks ref perf testing: Empty read markers. +// Definitions for RCU Tasks ref scale testing: Empty read markers. // These definitions also work for RCU Rude readers. -static void rcu_tasks_ref_perf_read_section(const int nloops) +static void rcu_tasks_ref_scale_read_section(const int nloops) { int i; @@ -202,7 +202,7 @@ static void rcu_tasks_ref_perf_read_section(const int nloops) continue; } -static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, const int ndl) +static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl) { int i; @@ -210,15 +210,15 @@ static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, co un_delay(udl, ndl); } -static struct ref_perf_ops rcu_tasks_ops = { - .init = rcu_sync_perf_init, - .readsection = rcu_tasks_ref_perf_read_section, - .delaysection = rcu_tasks_ref_perf_delay_section, +static struct ref_scale_ops rcu_tasks_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_tasks_ref_scale_read_section, + .delaysection = rcu_tasks_ref_scale_delay_section, .name = "rcu-tasks" }; -// Definitions for RCU Tasks Trace ref perf testing. -static void rcu_trace_ref_perf_read_section(const int nloops) +// Definitions for RCU Tasks Trace ref scale testing. +static void rcu_trace_ref_scale_read_section(const int nloops) { int i; @@ -228,7 +228,7 @@ static void rcu_trace_ref_perf_read_section(const int nloops) } } -static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, const int ndl) +static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl) { int i; @@ -239,10 +239,10 @@ static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, co } } -static struct ref_perf_ops rcu_trace_ops = { - .init = rcu_sync_perf_init, - .readsection = rcu_trace_ref_perf_read_section, - .delaysection = rcu_trace_ref_perf_delay_section, +static struct ref_scale_ops rcu_trace_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_trace_ref_scale_read_section, + .delaysection = rcu_trace_ref_scale_delay_section, .name = "rcu-trace" }; @@ -270,8 +270,8 @@ static void ref_refcnt_delay_section(const int nloops, const int udl, const int } } -static struct ref_perf_ops refcnt_ops = { - .init = rcu_sync_perf_init, +static struct ref_scale_ops refcnt_ops = { + .init = rcu_sync_scale_init, .readsection = ref_refcnt_section, .delaysection = ref_refcnt_delay_section, .name = "refcnt" @@ -306,7 +306,7 @@ static void ref_rwlock_delay_section(const int nloops, const int udl, const int } } -static struct ref_perf_ops rwlock_ops = { +static struct ref_scale_ops rwlock_ops = { .init = ref_rwlock_init, .readsection = ref_rwlock_section, .delaysection = ref_rwlock_delay_section, @@ -342,14 +342,14 @@ static void ref_rwsem_delay_section(const int nloops, const int udl, const int n } } -static struct ref_perf_ops rwsem_ops = { +static struct ref_scale_ops rwsem_ops = { .init = ref_rwsem_init, .readsection = ref_rwsem_section, .delaysection = ref_rwsem_delay_section, .name = "rwsem" }; -static void rcu_perf_one_reader(void) +static void rcu_scale_one_reader(void) { if (readdelay <= 0) cur_ops->readsection(loops); @@ -360,7 +360,7 @@ static void rcu_perf_one_reader(void) // Reader kthread. Repeatedly does empty RCU read-side // critical section, minimizing update-side interference. static int -ref_perf_reader(void *arg) +ref_scale_reader(void *arg) { unsigned long flags; long me = (long)arg; @@ -368,14 +368,14 @@ ref_perf_reader(void *arg) u64 start; s64 duration; - VERBOSE_PERFOUT("ref_perf_reader %ld: task started", me); + VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); atomic_inc(&n_init); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); repeat: - VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); // Wait for signal that this reader can start. wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || @@ -392,21 +392,21 @@ repeat: while (atomic_read_acquire(&n_started)) cpu_relax(); - VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx); // To reduce noise, do an initial cache-warming invocation, check // in, and then keep warming until everyone has checked in. - rcu_perf_one_reader(); + rcu_scale_one_reader(); if (!atomic_dec_return(&n_warmedup)) while (atomic_read_acquire(&n_warmedup)) - rcu_perf_one_reader(); + rcu_scale_one_reader(); // Also keep interrupts disabled. This also has the effect // of preventing entries into slow path for rcu_read_unlock(). local_irq_save(flags); start = ktime_get_mono_fast_ns(); - rcu_perf_one_reader(); + rcu_scale_one_reader(); duration = ktime_get_mono_fast_ns() - start; local_irq_restore(flags); @@ -416,18 +416,18 @@ repeat: // everyone is done. if (!atomic_dec_return(&n_cooleddown)) while (atomic_read_acquire(&n_cooleddown)) - rcu_perf_one_reader(); + rcu_scale_one_reader(); if (atomic_dec_and_test(&nreaders_exp)) wake_up(&main_wq); - VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)", + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", me, exp_idx, atomic_read(&nreaders_exp)); if (!torture_must_stop()) goto repeat; end: - torture_kthread_stopping("ref_perf_reader"); + torture_kthread_stopping("ref_scale_reader"); return 0; } @@ -471,7 +471,7 @@ static u64 process_durations(int n) } strcat(buf, "\n"); - PERFOUT("%s\n", buf); + SCALEOUT("%s\n", buf); kfree(buf); return sum; @@ -494,11 +494,11 @@ static int main_func(void *arg) set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); set_user_nice(current, MAX_NICE); - VERBOSE_PERFOUT("main_func task started"); + VERBOSE_SCALEOUT("main_func task started"); result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); buf = kzalloc(64 + nruns * 32, GFP_KERNEL); if (!result_avg || !buf) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); errexit = true; } if (holdoff) @@ -529,13 +529,13 @@ static int main_func(void *arg) wake_up(&reader_tasks[r].wq); } - VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers", + VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers", nreaders); wait_event(main_wq, !atomic_read(&nreaders_exp) || torture_must_stop()); - VERBOSE_PERFOUT("main_func: experiment ended"); + VERBOSE_SCALEOUT("main_func: experiment ended"); if (torture_must_stop()) goto end; @@ -544,7 +544,7 @@ static int main_func(void *arg) } // Print the average of all experiments - PERFOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); + SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); buf[0] = 0; strcat(buf, "\n"); @@ -562,7 +562,7 @@ static int main_func(void *arg) } if (!errexit) - PERFOUT("%s", buf); + SCALEOUT("%s", buf); // This will shutdown everything including us. if (shutdown) { @@ -582,15 +582,15 @@ end: } static void -ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) +ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) { - pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", perf_type, tag, + pr_alert("%s" SCALE_FLAG + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); } static void -ref_perf_cleanup(void) +ref_scale_cleanup(void) { int i; @@ -604,7 +604,7 @@ ref_perf_cleanup(void) if (reader_tasks) { for (i = 0; i < nreaders; i++) - torture_stop_kthread("ref_perf_reader", + torture_stop_kthread("ref_scale_reader", reader_tasks[i].task); } kfree(reader_tasks); @@ -612,7 +612,7 @@ ref_perf_cleanup(void) torture_stop_kthread("main_task", main_task); kfree(main_task); - // Do perf-type-specific cleanup operations. + // Do scale-type-specific cleanup operations. if (cur_ops->cleanup != NULL) cur_ops->cleanup(); @@ -621,40 +621,40 @@ ref_perf_cleanup(void) // Shutdown kthread. Just waits to be awakened, then shuts down system. static int -ref_perf_shutdown(void *arg) +ref_scale_shutdown(void *arg) { wait_event(shutdown_wq, shutdown_start); smp_mb(); // Wake before output. - ref_perf_cleanup(); + ref_scale_cleanup(); kernel_power_off(); return -EINVAL; } static int __init -ref_perf_init(void) +ref_scale_init(void) { long i; int firsterr = 0; - static struct ref_perf_ops *perf_ops[] = { + static struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, }; - if (!torture_init_begin(perf_type, verbose)) + if (!torture_init_begin(scale_type, verbose)) return -EBUSY; - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { - cur_ops = perf_ops[i]; - if (strcmp(perf_type, cur_ops->name) == 0) + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) break; } - if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); - pr_alert("rcu-perf types:"); - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_cont(" %s", perf_ops[i]->name); + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); pr_cont("\n"); WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); firsterr = -EINVAL; @@ -664,12 +664,12 @@ ref_perf_init(void) if (cur_ops->init) cur_ops->init(); - ref_perf_print_module_parms(cur_ops, "Start of test"); + ref_scale_print_module_parms(cur_ops, "Start of test"); // Shutdown task if (shutdown) { init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(ref_perf_shutdown, NULL, + firsterr = torture_create_kthread(ref_scale_shutdown, NULL, shutdown_task); if (firsterr) goto unwind; @@ -682,15 +682,15 @@ ref_perf_init(void) reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } - VERBOSE_PERFOUT("Starting %d reader threads\n", nreaders); + VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders); for (i = 0; i < nreaders; i++) { - firsterr = torture_create_kthread(ref_perf_reader, (void *)i, + firsterr = torture_create_kthread(ref_scale_reader, (void *)i, reader_tasks[i].task); if (firsterr) goto unwind; @@ -709,9 +709,9 @@ ref_perf_init(void) unwind: torture_init_end(); - ref_perf_cleanup(); + ref_scale_cleanup(); return firsterr; } -module_init(ref_perf_init); -module_exit(ref_perf_cleanup); +module_init(ref_scale_init); +module_exit(ref_scale_cleanup); diff --git a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh index 489f05dd929a..321e82641287 100644 --- a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh @@ -11,6 +11,6 @@ # # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { - echo $1 refperf.shutdown=1 \ - refperf.verbose=1 + echo $1 refscale.shutdown=1 \ + refscale.verbose=1 } From f71d8311ec278525508dac211de700b2b682a15f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 12:06:47 -0700 Subject: [PATCH 161/502] refscale: Change --torture type from refperf to refscale This commit renames the rcutorture config/refperf to config/refscale to further avoid conflation with the Linux kernel's perf feature. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- .../{kvm-recheck-refperf.sh => kvm-recheck-refscale.sh} | 8 ++++---- tools/testing/selftests/rcutorture/bin/kvm.sh | 8 ++++---- tools/testing/selftests/rcutorture/bin/parse-console.sh | 4 ++-- .../rcutorture/configs/{refperf => refscale}/CFLIST | 0 .../rcutorture/configs/{refperf => refscale}/CFcommon | 0 .../rcutorture/configs/{refperf => refscale}/NOPREEMPT | 0 .../rcutorture/configs/{refperf => refscale}/PREEMPT | 0 .../configs/{refperf => refscale}/ver_functions.sh | 0 8 files changed, 10 insertions(+), 10 deletions(-) rename tools/testing/selftests/rcutorture/bin/{kvm-recheck-refperf.sh => kvm-recheck-refscale.sh} (87%) rename tools/testing/selftests/rcutorture/configs/{refperf => refscale}/CFLIST (100%) rename tools/testing/selftests/rcutorture/configs/{refperf => refscale}/CFcommon (100%) rename tools/testing/selftests/rcutorture/configs/{refperf => refscale}/NOPREEMPT (100%) rename tools/testing/selftests/rcutorture/configs/{refperf => refscale}/PREEMPT (100%) rename tools/testing/selftests/rcutorture/configs/{refperf => refscale}/ver_functions.sh (100%) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh similarity index 87% rename from tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh rename to tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh index 0e29cfd9986c..35a463dddffe 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh @@ -1,9 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0+ # -# Analyze a given results directory for refperf performance measurements. +# Analyze a given results directory for refscale performance measurements. # -# Usage: kvm-recheck-refperf.sh resdir +# Usage: kvm-recheck-refscale.sh resdir # # Copyright (C) IBM Corporation, 2016 # @@ -51,7 +51,7 @@ END { print configfile " results:"; newNR = asort(readertimes); if (newNR <= 0) { - print "No refperf records found???" + print "No refscale records found???" exit; } medianidx = int(newNR / 2); @@ -67,5 +67,5 @@ END { print "Minimum reader duration: " readertimes[1]; print "Median reader duration: " medianvalue; print "Maximum reader duration: " readertimes[newNR]; - print "Computed from refperf printk output."; + print "Computed from refscale printk output."; }' diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 48b6a7248f50..ce05db324057 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -180,14 +180,14 @@ do shift ;; --torture) - checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refperf\)$' '^--' + checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refscale\)$' '^--' TORTURE_SUITE=$2 shift - if test "$TORTURE_SUITE" = rcuperf || test "$TORTURE_SUITE" = refperf + if test "$TORTURE_SUITE" = rcuperf || test "$TORTURE_SUITE" = refscale then - # If you really want jitter for refperf or + # If you really want jitter for refscale or # rcuperf, specify it after specifying the rcuperf - # or the refperf. (But why jitter in these cases?) + # or the refscale. (But why jitter in these cases?) jitter=0 fi ;; diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 85af11d2d0cb..8cb908fb852b 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -33,8 +33,8 @@ then fi cat /dev/null > $file.diags -# Check for proper termination, except for rcuperf and refperf. -if test "$TORTURE_SUITE" != rcuperf && test "$TORTURE_SUITE" != refperf +# Check for proper termination, except for rcuperf and refscale. +if test "$TORTURE_SUITE" != rcuperf && test "$TORTURE_SUITE" != refscale then # check for abject failure diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFLIST b/tools/testing/selftests/rcutorture/configs/refscale/CFLIST similarity index 100% rename from tools/testing/selftests/rcutorture/configs/refperf/CFLIST rename to tools/testing/selftests/rcutorture/configs/refscale/CFLIST diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon similarity index 100% rename from tools/testing/selftests/rcutorture/configs/refperf/CFcommon rename to tools/testing/selftests/rcutorture/configs/refscale/CFcommon diff --git a/tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT similarity index 100% rename from tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT rename to tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT diff --git a/tools/testing/selftests/rcutorture/configs/refperf/PREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT similarity index 100% rename from tools/testing/selftests/rcutorture/configs/refperf/PREEMPT rename to tools/testing/selftests/rcutorture/configs/refscale/PREEMPT diff --git a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh similarity index 100% rename from tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh rename to tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh From 7fef6cff8f2814bf8eb632e2bb8f0a987ffd9ece Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Sat, 18 Apr 2020 19:46:47 +0800 Subject: [PATCH 162/502] srcu: Fix a typo in comment "amoritized"->"amortized" This commit fixes a typo in a comment. Signed-off-by: Ethon Paul Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6d3ef700fb0e..8ff71e5d0fe8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -766,7 +766,7 @@ static void srcu_flip(struct srcu_struct *ssp) * it, if this function was preempted for enough time for the counters * to wrap, it really doesn't matter whether or not we expedite the grace * period. The extra overhead of a needlessly expedited grace period is - * negligible when amoritized over that time period, and the extra latency + * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ static bool srcu_might_be_idle(struct srcu_struct *ssp) From bde50d8ff83e4ce9e576f7c5ba1edb48a3610a5b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 26 May 2020 15:41:34 +0200 Subject: [PATCH 163/502] srcu: Avoid local_irq_save() before acquiring spinlock_t SRCU disables interrupts to get a stable per-CPU pointer and then acquires the spinlock which is in the per-CPU data structure. The release uses spin_unlock_irqrestore(). While this is correct on a non-RT kernel, this conflicts with the RT semantics because the spinlock is converted to a 'sleeping' spinlock. Sleeping locks can obviously not be acquired with interrupts disabled. Acquire the per-CPU pointer `ssp->sda' without disabling preemption and then acquire the spinlock_t of the per-CPU data structure. The lock will ensure that the data is consistent. The added call to check_init_srcu_struct() is now needed because a statically defined srcu_struct may remain uninitialized until this point and the newly introduced locking operation requires an initialized spinlock_t. This change was tested for four hours with 8*SRCU-N and 8*SRCU-P without causing any warnings. Cc: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: rcu@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 8ff71e5d0fe8..c100acf332ed 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -777,14 +777,15 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long t; unsigned long tlast; + check_init_srcu_struct(ssp); /* If the local srcu_data structure has callbacks, not idle. */ - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); return false; /* Callbacks already present, so not idle. */ } - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); /* * No local callbacks, so probabalistically probe global state. @@ -864,9 +865,8 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, } rhp->func = func; idx = srcu_read_lock(ssp); - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); - spin_lock_rcu_node(sdp); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); From 88513ae533756d10358e406743c21e8cf61fb72a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Apr 2020 14:41:48 -0700 Subject: [PATCH 164/502] torture: Remove qemu dependency on EFI firmware On some (probably misconfigured) systems, the torture-test scripting will cause qemu to complain about missing EFI firmware, often because qemu is trying to traverse broken symbolic links to find that firmware. Which is a bit silly given that the default torture-test guest OS has but a single binary for its userspace, and thus is unlikely to do much in the way of networking in any case. This commit therefore avoids such problems by specifying "-net none" to qemu unless the TORTURE_QEMU_INTERACTIVE environment variable is set (for example, by having specified "--interactive" to kvm.sh), in which case "-net nic -net user" is specified to qemu instead. Either choice may be overridden by specifying the "-net" argument of your choice to the kvm.sh "--qemu-args" parameter. Link: https://lore.kernel.org/lkml/20190701141403.GA246562@google.com Reported-by: Joel Fernandes Signed-off-by: Paul E. McKenney Cc: Sebastian Andrzej Siewior --- .../selftests/rcutorture/bin/functions.sh | 21 ++++++++++++++++--- .../rcutorture/bin/kvm-test-1-run.sh | 1 + 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index 12810229fddc..436b1542cf27 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -215,9 +215,6 @@ identify_qemu_args () { then echo -device spapr-vlan,netdev=net0,mac=$TORTURE_QEMU_MAC echo -netdev bridge,br=br0,id=net0 - elif test -n "$TORTURE_QEMU_INTERACTIVE" - then - echo -net nic -net user fi ;; esac @@ -275,3 +272,21 @@ specify_qemu_cpus () { esac fi } + +# specify_qemu_net qemu-args +# +# Appends a string containing "-net none" to qemu-args, unless the incoming +# qemu-args already contains "-smp" or unless the TORTURE_QEMU_INTERACTIVE +# environment variable is set, in which case the string that is be added is +# instead "-net nic -net user". +specify_qemu_net () { + if echo $1 | grep -q -e -net + then + echo $1 + elif test -n "$TORTURE_QEMU_INTERACTIVE" + then + echo $1 -net nic -net user + else + echo $1 -net none + fi +} diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 6ff611c630d1..1b9aebd54cc9 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -141,6 +141,7 @@ then cpu_count=$TORTURE_ALLOTED_CPUS fi qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`" +qemu_args="`specify_qemu_net "$qemu_args"`" # Generate architecture-specific and interaction-specific qemu arguments qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`" From 6582e7f184e49a754ee09c996a886b89113d7354 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 May 2020 15:55:47 -0700 Subject: [PATCH 165/502] torture: Add script to smoke-test commits in a branch This commit adds a kvm-check-branches.sh script that takes a list of commits and commit ranges and runs a short rcutorture test on all scenarios on each specified commit. A summary is printed at the end, and the script returns success if all rcutorture runs completed without error. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-check-branches.sh | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh new file mode 100755 index 000000000000..6e65c134e5f1 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh @@ -0,0 +1,108 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Run a group of kvm.sh tests on the specified commits. This currently +# unconditionally does three-minute runs on each scenario in CFLIST, +# taking advantage of all available CPUs and trusting the "make" utility. +# In the short term, adjustments can be made by editing this script and +# CFLIST. If some adjustments appear to have ongoing value, this script +# might grow some command-line arguments. +# +# Usage: kvm-check-branches.sh commit1 commit2..commit3 commit4 ... +# +# This script considers its arguments one at a time. If more elaborate +# specification of commits is needed, please use "git rev-list" to +# produce something that this simple script can understand. The reason +# for retaining the simplicity is that it allows the user to more easily +# see which commit came from which branch. +# +# This script creates a yyyy.mm.dd-hh.mm.ss-group entry in the "res" +# directory. The calls to kvm.sh create the usual entries, but this script +# moves them under the yyyy.mm.dd-hh.mm.ss-group entry, each in its own +# directory numbered in run order, that is, "0001", "0002", and so on. +# For successful runs, the large build artifacts are removed. Doing this +# reduces the disk space required by about two orders of magnitude for +# successful runs. +# +# Copyright (C) Facebook, 2020 +# +# Authors: Paul E. McKenney + +if ! git status > /dev/null 2>&1 +then + echo '!!!' This script needs to run in a git archive. 1>&2 + echo '!!!' Giving up. 1>&2 + exit 1 +fi + +# Remember where we started so that we can get back and the end. +curcommit="`git status | head -1 | awk '{ print $NF }'`" + +nfail=0 +ntry=0 +resdir="tools/testing/selftests/rcutorture/res" +ds="`date +%Y.%m.%d-%H.%M.%S`-group" +if ! test -e $resdir +then + mkdir $resdir || : +fi +mkdir $resdir/$ds +echo Results directory: $resdir/$ds + +KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM +PATH=${KVM}/bin:$PATH; export PATH +. functions.sh +cpus="`identify_qemu_vcpus`" +echo Using up to $cpus CPUs. + +# Each pass through this loop does one command-line argument. +for gitbr in $@ +do + echo ' --- git branch ' $gitbr + + # Each pass through this loop tests one commit. + for i in `git rev-list "$gitbr"` + do + ntry=`expr $ntry + 1` + idir=`awk -v ntry="$ntry" 'END { printf "%04d", ntry; }' < /dev/null` + echo ' --- commit ' $i from branch $gitbr + date + mkdir $resdir/$ds/$idir + echo $gitbr > $resdir/$ds/$idir/gitbr + echo $i >> $resdir/$ds/$idir/gitbr + + # Test the specified commit. + git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1 + echo git checkout return code: $? "(Commit $ntry: $i)" + kvm.sh --cpus $cpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1 + ret=$? + echo kvm.sh return code $ret for commit $i from branch $gitbr + + # Move the build products to their resting place. + runresdir="`grep -m 1 '^Results directory:' < $resdir/$ds/$idir/kvm.sh.out | sed -e 's/^Results directory://'`" + mv $runresdir $resdir/$ds/$idir + rrd="`echo $runresdir | sed -e 's,^.*/,,'`" + echo Run results: $resdir/$ds/$idir/$rrd + if test "$ret" -ne 0 + then + # Failure, so leave all evidence intact. + nfail=`expr $nfail + 1` + else + # Success, so remove large files to save about 1GB. + ( cd $resdir/$ds/$idir/$rrd; rm -f */vmlinux */bzImage */System.map */Module.symvers ) + fi + done +done +date + +# Go back to the original commit. +git checkout "$curcommit" + +if test $nfail -ne 0 +then + echo '!!! ' $nfail failures in $ntry 'runs!!!' + exit 1 +else + echo No failures in $ntry runs. + exit 0 +fi From d02c6b52d12fa30eeabfaf5aefe12078eacb94b2 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Mon, 13 Apr 2020 20:02:59 +0800 Subject: [PATCH 166/502] locktorture: Use true and false to assign to bool variables This commit fixes the following coccicheck warnings: kernel/locking/locktorture.c:689:6-10: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:907:2-20: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:938:3-20: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:668:2-19: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:674:2-19: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:634:2-20: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:640:2-20: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 5efbfc68ce99..8ff6f50e06a0 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -631,13 +631,13 @@ static int lock_torture_writer(void *arg) cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; - lock_is_write_held = 1; + lock_is_write_held = true; if (WARN_ON_ONCE(lock_is_read_held)) lwsp->n_lock_fail++; /* rare, but... */ lwsp->n_lock_acquired++; cxt.cur_ops->write_delay(&rand); - lock_is_write_held = 0; + lock_is_write_held = false; cxt.cur_ops->writeunlock(); stutter_wait("lock_torture_writer"); @@ -665,13 +665,13 @@ static int lock_torture_reader(void *arg) schedule_timeout_uninterruptible(1); cxt.cur_ops->readlock(); - lock_is_read_held = 1; + lock_is_read_held = true; if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = 0; + lock_is_read_held = false; cxt.cur_ops->readunlock(); stutter_wait("lock_torture_reader"); @@ -686,7 +686,7 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { - bool fail = 0; + bool fail = false; int i, n_stress; long max = 0, min = statp ? statp[0].n_lock_acquired : 0; long long sum = 0; @@ -904,7 +904,7 @@ static int __init lock_torture_init(void) /* Initialize the statistics so that each run gets its own numbers. */ if (nwriters_stress) { - lock_is_write_held = 0; + lock_is_write_held = false; cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, sizeof(*cxt.lwsa), GFP_KERNEL); @@ -935,7 +935,7 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - lock_is_read_held = 0; + lock_is_read_held = false; cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, sizeof(*cxt.lrsa), GFP_KERNEL); From 4a5f133c15b77c4018e8d7996541868ac94afb4f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Apr 2020 11:21:40 -0700 Subject: [PATCH 167/502] rcutorture: Add races with task-exit processing Several variants of Linux-kernel RCU interact with task-exit processing, including preemptible RCU, Tasks RCU, and Tasks Trace RCU. This commit therefore adds testing of this interaction to rcutorture by adding rcutorture.read_exit_burst and rcutorture.read_exit_delay kernel-boot parameters. These kernel parameters control the frequency and spacing of special read-then-exit kthreads that are spawned. [ paulmck: Apply feedback from Dan Carpenter's static checker. ] [ paulmck: Reduce latency to avoid false-positive shutdown hangs. ] Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 14 +++ include/linux/torture.h | 5 + kernel/rcu/rcutorture.c | 112 +++++++++++++++++- 3 files changed, 128 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb95fad81c79..a0dcc925c8a2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4258,6 +4258,20 @@ Set time (jiffies) between CPU-hotplug operations, or zero to disable CPU-hotplug testing. + rcutorture.read_exit= [KNL] + Set the number of read-then-exit kthreads used + to test the interaction of RCU updaters and + task-exit processing. + + rcutorture.read_exit_burst= [KNL] + The number of times in a given read-then-exit + episode that a set of read-then-exit kthreads + is spawned. + + rcutorture.read_exit_delay= [KNL] + The delay, in seconds, between successive + read-then-exit testing episodes. + rcutorture.shuffle_interval= [KNL] Set task-shuffle interval (s). Shuffling tasks allows some CPUs to go into dyntick-idle mode diff --git a/include/linux/torture.h b/include/linux/torture.h index 629b66e6c161..7f65bd1dd307 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -55,6 +55,11 @@ struct torture_random_state { #define DEFINE_TORTURE_RANDOM_PERCPU(name) \ DEFINE_PER_CPU(struct torture_random_state, name) unsigned long torture_random(struct torture_random_state *trsp); +static inline void torture_random_init(struct torture_random_state *trsp) +{ + trsp->trs_state = 0; + trsp->trs_count = 0; +} /* Task shuffler, which causes CPUs to occasionally go idle. */ void torture_shuffle_task_register(struct task_struct *tp); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efb792e13fca..2621a339c8a4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -109,6 +109,10 @@ torture_param(int, object_debug, 0, torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, read_exit_delay, 13, + "Delay between read-then-exit episodes (s)"); +torture_param(int, read_exit_burst, 16, + "# of read-then-exit bursts per episode, zero to disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -146,6 +150,7 @@ static struct task_struct *stall_task; static struct task_struct *fwd_prog_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; +static struct task_struct *read_exit_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -177,6 +182,7 @@ static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; /* did rcu_barrier test succeed? */ +static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; @@ -1539,10 +1545,11 @@ rcu_torture_stats_print(void) n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld\n", + pr_cont("barrier: %ld/%ld:%ld ", data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); + pr_cont("read-exits: %ld\n", data_race(n_read_exits)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1634,7 +1641,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d " "n_barrier_cbs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", + "onoff_interval=%d onoff_holdoff=%d " + "read_exit_delay=%d read_exit_burst=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -1643,7 +1651,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, n_barrier_cbs, - onoff_interval, onoff_holdoff); + onoff_interval, onoff_holdoff, + read_exit_delay, read_exit_burst); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -2338,6 +2347,99 @@ static bool rcu_torture_can_boost(void) return true; } +static bool read_exit_child_stop; +static bool read_exit_child_stopped; +static wait_queue_head_t read_exit_wq; + +// Child kthread which just does an rcutorture reader and exits. +static int rcu_torture_read_exit_child(void *trsp_in) +{ + struct torture_random_state *trsp = trsp_in; + + set_user_nice(current, MAX_NICE); + // Minimize time between reading and exiting. + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + (void)rcu_torture_one_read(trsp); + return 0; +} + +// Parent kthread which creates and destroys read-exit child kthreads. +static int rcu_torture_read_exit(void *unused) +{ + int count = 0; + bool errexit = false; + int i; + struct task_struct *tsp; + DEFINE_TORTURE_RANDOM(trs); + + // Allocate and initialize. + set_user_nice(current, MAX_NICE); + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test"); + + // Each pass through this loop does one read-exit episode. + do { + if (++count > read_exit_burst) { + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); + rcu_barrier(); // Wait for task_struct free, avoid OOM. + for (i = 0; i < read_exit_delay; i++) { + schedule_timeout_uninterruptible(HZ); + if (READ_ONCE(read_exit_child_stop)) + break; + } + if (!READ_ONCE(read_exit_child_stop)) + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); + count = 0; + } + if (READ_ONCE(read_exit_child_stop)) + break; + // Spawn child. + tsp = kthread_run(rcu_torture_read_exit_child, + &trs, "%s", + "rcu_torture_read_exit_child"); + if (IS_ERR(tsp)) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + errexit = true; + tsp = NULL; + break; + } + cond_resched(); + kthread_stop(tsp); + n_read_exits ++; + stutter_wait("rcu_torture_read_exit"); + } while (!errexit && !READ_ONCE(read_exit_child_stop)); + + // Clean up and exit. + smp_store_release(&read_exit_child_stopped, true); // After reaping. + smp_mb(); // Store before wakeup. + wake_up(&read_exit_wq); + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_read_exit"); + return 0; +} + +static int rcu_torture_read_exit_init(void) +{ + if (read_exit_burst <= 0) + return -EINVAL; + init_waitqueue_head(&read_exit_wq); + read_exit_child_stop = false; + read_exit_child_stopped = false; + return torture_create_kthread(rcu_torture_read_exit, NULL, + read_exit_task); +} + +static void rcu_torture_read_exit_cleanup(void) +{ + if (!read_exit_task) + return; + WRITE_ONCE(read_exit_child_stop, true); + smp_mb(); // Above write before wait. + wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped)); + torture_stop_kthread(rcutorture_read_exit, read_exit_task); +} + static enum cpuhp_state rcutor_hp; static void @@ -2359,6 +2461,7 @@ rcu_torture_cleanup(void) } show_rcu_gp_kthreads(); + rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); torture_stop_kthread(rcu_torture_stall, stall_task); @@ -2680,6 +2783,9 @@ rcu_torture_init(void) if (firsterr) goto unwind; firsterr = rcu_torture_barrier_init(); + if (firsterr) + goto unwind; + firsterr = rcu_torture_read_exit_init(); if (firsterr) goto unwind; if (object_debug) From 61251d6899803594a108c3165aeb072c73e09cc8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 26 Apr 2020 16:48:46 -0700 Subject: [PATCH 168/502] torture: Set configfile variable to current scenario The torture-test recheck logic fails to set the configfile variable to the current scenario, so this commit properly initializes this variable. This change isn't critical given that all errors for a given scenario follow that scenario's heading, but it is easier on the eyes to repeat it. And this repetition also prevents confusion as to whether a given message goes with the previous heading or the next one. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 736f04749b90..2261aa676304 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -31,6 +31,7 @@ do head -1 $resdir/log fi TORTURE_SUITE="`cat $i/../TORTURE_SUITE`" + configfile=`echo $i | sed -e 's,^.*/,,'` rm -f $i/console.log.*.diags kvm-recheck-${TORTURE_SUITE}.sh $i if test -f "$i/qemu-retval" && test "`cat $i/qemu-retval`" -ne 0 && test "`cat $i/qemu-retval`" -ne 137 From 59359e4f2a0906920389ec1e33296ac9a19178ba Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 26 Apr 2020 16:51:56 -0700 Subject: [PATCH 169/502] rcutorture: Handle non-statistic bang-string error messages The current console parsing assumes that console lines containing "!!!" are statistics lines from which it can parse the number of rcutorture too-short grace-period failures. This prints confusing output for other problems, including memory exhaustion. This commit therefore differentiates between these cases and prints an appropriate error string. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/parse-console.sh | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 4bf62d7b1cbc..1c64ca85438c 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -44,11 +44,23 @@ then tail -1 | awk ' { - for (i=NF-8;i<=NF;i++) + normalexit = 1; + for (i=NF-8;i<=NF;i++) { + if (i <= 0 || i !~ /^[0-9]*$/) { + bangstring = $0; + gsub(/^\[[^]]*] /, "", bangstring); + print bangstring; + normalexit = 0; + exit 0; + } sum+=$i; + } } - END { print sum }'` - print_bug $title FAILURE, $nerrs instances + END { + if (normalexit) + print sum " instances" + }'` + print_bug $title FAILURE, $nerrs exit fi From cae7cc6ba5bad320c2055ac54f73affd051e76ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 26 Apr 2020 19:20:37 -0700 Subject: [PATCH 170/502] rcutorture: NULL rcu_torture_current earlier in cleanup code Currently, the rcu_torture_current variable remains non-NULL until after all readers have stopped. During this time, rcu_torture_stats_print() will think that the test is still ongoing, which can result in confusing dmesg output. This commit therefore NULLs rcu_torture_current immediately after the rcu_torture_writer() kthread has decided to stop, thus informing rcu_torture_stats_print() much sooner. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2621a339c8a4..59112077a6da 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1172,6 +1172,7 @@ rcu_torture_writer(void *arg) WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); } } while (!torture_must_stop()); + rcu_torture_current = NULL; // Let stats task know that we are done. /* Reset expediting back to unexpedited. */ if (expediting > 0) expediting = -expediting; @@ -2473,7 +2474,6 @@ rcu_torture_cleanup(void) reader_tasks[i]); kfree(reader_tasks); } - rcu_torture_current = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { From d3cb26312ecfdb4ee8dedf931e24e60df1d7fbc9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 May 2020 16:40:53 -0700 Subject: [PATCH 171/502] torture: Remove whitespace from identify_qemu_vcpus output The identify_qemu_vcpus bash function can return numbers including whitespace characters, which can be a bit annoying in some bash dollar-sign substitutions. This commit therefore strips all spaces and tabs from the value that identify_qemu_vcpus outputs. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index 436b1542cf27..51f3464b96d3 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -231,7 +231,7 @@ identify_qemu_args () { # Returns the number of virtual CPUs available to the aggregate of the # guest OSes. identify_qemu_vcpus () { - lscpu | grep '^CPU(s):' | sed -e 's/CPU(s)://' + lscpu | grep '^CPU(s):' | sed -e 's/CPU(s)://' -e 's/[ ]*//g' } # print_bug From a3ba4972f2ef8408dcc8a2a3d433621d6c990594 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 May 2020 16:41:53 -0700 Subject: [PATCH 172/502] torture: Add --allcpus argument to the kvm.sh script Leaving off the kvm.sh script's --cpus argument results in the script testing the scenarios sequentially, which can be quite slow. However, having to specify the actual number of CPUs can be error-prone. This commit therefore adds a --allcpus argument that causes kvm.sh to use all available CPUs. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index c279cf9cb010..7dbce7a43413 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -73,6 +73,10 @@ usage () { while test $# -gt 0 do case "$1" in + --allcpus) + cpus=$TORTURE_ALLOTED_CPUS + max_cpus=$TORTURE_ALLOTED_CPUS + ;; --bootargs|--bootarg) checkarg --bootargs "(list of kernel boot arguments)" "$#" "$2" '.*' '^--' TORTURE_BOOTARGS="$2" From 8f43d5911b38f00dfa46169dcb1feb1e101dd906 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 1 Jun 2020 19:45:48 +0100 Subject: [PATCH 173/502] rcu/rcutorture: Replace 0 with false Coccinelle reports a warning WARNING: Assignment of 0/1 to bool variable The root cause is that the variable lastphase is a bool, but is initialised with integer 0. This commit therefore replaces the 0 with a false. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 59112077a6da..37455a12898e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2185,7 +2185,7 @@ static void rcu_torture_barrier1cb(void *rcu_void) static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; - bool lastphase = 0; + bool lastphase = false; bool newphase; struct rcu_head rcu; From 3e93a51f191aa710760591961240f8910d952b5b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Jun 2020 10:29:28 -0700 Subject: [PATCH 174/502] torture: Create qemu-cmd in --buildonly runs One reason to do a --buildonly run is to use the build products elsewhere, for example, to do the actual test on some other system. Part of doing the test is the actual qemu command, which is not currently produced by --buildonly runs. This commit therefore causes --buildonly runs to create this file. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 1b9aebd54cc9..064dd735de39 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -153,6 +153,7 @@ qemu_append="`identify_qemu_append "$QEMU"`" boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" # Generate kernel-version-specific boot parameters boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`" +echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then @@ -161,7 +162,6 @@ then exit 0 fi echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log -echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd ( $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append "$qemu_append $boot_args" > $resdir/qemu-output 2>&1 & echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 sleep 10 # Give qemu's pid a chance to reach the file From 6387ecbc94bf5ac07239104b84d2304da6e79b51 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Jun 2020 17:58:30 -0700 Subject: [PATCH 175/502] torture: Add a stop-run capability When bisecting RCU issues, it is often the case that the first error in an unsuccessful run will happen quickly, but that a successful run must go on for some time in order to obtain a sufficiently low false-negative error rate. In many cases, a bisection requires multiple concurrent runs, in which case the first failure in any run indicates failure, pure and simple. In such cases, it would speed things up greatly if the first failure terminated all runs. This commit therefore adds scripting that checks for a file named "STOP" in the top-level results directory, terminating the run when it appears. Note that in-progress builds will continue until completion, but future builds and all runs will be cut short. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/jitter.sh | 6 ++++++ tools/testing/selftests/rcutorture/bin/kvm-build.sh | 6 ++++++ .../selftests/rcutorture/bin/kvm-test-1-run.sh | 13 +++++++++++-- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 ++ 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index 30cb5b27d32e..188b864bc4bf 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -46,6 +46,12 @@ do exit 0; fi + # Check for stop request. + if test -f "$TORTURE_STOPFILE" + then + exit 1; + fi + # Set affinity to randomly selected online CPU if cpus=`grep 1 /sys/devices/system/cpu/*/online 2>&1 | sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//'` diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index 18d6518504ee..115e1822b26f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -9,6 +9,12 @@ # # Authors: Paul E. McKenney +if test -f "$TORTURE_STOPFILE" +then + echo "kvm-build.sh early exit due to run STOP request" + exit 1 +fi + config_template=${1} if test -z "$config_template" -o ! -f "$config_template" -o ! -r "$config_template" then diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 064dd735de39..5ec095da095f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -182,7 +182,7 @@ do kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 then - if test $kruntime -ge $seconds + if test $kruntime -ge $seconds -o -f "$TORTURE_STOPFILE" then break; fi @@ -211,10 +211,19 @@ then fi if test $commandcompleted -eq 0 -a -n "$qemu_pid" then - echo Grace period for qemu job at pid $qemu_pid + if ! test -f "$TORTURE_STOPFILE" + then + echo Grace period for qemu job at pid $qemu_pid + fi oldline="`tail $resdir/console.log`" while : do + if test -f "$TORTURE_STOPFILE" + then + echo "PID $qemu_pid killed due to run STOP request" >> $resdir/Warnings 2>&1 + kill -KILL $qemu_pid + break + fi kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` if kill -0 $qemu_pid > /dev/null 2>&1 then diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 7dbce7a43413..3578c85ea8c4 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -337,6 +337,8 @@ then mkdir -p "$resdir" || : fi mkdir $resdir/$ds +TORTURE_RESDIR="$resdir/$ds"; export TORTURE_RESDIR +TORTURE_STOPFILE="$resdir/$ds/STOP"; export TORTURE_STOPFILE echo Results directory: $resdir/$ds echo $scriptname $args touch $resdir/$ds/log From bc77a72cd188d44881ee1b9d0a9d65ca8108b508 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jun 2020 14:08:19 -0700 Subject: [PATCH 176/502] torture: Abstract out console-log error detection This commit pulls the simple pattern-based error detection from the console log into a new console-badness.sh file. This will enable future commits to end a run on the first error. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/console-badness.sh | 16 ++++++++++++++++ .../selftests/rcutorture/bin/parse-console.sh | 5 +---- 2 files changed, 17 insertions(+), 4 deletions(-) create mode 100755 tools/testing/selftests/rcutorture/bin/console-badness.sh diff --git a/tools/testing/selftests/rcutorture/bin/console-badness.sh b/tools/testing/selftests/rcutorture/bin/console-badness.sh new file mode 100755 index 000000000000..0e4c0b2eb7f0 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/console-badness.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Scan standard input for error messages, dumping any found to standard +# output. +# +# Usage: console-badness.sh +# +# Copyright (C) 2020 Facebook, Inc. +# +# Authors: Paul E. McKenney + +egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for|!!!' | +grep -v 'ODEBUG: ' | +grep -v 'This means that this is a DEBUG kernel and it is' | +grep -v 'Warning: unable to open an initial console' diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 1c64ca85438c..98478e12ac3d 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -116,10 +116,7 @@ then fi fi | tee -a $file.diags -egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for' < $file | -grep -v 'ODEBUG: ' | -grep -v 'This means that this is a DEBUG kernel and it is' | -grep -v 'Warning: unable to open an initial console' > $T.diags +console-badness.sh < $file > $T.diags if test -s $T.diags then print_warning "Assertion failure in $file $title" From 775227511843202e65a7f194cbf64f38de01f004 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Jun 2020 16:43:14 -0700 Subject: [PATCH 177/502] rcutorture: Check for unwatched readers RCU is supposed to be watching all non-idle kernel code and also all softirq handlers. This commit adds some teeth to this statement by adding a WARN_ON_ONCE(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 37455a12898e..9c310016585b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1377,6 +1377,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) struct rt_read_seg *rtrsp1; unsigned long long ts; + WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); started = cur_ops->get_gp_seq(); From 603d11ad6976e1289f19c2a19e2f75a83d0dc296 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 11:49:24 +0200 Subject: [PATCH 178/502] torture: Pass --kmake-arg to all make invocations We need to pass the arguments provided to --kmake-arg to all make invocations. In particular, the make invocations generating the configs need to see the final make arguments, e.g. if config variables depend on particular variables that are passed to make. For example, when using '--kcsan --kmake-arg CC=clang-11', we would lose CONFIG_KCSAN=y due to 'make oldconfig' not seeing that we want to use a compiler that supports KCSAN. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/configinit.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/configinit.sh b/tools/testing/selftests/rcutorture/bin/configinit.sh index 93e80a42249a..d6e5ce084b1c 100755 --- a/tools/testing/selftests/rcutorture/bin/configinit.sh +++ b/tools/testing/selftests/rcutorture/bin/configinit.sh @@ -32,11 +32,11 @@ if test -z "$TORTURE_TRUST_MAKE" then make clean > $resdir/Make.clean 2>&1 fi -make $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1 +make $TORTURE_KMAKE_ARG $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1 mv .config .config.sav sh $T/upd.sh < .config.sav > .config cp .config .config.new -yes '' | make oldconfig > $resdir/Make.oldconfig.out 2> $resdir/Make.oldconfig.err +yes '' | make $TORTURE_KMAKE_ARG oldconfig > $resdir/Make.oldconfig.out 2> $resdir/Make.oldconfig.err # verify new config matches specification. configcheck.sh .config $c From 6bcaf2a0876633b6a7c5e70ee88801e16280210a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Jun 2020 10:02:54 -0700 Subject: [PATCH 179/502] torture: Correctly summarize build-only runs Currently, kvm-recheck.sh complains that qemu failed for --buildonly runs, which is sort of true given that qemu can hardly succeed if not invoked in the first place. Nevertheless, this commit swaps the order of checks in kvm-recheck.sh so that --buildonly runs will be summarized more straightforwardly. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 2261aa676304..357899cfe249 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -56,15 +56,15 @@ do cat $i/Warnings fi else - if test -f "$i/qemu-cmd" - then - print_bug qemu failed - echo " $i" - elif test -f "$i/buildonly" + if test -f "$i/buildonly" then echo Build-only run, no boot/test configcheck.sh $i/.config $i/ConfigFragment parse-build.sh $i/Make.out $configfile + elif test -f "$i/qemu-cmd" + then + print_bug qemu failed + echo " $i" else print_bug Build failed echo " $i" From 61b77be09e29e6dc152b1984691e5b1708e8a6ac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Jun 2020 10:38:57 -0700 Subject: [PATCH 180/502] torture: Improve diagnostic for KCSAN-incapable compilers Using --kcsan when the compiler does not support KCSAN results in this: :CONFIG_KCSAN=y: improperly set :CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000: improperly set :CONFIG_KCSAN_VERBOSE=y: improperly set :CONFIG_KCSAN_INTERRUPT_WATCHER=y: improperly set Clean KCSAN run in /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2020.06.16-09.53.16 This is a bit obtuse, so this commit adds checks resulting in this: :CONFIG_KCSAN=y: improperly set :CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000: improperly set :CONFIG_KCSAN_VERBOSE=y: improperly set :CONFIG_KCSAN_INTERRUPT_WATCHER=y: improperly set Compiler or architecture does not support KCSAN! Did you forget to switch your compiler with --kmake-arg CC=? Suggested-by: Marco Elver Signed-off-by: Paul E. McKenney Acked-by: Marco Elver --- tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 357899cfe249..840a4679a0d7 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -44,7 +44,8 @@ do then echo QEMU killed fi - configcheck.sh $i/.config $i/ConfigFragment + configcheck.sh $i/.config $i/ConfigFragment > $T 2>&1 + cat $T if test -r $i/Make.oldconfig.err then cat $i/Make.oldconfig.err @@ -73,7 +74,11 @@ do done if test -f "$rd/kcsan.sum" then - if test -s "$rd/kcsan.sum" + if grep -q CONFIG_KCSAN=y $T + then + echo "Compiler or architecture does not support KCSAN!" + echo Did you forget to switch your compiler with '--kmake-arg CC='? + elif test -s "$rd/kcsan.sum" then echo KCSAN summary in $rd/kcsan.sum else From 9ccba350bd824ecacbfd8965f4f3ac980b96f951 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Jun 2020 11:16:18 -0700 Subject: [PATCH 181/502] torture: Add more tracing crib notes to kvm.sh This commit adds a few more hints about how to use tracing as comments at the end of kvm.sh. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 3578c85ea8c4..bdfa0c076ae6 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -503,3 +503,7 @@ fi # Tracing: trace_event=rcu:rcu_grace_period,rcu:rcu_future_grace_period,rcu:rcu_grace_period_init,rcu:rcu_nocb_wake,rcu:rcu_preempt_task,rcu:rcu_unlock_preempted_task,rcu:rcu_quiescent_state_report,rcu:rcu_fqs,rcu:rcu_callback,rcu:rcu_kfree_callback,rcu:rcu_batch_start,rcu:rcu_invoke_callback,rcu:rcu_invoke_kfree_callback,rcu:rcu_batch_end,rcu:rcu_torture_read,rcu:rcu_barrier # Function-graph tracing: ftrace=function_graph ftrace_graph_filter=sched_setaffinity,migration_cpu_stop # Also --kconfig "CONFIG_FUNCTION_TRACER=y CONFIG_FUNCTION_GRAPH_TRACER=y" +# Control buffer size: --bootargs trace_buf_size=3k +# Get trace-buffer dumps on all oopses: --bootargs ftrace_dump_on_oops +# Ditto, but dump only the oopsing CPU: --bootargs ftrace_dump_on_oops=orig_cpu +# Heavy-handed way to also dump on warnings: --bootargs panic_on_warn From 06efa9b4b27f926eeb8c935f430f8557eb8b106e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Jun 2020 14:14:09 -0700 Subject: [PATCH 182/502] torture: Add kvm-tranform.sh script for qemu-cmd files This commit adds a script that transforms qemu-cmd files to allow them and the corresponding kernels to be run in contexts other than the one that they were created for, including on systems other than the one that they were built on. For example, this allows the build products from a --buildonly run to be transformed to allow distributed rcutorture testing. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-transform.sh | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-transform.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh new file mode 100755 index 000000000000..c45a953ef393 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Transform a qemu-cmd file to allow reuse. +# +# Usage: kvm-transform.sh bzImage console.log < qemu-cmd-in > qemu-cmd-out +# +# bzImage: Kernel and initrd from the same prior kvm.sh run. +# console.log: File into which to place console output. +# +# The original qemu-cmd file is provided on standard input. +# The transformed qemu-cmd file is on standard output. +# The transformation assumes that the qemu command is confined to a +# single line. It also assumes no whitespace in filenames. +# +# Copyright (C) 2020 Facebook, Inc. +# +# Authors: Paul E. McKenney + +image="$1" +if test -z "$image" +then + echo Need kernel image file. + exit 1 +fi +consolelog="$2" +if test -z "$consolelog" +then + echo "Need console log file name." + exit 1 +fi + +awk -v image="$image" -v consolelog="$consolelog" ' +{ + line = ""; + for (i = 1; i <= NF; i++) { + if (line == "") + line = $i; + else + line = line " " $i; + if ($i == "-serial") { + i++; + line = line " file:" consolelog; + } + if ($i == "-kernel") { + i++; + line = line " " image; + } + } + print line; +}' From 2102ad290af06119ccfb56ddc3a0e5011a91537e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Jun 2020 15:38:24 -0700 Subject: [PATCH 183/502] torture: Dump ftrace at shutdown only if requested If there is a large number of torture tests running concurrently, all of which are dumping large ftrace buffers at shutdown time, the resulting dumping can take a very long time, particularly on systems with rotating-rust storage. This commit therefore adds a default-off torture.ftrace_dump_at_shutdown module parameter that enables shutdown-time ftrace-buffer dumping. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ kernel/torture.c | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a0dcc925c8a2..9f11ff80d4ad 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5096,6 +5096,13 @@ Prevent the CPU-hotplug component of torturing until after init has spawned. + torture.ftrace_dump_at_shutdown= [KNL] + Dump the ftrace buffer at torture-test shutdown, + even if there were no errors. This can be a + very costly operation when many torture tests + are running concurrently, especially on systems + with rotating-rust storage. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] diff --git a/kernel/torture.c b/kernel/torture.c index a1a41484ff6d..1061492f14bd 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -45,6 +45,9 @@ MODULE_AUTHOR("Paul E. McKenney "); static bool disable_onoff_at_boot; module_param(disable_onoff_at_boot, bool, 0444); +static bool ftrace_dump_at_shutdown; +module_param(ftrace_dump_at_shutdown, bool, 0444); + static char *torture_type; static int verbose; @@ -527,7 +530,8 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); - rcu_ftrace_dump(DUMP_ALL); + if (ftrace_dump_at_shutdown) + rcu_ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } From 316db5897ee5d7408f2adea4d5992ed380316928 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Jun 2020 16:34:52 -0700 Subject: [PATCH 184/502] torture: Avoid duplicate specification of qemu command Currently, the qemu command is constructed twice, once to dump it to the qemu-cmd file and again to execute it. This is of course an accident waiting to happen, but is done to ensure that the remainder of the script has an accurate idea of the running qemu command's PID. This commit therefore places both the qemu command and the PID capture into a new temporary file and sources that temporary file. Thus the single construction of the qemu command into the qemu-cmd file suffices for both purposes. Signed-off-by: Paul E. McKenney --- .../testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 5ec095da095f..484445bd3010 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -161,8 +161,16 @@ then touch $resdir/buildonly exit 0 fi + +# Decorate qemu-cmd with redirection, backgrounding, and PID capture +sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd +echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd + +# In case qemu refuses to run... echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log -( $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append "$qemu_append $boot_args" > $resdir/qemu-output 2>&1 & echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & + +# Attempt to run qemu +( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 sleep 10 # Give qemu's pid a chance to reach the file if test -s "$resdir/qemu_pid" From 7a6bbeaa01f71af2722fd775a4a4ff9593d12838 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Jun 2020 17:07:15 -0700 Subject: [PATCH 185/502] torture: Remove obsolete "cd $KVM" In the dim distant past, qemu commands needed to be run from the rcutorture directory, but this is no longer the case. This commit therefore removes the now-useless "cd $KVM" from the kvm-test-1-run.sh script. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 484445bd3010..e07779a62634 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -124,7 +124,6 @@ seconds=$4 qemu_args=$5 boot_args=$6 -cd $KVM kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` if test -z "$TORTURE_BUILDONLY" then From cda099b37d7165fc73a63961739acf026444cde2 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 19 Feb 2020 11:00:54 -0800 Subject: [PATCH 186/502] fork: Annotate a data race in vm_area_dup() struct vm_area_struct could be accessed concurrently as noticed by KCSAN, write to 0xffff9cf8bba08ad8 of 8 bytes by task 14263 on cpu 35: vma_interval_tree_insert+0x101/0x150: rb_insert_augmented_cached at include/linux/rbtree_augmented.h:58 (inlined by) vma_interval_tree_insert at mm/interval_tree.c:23 __vma_link_file+0x6e/0xe0 __vma_link_file at mm/mmap.c:629 vma_link+0xa2/0x120 mmap_region+0x753/0xb90 do_mmap+0x45c/0x710 vm_mmap_pgoff+0xc0/0x130 ksys_mmap_pgoff+0x1d1/0x300 __x64_sys_mmap+0x33/0x40 do_syscall_64+0x91/0xc44 entry_SYSCALL_64_after_hwframe+0x49/0xbe read to 0xffff9cf8bba08a80 of 200 bytes by task 14262 on cpu 122: vm_area_dup+0x6a/0xe0 vm_area_dup at kernel/fork.c:362 __split_vma+0x72/0x2a0 __split_vma at mm/mmap.c:2661 split_vma+0x5a/0x80 mprotect_fixup+0x368/0x3f0 do_mprotect_pkey+0x263/0x420 __x64_sys_mprotect+0x51/0x70 do_syscall_64+0x91/0xc44 entry_SYSCALL_64_after_hwframe+0x49/0xbe vm_area_dup() blindly copies all fields of original VMA to the new one. This includes coping vm_area_struct::shared.rb which is normally protected by i_mmap_lock. But this is fine because the read value will be overwritten on the following __vma_link_file() under proper protection. Thus, mark it as an intentional data race and insert a few assertions for the fields that should not be modified concurrently. Signed-off-by: Qian Cai Signed-off-by: Paul E. McKenney --- kernel/fork.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..bba10fbcdce7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -359,7 +359,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new) { - *new = *orig; + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + /* + * orig->shared.rb may be modified concurrently, but the clone + * will be reinitialized. + */ + *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; } From cb38f82043d1641a27f96b58b402ca4b7a88f52d Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 10 Feb 2020 09:10:16 -0500 Subject: [PATCH 187/502] x86/mm/pat: Mark an intentional data race cpa_4k_install could be accessed concurrently as noticed by KCSAN, read to 0xffffffffaa59a000 of 8 bytes by interrupt on cpu 7: cpa_inc_4k_install arch/x86/mm/pat/set_memory.c:131 [inline] __change_page_attr+0x10cf/0x1840 arch/x86/mm/pat/set_memory.c:1514 __change_page_attr_set_clr+0xce/0x490 arch/x86/mm/pat/set_memory.c:1636 __set_pages_np+0xc4/0xf0 arch/x86/mm/pat/set_memory.c:2148 __kernel_map_pages+0xb0/0xc8 arch/x86/mm/pat/set_memory.c:2178 kernel_map_pages include/linux/mm.h:2719 [inline] write to 0xffffffffaa59a000 of 8 bytes by task 1 on cpu 6: cpa_inc_4k_install arch/x86/mm/pat/set_memory.c:131 [inline] __change_page_attr+0x10ea/0x1840 arch/x86/mm/pat/set_memory.c:1514 __change_page_attr_set_clr+0xce/0x490 arch/x86/mm/pat/set_memory.c:1636 __set_pages_p+0xc4/0xf0 arch/x86/mm/pat/set_memory.c:2129 __kernel_map_pages+0x2e/0xc8 arch/x86/mm/pat/set_memory.c:2176 kernel_map_pages include/linux/mm.h:2719 [inline] Both accesses are due to the same "cpa_4k_install++" in cpa_inc_4k_install. A data race here could be potentially undesirable: depending on compiler optimizations or how x86 executes a non-LOCK'd increment, it may lose increments, corrupt the counter, etc. Since this counter only seems to be used for printing some stats, this data race itself is unlikely to cause harm to the system though. Thus, mark this intentional data race using the data_race() marco. Suggested-by: Macro Elver Signed-off-by: Qian Cai Acked-by: Borislav Petkov Signed-off-by: Paul E. McKenney --- arch/x86/mm/pat/set_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 77e04304a2a7..d1b2a889f035 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -135,7 +135,7 @@ static inline void cpa_inc_2m_checked(void) static inline void cpa_inc_4k_install(void) { - cpa_4k_install++; + data_race(cpa_4k_install++); } static inline void cpa_inc_lp_sameprot(int level) From c93773c1a3fedf6c3f6fa12833e2b74a9897c3e3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Feb 2020 13:29:15 -0800 Subject: [PATCH 188/502] rculist: Add ASSERT_EXCLUSIVE_ACCESS() to __list_splice_init_rcu() After the sync() in __list_splice_init_rcu(), there should be no readers traversing the old list. This commit therefore enlists the help of KCSAN to verify this condition via a pair of calls to ASSERT_EXCLUSIVE_ACCESS(). Signed-off-by: Paul E. McKenney Cc: Marco Elver --- include/linux/rculist.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index df587d181844..2ebd112f86f7 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -248,6 +248,8 @@ static inline void __list_splice_init_rcu(struct list_head *list, */ sync(); + ASSERT_EXCLUSIVE_ACCESS(*first); + ASSERT_EXCLUSIVE_ACCESS(*last); /* * Readers are finished with the source list, so perform splice. From 1fe84fd4a4027a17d511a832f89ab14107650ba4 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 5 May 2020 20:28:21 +0200 Subject: [PATCH 189/502] kcsan: Add test suite This adds KCSAN test focusing on behaviour of the integrated runtime. Tests various race scenarios, and verifies the reports generated to console. Makes use of KUnit for test organization, and the Torture framework for test thread control. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 3 + kernel/kcsan/kcsan-test.c | 1084 +++++++++++++++++++++++++++++++++++++ lib/Kconfig.kcsan | 23 +- 3 files changed, 1109 insertions(+), 1 deletion(-) create mode 100644 kernel/kcsan/kcsan-test.c diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index d4999b38d1be..14533cf24bc3 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -12,3 +12,6 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ obj-y := core.o debugfs.o report.o obj-$(CONFIG_KCSAN_SELFTEST) += test.o + +CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c new file mode 100644 index 000000000000..a8c11506dd2a --- /dev/null +++ b/kernel/kcsan/kcsan-test.c @@ -0,0 +1,1084 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KCSAN test with various race scenarious to test runtime behaviour. Since the + * interface with which KCSAN's reports are obtained is via the console, this is + * the output we should verify. For each test case checks the presence (or + * absence) of generated reports. Relies on 'console' tracepoint to capture + * reports as they appear in the kernel log. + * + * Makes use of KUnit for test organization, and the Torture framework for test + * thread control. + * + * Copyright (C) 2020, Google LLC. + * Author: Marco Elver + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Points to current test-case memory access "kernels". */ +static void (*access_kernels[2])(void); + +static struct task_struct **threads; /* Lists of threads. */ +static unsigned long end_time; /* End time of test. */ + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + int nlines; + char lines[3][512]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Setup test checking loop. */ +static __no_kcsan_or_inline void +begin_test_checks(void (*func1)(void), void (*func2)(void)) +{ + kcsan_disable_current(); + + /* + * Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at + * least one race is reported. + */ + end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 500); + + /* Signal start; release potential initialization of shared data. */ + smp_store_release(&access_kernels[0], func1); + smp_store_release(&access_kernels[1], func2); +} + +/* End test checking loop. */ +static __no_kcsan_or_inline bool +end_test_checks(bool stop) +{ + if (!stop && time_before(jiffies, end_time)) { + /* Continue checking */ + might_sleep(); + return false; + } + + kcsan_enable_current(); + return true; +} + +/* + * Probe for console output: checks if a race was reported, and obtains observed + * lines of interest. + */ +__no_kcsan +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + int nlines; + + /* + * Note that KCSAN reports under a global lock, so we do not risk the + * possibility of having multiple reports interleaved. If that were the + * case, we'd expect tests to fail. + */ + + spin_lock_irqsave(&observed.lock, flags); + nlines = observed.nlines; + + if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) { + /* + * KCSAN report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0]))); + nlines = 1; + } else if ((nlines == 1 || nlines == 2) && strnstr(buf, "bytes by", len)) { + strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0]))); + + if (strnstr(buf, "race at unknown origin", len)) { + if (WARN_ON(nlines != 2)) + goto out; + + /* No second line of interest. */ + strcpy(observed.lines[nlines++], ""); + } + } + +out: + WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */ + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +__no_kcsan +static bool report_available(void) +{ + return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines); +} + +/* Report information we expect in a report. */ +struct expect_report { + /* Access information of both accesses. */ + struct { + void *fn; /* Function pointer to expected function of top frame. */ + void *addr; /* Address of access; unchecked if NULL. */ + size_t size; /* Size of access; unchecked if @addr is NULL. */ + int type; /* Access type, see KCSAN_ACCESS definitions. */ + } access[2]; +}; + +/* Check observed report matches information in @r. */ +__no_kcsan +static bool report_matches(const struct expect_report *r) +{ + const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT; + bool ret = false; + unsigned long flags; + typeof(observed.lines) expect; + const char *end; + char *cur; + int i; + + /* Doubled-checked locking. */ + if (!report_available()) + return false; + + /* Generate expected report contents. */ + + /* Title */ + cur = expect[0]; + end = &expect[0][sizeof(expect[0]) - 1]; + cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ", + is_assert ? "assert: race" : "data-race"); + if (r->access[1].fn) { + char tmp[2][64]; + int cmp; + + /* Expect lexographically sorted function names in title. */ + scnprintf(tmp[0], sizeof(tmp[0]), "%pS", r->access[0].fn); + scnprintf(tmp[1], sizeof(tmp[1]), "%pS", r->access[1].fn); + cmp = strcmp(tmp[0], tmp[1]); + cur += scnprintf(cur, end - cur, "%ps / %ps", + cmp < 0 ? r->access[0].fn : r->access[1].fn, + cmp < 0 ? r->access[1].fn : r->access[0].fn); + } else { + scnprintf(cur, end - cur, "%pS", r->access[0].fn); + /* The exact offset won't match, remove it. */ + cur = strchr(expect[0], '+'); + if (cur) + *cur = '\0'; + } + + /* Access 1 */ + cur = expect[1]; + end = &expect[1][sizeof(expect[1]) - 1]; + if (!r->access[1].fn) + cur += scnprintf(cur, end - cur, "race at unknown origin, with "); + + /* Access 1 & 2 */ + for (i = 0; i < 2; ++i) { + const char *const access_type = + (r->access[i].type & KCSAN_ACCESS_ASSERT) ? + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "assert no accesses" : + "assert no writes") : + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "write" : + "read"); + const char *const access_type_aux = + (r->access[i].type & KCSAN_ACCESS_ATOMIC) ? + " (marked)" : + ((r->access[i].type & KCSAN_ACCESS_SCOPED) ? + " (scoped)" : + ""); + + if (i == 1) { + /* Access 2 */ + cur = expect[2]; + end = &expect[2][sizeof(expect[2]) - 1]; + + if (!r->access[1].fn) { + /* Dummy string if no second access is available. */ + strcpy(cur, ""); + break; + } + } + + cur += scnprintf(cur, end - cur, "%s%s to ", access_type, + access_type_aux); + + if (r->access[i].addr) /* Address is optional. */ + cur += scnprintf(cur, end - cur, "0x%px of %zu bytes", + r->access[i].addr, r->access[i].size); + } + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.lines[0], expect[0]) && + /* Access info may appear in any order. */ + ((strstr(observed.lines[1], expect[1]) && + strstr(observed.lines[2], expect[2])) || + (strstr(observed.lines[1], expect[2]) && + strstr(observed.lines[2], expect[1]))); +out: + spin_unlock_irqrestore(&observed.lock, flags); + return ret; +} + +/* ===== Test kernels ===== */ + +static long test_sink; +static long test_var; +/* @test_array should be large enough to fall into multiple watchpoint slots. */ +static long test_array[3 * PAGE_SIZE / sizeof(long)]; +static struct { + long val[8]; +} test_struct; +static DEFINE_SEQLOCK(test_seqlock); + +/* + * Helper to avoid compiler optimizing out reads, and to generate source values + * for writes. + */ +__no_kcsan +static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); } + +static noinline void test_kernel_read(void) { sink_value(test_var); } + +static noinline void test_kernel_write(void) +{ + test_var = READ_ONCE_NOCHECK(test_sink) + 1; +} + +static noinline void test_kernel_write_nochange(void) { test_var = 42; } + +/* Suffixed by value-change exception filter. */ +static noinline void test_kernel_write_nochange_rcu(void) { test_var = 42; } + +static noinline void test_kernel_read_atomic(void) +{ + sink_value(READ_ONCE(test_var)); +} + +static noinline void test_kernel_write_atomic(void) +{ + WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1); +} + +__no_kcsan +static noinline void test_kernel_write_uninstrumented(void) { test_var++; } + +static noinline void test_kernel_data_race(void) { data_race(test_var++); } + +static noinline void test_kernel_assert_writer(void) +{ + ASSERT_EXCLUSIVE_WRITER(test_var); +} + +static noinline void test_kernel_assert_access(void) +{ + ASSERT_EXCLUSIVE_ACCESS(test_var); +} + +#define TEST_CHANGE_BITS 0xff00ff00 + +static noinline void test_kernel_change_bits(void) +{ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { + /* + * Avoid race of unknown origin for this test, just pretend they + * are atomic. + */ + kcsan_nestable_atomic_begin(); + test_var ^= TEST_CHANGE_BITS; + kcsan_nestable_atomic_end(); + } else + WRITE_ONCE(test_var, READ_ONCE(test_var) ^ TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_change(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_nochange(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS); +} + +/* To check that scoped assertions do trigger anywhere in scope. */ +static noinline void test_enter_scope(void) +{ + int x = 0; + + /* Unrelated accesses to scoped assert. */ + READ_ONCE(test_sink); + kcsan_check_read(&x, sizeof(x)); +} + +static noinline void test_kernel_assert_writer_scoped(void) +{ + ASSERT_EXCLUSIVE_WRITER_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_assert_access_scoped(void) +{ + ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_rmw_array(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(test_array); ++i) + test_array[i]++; +} + +static noinline void test_kernel_write_struct(void) +{ + kcsan_check_write(&test_struct, sizeof(test_struct)); + kcsan_disable_current(); + test_struct.val[3]++; /* induce value change */ + kcsan_enable_current(); +} + +static noinline void test_kernel_write_struct_part(void) +{ + test_struct.val[3] = 42; +} + +static noinline void test_kernel_read_struct_zero_size(void) +{ + kcsan_check_read(&test_struct.val[3], 0); +} + +static noinline void test_kernel_seqlock_reader(void) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&test_seqlock); + sink_value(test_var); + } while (read_seqretry(&test_seqlock, seq)); +} + +static noinline void test_kernel_seqlock_writer(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&test_seqlock, flags); + test_var++; + write_sequnlock_irqrestore(&test_seqlock, flags); +} + +/* ===== Test cases ===== */ + +/* Simple test with normal data race. */ +__no_kcsan +static void test_basic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write, test_kernel_read); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Stress KCSAN with lots of concurrent races on different addresses until + * timeout. + */ +__no_kcsan +static void test_concurrent_races(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + /* NULL will match any address. */ + { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_rmw_array, NULL, 0, 0 }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_rmw_array, test_kernel_rmw_array); + do { + match_expect |= report_matches(&expect); + match_never |= report_matches(&never); + } while (!end_test_checks(false)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check matches exist. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the KCSAN_REPORT_VALUE_CHANGE_ONLY option. */ +__no_kcsan +static void test_novalue_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that the rules where the KCSAN_REPORT_VALUE_CHANGE_ONLY option should + * never apply work. + */ +__no_kcsan +static void test_novalue_change_exception(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that data races of unknown origin are reported. */ +__no_kcsan +static void test_unknown_origin(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { NULL }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_uninstrumented, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + KUNIT_EXPECT_TRUE(test, match_expect); + else + KUNIT_EXPECT_FALSE(test, match_expect); +} + +/* Test KCSAN_ASSUME_PLAIN_WRITES_ATOMIC if it is selected. */ +__no_kcsan +static void test_write_write_assume_atomic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write, test_kernel_write); + do { + sink_value(READ_ONCE(test_var)); /* induce value-change */ + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races with writes larger than word-size are always reported, + * even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races where only one write is larger than word-size are always + * reported, even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct_part(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct_part); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that races with atomic accesses never result in reports. */ +__no_kcsan +static void test_read_atomic_write_atomic(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_read_atomic, test_kernel_write_atomic); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test that a race with an atomic and plain access result in reports. */ +__no_kcsan +static void test_read_plain_atomic_write(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, + }, + }; + bool match_expect = false; + + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) + return; + + begin_test_checks(test_kernel_read, test_kernel_write_atomic); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Zero-sized accesses should never cause data race reports. */ +__no_kcsan +static void test_zero_size_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_read_struct_zero_size); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the data_race() macro. */ +__no_kcsan +static void test_data_race(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_data_race, test_kernel_data_race); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_writer, test_kernel_write_nochange); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access_writer(struct kunit *test) +{ + const struct expect_report expect_access_writer = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + const struct expect_report expect_access_access = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + bool match_expect_access_writer = false; + bool match_expect_access_access = false; + bool match_never = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_assert_writer); + do { + match_expect_access_writer |= report_matches(&expect_access_writer); + match_expect_access_access |= report_matches(&expect_access_access); + match_never |= report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect_access_writer); + KUNIT_EXPECT_TRUE(test, match_expect_access_access); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_bits_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_change_bits, &test_var, sizeof(test_var), + KCSAN_ACCESS_WRITE | (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) ? 0 : KCSAN_ACCESS_ATOMIC) }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_bits_change, test_kernel_change_bits); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_bits_nochange(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_assert_bits_nochange, test_kernel_change_bits); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer_scoped(struct kunit *test) +{ + const struct expect_report expect_start = { + .access = { + { test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report expect_anywhere = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect_start = false; + bool match_expect_anywhere = false; + + begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange); + do { + match_expect_start |= report_matches(&expect_start); + match_expect_anywhere |= report_matches(&expect_anywhere); + } while (!end_test_checks(match_expect_start && match_expect_anywhere)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_anywhere); +} + +__no_kcsan +static void test_assert_exclusive_access_scoped(struct kunit *test) +{ + const struct expect_report expect_start1 = { + .access = { + { test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + const struct expect_report expect_start2 = { + .access = { expect_start1.access[0], expect_start1.access[0] }, + }; + const struct expect_report expect_inscope = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect_start = false; + bool match_expect_inscope = false; + + begin_test_checks(test_kernel_assert_access_scoped, test_kernel_read); + end_time += msecs_to_jiffies(1000); /* This test requires a bit more time. */ + do { + match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2); + match_expect_inscope |= report_matches(&expect_inscope); + } while (!end_test_checks(match_expect_start && match_expect_inscope)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_inscope); +} + +/* Test that racing accesses in seqlock critical sections are not reported. */ +__no_kcsan +static void test_seqlock_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_seqlock_reader, test_kernel_seqlock_writer); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Each test case is run with different numbers of threads. Until KUnit supports + * passing arguments for each test case, we encode #threads in the test case + * name (read by get_num_threads()). [The '-' was chosen as a stylistic + * preference to separate test name and #threads.] + * + * The thread counts are chosen to cover potentially interesting boundaries and + * corner cases (range 2-5), and then stress the system with larger counts. + */ +#define KCSAN_KUNIT_CASE(test_name) \ + { .run_case = test_name, .name = #test_name "-02" }, \ + { .run_case = test_name, .name = #test_name "-03" }, \ + { .run_case = test_name, .name = #test_name "-04" }, \ + { .run_case = test_name, .name = #test_name "-05" }, \ + { .run_case = test_name, .name = #test_name "-08" }, \ + { .run_case = test_name, .name = #test_name "-16" } + +static struct kunit_case kcsan_test_cases[] = { + KCSAN_KUNIT_CASE(test_basic), + KCSAN_KUNIT_CASE(test_concurrent_races), + KCSAN_KUNIT_CASE(test_novalue_change), + KCSAN_KUNIT_CASE(test_novalue_change_exception), + KCSAN_KUNIT_CASE(test_unknown_origin), + KCSAN_KUNIT_CASE(test_write_write_assume_atomic), + KCSAN_KUNIT_CASE(test_write_write_struct), + KCSAN_KUNIT_CASE(test_write_write_struct_part), + KCSAN_KUNIT_CASE(test_read_atomic_write_atomic), + KCSAN_KUNIT_CASE(test_read_plain_atomic_write), + KCSAN_KUNIT_CASE(test_zero_size_access), + KCSAN_KUNIT_CASE(test_data_race), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_access), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_change), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), + KCSAN_KUNIT_CASE(test_seqlock_noreport), + {}, +}; + +/* ===== End test cases ===== */ + +/* Get number of threads encoded in test name. */ +static bool __no_kcsan +get_num_threads(const char *test, int *nthreads) +{ + int len = strlen(test); + + if (WARN_ON(len < 3)) + return false; + + *nthreads = test[len - 1] - '0'; + *nthreads += (test[len - 2] - '0') * 10; + + if (WARN_ON(*nthreads < 0)) + return false; + + return true; +} + +/* Concurrent accesses from interrupts. */ +__no_kcsan +static void access_thread_timer(struct timer_list *timer) +{ + static atomic_t cnt = ATOMIC_INIT(0); + unsigned int idx; + void (*func)(void); + + idx = (unsigned int)atomic_inc_return(&cnt) % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); +} + +/* The main loop for each thread. */ +__no_kcsan +static int access_thread(void *arg) +{ + struct timer_list timer; + unsigned int cnt = 0; + unsigned int idx; + void (*func)(void); + + timer_setup_on_stack(&timer, access_thread_timer, 0); + do { + might_sleep(); + + if (!timer_pending(&timer)) + mod_timer(&timer, jiffies + 1); + else { + /* Iterate through all kernels. */ + idx = cnt++ % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); + } + } while (!torture_must_stop()); + del_timer_sync(&timer); + destroy_timer_on_stack(&timer); + + torture_kthread_stopping("access_thread"); + return 0; +} + +__no_kcsan +static int test_init(struct kunit *test) +{ + unsigned long flags; + int nthreads; + int i; + + spin_lock_irqsave(&observed.lock, flags); + for (i = 0; i < ARRAY_SIZE(observed.lines); ++i) + observed.lines[i][0] = '\0'; + observed.nlines = 0; + spin_unlock_irqrestore(&observed.lock, flags); + + if (!torture_init_begin((char *)test->name, 1)) + return -EBUSY; + + if (!get_num_threads(test->name, &nthreads)) + goto err; + + if (WARN_ON(threads)) + goto err; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) { + if (WARN_ON(access_kernels[i])) + goto err; + } + + if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { + /* + * Without any preemption, keep 2 CPUs free for other tasks, one + * of which is the main test case function checking for + * completion or failure. + */ + const int min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; + const int min_required_cpus = 2 + min_unused_cpus; + + if (num_online_cpus() < min_required_cpus) { + pr_err("%s: too few online CPUs (%u < %d) for test", + test->name, num_online_cpus(), min_required_cpus); + goto err; + } else if (nthreads > num_online_cpus() - min_unused_cpus) { + nthreads = num_online_cpus() - min_unused_cpus; + pr_warn("%s: limiting number of threads to %d\n", + test->name, nthreads); + } + } + + if (nthreads) { + threads = kcalloc(nthreads + 1, sizeof(struct task_struct *), + GFP_KERNEL); + if (WARN_ON(!threads)) + goto err; + + threads[nthreads] = NULL; + for (i = 0; i < nthreads; ++i) { + if (torture_create_kthread(access_thread, NULL, + threads[i])) + goto err; + } + } + + torture_init_end(); + + return 0; + +err: + kfree(threads); + threads = NULL; + torture_init_end(); + return -EINVAL; +} + +__no_kcsan +static void test_exit(struct kunit *test) +{ + struct task_struct **stop_thread; + int i; + + if (torture_cleanup_begin()) + return; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) + WRITE_ONCE(access_kernels[i], NULL); + + if (threads) { + for (stop_thread = threads; *stop_thread; stop_thread++) + torture_stop_kthread(reader_thread, *stop_thread); + + kfree(threads); + threads = NULL; + } + + torture_cleanup_end(); +} + +static struct kunit_suite kcsan_test_suite = { + .name = "kcsan-test", + .test_cases = kcsan_test_cases, + .init = test_init, + .exit = test_exit, +}; +static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL }; + +__no_kcsan +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +__no_kcsan +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +/* + * We only want to do tracepoints setup and teardown once, therefore we have to + * customize the init and exit functions and cannot rely on kunit_test_suite(). + */ +static int __init kcsan_test_init(void) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return __kunit_test_suites_init(kcsan_test_suites); +} + +static void kcsan_test_exit(void) +{ + __kunit_test_suites_exit(kcsan_test_suites); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +late_initcall(kcsan_test_init); +module_exit(kcsan_test_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Marco Elver "); diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index 5ee88e5119c2..3f3b5bca7a8f 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -59,7 +59,28 @@ config KCSAN_SELFTEST bool "Perform short selftests on boot" default y help - Run KCSAN selftests on boot. On test failure, causes the kernel to panic. + Run KCSAN selftests on boot. On test failure, causes the kernel to + panic. Recommended to be enabled, ensuring critical functionality + works as intended. + +config KCSAN_TEST + tristate "KCSAN test for integrated runtime behaviour" + depends on TRACEPOINTS && KUNIT + select TORTURE_TEST + help + KCSAN test focusing on behaviour of the integrated runtime. Tests + various race scenarios, and verifies the reports generated to + console. Makes use of KUnit for test organization, and the Torture + framework for test thread control. + + Each test case may run at least up to KCSAN_REPORT_ONCE_IN_MS + milliseconds. Test run duration may be optimized by building the + kernel and KCSAN test with KCSAN_REPORT_ONCE_IN_MS set to a lower + than default value. + + Say Y here if you want the test to be built into the kernel and run + during boot; say M if you want the test to build as a module; say N + if you are unsure. config KCSAN_EARLY_ENABLE bool "Early enable during boot" From 33190b675ce2eacbeb4e75168c05b41110b506ec Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 11 Feb 2020 08:54:15 -0500 Subject: [PATCH 190/502] locking/osq_lock: Annotate a data race in osq_lock The prev->next pointer can be accessed concurrently as noticed by KCSAN: write (marked) to 0xffff9d3370dbbe40 of 8 bytes by task 3294 on cpu 107: osq_lock+0x25f/0x350 osq_wait_next at kernel/locking/osq_lock.c:79 (inlined by) osq_lock at kernel/locking/osq_lock.c:185 rwsem_optimistic_spin read to 0xffff9d3370dbbe40 of 8 bytes by task 3398 on cpu 100: osq_lock+0x196/0x350 osq_lock at kernel/locking/osq_lock.c:157 rwsem_optimistic_spin Since the write only stores NULL to prev->next and the read tests if prev->next equals to this_cpu_ptr(&osq_node). Even if the value is shattered, the code is still working correctly. Thus, mark it as an intentional data race using the data_race() macro. Signed-off-by: Qian Cai Signed-off-by: Paul E. McKenney --- kernel/locking/osq_lock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 1f7734949ac8..1de006ed3aa8 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -154,7 +154,11 @@ bool osq_lock(struct optimistic_spin_queue *lock) */ for (;;) { - if (prev->next == node && + /* + * cpu_relax() below implies a compiler barrier which would + * prevent this comparison being optimized away. + */ + if (data_race(prev->next) == node && cmpxchg(&prev->next, node, NULL) == node) break; From 2888557f68db334a3839dcc262264a4c436f576b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 2 Jun 2020 16:36:33 +0200 Subject: [PATCH 191/502] kcsan: Prefer '__no_kcsan inline' in test Instead of __no_kcsan_or_inline, prefer '__no_kcsan inline' in test -- this is in case we decide to remove __no_kcsan_or_inline. Suggested-by: Peter Zijlstra Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan-test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index a8c11506dd2a..3af420ad6ee7 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -43,7 +43,7 @@ static struct { }; /* Setup test checking loop. */ -static __no_kcsan_or_inline void +static __no_kcsan inline void begin_test_checks(void (*func1)(void), void (*func2)(void)) { kcsan_disable_current(); @@ -60,7 +60,7 @@ begin_test_checks(void (*func1)(void), void (*func2)(void)) } /* End test checking loop. */ -static __no_kcsan_or_inline bool +static __no_kcsan inline bool end_test_checks(bool stop) { if (!stop && time_before(jiffies, end_time)) { From 9dd979bae4cf76558ff816abe83283308fb1ae8c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:22 +0200 Subject: [PATCH 192/502] kcsan: Silence -Wmissing-prototypes warning with W=1 The functions here should not be forward declared for explicit use elsewhere in the kernel, as they should only be emitted by the compiler due to sanitizer instrumentation. Add forward declarations a line above their definition to shut up warnings in W=1 builds. Link: https://lkml.kernel.org/r/202006060103.jSCpnV1g%lkp@intel.com Reported-by: kernel test robot Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 15f67949d11e..1866bafda4fd 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -754,6 +754,7 @@ EXPORT_SYMBOL(__kcsan_check_access); */ #define DEFINE_TSAN_READ_WRITE(size) \ + void __tsan_read##size(void *ptr); \ void __tsan_read##size(void *ptr) \ { \ check_access(ptr, size, 0); \ @@ -762,6 +763,7 @@ EXPORT_SYMBOL(__kcsan_check_access); void __tsan_unaligned_read##size(void *ptr) \ __alias(__tsan_read##size); \ EXPORT_SYMBOL(__tsan_unaligned_read##size); \ + void __tsan_write##size(void *ptr); \ void __tsan_write##size(void *ptr) \ { \ check_access(ptr, size, KCSAN_ACCESS_WRITE); \ @@ -777,12 +779,14 @@ DEFINE_TSAN_READ_WRITE(4); DEFINE_TSAN_READ_WRITE(8); DEFINE_TSAN_READ_WRITE(16); +void __tsan_read_range(void *ptr, size_t size); void __tsan_read_range(void *ptr, size_t size) { check_access(ptr, size, 0); } EXPORT_SYMBOL(__tsan_read_range); +void __tsan_write_range(void *ptr, size_t size); void __tsan_write_range(void *ptr, size_t size) { check_access(ptr, size, KCSAN_ACCESS_WRITE); @@ -799,6 +803,7 @@ EXPORT_SYMBOL(__tsan_write_range); * the size-check of compiletime_assert_rwonce_type(). */ #define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \ + void __tsan_volatile_read##size(void *ptr); \ void __tsan_volatile_read##size(void *ptr) \ { \ const bool is_atomic = size <= sizeof(long long) && \ @@ -811,6 +816,7 @@ EXPORT_SYMBOL(__tsan_write_range); void __tsan_unaligned_volatile_read##size(void *ptr) \ __alias(__tsan_volatile_read##size); \ EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \ + void __tsan_volatile_write##size(void *ptr); \ void __tsan_volatile_write##size(void *ptr) \ { \ const bool is_atomic = size <= sizeof(long long) && \ @@ -836,14 +842,17 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(16); * The below are not required by KCSAN, but can still be emitted by the * compiler. */ +void __tsan_func_entry(void *call_pc); void __tsan_func_entry(void *call_pc) { } EXPORT_SYMBOL(__tsan_func_entry); +void __tsan_func_exit(void); void __tsan_func_exit(void) { } EXPORT_SYMBOL(__tsan_func_exit); +void __tsan_init(void); void __tsan_init(void) { } From acfa087ccf2d2eff46186477f53e4c3ffbdb033d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:23 +0200 Subject: [PATCH 193/502] kcsan: Rename test.c to selftest.c Rename 'test.c' to 'selftest.c' to better reflect its purpose (Kconfig variable and code inside already match this). This is to avoid confusion with the test suite module in 'kcsan-test.c'. No functional change. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 2 +- kernel/kcsan/{test.c => selftest.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename kernel/kcsan/{test.c => selftest.c} (100%) diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 14533cf24bc3..092ce58d2e56 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -11,7 +11,7 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ $(call cc-option,-fno-stack-protector,) obj-y := core.o debugfs.o report.o -obj-$(CONFIG_KCSAN_SELFTEST) += test.o +obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o diff --git a/kernel/kcsan/test.c b/kernel/kcsan/selftest.c similarity index 100% rename from kernel/kcsan/test.c rename to kernel/kcsan/selftest.c From 7e766560e6e2c1cf2782f00e63c31564e4c9f0fe Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:24 +0200 Subject: [PATCH 194/502] kcsan: Remove existing special atomic rules Remove existing special atomic rules from kcsan_is_atomic_special() because they are no longer needed. Since we rely on the compiler emitting instrumentation distinguishing volatile accesses, the rules have become redundant. Let's keep kcsan_is_atomic_special() around, so that we have an obvious place to add special rules should the need arise in future. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/atomic.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index be9e625227f3..75fe701f4127 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -3,8 +3,7 @@ #ifndef _KERNEL_KCSAN_ATOMIC_H #define _KERNEL_KCSAN_ATOMIC_H -#include -#include +#include /* * Special rules for certain memory where concurrent conflicting accesses are @@ -13,8 +12,7 @@ */ static bool kcsan_is_atomic_special(const volatile void *ptr) { - /* volatile globals that have been observed in data races. */ - return ptr == &jiffies || ptr == ¤t->state; + return false; } #endif /* _KERNEL_KCSAN_ATOMIC_H */ From 56b031f0abf55254d47a329010574733fa9a27b8 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:25 +0200 Subject: [PATCH 195/502] kcsan: Add jiffies test to test suite Add a test that KCSAN nor the compiler gets confused about accesses to jiffies on different architectures. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan-test.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index 3af420ad6ee7..fed6fcb5768c 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -366,6 +366,11 @@ static noinline void test_kernel_read_struct_zero_size(void) kcsan_check_read(&test_struct.val[3], 0); } +static noinline void test_kernel_jiffies_reader(void) +{ + sink_value((long)jiffies); +} + static noinline void test_kernel_seqlock_reader(void) { unsigned int seq; @@ -817,6 +822,23 @@ static void test_assert_exclusive_access_scoped(struct kunit *test) KUNIT_EXPECT_TRUE(test, match_expect_inscope); } +/* + * jiffies is special (declared to be volatile) and its accesses are typically + * not marked; this test ensures that the compiler nor KCSAN gets confused about + * jiffies's declaration on different architectures. + */ +__no_kcsan +static void test_jiffies_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_jiffies_reader, test_kernel_jiffies_reader); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + /* Test that racing accesses in seqlock critical sections are not reported. */ __no_kcsan static void test_seqlock_noreport(struct kunit *test) @@ -867,6 +889,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange), KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped), KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), + KCSAN_KUNIT_CASE(test_jiffies_noreport), KCSAN_KUNIT_CASE(test_seqlock_noreport), {}, }; From e68dcd8eac63cf14745df0dc872ea479df8ed4b9 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 18 Jun 2020 11:31:16 +0200 Subject: [PATCH 196/502] kcsan: Re-add GCC as a supported compiler GCC version 11 recently implemented all requirements to correctly support KCSAN: 1. Correct no_sanitize-attribute inlining behaviour: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=4089df8ef4a63126b0774c39b6638845244c20d2 2. --param=tsan-distinguish-volatile https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=ab2789ec507a94f1a75a6534bca51c7b39037ce0 3. --param=tsan-instrument-func-entry-exit https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=06712fc68dc9843d9af7c7ac10047f49d305ad76 Therefore, we can re-enable GCC for KCSAN, and document the new compiler requirements. Signed-off-by: Marco Elver Cc: Martin Liska Signed-off-by: Paul E. McKenney --- Documentation/dev-tools/kcsan.rst | 3 ++- lib/Kconfig.kcsan | 3 ++- scripts/Makefile.kcsan | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst index b38379f06194..be7a0b0e1f28 100644 --- a/Documentation/dev-tools/kcsan.rst +++ b/Documentation/dev-tools/kcsan.rst @@ -8,7 +8,8 @@ approach to detect races. KCSAN's primary purpose is to detect `data races`_. Usage ----- -KCSAN requires Clang version 11 or later. +KCSAN is supported by both GCC and Clang. With GCC we require version 11 or +later, and with Clang also require version 11 or later. To enable KCSAN configure the kernel with:: diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index 3f3b5bca7a8f..3d282d51849b 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -4,7 +4,8 @@ config HAVE_ARCH_KCSAN bool config HAVE_KCSAN_COMPILER - def_bool CC_IS_CLANG && $(cc-option,-fsanitize=thread -mllvm -tsan-distinguish-volatile=1) + def_bool (CC_IS_CLANG && $(cc-option,-fsanitize=thread -mllvm -tsan-distinguish-volatile=1)) || \ + (CC_IS_GCC && $(cc-option,-fsanitize=thread --param tsan-distinguish-volatile=1)) help For the list of compilers that support KCSAN, please see . diff --git a/scripts/Makefile.kcsan b/scripts/Makefile.kcsan index bd4da1af5953..dd66206f4578 100644 --- a/scripts/Makefile.kcsan +++ b/scripts/Makefile.kcsan @@ -6,7 +6,7 @@ ifdef CONFIG_KCSAN ifdef CONFIG_CC_IS_CLANG cc-param = -mllvm -$(1) else -cc-param = --param -$(1) +cc-param = --param $(1) endif # Keep most options here optional, to allow enabling more compilers if absence From 2839a232071f588d334543fb86f5689b43353842 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 18 Jun 2020 11:31:17 +0200 Subject: [PATCH 197/502] kcsan: Simplify compiler flags Simplify the set of compiler flags for the runtime by removing cc-option from -fno-stack-protector, because all supported compilers support it. This saves us one compiler invocation during build. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 092ce58d2e56..fea064afc4f7 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -7,8 +7,8 @@ CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) -CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ - $(call cc-option,-fno-stack-protector,) +CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ + -fno-stack-protector obj-y := core.o debugfs.o report.o obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o From 61d56d7aa5eca3b909bce51ba8125b0fa44d7e17 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 18 Jun 2020 11:31:18 +0200 Subject: [PATCH 198/502] kcsan: Disable branch tracing in core runtime Disable branch tracing in core KCSAN runtime if branches are being traced (TRACE_BRANCH_PROFILING). This it to avoid its performance impact, but also avoid recursion in case KCSAN is enabled for the branch tracing runtime. The latter had already been a problem for KASAN: https://lore.kernel.org/lkml/CANpmjNOeXmD5E3O50Z3MjkiuCYaYOPyi+1rq=GZvEKwBvLR0Ug@mail.gmail.com/ Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index fea064afc4f7..65ca5539c470 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -8,7 +8,7 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ - -fno-stack-protector + -fno-stack-protector -DDISABLE_BRANCH_PROFILING obj-y := core.o debugfs.o report.o obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o From 38908de90a8c24c949505958f1d09812bb3b64aa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Jan 2020 13:38:57 -0800 Subject: [PATCH 199/502] tools/memory-model: Add recent references This commit updates the list of LKMM-related publications in Documentation/references.txt. Signed-off-by: Paul E. McKenney Acked-by: Andrea Parri --- .../memory-model/Documentation/references.txt | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tools/memory-model/Documentation/references.txt b/tools/memory-model/Documentation/references.txt index b177f3e4a614..ecbbaa5396d4 100644 --- a/tools/memory-model/Documentation/references.txt +++ b/tools/memory-model/Documentation/references.txt @@ -73,6 +73,18 @@ o Christopher Pulte, Shaked Flur, Will Deacon, Jon French, Linux-kernel memory model ========================= +o Jade Alglave, Will Deacon, Boqun Feng, David Howells, Daniel + Lustig, Luc Maranget, Paul E. McKenney, Andrea Parri, Nicholas + Piggin, Alan Stern, Akira Yokosawa, and Peter Zijlstra. + 2019. "Calibrating your fear of big bad optimizing compilers" + Linux Weekly News. https://lwn.net/Articles/799218/ + +o Jade Alglave, Will Deacon, Boqun Feng, David Howells, Daniel + Lustig, Luc Maranget, Paul E. McKenney, Andrea Parri, Nicholas + Piggin, Alan Stern, Akira Yokosawa, and Peter Zijlstra. + 2019. "Who's afraid of a big bad optimizing compiler?" + Linux Weekly News. https://lwn.net/Articles/793253/ + o Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and Alan Stern. 2018. "Frightening small children and disconcerting grown-ups: Concurrency in the Linux kernel". In Proceedings of @@ -88,6 +100,11 @@ o Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and Alan Stern. 2017. "A formal kernel memory-ordering model (part 2)" Linux Weekly News. https://lwn.net/Articles/720550/ +o Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and + Alan Stern. 2017-2019. "A Formal Model of Linux-Kernel Memory + Ordering" (backup material for the LWN articles) + https://mirrors.edge.kernel.org/pub/linux/kernel/people/paulmck/LWNLinuxMM/ + Memory-model tooling ==================== @@ -110,5 +127,5 @@ Memory-model comparisons ======================== o Paul E. McKenney, Ulrich Weigand, Andrea Parri, and Boqun - Feng. 2016. "Linux-Kernel Memory Model". (6 June 2016). - http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0124r2.html. + Feng. 2018. "Linux-Kernel Memory Model". (27 September 2018). + http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0124r6.html. From c1b14609013a6b4c4b2d73583bde645540ebd9b7 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 2 Mar 2020 18:21:01 +0100 Subject: [PATCH 200/502] tools/memory-model: Fix "conflict" definition The definition of "conflict" should not include the type of access nor whether the accesses are concurrent or not, which this patch addresses. The definition of "data race" remains unchanged. The definition of "conflict" as we know it and is cited by various papers on memory consistency models appeared in [1]: "Two accesses to the same variable conflict if at least one is a write; two operations conflict if they execute conflicting accesses." The LKMM as well as the C11 memory model are adaptations of data-race-free, which are based on the work in [2]. Necessarily, we need both conflicting data operations (plain) and synchronization operations (marked). For example, C11's definition is based on [3], which defines a "data race" as: "Two memory operations conflict if they access the same memory location, and at least one of them is a store, atomic store, or atomic read-modify-write operation. In a sequentially consistent execution, two memory operations from different threads form a type 1 data race if they conflict, at least one of them is a data operation, and they are adjacent in Co-developed-by: Alan Stern Signed-off-by: Alan Stern Acked-by: Andrea Parri Signed-off-by: Paul E. McKenney --- .../Documentation/explanation.txt | 83 ++++++++++--------- 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt index e91a2eb19592..993f800659c6 100644 --- a/tools/memory-model/Documentation/explanation.txt +++ b/tools/memory-model/Documentation/explanation.txt @@ -1987,28 +1987,36 @@ outcome undefined. In technical terms, the compiler is allowed to assume that when the program executes, there will not be any data races. A "data race" -occurs when two conflicting memory accesses execute concurrently; -two memory accesses "conflict" if: +occurs when there are two memory accesses such that: - they access the same location, +1. they access the same location, - they occur on different CPUs (or in different threads on the - same CPU), +2. at least one of them is a store, - at least one of them is a plain access, +3. at least one of them is plain, - and at least one of them is a store. +4. they occur on different CPUs (or in different threads on the + same CPU), and -The LKMM tries to determine whether a program contains two conflicting -accesses which may execute concurrently; if it does then the LKMM says -there is a potential data race and makes no predictions about the -program's outcome. +5. they execute concurrently. -Determining whether two accesses conflict is easy; you can see that -all the concepts involved in the definition above are already part of -the memory model. The hard part is telling whether they may execute -concurrently. The LKMM takes a conservative attitude, assuming that -accesses may be concurrent unless it can prove they cannot. +In the literature, two accesses are said to "conflict" if they satisfy +1 and 2 above. We'll go a little farther and say that two accesses +are "race candidates" if they satisfy 1 - 4. Thus, whether or not two +race candidates actually do race in a given execution depends on +whether they are concurrent. + +The LKMM tries to determine whether a program contains race candidates +which may execute concurrently; if it does then the LKMM says there is +a potential data race and makes no predictions about the program's +outcome. + +Determining whether two accesses are race candidates is easy; you can +see that all the concepts involved in the definition above are already +part of the memory model. The hard part is telling whether they may +execute concurrently. The LKMM takes a conservative attitude, +assuming that accesses may be concurrent unless it can prove they +are not. If two memory accesses aren't concurrent then one must execute before the other. Therefore the LKMM decides two accesses aren't concurrent @@ -2171,8 +2179,8 @@ again, now using plain accesses for buf: } This program does not contain a data race. Although the U and V -accesses conflict, the LKMM can prove they are not concurrent as -follows: +accesses are race candidates, the LKMM can prove they are not +concurrent as follows: The smp_wmb() fence in P0 is both a compiler barrier and a cumul-fence. It guarantees that no matter what hash of @@ -2326,12 +2334,11 @@ could now perform the load of x before the load of ptr (there might be a control dependency but no address dependency at the machine level). Finally, it turns out there is a situation in which a plain write does -not need to be w-post-bounded: when it is separated from the -conflicting access by a fence. At first glance this may seem -impossible. After all, to be conflicting the second access has to be -on a different CPU from the first, and fences don't link events on -different CPUs. Well, normal fences don't -- but rcu-fence can! -Here's an example: +not need to be w-post-bounded: when it is separated from the other +race-candidate access by a fence. At first glance this may seem +impossible. After all, to be race candidates the two accesses must +be on different CPUs, and fences don't link events on different CPUs. +Well, normal fences don't -- but rcu-fence can! Here's an example: int x, y; @@ -2367,7 +2374,7 @@ concurrent and there is no race, even though P1's plain store to y isn't w-post-bounded by any marked accesses. Putting all this material together yields the following picture. For -two conflicting stores W and W', where W ->co W', the LKMM says the +race-candidate stores W and W', where W ->co W', the LKMM says the stores don't race if W can be linked to W' by a w-post-bounded ; vis ; w-pre-bounded @@ -2380,8 +2387,8 @@ sequence, and if W' is plain then they also have to be linked by a w-post-bounded ; vis ; r-pre-bounded -sequence. For a conflicting load R and store W, the LKMM says the two -accesses don't race if R can be linked to W by an +sequence. For race-candidate load R and store W, the LKMM says the +two accesses don't race if R can be linked to W by an r-post-bounded ; xb* ; w-pre-bounded @@ -2413,20 +2420,20 @@ is, the rules governing the memory subsystem's choice of a store to satisfy a load request and its determination of where a store will fall in the coherence order): - If R and W conflict and it is possible to link R to W by one - of the xb* sequences listed above, then W ->rfe R is not - allowed (i.e., a load cannot read from a store that it + If R and W are race candidates and it is possible to link R to + W by one of the xb* sequences listed above, then W ->rfe R is + not allowed (i.e., a load cannot read from a store that it executes before, even if one or both is plain). - If W and R conflict and it is possible to link W to R by one - of the vis sequences listed above, then R ->fre W is not - allowed (i.e., if a store is visible to a load then the load - must read from that store or one coherence-after it). + If W and R are race candidates and it is possible to link W to + R by one of the vis sequences listed above, then R ->fre W is + not allowed (i.e., if a store is visible to a load then the + load must read from that store or one coherence-after it). - If W and W' conflict and it is possible to link W to W' by one - of the vis sequences listed above, then W' ->co W is not - allowed (i.e., if one store is visible to a second then the - second must come after the first in the coherence order). + If W and W' are race candidates and it is possible to link W + to W' by one of the vis sequences listed above, then W' ->co W + is not allowed (i.e., if one store is visible to a second then + the second must come after the first in the coherence order). This is the extent to which the LKMM deals with plain accesses. Perhaps it could say more (for example, plain accesses might From be4a37973cb078fc64d541f396b7d4d80e45fbe2 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 22 Mar 2020 21:57:33 -0400 Subject: [PATCH 201/502] Documentation: LKMM: Add litmus test for RCU GP guarantee where updater frees object This adds an example for the important RCU grace period guarantee, which shows an RCU reader can never span a grace period. Acked-by: Andrea Parri Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- .../litmus-tests/rcu/RCU+sync+free.litmus | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 Documentation/litmus-tests/rcu/RCU+sync+free.litmus diff --git a/Documentation/litmus-tests/rcu/RCU+sync+free.litmus b/Documentation/litmus-tests/rcu/RCU+sync+free.litmus new file mode 100644 index 000000000000..4ee67e12f513 --- /dev/null +++ b/Documentation/litmus-tests/rcu/RCU+sync+free.litmus @@ -0,0 +1,42 @@ +C RCU+sync+free + +(* + * Result: Never + * + * This litmus test demonstrates that an RCU reader can never see a write that + * follows a grace period, if it did not see writes that precede that grace + * period. + * + * This is a typical pattern of RCU usage, where the write before the grace + * period assigns a pointer, and the writes following the grace period destroy + * the object that the pointer used to point to. + * + * This is one implication of the RCU grace-period guarantee, which says (among + * other things) that an RCU read-side critical section cannot span a grace period. + *) + +{ +int x = 1; +int *y = &x; +int z = 1; +} + +P0(int *x, int *z, int **y) +{ + int *r0; + int r1; + + rcu_read_lock(); + r0 = rcu_dereference(*y); + r1 = READ_ONCE(*r0); + rcu_read_unlock(); +} + +P1(int *x, int *z, int **y) +{ + rcu_assign_pointer(*y, z); + synchronize_rcu(); + WRITE_ONCE(*x, 0); +} + +exists (0:r0=x /\ 0:r1=0) From a591890c4e91f37ce858a3090b16e0eef2511575 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 22 Mar 2020 21:57:34 -0400 Subject: [PATCH 202/502] Documentation: LKMM: Add litmus test for RCU GP guarantee where reader stores This adds an example for the important RCU grace period guarantee, which shows an RCU reader can never span a grace period. Acked-by: Andrea Parri Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/litmus-tests/README | 11 ++++++ .../litmus-tests/rcu/RCU+sync+read.litmus | 37 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 Documentation/litmus-tests/README create mode 100644 Documentation/litmus-tests/rcu/RCU+sync+read.litmus diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README new file mode 100644 index 000000000000..c4307ea9f996 --- /dev/null +++ b/Documentation/litmus-tests/README @@ -0,0 +1,11 @@ +============ +LITMUS TESTS +============ + +RCU (/rcu directory) +-------------------- + +RCU+sync+read.litmus +RCU+sync+free.litmus + Both the above litmus tests demonstrate the RCU grace period guarantee + that an RCU read-side critical section can never span a grace period. diff --git a/Documentation/litmus-tests/rcu/RCU+sync+read.litmus b/Documentation/litmus-tests/rcu/RCU+sync+read.litmus new file mode 100644 index 000000000000..f34176720231 --- /dev/null +++ b/Documentation/litmus-tests/rcu/RCU+sync+read.litmus @@ -0,0 +1,37 @@ +C RCU+sync+read + +(* + * Result: Never + * + * This litmus test demonstrates that after a grace period, an RCU updater always + * sees all stores done in prior RCU read-side critical sections. Such + * read-side critical sections would have ended before the grace period ended. + * + * This is one implication of the RCU grace-period guarantee, which says (among + * other things) that an RCU read-side critical section cannot span a grace period. + *) + +{ +int x = 0; +int y = 0; +} + +P0(int *x, int *y) +{ + rcu_read_lock(); + WRITE_ONCE(*x, 1); + WRITE_ONCE(*y, 1); + rcu_read_unlock(); +} + +P1(int *x, int *y) +{ + int r0; + int r1; + + r0 = READ_ONCE(*x); + synchronize_rcu(); + r1 = READ_ONCE(*y); +} + +exists (1:r0=1 /\ 1:r1=0) From 7f871338ff939952c4e04a83ae395ff9d57040c2 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 22 Mar 2020 21:57:35 -0400 Subject: [PATCH 203/502] MAINTAINERS: Update maintainers for new Documentation/litmus-tests This commit adds Joel Fernandes as official LKMM reviewer. Acked-by: Boqun Feng Acked-by: Andrea Parri Signed-off-by: Joel Fernandes (Google) [ paulmck: Apply Joe Perches alphabetization feedback. ] Signed-off-by: Paul E. McKenney --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 496fd4eafb68..b2578efb6c0e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9960,6 +9960,7 @@ M: Luc Maranget M: "Paul E. McKenney" R: Akira Yokosawa R: Daniel Lustig +R: Joel Fernandes L: linux-kernel@vger.kernel.org L: linux-arch@vger.kernel.org S: Supported @@ -9968,6 +9969,7 @@ F: Documentation/atomic_bitops.txt F: Documentation/atomic_t.txt F: Documentation/core-api/atomic_ops.rst F: Documentation/core-api/refcount-vs-atomic.rst +F: Documentation/litmus-tests/ F: Documentation/memory-barriers.txt F: tools/memory-model/ From 4a9cc65f7a715ba1f4f58529f7bf6f1548d8701f Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 26 Mar 2020 10:40:19 +0800 Subject: [PATCH 204/502] tools/memory-model: Add an exception for limitations on _unless() family According to Luc, atomic_add_unless() is directly provided by herd7, therefore it can be used in litmus tests. So change the limitation section in README to unlimit the use of atomic_add_unless(). Cc: Luc Maranget Acked-by: Andrea Parri Reviewed-by: Joel Fernandes (Google) Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- tools/memory-model/README | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/memory-model/README b/tools/memory-model/README index fc07b52f2028..b9c562e92981 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -207,11 +207,15 @@ The Linux-kernel memory model (LKMM) has the following limitations: case as a store release. b. The "unless" RMW operations are not currently modeled: - atomic_long_add_unless(), atomic_add_unless(), - atomic_inc_unless_negative(), and - atomic_dec_unless_positive(). These can be emulated + atomic_long_add_unless(), atomic_inc_unless_negative(), + and atomic_dec_unless_positive(). These can be emulated in litmus tests, for example, by using atomic_cmpxchg(). + One exception of this limitation is atomic_add_unless(), + which is provided directly by herd7 (so no corresponding + definition in linux-kernel.def). atomic_add_unless() is + modeled by herd7 therefore it can be used in litmus tests. + c. The call_rcu() function is not modeled. It can be emulated in litmus tests by adding another process that invokes synchronize_rcu() and the body of the callback From efff6150209694a78c8af8c2a7557af682086220 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 26 Mar 2020 10:40:20 +0800 Subject: [PATCH 205/502] Documentation/litmus-tests: Introduce atomic directory Although we have atomic_t.txt and its friends to describe the semantics of atomic APIs and lib/atomic64_test.c for build testing and testing in UP mode, the tests for our atomic APIs in real SMP mode are still missing. Since now we have the LKMM tool in kernel and litmus tests can be used to generate kernel modules for testing purpose with "klitmus" (a tool from the LKMM toolset), it makes sense to put a few typical litmus tests into kernel so that 1) they are the examples to describe the conceptual mode of the semantics of atomic APIs, and 2) they can be used to generate kernel test modules for anyone who is interested to test the atomic APIs implementation (in most cases, is the one who implements the APIs for a new arch) Therefore, introduce the atomic directory for this purpose. The directory is maintained by the LKMM group to make sure the litmus tests are always aligned with our memory model. Acked-by: Alan Stern Acked-by: Andrea Parri Reviewed-by: Joel Fernandes (Google) Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- Documentation/litmus-tests/atomic/README | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Documentation/litmus-tests/atomic/README diff --git a/Documentation/litmus-tests/atomic/README b/Documentation/litmus-tests/atomic/README new file mode 100644 index 000000000000..ae61201a4271 --- /dev/null +++ b/Documentation/litmus-tests/atomic/README @@ -0,0 +1,4 @@ +This directory contains litmus tests that are typical to describe the semantics +of our atomic APIs. For more information about how to "run" a litmus test or +how to generate a kernel test module based on a litmus test, please see +tools/memory-model/README. From 4dcd4d36ddb1fa7fa7257ffe9e711608119b9785 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 26 Mar 2020 10:40:21 +0800 Subject: [PATCH 206/502] Documentation/litmus-tests/atomic: Add a test for atomic_set() We already use a litmus test in atomic_t.txt to describe the behavior of an atomic_set() with the an atomic RMW, so add it into atomic-tests directory to make it easily accessible for anyone who cares about the semantics of our atomic APIs. Besides currently the litmus test "atomic-set" in atomic_t.txt has a few things to be improved: 1) The CPU/Processor numbers "P1,P2" are not only inconsistent with the rest of the document, which uses "CPU0" and "CPU1", but also unacceptable by the herd tool, which requires processors start at "P0". 2) The initialization block uses a "atomic_set()", which is OK, but it's better to use ATOMIC_INIT() to make clear this is an initialization. 3) The return value of atomic_add_unless() is discarded inexplicitly, which is OK for C language, but it will be helpful to the herd tool if we use a void cast to make the discard explicit. 4) The name and the paragraph describing the test need to be more accurate and aligned with our wording in LKMM. Therefore fix these in both atomic_t.txt and the new added litmus test. Acked-by: Andrea Parri Acked-by: Alan Stern Signed-off-by: Boqun Feng Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/atomic_t.txt | 18 +++++++------- ...c-RMW-ops-are-atomic-WRT-atomic_set.litmus | 24 +++++++++++++++++++ Documentation/litmus-tests/atomic/README | 7 ++++++ 3 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt index 0ab747e0d5ac..67d1d99f8589 100644 --- a/Documentation/atomic_t.txt +++ b/Documentation/atomic_t.txt @@ -85,21 +85,21 @@ smp_store_release() respectively. Therefore, if you find yourself only using the Non-RMW operations of atomic_t, you do not in fact need atomic_t at all and are doing it wrong. -A subtle detail of atomic_set{}() is that it should be observable to the RMW -ops. That is: +A note for the implementation of atomic_set{}() is that it must not break the +atomicity of the RMW ops. That is: - C atomic-set + C Atomic-RMW-ops-are-atomic-WRT-atomic_set { - atomic_set(v, 1); + atomic_t v = ATOMIC_INIT(1); + } + + P0(atomic_t *v) + { + (void)atomic_add_unless(v, 1, 0); } P1(atomic_t *v) - { - atomic_add_unless(v, 1, 0); - } - - P2(atomic_t *v) { atomic_set(v, 0); } diff --git a/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus b/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus new file mode 100644 index 000000000000..49385314d911 --- /dev/null +++ b/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus @@ -0,0 +1,24 @@ +C Atomic-RMW-ops-are-atomic-WRT-atomic_set + +(* + * Result: Never + * + * Test that atomic_set() cannot break the atomicity of atomic RMWs. + *) + +{ + atomic_t v = ATOMIC_INIT(1); +} + +P0(atomic_t *v) +{ + (void)atomic_add_unless(v, 1, 0); +} + +P1(atomic_t *v) +{ + atomic_set(v, 0); +} + +exists +(v=2) diff --git a/Documentation/litmus-tests/atomic/README b/Documentation/litmus-tests/atomic/README index ae61201a4271..a1b72410b539 100644 --- a/Documentation/litmus-tests/atomic/README +++ b/Documentation/litmus-tests/atomic/README @@ -2,3 +2,10 @@ This directory contains litmus tests that are typical to describe the semantics of our atomic APIs. For more information about how to "run" a litmus test or how to generate a kernel test module based on a litmus test, please see tools/memory-model/README. + +============ +LITMUS TESTS +============ + +Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus + Test that atomic_set() cannot break the atomicity of atomic RMWs. From e30d02355536e9678ab8a4dfcd6e90a86479b10f Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 26 Mar 2020 10:40:22 +0800 Subject: [PATCH 207/502] Documentation/litmus-tests/atomic: Add a test for smp_mb__after_atomic() We already use a litmus test in atomic_t.txt to describe atomic RMW + smp_mb__after_atomic() is stronger than acquire (both the read and the write parts are ordered). So make it a litmus test in atomic-tests directory, so that people can access the litmus easily. Additionally, change the processor numbers "P1, P2" to "P0, P1" in atomic_t.txt for the consistency with the processor numbers in the litmus test, which herd can handle. Acked-by: Alan Stern Acked-by: Andrea Parri Signed-off-by: Boqun Feng Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/atomic_t.txt | 10 +++--- ...ter_atomic-is-stronger-than-acquire.litmus | 32 +++++++++++++++++++ Documentation/litmus-tests/atomic/README | 5 +++ 3 files changed, 42 insertions(+), 5 deletions(-) create mode 100644 Documentation/litmus-tests/atomic/Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt index 67d1d99f8589..0f1fdedf36bb 100644 --- a/Documentation/atomic_t.txt +++ b/Documentation/atomic_t.txt @@ -233,19 +233,19 @@ as well. Similarly, something like: is an ACQUIRE pattern (though very much not typical), but again the barrier is strictly stronger than ACQUIRE. As illustrated: - C strong-acquire + C Atomic-RMW+mb__after_atomic-is-stronger-than-acquire { } - P1(int *x, atomic_t *y) + P0(int *x, atomic_t *y) { r0 = READ_ONCE(*x); smp_rmb(); r1 = atomic_read(y); } - P2(int *x, atomic_t *y) + P1(int *x, atomic_t *y) { atomic_inc(y); smp_mb__after_atomic(); @@ -253,14 +253,14 @@ strictly stronger than ACQUIRE. As illustrated: } exists - (r0=1 /\ r1=0) + (0:r0=1 /\ 0:r1=0) This should not happen; but a hypothetical atomic_inc_acquire() -- (void)atomic_fetch_inc_acquire() for instance -- would allow the outcome, because it would not order the W part of the RMW against the following WRITE_ONCE. Thus: - P1 P2 + P0 P1 t = LL.acq *y (0) t++; diff --git a/Documentation/litmus-tests/atomic/Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus b/Documentation/litmus-tests/atomic/Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus new file mode 100644 index 000000000000..9a8e31a44b28 --- /dev/null +++ b/Documentation/litmus-tests/atomic/Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus @@ -0,0 +1,32 @@ +C Atomic-RMW+mb__after_atomic-is-stronger-than-acquire + +(* + * Result: Never + * + * Test that an atomic RMW followed by a smp_mb__after_atomic() is + * stronger than a normal acquire: both the read and write parts of + * the RMW are ordered before the subsequential memory accesses. + *) + +{ +} + +P0(int *x, atomic_t *y) +{ + int r0; + int r1; + + r0 = READ_ONCE(*x); + smp_rmb(); + r1 = atomic_read(y); +} + +P1(int *x, atomic_t *y) +{ + atomic_inc(y); + smp_mb__after_atomic(); + WRITE_ONCE(*x, 1); +} + +exists +(0:r0=1 /\ 0:r1=0) diff --git a/Documentation/litmus-tests/atomic/README b/Documentation/litmus-tests/atomic/README index a1b72410b539..714cf93816ea 100644 --- a/Documentation/litmus-tests/atomic/README +++ b/Documentation/litmus-tests/atomic/README @@ -7,5 +7,10 @@ tools/memory-model/README. LITMUS TESTS ============ +Atomic-RMW+mb__after_atomic-is-stronger-than-acquire + Test that an atomic RMW followed by a smp_mb__after_atomic() is + stronger than a normal acquire: both the read and write parts of + the RMW are ordered before the subsequential memory accesses. + Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus Test that atomic_set() cannot break the atomicity of atomic RMWs. From 9725dd55512772422e195cf0cfbca1eda6778358 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Sun, 10 May 2020 13:37:14 +0900 Subject: [PATCH 208/502] tools/memory-model: Fix reference to litmus test in recipes.txt The name of litmus test doesn't match the one described below. Fix the name of litmus test. Acked-by: Andrea Parri Acked-by: Joel Fernandes (Google) Signed-off-by: Akira Yokosawa Signed-off-by: Paul E. McKenney --- tools/memory-model/Documentation/recipes.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/Documentation/recipes.txt b/tools/memory-model/Documentation/recipes.txt index 7fe8d7aa3029..63c4adfed884 100644 --- a/tools/memory-model/Documentation/recipes.txt +++ b/tools/memory-model/Documentation/recipes.txt @@ -126,7 +126,7 @@ However, it is not necessarily the case that accesses ordered by locking will be seen as ordered by CPUs not holding that lock. Consider this example: - /* See Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus. */ + /* See Z6.0+pooncelock+pooncelock+pombonce.litmus. */ void CPU0(void) { spin_lock(&mylock); From cdaac9d6d23d7a7f9edbb568191d05f2b660fff0 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Sun, 10 May 2020 15:12:57 +0900 Subject: [PATCH 209/502] Documentation/litmus-tests: Merge atomic's README into top-level one Where Documentation/litmus-tests/README lists RCU litmus tests, Documentation/litmus-tests/atomic/README lists atomic litmus tests. For symmetry, merge the latter into former, with some context adjustment in the introduction. Acked-by: Andrea Parri Acked-by: Joel Fernandes (Google) Acked-by: Boqun Feng Signed-off-by: Akira Yokosawa Signed-off-by: Paul E. McKenney --- Documentation/litmus-tests/README | 19 +++++++++++++++++++ Documentation/litmus-tests/atomic/README | 16 ---------------- 2 files changed, 19 insertions(+), 16 deletions(-) delete mode 100644 Documentation/litmus-tests/atomic/README diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README index c4307ea9f996..ac0b270b456c 100644 --- a/Documentation/litmus-tests/README +++ b/Documentation/litmus-tests/README @@ -2,6 +2,25 @@ LITMUS TESTS ============ +Each subdirectory contains litmus tests that are typical to describe the +semantics of respective kernel APIs. +For more information about how to "run" a litmus test or how to generate +a kernel test module based on a litmus test, please see +tools/memory-model/README. + + +atomic (/atomic derectory) +-------------------------- + +Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus + Test that an atomic RMW followed by a smp_mb__after_atomic() is + stronger than a normal acquire: both the read and write parts of + the RMW are ordered before the subsequential memory accesses. + +Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus + Test that atomic_set() cannot break the atomicity of atomic RMWs. + + RCU (/rcu directory) -------------------- diff --git a/Documentation/litmus-tests/atomic/README b/Documentation/litmus-tests/atomic/README deleted file mode 100644 index 714cf93816ea..000000000000 --- a/Documentation/litmus-tests/atomic/README +++ /dev/null @@ -1,16 +0,0 @@ -This directory contains litmus tests that are typical to describe the semantics -of our atomic APIs. For more information about how to "run" a litmus test or -how to generate a kernel test module based on a litmus test, please see -tools/memory-model/README. - -============ -LITMUS TESTS -============ - -Atomic-RMW+mb__after_atomic-is-stronger-than-acquire - Test that an atomic RMW followed by a smp_mb__after_atomic() is - stronger than a normal acquire: both the read and write parts of - the RMW are ordered before the subsequential memory accesses. - -Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus - Test that atomic_set() cannot break the atomicity of atomic RMWs. From c425fb5f8d2c8d22e7baad6dc077703c2b329d2d Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 11 May 2020 22:06:46 -0400 Subject: [PATCH 210/502] Documentation/litmus-tests: Cite an RCU litmus test This commit cites a pertinent RCU-related litmus test. Co-developed-by: Joel Fernandes (Google) Co-developed-by: Akira Yokosawa [Alan: grammar nit] [ paulmck: Update commit log and title per Akira feedback. ] Suggested-by: Alan Stern Signed-off-by: Joel Fernandes (Google) Signed-off-by: Akira Yokosawa Signed-off-by: Paul E. McKenney --- Documentation/litmus-tests/README | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README index ac0b270b456c..b79e640214b9 100644 --- a/Documentation/litmus-tests/README +++ b/Documentation/litmus-tests/README @@ -24,6 +24,10 @@ Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus RCU (/rcu directory) -------------------- +MP+onceassign+derefonce.litmus (under tools/memory-model/litmus-tests/) + Demonstrates the use of rcu_assign_pointer() and rcu_dereference() to + ensure that an RCU reader will not see pre-initialization garbage. + RCU+sync+read.litmus RCU+sync+free.litmus Both the above litmus tests demonstrate the RCU grace period guarantee From d075a78a5ab19389d5600923d6ad5391d7cd1be8 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Sun, 31 May 2020 20:04:32 +0900 Subject: [PATCH 211/502] tools/memory-model/README: Expand dependency of klitmus7 klitmus7 is independent of the memory model but depends on the build-target kernel release. It occasionally lost compatibility due to kernel API changes [1, 2, 3]. It was remedied in a backwards-compatible manner respectively [4, 5, 6]. Reflect this fact in README. [1]: b899a850431e ("compiler.h: Remove ACCESS_ONCE()") [2]: 0bb95f80a38f ("Makefile: Globally enable VLA warning") [3]: d56c0d45f0e2 ("proc: decouple proc from VFS with "struct proc_ops"") [4]: https://github.com/herd/herdtools7/commit/e87d7f9287d1 ("klitmus: Use WRITE_ONCE and READ_ONCE in place of deprecated ACCESS_ONCE") [5]: https://github.com/herd/herdtools7/commit/a0cbb10d02be ("klitmus: Avoid variable length array") [6]: https://github.com/herd/herdtools7/commit/46b9412d3a58 ("klitmus: Linux kernel v5.6.x compat") NOTE: [5] was ahead of herdtools7 7.53, which did not make an official release. Code generated by klitmus7 without [5] can still be built targeting Linux 4.20--5.5 if you don't care VLA warnings. Acked-by: Andrea Parri Signed-off-by: Akira Yokosawa Signed-off-by: Paul E. McKenney --- tools/memory-model/README | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tools/memory-model/README b/tools/memory-model/README index b9c562e92981..90af203c3cf1 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -28,8 +28,34 @@ downloaded separately: See "herdtools7/INSTALL.md" for installation instructions. Note that although these tools usually provide backwards compatibility, -this is not absolutely guaranteed. Therefore, if a later version does -not work, please try using the exact version called out above. +this is not absolutely guaranteed. + +For example, a future version of herd7 might not work with the model +in this release. A compatible model will likely be made available in +a later release of Linux kernel. + +If you absolutely need to run the model in this particular release, +please try using the exact version called out above. + +klitmus7 is independent of the model provided here. It has its own +dependency on a target kernel release where converted code is built +and executed. Any change in kernel APIs essential to klitmus7 will +necessitate an upgrade of klitmus7. + +If you find any compatibility issues in klitmus7, please inform the +memory model maintainers. + +klitmus7 Compatibility Table +---------------------------- + + ============ ========== + target Linux herdtools7 + ------------ ---------- + -- 4.18 7.48 -- + 4.15 -- 4.19 7.49 -- + 4.20 -- 5.5 7.54 -- + 5.6 -- HEAD + ============ ========== ================== From 2bfa5c62debe43e3779e03bfc66b75ab72098db1 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Wed, 24 Jun 2020 06:56:43 +0900 Subject: [PATCH 212/502] tools/memory-model/README: Mention herdtools7 7.56 in compatibility table herdtools7 7.56 is going to be released in the week of 22 Jun 2020. This commit therefore adds the exact version in the compatibility table. Acked-by: Andrea Parri Signed-off-by: Akira Yokosawa Signed-off-by: Paul E. McKenney --- tools/memory-model/README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/README b/tools/memory-model/README index 90af203c3cf1..ecb7385376bf 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -54,7 +54,7 @@ klitmus7 Compatibility Table -- 4.18 7.48 -- 4.15 -- 4.19 7.49 -- 4.20 -- 5.5 7.54 -- - 5.6 -- HEAD + 5.6 -- 7.56 -- ============ ========== From 5ef0a07a7928539d46fdb163acfad28c6d877a89 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Wed, 24 Jun 2020 06:59:26 +0900 Subject: [PATCH 213/502] Documentation/litmus-tests: Add note on herd7 7.56 in atomic litmus test herdtools 7.56 has enhanced herd7's C parser so that the "(void)expr" construct in Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus is accepted. This is independent of LKMM's cat model, so mention the required version in the header of the litmus test and its entry in README. CC: Boqun Feng Reported-by: Andrea Parri Acked-by: Andrea Parri Signed-off-by: Akira Yokosawa Signed-off-by: Paul E. McKenney --- Documentation/litmus-tests/README | 1 + .../atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus | 1 + 2 files changed, 2 insertions(+) diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README index b79e640214b9..7f5c6c3ed6c3 100644 --- a/Documentation/litmus-tests/README +++ b/Documentation/litmus-tests/README @@ -19,6 +19,7 @@ Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus Test that atomic_set() cannot break the atomicity of atomic RMWs. + NOTE: Require herd7 7.56 or later which supports "(void)expr". RCU (/rcu directory) diff --git a/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus b/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus index 49385314d911..ffd4d3e79c4a 100644 --- a/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus +++ b/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus @@ -4,6 +4,7 @@ C Atomic-RMW-ops-are-atomic-WRT-atomic_set * Result: Never * * Test that atomic_set() cannot break the atomicity of atomic RMWs. + * NOTE: This requires herd7 7.56 or later which supports "(void)expr". *) { From 7c86ffeeed303187f266ed17bd87a9b375955709 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:12:59 +0300 Subject: [PATCH 214/502] io_uring: deduplicate freeing linked timeouts Linked timeout cancellation code is repeated in in io_req_link_next() and io_fail_links(), and they differ in details even though shouldn't. Basing on the fact that there is maximum one armed linked timeout in a link, and it immediately follows the head, extract a function that will check for it and defuse. Justification: - DRY and cleaner - better inlining for io_req_link_next() (just 1 call site now) - isolates linked_timeouts from common path - reduces time under spinlock for failed links - actually less code Signed-off-by: Pavel Begunkov [axboe: fold in locking fix for io_fail_links()] Signed-off-by: Jens Axboe --- fs/io_uring.c | 107 +++++++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 49 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 92c7e2a96912..a0aea78162a6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1552,48 +1552,57 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) return false; } -static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *link; bool wake_ev = false; + unsigned long flags = 0; /* false positive warning */ + + if (!(req->flags & REQ_F_COMP_LOCKED)) + spin_lock_irqsave(&ctx->completion_lock, flags); + + if (list_empty(&req->link_list)) + goto out; + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); + if (link->opcode != IORING_OP_LINK_TIMEOUT) + goto out; + + list_del_init(&link->link_list); + wake_ev = io_link_cancel_timeout(link); + req->flags &= ~REQ_F_LINK_TIMEOUT; +out: + if (!(req->flags & REQ_F_COMP_LOCKED)) + spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (wake_ev) + io_cqring_ev_posted(ctx); +} + +static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +{ + struct io_kiocb *nxt; /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the * safe side. */ - while (!list_empty(&req->link_list)) { - struct io_kiocb *nxt = list_first_entry(&req->link_list, - struct io_kiocb, link_list); + if (unlikely(list_empty(&req->link_list))) + return; - if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && - (nxt->flags & REQ_F_TIMEOUT))) { - list_del_init(&nxt->link_list); - wake_ev |= io_link_cancel_timeout(nxt); - req->flags &= ~REQ_F_LINK_TIMEOUT; - continue; - } - - list_del_init(&req->link_list); - if (!list_empty(&nxt->link_list)) - nxt->flags |= REQ_F_LINK_HEAD; - *nxtptr = nxt; - break; - } - - if (wake_ev) - io_cqring_ev_posted(ctx); + nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK_HEAD; + *nxtptr = nxt; } /* * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ -static void io_fail_links(struct io_kiocb *req) +static void __io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { struct io_kiocb *link = list_first_entry(&req->link_list, @@ -1602,18 +1611,29 @@ static void io_fail_links(struct io_kiocb *req) list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); - if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->opcode == IORING_OP_LINK_TIMEOUT) { - io_link_cancel_timeout(link); - } else { - io_cqring_fill_event(link, -ECANCELED); - __io_double_put_req(link); - } + io_cqring_fill_event(link, -ECANCELED); + __io_double_put_req(link); req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + io_cqring_ev_posted(ctx); +} + +static void io_fail_links(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + __io_fail_links(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + __io_fail_links(req); + } + io_cqring_ev_posted(ctx); } @@ -1623,30 +1643,19 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) return; req->flags &= ~REQ_F_LINK_HEAD; + if (req->flags & REQ_F_LINK_TIMEOUT) + io_kill_linked_timeout(req); + /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & REQ_F_FAIL_LINK) { + if (req->flags & REQ_F_FAIL_LINK) io_fail_links(req); - } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) == - REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - /* - * If this is a timeout link, we could be racing with the - * timeout timer. Grab the completion lock for this case to - * protect against that. - */ - spin_lock_irqsave(&ctx->completion_lock, flags); + else io_req_link_next(req, nxt); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else { - io_req_link_next(req, nxt); - } } static void __io_req_task_cancel(struct io_kiocb *req, int error) From 9b5f7bd93272689ec8dc2cfd40a812265c23414e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:00 +0300 Subject: [PATCH 215/502] io_uring: replace find_next() out param with ret Generally, it's better to return a value directly than having out parameter. It's cleaner and saves from some kinds of ugly bugs. May also be faster. Return next request from io_req_find_next() and friends directly instead of passing out parameter. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a0aea78162a6..0234dc2c9625 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1578,7 +1578,7 @@ out: io_cqring_ev_posted(ctx); } -static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static struct io_kiocb *io_req_link_next(struct io_kiocb *req) { struct io_kiocb *nxt; @@ -1588,13 +1588,13 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * safe side. */ if (unlikely(list_empty(&req->link_list))) - return; + return NULL; nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); list_del_init(&req->link_list); if (!list_empty(&nxt->link_list)) nxt->flags |= REQ_F_LINK_HEAD; - *nxtptr = nxt; + return nxt; } /* @@ -1637,10 +1637,10 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) +static struct io_kiocb *io_req_find_next(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_LINK_HEAD))) - return; + return NULL; req->flags &= ~REQ_F_LINK_HEAD; if (req->flags & REQ_F_LINK_TIMEOUT) @@ -1652,10 +1652,10 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & REQ_F_FAIL_LINK) - io_fail_links(req); - else - io_req_link_next(req, nxt); + if (likely(!(req->flags & REQ_F_FAIL_LINK))) + return io_req_link_next(req); + io_fail_links(req); + return NULL; } static void __io_req_task_cancel(struct io_kiocb *req, int error) @@ -1718,9 +1718,8 @@ static void io_req_task_queue(struct io_kiocb *req) static void io_queue_next(struct io_kiocb *req) { - struct io_kiocb *nxt = NULL; + struct io_kiocb *nxt = io_req_find_next(req); - io_req_find_next(req, &nxt); if (nxt) io_req_task_queue(nxt); } @@ -1770,13 +1769,15 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. */ -__attribute__((nonnull)) -static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) { + struct io_kiocb *nxt = NULL; + if (refcount_dec_and_test(&req->refs)) { - io_req_find_next(req, nxtptr); + nxt = io_req_find_next(req); __io_free_req(req); } + return nxt; } static void io_put_req(struct io_kiocb *req) @@ -1797,7 +1798,7 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) if (refcount_read(&req->refs) != 1) return NULL; - io_req_find_next(req, &nxt); + nxt = io_req_find_next(req); if (!nxt) return NULL; @@ -4488,7 +4489,7 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); req->flags |= REQ_F_COMP_LOCKED; - io_put_req_find_next(req, nxt); + *nxt = io_put_req_find_next(req); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -5938,9 +5939,8 @@ punt: } err: - nxt = NULL; /* drop submission reference */ - io_put_req_find_next(req, &nxt); + nxt = io_put_req_find_next(req); if (linked_timeout) { if (!ret) From a1a4661691c5f1a3af4c04f56ad68e2d1dbee3af Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:01 +0300 Subject: [PATCH 216/502] io_uring: kill REQ_F_TIMEOUT Now REQ_F_TIMEOUT is set but never used, kill it Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0234dc2c9625..e9c8f52daf8f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -531,7 +531,6 @@ enum { REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, - REQ_F_TIMEOUT_BIT, REQ_F_ISREG_BIT, REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, @@ -574,8 +573,6 @@ enum { REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), /* has linked timeout */ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), - /* timeout request */ - REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* no timeout sequence */ @@ -5063,7 +5060,6 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data = &req->io->timeout; data->req = req; - req->flags |= REQ_F_TIMEOUT; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; From 8eb7e2d00763367f345ef0b2a2eb4f8001ae40ce Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:02 +0300 Subject: [PATCH 217/502] io_uring: kill REQ_F_TIMEOUT_NOSEQ There are too many useless flags, kill REQ_F_TIMEOUT_NOSEQ, which can be easily infered from req.timeout itself. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e9c8f52daf8f..8495c17b53d6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -532,7 +532,6 @@ enum { REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_OVERFLOW_BIT, @@ -575,8 +574,6 @@ enum { REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* no timeout sequence */ - REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), /* completion under lock */ REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ @@ -1010,6 +1007,11 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } +static inline bool io_is_timeout_noseq(struct io_kiocb *req) +{ + return !req->timeout.off; +} + static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -1222,7 +1224,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) struct io_kiocb *req = list_first_entry(&ctx->timeout_list, struct io_kiocb, list); - if (req->flags & REQ_F_TIMEOUT_NOSEQ) + if (io_is_timeout_noseq(req)) break; if (req->timeout.target_seq != ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts)) @@ -5087,8 +5089,7 @@ static int io_timeout(struct io_kiocb *req) * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - if (!off) { - req->flags |= REQ_F_TIMEOUT_NOSEQ; + if (io_is_timeout_noseq(req)) { entry = ctx->timeout_list.prev; goto add; } @@ -5103,7 +5104,7 @@ static int io_timeout(struct io_kiocb *req) list_for_each_prev(entry, &ctx->timeout_list) { struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); - if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) + if (io_is_timeout_noseq(nxt)) continue; /* nxt.seq is behind @tail, otherwise would've been completed */ if (off >= nxt->timeout.target_seq - tail) From ecfc51777487da4da530710e0b13de4c8cb4a6d2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:03 +0300 Subject: [PATCH 218/502] io_uring: fix potential use after free on fallback request free After __io_free_req() puts a ctx ref, it should be assumed that the ctx may already be gone. However, it can be accessed when putting the fallback req. Free the req first and then put the ctx. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8495c17b53d6..b54e358e6b31 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1526,12 +1526,15 @@ static void io_dismantle_req(struct io_kiocb *req) static void __io_free_req(struct io_kiocb *req) { + struct io_ring_ctx *ctx; + io_dismantle_req(req); - percpu_ref_put(&req->ctx->refs); + ctx = req->ctx; if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else - clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req); + clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req); + percpu_ref_put(&ctx->refs); } static bool io_link_cancel_timeout(struct io_kiocb *req) From 351fd53595a3ceb88756a005e3b864f7c8cb86e4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:40 +0300 Subject: [PATCH 219/502] io_uring: don't pass def into io_req_work_grab_env Remove struct io_op_def *def parameter from io_req_work_grab_env(), it's trivially deducible from req->opcode and fast. The API is cleaner this way, and also helps the complier to understand that it's a real constant and could be register-cached. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b54e358e6b31..2b7666e81c13 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1101,9 +1101,10 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline void io_req_work_grab_env(struct io_kiocb *req, - const struct io_op_def *def) +static inline void io_req_work_grab_env(struct io_kiocb *req) { + const struct io_op_def *def = &io_op_defs[req->opcode]; + if (!req->work.mm && def->needs_mm) { mmgrab(current->mm); req->work.mm = current->mm; @@ -1161,7 +1162,7 @@ static inline void io_prep_async_work(struct io_kiocb *req, } io_req_init_async(req); - io_req_work_grab_env(req, def); + io_req_work_grab_env(req); *link = io_prep_linked_timeout(req); } @@ -5255,7 +5256,7 @@ static int io_req_defer_prep(struct io_kiocb *req, if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { io_req_init_async(req); - io_req_work_grab_env(req, &io_op_defs[req->opcode]); + io_req_work_grab_env(req); } switch (req->opcode) { From edcdfcc149a8d0c11d4dd2b23b5338af22e31a5f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:41 +0300 Subject: [PATCH 220/502] io_uring: do init work in grab_env() Place io_req_init_async() in io_req_work_grab_env() so it won't be forgotten. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2b7666e81c13..3b2f6fd8f58f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1105,6 +1105,8 @@ static inline void io_req_work_grab_env(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; + io_req_init_async(req); + if (!req->work.mm && def->needs_mm) { mmgrab(current->mm); req->work.mm = current->mm; @@ -1161,9 +1163,7 @@ static inline void io_prep_async_work(struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_UNBOUND; } - io_req_init_async(req); io_req_work_grab_env(req); - *link = io_prep_linked_timeout(req); } @@ -5254,10 +5254,8 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } - if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { - io_req_init_async(req); + if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) io_req_work_grab_env(req); - } switch (req->opcode) { case IORING_OP_NOP: From debb85f496c9cc70663eac31d3ad9153839c844c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:42 +0300 Subject: [PATCH 221/502] io_uring: factor out grab_env() from defer_prep() Remove io_req_work_grab_env() call from io_req_defer_prep(), just call it when neccessary. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3b2f6fd8f58f..caf908382cdb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5240,7 +5240,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock, } static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe, bool for_async) + const struct io_uring_sqe *sqe) { ssize_t ret = 0; @@ -5254,9 +5254,6 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } - if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) - io_req_work_grab_env(req); - switch (req->opcode) { case IORING_OP_NOP: break; @@ -5369,9 +5366,10 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req->io) { if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, true); + ret = io_req_defer_prep(req, sqe); if (ret < 0) return ret; + io_req_work_grab_env(req); } spin_lock_irq(&ctx->completion_lock); @@ -5983,9 +5981,10 @@ fail_req: ret = -EAGAIN; if (io_alloc_async_ctx(req)) goto fail_req; - ret = io_req_defer_prep(req, sqe, true); + ret = io_req_defer_prep(req, sqe); if (unlikely(ret < 0)) goto fail_req; + io_req_work_grab_env(req); } /* @@ -6039,7 +6038,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, false); + ret = io_req_defer_prep(req, sqe); if (ret) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; @@ -6066,7 +6065,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, false); + ret = io_req_defer_prep(req, sqe); if (ret) req->flags |= REQ_F_FAIL_LINK; *link = req; From cbdcb4357c000861b77369c34e110fa893d23607 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:43 +0300 Subject: [PATCH 222/502] io_uring: do grab_env() just before punting Currently io_steal_work() is disabled, and every linked request should go through task_work for initialisation. Do io_req_work_grab_env() just before io-wq punting and for the whole link, so any request reachable by io_steal_work() is prepared. This is also interesting for another reason -- it localises io_req_work_grab_env() into one place just before io-wq punting, helping to to better manage req->work lifetime and add some neat cleanup/optimisations later. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 ++++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index caf908382cdb..9bc4339057ef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1101,7 +1101,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline void io_req_work_grab_env(struct io_kiocb *req) +static void io_req_work_grab_env(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1150,8 +1150,7 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) } } -static inline void io_prep_async_work(struct io_kiocb *req, - struct io_kiocb **link) +static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1164,15 +1163,22 @@ static inline void io_prep_async_work(struct io_kiocb *req, } io_req_work_grab_env(req); - *link = io_prep_linked_timeout(req); } -static inline void io_queue_async_work(struct io_kiocb *req) +static void io_prep_async_link(struct io_kiocb *req) +{ + struct io_kiocb *cur; + + io_prep_async_work(req); + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(cur, &req->link_list, link_list) + io_prep_async_work(cur); +} + +static void __io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; - - io_prep_async_work(req, &link); + struct io_kiocb *link = io_prep_linked_timeout(req); trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, &req->work, req->flags); @@ -1182,6 +1188,13 @@ static inline void io_queue_async_work(struct io_kiocb *req) io_queue_linked_timeout(link); } +static void io_queue_async_work(struct io_kiocb *req) +{ + /* init ->work of the whole link before punting */ + io_prep_async_link(req); + __io_queue_async_work(req); +} + static void io_kill_timeout(struct io_kiocb *req) { int ret; @@ -1215,7 +1228,8 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) if (req_need_defer(req)) break; list_del_init(&req->list); - io_queue_async_work(req); + /* punt-init is done before queueing for defer */ + __io_queue_async_work(req); } while (!list_empty(&ctx->defer_list)); } @@ -1791,7 +1805,7 @@ static void io_put_req(struct io_kiocb *req) static struct io_wq_work *io_steal_work(struct io_kiocb *req) { - struct io_kiocb *nxt = NULL; + struct io_kiocb *timeout, *nxt = NULL; /* * A ref is owned by io-wq in which context we're. So, if that's the @@ -1805,18 +1819,10 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) if (!nxt) return NULL; - if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file) - io_wq_hash_work(&nxt->work, file_inode(nxt->file)); - - io_req_task_queue(nxt); - /* - * If we're going to return actual work, here should be timeout prep: - * - * link = io_prep_linked_timeout(nxt); - * if (link) - * nxt->flags |= REQ_F_QUEUE_TIMEOUT; - */ - return NULL; + timeout = io_prep_linked_timeout(nxt); + if (timeout) + nxt->flags |= REQ_F_QUEUE_TIMEOUT; + return &nxt->work; } /* @@ -5369,8 +5375,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) ret = io_req_defer_prep(req, sqe); if (ret < 0) return ret; - io_req_work_grab_env(req); } + io_prep_async_link(req); spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { @@ -5984,7 +5990,6 @@ fail_req: ret = io_req_defer_prep(req, sqe); if (unlikely(ret < 0)) goto fail_req; - io_req_work_grab_env(req); } /* From ab0b6451db2a8ed630b89ef3826b8ea994149444 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 30 Jun 2020 08:43:15 -0600 Subject: [PATCH 223/502] io_uring: clean up io_kill_linked_timeout() locking Avoid jumping through hoops to silence unused variable warnings, and also fix sparse rightfully complaining about the locking context: fs/io_uring.c:1593:39: warning: context imbalance in 'io_kill_linked_timeout' - unexpected unlock Provide the functional helper as __io_kill_linked_timeout(), and have separate the locking from it. Signed-off-by: Jens Axboe --- fs/io_uring.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9bc4339057ef..3c12221f549e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1569,28 +1569,38 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) return false; } -static void io_kill_linked_timeout(struct io_kiocb *req) +static bool __io_kill_linked_timeout(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link; - bool wake_ev = false; - unsigned long flags = 0; /* false positive warning */ - - if (!(req->flags & REQ_F_COMP_LOCKED)) - spin_lock_irqsave(&ctx->completion_lock, flags); + bool wake_ev; if (list_empty(&req->link_list)) - goto out; + return false; link = list_first_entry(&req->link_list, struct io_kiocb, link_list); if (link->opcode != IORING_OP_LINK_TIMEOUT) - goto out; + return false; list_del_init(&link->link_list); wake_ev = io_link_cancel_timeout(link); req->flags &= ~REQ_F_LINK_TIMEOUT; -out: - if (!(req->flags & REQ_F_COMP_LOCKED)) + return wake_ev; +} + +static void io_kill_linked_timeout(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + bool wake_ev; + + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + wake_ev = __io_kill_linked_timeout(req); spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + wake_ev = __io_kill_linked_timeout(req); + } + if (wake_ev) io_cqring_ev_posted(ctx); } From cf2f54255d0342cfbd273cbb964ad6bc7674f587 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:40 +0300 Subject: [PATCH 224/502] io_uring: don't fail iopoll requeue without ->mm Actually, io_iopoll_queue() may have NULL ->mm, that's if SQ thread didn't grabbed mm before doing iopoll. Don't fail reqs there, as after recent changes it won't be punted directly but rather through task_work. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3c12221f549e..43419f5bef8c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1902,9 +1902,7 @@ static void io_iopoll_queue(struct list_head *again) do { req = list_first_entry(again, struct io_kiocb, list); list_del(&req->list); - - /* should have ->mm unless io_uring is dying, kill reqs then */ - if (unlikely(!current->mm) || !io_rw_reissue(req, -EAGAIN)) + if (!io_rw_reissue(req, -EAGAIN)) io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); } while (!list_empty(again)); } From ea1164e574e9af0a15ab730ead0861a4c7724142 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:41 +0300 Subject: [PATCH 225/502] io_uring: fix NULL mm in io_poll_task_func() io_poll_task_func() hand-coded link submission forgetting to set TASK_RUNNING, acquire mm, etc. Call existing helper for that, i.e. __io_req_task_submit(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 43419f5bef8c..2c17c2613205 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4518,13 +4518,8 @@ static void io_poll_task_func(struct callback_head *cb) struct io_kiocb *nxt = NULL; io_poll_task_handler(req, &nxt); - if (nxt) { - struct io_ring_ctx *ctx = nxt->ctx; - - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(nxt, NULL, NULL); - mutex_unlock(&ctx->uring_lock); - } + if (nxt) + __io_req_task_submit(nxt); } static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, From 0be0b0e33b0bfd08264b108512e44b3907fe987b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:42 +0300 Subject: [PATCH 226/502] io_uring: simplify io_async_task_func() Greatly simplify io_async_task_func() removing duplicated functionality of __io_req_task_submit(). This do one extra spin lock/unlock for cancelled poll case, but that shouldn't happen often. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2c17c2613205..82b35948ac5b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4608,7 +4608,6 @@ static void io_async_task_func(struct callback_head *cb) struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; - bool canceled = false; trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); @@ -4618,15 +4617,8 @@ static void io_async_task_func(struct callback_head *cb) } /* If req is still hashed, it cannot have been canceled. Don't check. */ - if (hash_hashed(&req->hash_node)) { + if (hash_hashed(&req->hash_node)) hash_del(&req->hash_node); - } else { - canceled = READ_ONCE(apoll->poll.canceled); - if (canceled) { - io_cqring_fill_event(req, -ECANCELED); - io_commit_cqring(ctx); - } - } spin_unlock_irq(&ctx->completion_lock); @@ -4635,21 +4627,10 @@ static void io_async_task_func(struct callback_head *cb) memcpy(&req->work, &apoll->work, sizeof(req->work)); kfree(apoll); - if (!canceled) { - __set_current_state(TASK_RUNNING); - if (io_sq_thread_acquire_mm(ctx, req)) { - io_cqring_add_event(req, -EFAULT, 0); - goto end_req; - } - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL, NULL); - mutex_unlock(&ctx->uring_lock); - } else { - io_cqring_ev_posted(ctx); -end_req: - req_set_fail_links(req); - io_double_put_req(req); - } + if (!READ_ONCE(apoll->poll.canceled)) + __io_req_task_submit(req); + else + __io_req_task_cancel(req, -ECANCELED); } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, From 3fa5e0f331280237af918ab2e7a160f5a68d3e7d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:43 +0300 Subject: [PATCH 227/502] io_uring: optimise io_req_find_next() fast check gcc 9.2.0 compiles io_req_find_next() as a separate function leaving the first REQ_F_LINK_HEAD fast check not inlined. Help it by splitting out the check from the function. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 82b35948ac5b..9a43847c6823 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1664,12 +1664,9 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) { - if (likely(!(req->flags & REQ_F_LINK_HEAD))) - return NULL; req->flags &= ~REQ_F_LINK_HEAD; - if (req->flags & REQ_F_LINK_TIMEOUT) io_kill_linked_timeout(req); @@ -1685,6 +1682,13 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return NULL; } +static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_LINK_HEAD))) + return NULL; + return __io_req_find_next(req); +} + static void __io_req_task_cancel(struct io_kiocb *req, int error) { struct io_ring_ctx *ctx = req->ctx; From 8eb06d7e8dd853d70668617dda57de4f6cebe651 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:39 +0300 Subject: [PATCH 228/502] io_uring: fix missing ->mm on exit There is a fancy bug, where exiting user task may not have ->mm, that makes task_works to try to do kthread_use_mm(ctx->sqo_mm). Don't do that if sqo_mm is NULL. [ 290.460558] WARNING: CPU: 6 PID: 150933 at kernel/kthread.c:1238 kthread_use_mm+0xf3/0x110 [ 290.460579] CPU: 6 PID: 150933 Comm: read-write2 Tainted: G I E 5.8.0-rc2-00066-g9b21720607cf #531 [ 290.460580] RIP: 0010:kthread_use_mm+0xf3/0x110 ... [ 290.460584] Call Trace: [ 290.460584] __io_sq_thread_acquire_mm.isra.0.part.0+0x25/0x30 [ 290.460584] __io_req_task_submit+0x64/0x80 [ 290.460584] io_req_task_submit+0x15/0x20 [ 290.460585] task_work_run+0x67/0xa0 [ 290.460585] do_exit+0x35d/0xb70 [ 290.460585] do_group_exit+0x43/0xa0 [ 290.460585] get_signal+0x140/0x900 [ 290.460586] do_signal+0x37/0x780 [ 290.460586] __prepare_exit_to_usermode+0x126/0x1c0 [ 290.460586] __syscall_return_slowpath+0x3b/0x1c0 [ 290.460587] do_syscall_64+0x5f/0xa0 [ 290.460587] entry_SYSCALL_64_after_hwframe+0x44/0xa9 following with faults. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9a43847c6823..cfad2acd4d86 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -958,7 +958,7 @@ static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { if (!current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + if (unlikely(!ctx->sqo_mm || !mmget_not_zero(ctx->sqo_mm))) return -EFAULT; kthread_use_mm(ctx->sqo_mm); } @@ -7216,10 +7216,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - mmgrab(current->mm); - ctx->sqo_mm = current->mm; - if (ctx->flags & IORING_SETUP_SQPOLL) { + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto err; @@ -7263,8 +7263,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, return 0; err: io_finish_async(ctx); - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; + if (ctx->sqo_mm) { + mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + } return ret; } From fb37409a01b011a664347702f44dbf13fa7c7486 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 10 Jun 2020 09:45:20 +0300 Subject: [PATCH 229/502] arch: remove unicore32 port The unicore32 port do not seem maintained for a long time now, there is no upstream toolchain that can create unicore32 binaries and all the links to prebuilt toolchains for unicore32 are dead. Even compilers that were available are not supported by the kernel anymore. Guenter Roeck says: I have stopped building unicore32 images since v4.19 since there is no available compiler that is still supported by the kernel. I am surprised that support for it has not been removed from the kernel. Remove unicore32 port. Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: Guenter Roeck --- .../features/core/cBPF-JIT/arch-support.txt | 1 - .../features/core/eBPF-JIT/arch-support.txt | 1 - .../core/generic-idle-thread/arch-support.txt | 1 - .../core/jump-labels/arch-support.txt | 1 - .../features/core/tracehook/arch-support.txt | 1 - .../features/debug/KASAN/arch-support.txt | 1 - .../debug/debug-vm-pgtable/arch-support.txt | 1 - .../debug/gcov-profile-all/arch-support.txt | 1 - .../features/debug/kgdb/arch-support.txt | 1 - .../debug/kprobes-on-ftrace/arch-support.txt | 1 - .../features/debug/kprobes/arch-support.txt | 1 - .../debug/kretprobes/arch-support.txt | 1 - .../features/debug/optprobes/arch-support.txt | 1 - .../debug/stackprotector/arch-support.txt | 1 - .../features/debug/uprobes/arch-support.txt | 1 - .../debug/user-ret-profiler/arch-support.txt | 1 - .../io/dma-contiguous/arch-support.txt | 1 - .../locking/cmpxchg-local/arch-support.txt | 1 - .../features/locking/lockdep/arch-support.txt | 1 - .../locking/queued-rwlocks/arch-support.txt | 1 - .../locking/queued-spinlocks/arch-support.txt | 1 - .../perf/kprobes-event/arch-support.txt | 1 - .../features/perf/perf-regs/arch-support.txt | 1 - .../perf/perf-stackdump/arch-support.txt | 1 - .../membarrier-sync-core/arch-support.txt | 1 - .../sched/numa-balancing/arch-support.txt | 1 - .../seccomp/seccomp-filter/arch-support.txt | 1 - .../time/arch-tick-broadcast/arch-support.txt | 1 - .../time/clockevents/arch-support.txt | 1 - .../time/context-tracking/arch-support.txt | 1 - .../time/irq-time-acct/arch-support.txt | 1 - .../time/modern-timekeeping/arch-support.txt | 1 - .../time/virt-cpuacct/arch-support.txt | 1 - .../features/vm/ELF-ASLR/arch-support.txt | 1 - .../features/vm/PG_uncached/arch-support.txt | 1 - .../features/vm/THP/arch-support.txt | 1 - .../features/vm/TLB/arch-support.txt | 1 - .../features/vm/huge-vmap/arch-support.txt | 1 - .../features/vm/ioremap_prot/arch-support.txt | 1 - .../features/vm/pte_special/arch-support.txt | 1 - MAINTAINERS | 7 - arch/unicore32/.gitignore | 22 - arch/unicore32/Kconfig | 200 ----- arch/unicore32/Kconfig.debug | 29 - arch/unicore32/Makefile | 59 -- arch/unicore32/boot/Makefile | 39 - arch/unicore32/boot/compressed/Makefile | 64 -- arch/unicore32/boot/compressed/head.S | 201 ----- arch/unicore32/boot/compressed/misc.c | 123 --- arch/unicore32/boot/compressed/piggy.S.in | 6 - arch/unicore32/boot/compressed/vmlinux.lds.S | 58 -- arch/unicore32/configs/defconfig | 214 ----- arch/unicore32/include/asm/Kbuild | 7 - arch/unicore32/include/asm/assembler.h | 128 --- arch/unicore32/include/asm/barrier.h | 16 - arch/unicore32/include/asm/bitops.h | 46 - arch/unicore32/include/asm/bug.h | 20 - arch/unicore32/include/asm/cache.h | 24 - arch/unicore32/include/asm/cacheflush.h | 186 ---- arch/unicore32/include/asm/checksum.h | 38 - arch/unicore32/include/asm/cmpxchg.h | 58 -- arch/unicore32/include/asm/cpu-single.h | 42 - arch/unicore32/include/asm/cputype.h | 30 - arch/unicore32/include/asm/delay.h | 49 -- arch/unicore32/include/asm/dma.h | 20 - arch/unicore32/include/asm/elf.h | 90 -- arch/unicore32/include/asm/fpstate.h | 23 - arch/unicore32/include/asm/fpu-ucf64.h | 50 -- arch/unicore32/include/asm/gpio.h | 101 --- arch/unicore32/include/asm/hwcap.h | 29 - arch/unicore32/include/asm/hwdef-copro.h | 45 - arch/unicore32/include/asm/io.h | 69 -- arch/unicore32/include/asm/irq.h | 102 --- arch/unicore32/include/asm/irqflags.h | 50 -- arch/unicore32/include/asm/linkage.h | 19 - arch/unicore32/include/asm/memblock.h | 43 - arch/unicore32/include/asm/memory.h | 102 --- arch/unicore32/include/asm/mmu.h | 14 - arch/unicore32/include/asm/mmu_context.h | 98 --- arch/unicore32/include/asm/page.h | 74 -- arch/unicore32/include/asm/pci.h | 20 - arch/unicore32/include/asm/pgalloc.h | 87 -- arch/unicore32/include/asm/pgtable-hwdef.h | 51 -- arch/unicore32/include/asm/pgtable.h | 267 ------ arch/unicore32/include/asm/processor.h | 74 -- arch/unicore32/include/asm/ptrace.h | 58 -- arch/unicore32/include/asm/stacktrace.h | 28 - arch/unicore32/include/asm/string.h | 35 - arch/unicore32/include/asm/suspend.h | 26 - arch/unicore32/include/asm/switch_to.h | 27 - arch/unicore32/include/asm/syscall.h | 12 - arch/unicore32/include/asm/thread_info.h | 133 --- arch/unicore32/include/asm/timex.h | 31 - arch/unicore32/include/asm/tlb.h | 24 - arch/unicore32/include/asm/tlbflush.h | 192 ----- arch/unicore32/include/asm/traps.h | 18 - arch/unicore32/include/asm/uaccess.h | 38 - arch/unicore32/include/asm/vmalloc.h | 4 - arch/unicore32/include/mach/PKUnity.h | 95 --- arch/unicore32/include/mach/bitfield.h | 21 - arch/unicore32/include/mach/dma.h | 45 - arch/unicore32/include/mach/hardware.h | 30 - arch/unicore32/include/mach/map.h | 17 - arch/unicore32/include/mach/memory.h | 54 -- arch/unicore32/include/mach/ocd.h | 33 - arch/unicore32/include/mach/pm.h | 37 - arch/unicore32/include/mach/regs-ac97.h | 33 - arch/unicore32/include/mach/regs-dmac.h | 82 -- arch/unicore32/include/mach/regs-gpio.h | 71 -- arch/unicore32/include/mach/regs-i2c.h | 64 -- arch/unicore32/include/mach/regs-intc.h | 29 - arch/unicore32/include/mach/regs-nand.h | 80 -- arch/unicore32/include/mach/regs-ost.h | 91 -- arch/unicore32/include/mach/regs-pci.h | 95 --- arch/unicore32/include/mach/regs-pm.h | 127 --- arch/unicore32/include/mach/regs-ps2.h | 21 - arch/unicore32/include/mach/regs-resetc.h | 35 - arch/unicore32/include/mach/regs-rtc.h | 38 - arch/unicore32/include/mach/regs-sdc.h | 157 ---- arch/unicore32/include/mach/regs-spi.h | 99 --- arch/unicore32/include/mach/regs-uart.h | 3 - arch/unicore32/include/mach/regs-umal.h | 230 ----- arch/unicore32/include/mach/regs-unigfx.h | 201 ----- arch/unicore32/include/mach/uncompress.h | 31 - arch/unicore32/include/uapi/asm/Kbuild | 2 - arch/unicore32/include/uapi/asm/byteorder.h | 25 - arch/unicore32/include/uapi/asm/ptrace.h | 91 -- arch/unicore32/include/uapi/asm/sigcontext.h | 30 - arch/unicore32/include/uapi/asm/unistd.h | 21 - arch/unicore32/kernel/Makefile | 31 - arch/unicore32/kernel/asm-offsets.c | 108 --- arch/unicore32/kernel/clock.c | 387 --------- arch/unicore32/kernel/debug-macro.S | 86 -- arch/unicore32/kernel/debug.S | 82 -- arch/unicore32/kernel/dma.c | 179 ---- arch/unicore32/kernel/early_printk.c | 46 - arch/unicore32/kernel/elf.c | 35 - arch/unicore32/kernel/entry.S | 802 ------------------ arch/unicore32/kernel/fpu-ucf64.c | 117 --- arch/unicore32/kernel/gpio.c | 121 --- arch/unicore32/kernel/head.S | 249 ------ arch/unicore32/kernel/hibernate.c | 159 ---- arch/unicore32/kernel/hibernate_asm.S | 114 --- arch/unicore32/kernel/irq.c | 371 -------- arch/unicore32/kernel/ksyms.c | 57 -- arch/unicore32/kernel/ksyms.h | 14 - arch/unicore32/kernel/module.c | 105 --- arch/unicore32/kernel/pci.c | 371 -------- arch/unicore32/kernel/pm.c | 121 --- arch/unicore32/kernel/process.c | 319 ------- arch/unicore32/kernel/ptrace.c | 147 ---- arch/unicore32/kernel/puv3-core.c | 276 ------ arch/unicore32/kernel/puv3-nb0916.c | 147 ---- arch/unicore32/kernel/setup.c | 352 -------- arch/unicore32/kernel/setup.h | 36 - arch/unicore32/kernel/signal.c | 424 --------- arch/unicore32/kernel/sleep.S | 199 ----- arch/unicore32/kernel/stacktrace.c | 127 --- arch/unicore32/kernel/sys.c | 37 - arch/unicore32/kernel/time.c | 128 --- arch/unicore32/kernel/traps.c | 322 ------- arch/unicore32/kernel/vmlinux.lds.S | 59 -- arch/unicore32/lib/Makefile | 28 - arch/unicore32/lib/backtrace.S | 168 ---- arch/unicore32/lib/clear_user.S | 54 -- arch/unicore32/lib/copy_from_user.S | 101 --- arch/unicore32/lib/copy_page.S | 36 - arch/unicore32/lib/copy_template.S | 211 ----- arch/unicore32/lib/copy_to_user.S | 93 -- arch/unicore32/lib/delay.S | 48 -- arch/unicore32/lib/findbit.S | 97 --- arch/unicore32/lib/strncpy_from_user.S | 42 - arch/unicore32/lib/strnlen_user.S | 39 - arch/unicore32/mm/Kconfig | 41 - arch/unicore32/mm/Makefile | 14 - arch/unicore32/mm/alignment.c | 524 ------------ arch/unicore32/mm/cache-ucv2.S | 209 ----- arch/unicore32/mm/extable.c | 21 - arch/unicore32/mm/fault.c | 481 ----------- arch/unicore32/mm/flush.c | 94 -- arch/unicore32/mm/init.c | 261 ------ arch/unicore32/mm/ioremap.c | 242 ------ arch/unicore32/mm/mm.h | 31 - arch/unicore32/mm/mmu.c | 513 ----------- arch/unicore32/mm/pgd.c | 102 --- arch/unicore32/mm/proc-macros.S | 142 ---- arch/unicore32/mm/proc-syms.c | 19 - arch/unicore32/mm/proc-ucv2.S | 131 --- arch/unicore32/mm/tlb-ucv2.S | 86 -- kernel/reboot.c | 2 +- 190 files changed, 1 insertion(+), 15705 deletions(-) delete mode 100644 arch/unicore32/.gitignore delete mode 100644 arch/unicore32/Kconfig delete mode 100644 arch/unicore32/Kconfig.debug delete mode 100644 arch/unicore32/Makefile delete mode 100644 arch/unicore32/boot/Makefile delete mode 100644 arch/unicore32/boot/compressed/Makefile delete mode 100644 arch/unicore32/boot/compressed/head.S delete mode 100644 arch/unicore32/boot/compressed/misc.c delete mode 100644 arch/unicore32/boot/compressed/piggy.S.in delete mode 100644 arch/unicore32/boot/compressed/vmlinux.lds.S delete mode 100644 arch/unicore32/configs/defconfig delete mode 100644 arch/unicore32/include/asm/Kbuild delete mode 100644 arch/unicore32/include/asm/assembler.h delete mode 100644 arch/unicore32/include/asm/barrier.h delete mode 100644 arch/unicore32/include/asm/bitops.h delete mode 100644 arch/unicore32/include/asm/bug.h delete mode 100644 arch/unicore32/include/asm/cache.h delete mode 100644 arch/unicore32/include/asm/cacheflush.h delete mode 100644 arch/unicore32/include/asm/checksum.h delete mode 100644 arch/unicore32/include/asm/cmpxchg.h delete mode 100644 arch/unicore32/include/asm/cpu-single.h delete mode 100644 arch/unicore32/include/asm/cputype.h delete mode 100644 arch/unicore32/include/asm/delay.h delete mode 100644 arch/unicore32/include/asm/dma.h delete mode 100644 arch/unicore32/include/asm/elf.h delete mode 100644 arch/unicore32/include/asm/fpstate.h delete mode 100644 arch/unicore32/include/asm/fpu-ucf64.h delete mode 100644 arch/unicore32/include/asm/gpio.h delete mode 100644 arch/unicore32/include/asm/hwcap.h delete mode 100644 arch/unicore32/include/asm/hwdef-copro.h delete mode 100644 arch/unicore32/include/asm/io.h delete mode 100644 arch/unicore32/include/asm/irq.h delete mode 100644 arch/unicore32/include/asm/irqflags.h delete mode 100644 arch/unicore32/include/asm/linkage.h delete mode 100644 arch/unicore32/include/asm/memblock.h delete mode 100644 arch/unicore32/include/asm/memory.h delete mode 100644 arch/unicore32/include/asm/mmu.h delete mode 100644 arch/unicore32/include/asm/mmu_context.h delete mode 100644 arch/unicore32/include/asm/page.h delete mode 100644 arch/unicore32/include/asm/pci.h delete mode 100644 arch/unicore32/include/asm/pgalloc.h delete mode 100644 arch/unicore32/include/asm/pgtable-hwdef.h delete mode 100644 arch/unicore32/include/asm/pgtable.h delete mode 100644 arch/unicore32/include/asm/processor.h delete mode 100644 arch/unicore32/include/asm/ptrace.h delete mode 100644 arch/unicore32/include/asm/stacktrace.h delete mode 100644 arch/unicore32/include/asm/string.h delete mode 100644 arch/unicore32/include/asm/suspend.h delete mode 100644 arch/unicore32/include/asm/switch_to.h delete mode 100644 arch/unicore32/include/asm/syscall.h delete mode 100644 arch/unicore32/include/asm/thread_info.h delete mode 100644 arch/unicore32/include/asm/timex.h delete mode 100644 arch/unicore32/include/asm/tlb.h delete mode 100644 arch/unicore32/include/asm/tlbflush.h delete mode 100644 arch/unicore32/include/asm/traps.h delete mode 100644 arch/unicore32/include/asm/uaccess.h delete mode 100644 arch/unicore32/include/asm/vmalloc.h delete mode 100644 arch/unicore32/include/mach/PKUnity.h delete mode 100644 arch/unicore32/include/mach/bitfield.h delete mode 100644 arch/unicore32/include/mach/dma.h delete mode 100644 arch/unicore32/include/mach/hardware.h delete mode 100644 arch/unicore32/include/mach/map.h delete mode 100644 arch/unicore32/include/mach/memory.h delete mode 100644 arch/unicore32/include/mach/ocd.h delete mode 100644 arch/unicore32/include/mach/pm.h delete mode 100644 arch/unicore32/include/mach/regs-ac97.h delete mode 100644 arch/unicore32/include/mach/regs-dmac.h delete mode 100644 arch/unicore32/include/mach/regs-gpio.h delete mode 100644 arch/unicore32/include/mach/regs-i2c.h delete mode 100644 arch/unicore32/include/mach/regs-intc.h delete mode 100644 arch/unicore32/include/mach/regs-nand.h delete mode 100644 arch/unicore32/include/mach/regs-ost.h delete mode 100644 arch/unicore32/include/mach/regs-pci.h delete mode 100644 arch/unicore32/include/mach/regs-pm.h delete mode 100644 arch/unicore32/include/mach/regs-ps2.h delete mode 100644 arch/unicore32/include/mach/regs-resetc.h delete mode 100644 arch/unicore32/include/mach/regs-rtc.h delete mode 100644 arch/unicore32/include/mach/regs-sdc.h delete mode 100644 arch/unicore32/include/mach/regs-spi.h delete mode 100644 arch/unicore32/include/mach/regs-uart.h delete mode 100644 arch/unicore32/include/mach/regs-umal.h delete mode 100644 arch/unicore32/include/mach/regs-unigfx.h delete mode 100644 arch/unicore32/include/mach/uncompress.h delete mode 100644 arch/unicore32/include/uapi/asm/Kbuild delete mode 100644 arch/unicore32/include/uapi/asm/byteorder.h delete mode 100644 arch/unicore32/include/uapi/asm/ptrace.h delete mode 100644 arch/unicore32/include/uapi/asm/sigcontext.h delete mode 100644 arch/unicore32/include/uapi/asm/unistd.h delete mode 100644 arch/unicore32/kernel/Makefile delete mode 100644 arch/unicore32/kernel/asm-offsets.c delete mode 100644 arch/unicore32/kernel/clock.c delete mode 100644 arch/unicore32/kernel/debug-macro.S delete mode 100644 arch/unicore32/kernel/debug.S delete mode 100644 arch/unicore32/kernel/dma.c delete mode 100644 arch/unicore32/kernel/early_printk.c delete mode 100644 arch/unicore32/kernel/elf.c delete mode 100644 arch/unicore32/kernel/entry.S delete mode 100644 arch/unicore32/kernel/fpu-ucf64.c delete mode 100644 arch/unicore32/kernel/gpio.c delete mode 100644 arch/unicore32/kernel/head.S delete mode 100644 arch/unicore32/kernel/hibernate.c delete mode 100644 arch/unicore32/kernel/hibernate_asm.S delete mode 100644 arch/unicore32/kernel/irq.c delete mode 100644 arch/unicore32/kernel/ksyms.c delete mode 100644 arch/unicore32/kernel/ksyms.h delete mode 100644 arch/unicore32/kernel/module.c delete mode 100644 arch/unicore32/kernel/pci.c delete mode 100644 arch/unicore32/kernel/pm.c delete mode 100644 arch/unicore32/kernel/process.c delete mode 100644 arch/unicore32/kernel/ptrace.c delete mode 100644 arch/unicore32/kernel/puv3-core.c delete mode 100644 arch/unicore32/kernel/puv3-nb0916.c delete mode 100644 arch/unicore32/kernel/setup.c delete mode 100644 arch/unicore32/kernel/setup.h delete mode 100644 arch/unicore32/kernel/signal.c delete mode 100644 arch/unicore32/kernel/sleep.S delete mode 100644 arch/unicore32/kernel/stacktrace.c delete mode 100644 arch/unicore32/kernel/sys.c delete mode 100644 arch/unicore32/kernel/time.c delete mode 100644 arch/unicore32/kernel/traps.c delete mode 100644 arch/unicore32/kernel/vmlinux.lds.S delete mode 100644 arch/unicore32/lib/Makefile delete mode 100644 arch/unicore32/lib/backtrace.S delete mode 100644 arch/unicore32/lib/clear_user.S delete mode 100644 arch/unicore32/lib/copy_from_user.S delete mode 100644 arch/unicore32/lib/copy_page.S delete mode 100644 arch/unicore32/lib/copy_template.S delete mode 100644 arch/unicore32/lib/copy_to_user.S delete mode 100644 arch/unicore32/lib/delay.S delete mode 100644 arch/unicore32/lib/findbit.S delete mode 100644 arch/unicore32/lib/strncpy_from_user.S delete mode 100644 arch/unicore32/lib/strnlen_user.S delete mode 100644 arch/unicore32/mm/Kconfig delete mode 100644 arch/unicore32/mm/Makefile delete mode 100644 arch/unicore32/mm/alignment.c delete mode 100644 arch/unicore32/mm/cache-ucv2.S delete mode 100644 arch/unicore32/mm/extable.c delete mode 100644 arch/unicore32/mm/fault.c delete mode 100644 arch/unicore32/mm/flush.c delete mode 100644 arch/unicore32/mm/init.c delete mode 100644 arch/unicore32/mm/ioremap.c delete mode 100644 arch/unicore32/mm/mm.h delete mode 100644 arch/unicore32/mm/mmu.c delete mode 100644 arch/unicore32/mm/pgd.c delete mode 100644 arch/unicore32/mm/proc-macros.S delete mode 100644 arch/unicore32/mm/proc-syms.c delete mode 100644 arch/unicore32/mm/proc-ucv2.S delete mode 100644 arch/unicore32/mm/tlb-ucv2.S diff --git a/Documentation/features/core/cBPF-JIT/arch-support.txt b/Documentation/features/core/cBPF-JIT/arch-support.txt index 8620c38d4db0..399935616813 100644 --- a/Documentation/features/core/cBPF-JIT/arch-support.txt +++ b/Documentation/features/core/cBPF-JIT/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | TODO | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/core/eBPF-JIT/arch-support.txt b/Documentation/features/core/eBPF-JIT/arch-support.txt index 9ed964f65224..79409bfe0263 100644 --- a/Documentation/features/core/eBPF-JIT/arch-support.txt +++ b/Documentation/features/core/eBPF-JIT/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/core/generic-idle-thread/arch-support.txt b/Documentation/features/core/generic-idle-thread/arch-support.txt index 365df2c2ff0b..9ea60e416efd 100644 --- a/Documentation/features/core/generic-idle-thread/arch-support.txt +++ b/Documentation/features/core/generic-idle-thread/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/core/jump-labels/arch-support.txt b/Documentation/features/core/jump-labels/arch-support.txt index 632a1c7aefa2..f8ec5c13cde4 100644 --- a/Documentation/features/core/jump-labels/arch-support.txt +++ b/Documentation/features/core/jump-labels/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/core/tracehook/arch-support.txt b/Documentation/features/core/tracehook/arch-support.txt index 964667052eda..cd3510e2eedb 100644 --- a/Documentation/features/core/tracehook/arch-support.txt +++ b/Documentation/features/core/tracehook/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt index 6ff38548923e..c3fe9b266e7b 100644 --- a/Documentation/features/debug/KASAN/arch-support.txt +++ b/Documentation/features/debug/KASAN/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt index c527d05c0459..ca6bacb1e99e 100644 --- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt +++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/gcov-profile-all/arch-support.txt b/Documentation/features/debug/gcov-profile-all/arch-support.txt index 210256f6a4cf..7563a494ddb8 100644 --- a/Documentation/features/debug/gcov-profile-all/arch-support.txt +++ b/Documentation/features/debug/gcov-profile-all/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/kgdb/arch-support.txt b/Documentation/features/debug/kgdb/arch-support.txt index 38c40cfa0578..4b0a1d0d6ba4 100644 --- a/Documentation/features/debug/kgdb/arch-support.txt +++ b/Documentation/features/debug/kgdb/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt index 97cd7aa74905..6225cfe0c5bf 100644 --- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt +++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/kprobes/arch-support.txt b/Documentation/features/debug/kprobes/arch-support.txt index 8b316c6e03d4..371f0ac488f5 100644 --- a/Documentation/features/debug/kprobes/arch-support.txt +++ b/Documentation/features/debug/kprobes/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/kretprobes/arch-support.txt b/Documentation/features/debug/kretprobes/arch-support.txt index b805aada395e..38e95251deed 100644 --- a/Documentation/features/debug/kretprobes/arch-support.txt +++ b/Documentation/features/debug/kretprobes/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/optprobes/arch-support.txt b/Documentation/features/debug/optprobes/arch-support.txt index fb297a88f62c..7f4a20e6a12b 100644 --- a/Documentation/features/debug/optprobes/arch-support.txt +++ b/Documentation/features/debug/optprobes/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt index 12410f606edc..3db4763aa3f5 100644 --- a/Documentation/features/debug/stackprotector/arch-support.txt +++ b/Documentation/features/debug/stackprotector/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/debug/uprobes/arch-support.txt b/Documentation/features/debug/uprobes/arch-support.txt index be8acbb95b54..43cac6ee0c68 100644 --- a/Documentation/features/debug/uprobes/arch-support.txt +++ b/Documentation/features/debug/uprobes/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/user-ret-profiler/arch-support.txt b/Documentation/features/debug/user-ret-profiler/arch-support.txt index 6bfa36b0e017..d636ed0e679f 100644 --- a/Documentation/features/debug/user-ret-profiler/arch-support.txt +++ b/Documentation/features/debug/user-ret-profiler/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/io/dma-contiguous/arch-support.txt b/Documentation/features/io/dma-contiguous/arch-support.txt index 895c3b0f6492..dfc93d074e3d 100644 --- a/Documentation/features/io/dma-contiguous/arch-support.txt +++ b/Documentation/features/io/dma-contiguous/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/locking/cmpxchg-local/arch-support.txt b/Documentation/features/locking/cmpxchg-local/arch-support.txt index 242ff5a6586e..1815c7fed06d 100644 --- a/Documentation/features/locking/cmpxchg-local/arch-support.txt +++ b/Documentation/features/locking/cmpxchg-local/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/locking/lockdep/arch-support.txt b/Documentation/features/locking/lockdep/arch-support.txt index 98cb9d85c55d..4f844ecd0680 100644 --- a/Documentation/features/locking/lockdep/arch-support.txt +++ b/Documentation/features/locking/lockdep/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | ok | - | unicore32: | ok | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/locking/queued-rwlocks/arch-support.txt b/Documentation/features/locking/queued-rwlocks/arch-support.txt index ee922746a64c..5c6bcfcf8e1f 100644 --- a/Documentation/features/locking/queued-rwlocks/arch-support.txt +++ b/Documentation/features/locking/queued-rwlocks/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/locking/queued-spinlocks/arch-support.txt b/Documentation/features/locking/queued-spinlocks/arch-support.txt index c52116c1a049..b55e420a34ea 100644 --- a/Documentation/features/locking/queued-spinlocks/arch-support.txt +++ b/Documentation/features/locking/queued-spinlocks/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/perf/kprobes-event/arch-support.txt b/Documentation/features/perf/kprobes-event/arch-support.txt index 518f352fc727..04c17c2106a4 100644 --- a/Documentation/features/perf/kprobes-event/arch-support.txt +++ b/Documentation/features/perf/kprobes-event/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt index c22cd6f8aa5e..e7450fbb8253 100644 --- a/Documentation/features/perf/perf-regs/arch-support.txt +++ b/Documentation/features/perf/perf-regs/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt index 527fe4d0b074..98e79d128d9b 100644 --- a/Documentation/features/perf/perf-stackdump/arch-support.txt +++ b/Documentation/features/perf/perf-stackdump/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt index 8a521a622966..68658a6f8c5b 100644 --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt @@ -51,7 +51,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/sched/numa-balancing/arch-support.txt b/Documentation/features/sched/numa-balancing/arch-support.txt index 350823692f28..964457ad26c1 100644 --- a/Documentation/features/sched/numa-balancing/arch-support.txt +++ b/Documentation/features/sched/numa-balancing/arch-support.txt @@ -28,7 +28,6 @@ | sh: | .. | | sparc: | TODO | | um: | .. | - | unicore32: | .. | | x86: | ok | | xtensa: | .. | ----------------------- diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt index c7b837f735b1..f54ddfc06a12 100644 --- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt +++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | ok | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/time/arch-tick-broadcast/arch-support.txt b/Documentation/features/time/arch-tick-broadcast/arch-support.txt index 593536f7925b..4d11cbb3c09b 100644 --- a/Documentation/features/time/arch-tick-broadcast/arch-support.txt +++ b/Documentation/features/time/arch-tick-broadcast/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | TODO | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/time/clockevents/arch-support.txt b/Documentation/features/time/clockevents/arch-support.txt index 7a27157da408..8287b6aa522e 100644 --- a/Documentation/features/time/clockevents/arch-support.txt +++ b/Documentation/features/time/clockevents/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | ok | - | unicore32: | ok | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/time/context-tracking/arch-support.txt b/Documentation/features/time/context-tracking/arch-support.txt index 048bfb6d3872..a71f3a945285 100644 --- a/Documentation/features/time/context-tracking/arch-support.txt +++ b/Documentation/features/time/context-tracking/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt index a14bbad8e948..d9082b91f10e 100644 --- a/Documentation/features/time/irq-time-acct/arch-support.txt +++ b/Documentation/features/time/irq-time-acct/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | .. | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/time/modern-timekeeping/arch-support.txt b/Documentation/features/time/modern-timekeeping/arch-support.txt index 1d46da165b75..a84c3b9d9a94 100644 --- a/Documentation/features/time/modern-timekeeping/arch-support.txt +++ b/Documentation/features/time/modern-timekeeping/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | ok | - | unicore32: | ok | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/time/virt-cpuacct/arch-support.txt b/Documentation/features/time/virt-cpuacct/arch-support.txt index fb0d0cab9cab..56b372da6b01 100644 --- a/Documentation/features/time/virt-cpuacct/arch-support.txt +++ b/Documentation/features/time/virt-cpuacct/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/ELF-ASLR/arch-support.txt b/Documentation/features/vm/ELF-ASLR/arch-support.txt index adc25878d217..eccda0732474 100644 --- a/Documentation/features/vm/ELF-ASLR/arch-support.txt +++ b/Documentation/features/vm/ELF-ASLR/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/PG_uncached/arch-support.txt b/Documentation/features/vm/PG_uncached/arch-support.txt index f05588f9e4b4..c74e3f8040e1 100644 --- a/Documentation/features/vm/PG_uncached/arch-support.txt +++ b/Documentation/features/vm/PG_uncached/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/THP/arch-support.txt b/Documentation/features/vm/THP/arch-support.txt index cdfe8925f881..1c0b95f2b40d 100644 --- a/Documentation/features/vm/THP/arch-support.txt +++ b/Documentation/features/vm/THP/arch-support.txt @@ -28,7 +28,6 @@ | sh: | .. | | sparc: | ok | | um: | .. | - | unicore32: | .. | | x86: | ok | | xtensa: | .. | ----------------------- diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt index 2bdd3b6cee3c..30f75a79ce01 100644 --- a/Documentation/features/vm/TLB/arch-support.txt +++ b/Documentation/features/vm/TLB/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | .. | - | unicore32: | .. | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt index 8525f1981f19..c5ff3a427722 100644 --- a/Documentation/features/vm/huge-vmap/arch-support.txt +++ b/Documentation/features/vm/huge-vmap/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/ioremap_prot/arch-support.txt b/Documentation/features/vm/ioremap_prot/arch-support.txt index 3a6b87de6a19..1cb7406cd858 100644 --- a/Documentation/features/vm/ioremap_prot/arch-support.txt +++ b/Documentation/features/vm/ioremap_prot/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt index 2e017387e228..13d0e1e17001 100644 --- a/Documentation/features/vm/pte_special/arch-support.txt +++ b/Documentation/features/vm/pte_special/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 496fd4eafb68..1de95aa44bbb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17532,13 +17532,6 @@ L: linux-fsdevel@vger.kernel.org S: Supported F: fs/unicode/ -UNICORE32 ARCHITECTURE -M: Guan Xuetao -S: Maintained -W: http://mprc.pku.edu.cn/~guanxuetao/linux -T: git git://github.com/gxt/linux.git -F: arch/unicore32/ - UNIFDEF M: Tony Finch S: Maintained diff --git a/arch/unicore32/.gitignore b/arch/unicore32/.gitignore deleted file mode 100644 index e82f3fb57ba0..000000000000 --- a/arch/unicore32/.gitignore +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Generated include files -# -include/generated -# -# Generated ld script file -# -kernel/vmlinux.lds -# -# Generated images in boot -# -boot/Image -boot/zImage -boot/uImage -# -# Generated files in boot/compressed -# -boot/compressed/piggy.S -boot/compressed/piggy.gzip -boot/compressed/vmlinux -boot/compressed/vmlinux.lds diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig deleted file mode 100644 index 11ba1839d198..000000000000 --- a/arch/unicore32/Kconfig +++ /dev/null @@ -1,200 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -config UNICORE32 - def_bool y - select ARCH_32BIT_OFF_T - select ARCH_HAS_DEVMEM_IS_ALLOWED - select ARCH_HAS_KEEPINITRD - select ARCH_MIGHT_HAVE_PC_PARPORT - select ARCH_MIGHT_HAVE_PC_SERIO - select HAVE_KERNEL_GZIP - select HAVE_KERNEL_BZIP2 - select GENERIC_ATOMIC64 - select HAVE_KERNEL_LZO - select HAVE_KERNEL_LZMA - select HAVE_PCI - select VIRT_TO_BUS - select ARCH_HAVE_CUSTOM_GPIO_H - select GENERIC_FIND_FIRST_BIT - select GENERIC_IRQ_PROBE - select GENERIC_IRQ_SHOW - select ARCH_WANT_FRAME_POINTERS - select GENERIC_IOMAP - select MODULES_USE_ELF_REL - select NEED_DMA_MAP_STATE - select MMU_GATHER_NO_RANGE if MMU - help - UniCore-32 is 32-bit Instruction Set Architecture, - including a series of low-power-consumption RISC chip - designs licensed by PKUnity Ltd. - Please see web page at . - -config GENERIC_CSUM - def_bool y - -config NO_IOPORT_MAP - bool - -config STACKTRACE_SUPPORT - def_bool y - -config LOCKDEP_SUPPORT - def_bool y - -config ARCH_HAS_ILOG2_U32 - bool - -config ARCH_HAS_ILOG2_U64 - bool - -config GENERIC_HWEIGHT - def_bool y - -config GENERIC_CALIBRATE_DELAY - def_bool y - -config ARCH_MAY_HAVE_PC_FDC - bool - -config ZONE_DMA - def_bool y - -menu "System Type" - -config MMU - def_bool y - -config ARCH_FPGA - bool - -config ARCH_PUV3 - def_bool y - select CPU_UCV2 - select GENERIC_CLOCKEVENTS - select HAVE_LEGACY_CLK - select GPIOLIB - -# CONFIGs for ARCH_PUV3 - -if ARCH_PUV3 - -choice - prompt "Board Selection" - default PUV3_DB0913 - -config PUV3_FPGA_DLX200 - select ARCH_FPGA - bool "FPGA board" - -config PUV3_DB0913 - bool "DEBUG board (0913)" - -config PUV3_NB0916 - bool "NetBook board (0916)" - select PWM - select PWM_PUV3 - -config PUV3_SMW0919 - bool "Security Mini-Workstation board (0919)" - -endchoice - -config PUV3_PM - def_bool y if !ARCH_FPGA - -endif - -source "arch/unicore32/mm/Kconfig" - -comment "Floating point support" - -config UNICORE_FPU_F64 - def_bool y if !ARCH_FPGA - -endmenu - -menu "Kernel Features" - -source "kernel/Kconfig.hz" - -config LEDS - def_bool y - depends on GPIOLIB - -config ALIGNMENT_TRAP - def_bool y - help - Unicore processors can not fetch/store information which is not - naturally aligned on the bus, i.e., a 4 byte fetch must start at an - address divisible by 4. On 32-bit Unicore processors, these non-aligned - fetch/store instructions will be emulated in software if you say - here, which has a severe performance impact. This is necessary for - correct operation of some network protocols. With an IP-only - configuration it is safe to say N, otherwise say Y. - -endmenu - -menu "Boot options" - -config CMDLINE - string "Default kernel command string" - default "" - -config CMDLINE_FORCE - bool "Always use the default kernel command string" - depends on CMDLINE != "" - help - Always use the default kernel command string, even if the boot - loader passes other arguments to the kernel. - This is useful if you cannot or don't want to change the - command-line options your boot loader passes to the kernel. - - If unsure, say N. - -endmenu - -menu "Power management options" - -source "kernel/power/Kconfig" - -source "drivers/cpufreq/Kconfig" - -config ARCH_SUSPEND_POSSIBLE - def_bool y if !ARCH_FPGA - -config ARCH_HIBERNATION_POSSIBLE - def_bool y if !ARCH_FPGA - -endmenu - -if ARCH_PUV3 - -config PUV3_GPIO - bool - depends on !ARCH_FPGA - select GPIO_SYSFS - default y - -if PUV3_NB0916 - -menu "PKUnity NetBook-0916 Features" - -config I2C_BATTERY_BQ27200 - tristate "I2C Battery BQ27200 Support" - select I2C_PUV3 - select POWER_SUPPLY - select BATTERY_BQ27XXX - -config I2C_EEPROM_AT24 - tristate "I2C EEPROMs AT24 support" - select I2C_PUV3 - select EEPROM_AT24 - -config LCD_BACKLIGHT - tristate "LCD Backlight support" - select BACKLIGHT_PWM - -endmenu - -endif - -endif diff --git a/arch/unicore32/Kconfig.debug b/arch/unicore32/Kconfig.debug deleted file mode 100644 index ca0ff97657ef..000000000000 --- a/arch/unicore32/Kconfig.debug +++ /dev/null @@ -1,29 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -config EARLY_PRINTK - def_bool DEBUG_OCD - help - Write kernel log output directly into the ocd or to a serial port. - - This is useful for kernel debugging when your machine crashes very - early before the console code is initialized. For normal operation - it is not recommended because it looks ugly and doesn't cooperate - with klogd/syslogd or the X server. You should normally N here, - unless you want to debug such a crash. - -# These options are only for real kernel hackers who want to get their hands dirty. -config DEBUG_LL - bool "Kernel low-level debugging functions" - depends on DEBUG_KERNEL - help - Say Y here to include definitions of printascii, printch, printhex - in the kernel. This is helpful if you are debugging code that - executes before the console is initialized. - -config DEBUG_OCD - bool "Kernel low-level debugging via On-Chip-Debugger" - depends on DEBUG_LL - default y - help - Say Y here if you want the debug print routines to direct their - output to the UniCore On-Chip-Debugger channel using CP #1. diff --git a/arch/unicore32/Makefile b/arch/unicore32/Makefile deleted file mode 100644 index 390819947c37..000000000000 --- a/arch/unicore32/Makefile +++ /dev/null @@ -1,59 +0,0 @@ -# -# arch/unicore32/Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 2002~2010 by Guan Xue-tao -# -ifneq ($(SUBARCH),$(ARCH)) - ifeq ($(CROSS_COMPILE),) - CROSS_COMPILE := $(call cc-cross-prefix, unicore32-linux-) - endif -endif - -LDFLAGS_vmlinux := -p --no-undefined -X - -OBJCOPYFLAGS := -O binary -R .note -R .note.gnu.build-id -R .comment -S - -# Never generate .eh_frame -KBUILD_CFLAGS += $(call cc-option,-fno-dwarf2-cfi-asm) - -# Never use hard float in kernel -KBUILD_CFLAGS += -msoft-float - -ifeq ($(CONFIG_FRAME_POINTER),y) -KBUILD_CFLAGS += -mno-sched-prolog -endif - -CHECKFLAGS += -D__unicore32__ - -head-y := arch/unicore32/kernel/head.o - -core-y += arch/unicore32/kernel/ -core-y += arch/unicore32/mm/ - -libs-y += arch/unicore32/lib/ - -boot := arch/unicore32/boot - -# Default target when executing plain make -KBUILD_IMAGE := $(boot)/zImage - -all: zImage - -zImage Image uImage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ - -archclean: - $(Q)$(MAKE) $(clean)=$(boot) - -define archhelp - echo '* zImage - Compressed kernel image (arch/$(ARCH)/boot/zImage)' - echo ' Image - Uncompressed kernel image (arch/$(ARCH)/boot/Image)' - echo ' uImage - U-Boot wrapped zImage' -endef diff --git a/arch/unicore32/boot/Makefile b/arch/unicore32/boot/Makefile deleted file mode 100644 index 828855007b29..000000000000 --- a/arch/unicore32/boot/Makefile +++ /dev/null @@ -1,39 +0,0 @@ -# -# arch/unicore32/boot/Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 2001~2010 GUAN Xue-tao -# - -targets := Image zImage uImage - -$(obj)/Image: vmlinux FORCE - $(call if_changed,objcopy) - @echo ' Kernel: $@ is ready' - -$(obj)/compressed/vmlinux: $(obj)/Image FORCE - $(Q)$(MAKE) $(build)=$(obj)/compressed $@ - -$(obj)/zImage: $(obj)/compressed/vmlinux FORCE - $(call if_changed,objcopy) - @echo ' Kernel: $@ is ready' - -UIMAGE_ARCH = unicore -UIMAGE_LOADADDR = 0x0 - -$(obj)/uImage: $(obj)/zImage FORCE - $(call if_changed,uimage) - @echo ' Image $@ is ready' - -PHONY += initrd -initrd: - @test "$(INITRD)" != "" || \ - (echo You must specify INITRD; exit -1) - -subdir- := compressed diff --git a/arch/unicore32/boot/compressed/Makefile b/arch/unicore32/boot/compressed/Makefile deleted file mode 100644 index 150fafc32fb0..000000000000 --- a/arch/unicore32/boot/compressed/Makefile +++ /dev/null @@ -1,64 +0,0 @@ -# -# linux/arch/unicore32/boot/compressed/Makefile -# -# create a compressed vmlinuz image from the original vmlinux -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 2001~2010 GUAN Xue-tao -# - -ccflags-y := -fpic -fno-builtin -asflags-y := -Wa,-march=all - -OBJS := misc.o - -# font.c and font.o -CFLAGS_font.o := -Dstatic= -$(obj)/font.c: $(srctree)/lib/fonts/font_8x8.c - $(call cmd,shipped) - -# piggy.S and piggy.o -suffix_$(CONFIG_KERNEL_GZIP) := gzip -suffix_$(CONFIG_KERNEL_BZIP2) := bz2 -suffix_$(CONFIG_KERNEL_LZO) := lzo -suffix_$(CONFIG_KERNEL_LZMA) := lzma - -$(obj)/piggy.$(suffix_y): $(obj)/../Image FORCE - $(call if_changed,$(suffix_y)) - -SEDFLAGS_piggy = s/DECOMP_SUFFIX/$(suffix_y)/ -$(obj)/piggy.S: $(obj)/piggy.S.in - @sed "$(SEDFLAGS_piggy)" < $< > $@ - -$(obj)/piggy.o: $(obj)/piggy.$(suffix_y) $(obj)/piggy.S FORCE - -targets := vmlinux vmlinux.lds font.o font.c head.o misc.o \ - piggy.$(suffix_y) piggy.o piggy.S \ - -# Make sure files are removed during clean -extra-y += piggy.gzip piggy.bz2 piggy.lzo piggy.lzma - -# ? -LDFLAGS_vmlinux += -p -# Report unresolved symbol references -LDFLAGS_vmlinux += --no-undefined -# Delete all temporary local symbols -LDFLAGS_vmlinux += -X -# Next argument is a linker script -LDFLAGS_vmlinux += -T - -# For uidivmod -$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head.o $(obj)/piggy.o \ - $(obj)/misc.o FORCE - $(call if_changed,ld) - -# We now have a PIC decompressor implementation. Decompressors running -# from RAM should not define ZTEXTADDR. Decompressors running directly -# from ROM or Flash must define ZTEXTADDR (preferably via the config) -ZTEXTADDR := 0x03000000 -ZBSSADDR := ALIGN(4) - -CPPFLAGS_vmlinux.lds = -DTEXT_START="$(ZTEXTADDR)" -DBSS_START="$(ZBSSADDR)" diff --git a/arch/unicore32/boot/compressed/head.S b/arch/unicore32/boot/compressed/head.S deleted file mode 100644 index 5f72662cd294..000000000000 --- a/arch/unicore32/boot/compressed/head.S +++ /dev/null @@ -1,201 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/boot/compressed/head.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - -#define csub cmpsub -#define cand cmpand -#define nop8 nop; nop; nop; nop; nop; nop; nop; nop - - .section ".start", #alloc, #execinstr - .text -start: - .type start,#function - - /* Initialize ASR, PRIV mode and INTR off */ - mov r0, #0xD3 - mov.a asr, r0 - - adr r0, LC0 - ldm (r1, r2, r3, r5, r6, r7, r8), [r0]+ - ldw sp, [r0+], #28 - sub.a r0, r0, r1 @ calculate the delta offset - - /* - * if delta is zero, we are running at the address - * we were linked at. - */ - beq not_relocated - - /* - * We're running at a different address. We need to fix - * up various pointers: - * r5 - zImage base address (_start) - * r7 - GOT start - * r8 - GOT end - */ - add r5, r5, r0 - add r7, r7, r0 - add r8, r8, r0 - - /* - * we need to fix up pointers into the BSS region. - * r2 - BSS start - * r3 - BSS end - * sp - stack pointer - */ - add r2, r2, r0 - add r3, r3, r0 - add sp, sp, r0 - - /* - * Relocate all entries in the GOT table. - * This fixes up the C references. - * r7 - GOT start - * r8 - GOT end - */ -1001: ldw r1, [r7+], #0 - add r1, r1, r0 - stw.w r1, [r7]+, #4 - csub.a r7, r8 - bub 1001b - -not_relocated: - /* - * Clear BSS region. - * r2 - BSS start - * r3 - BSS end - */ - mov r0, #0 -1002: stw.w r0, [r2]+, #4 - csub.a r2, r3 - bub 1002b - - /* - * Turn on the cache. - */ - mov r0, #0 - movc p0.c5, r0, #28 @ cache invalidate all - nop8 - movc p0.c6, r0, #6 @ tlb invalidate all - nop8 - - mov r0, #0x1c @ en icache and wb dcache - movc p0.c1, r0, #0 - nop8 - - /* - * Set up some pointers, for starting decompressing. - */ - - mov r1, sp @ malloc space above stack - add r2, sp, #0x10000 @ 64k max - - /* - * Check to see if we will overwrite ourselves. - * r4 = final kernel address - * r5 = start of this image - * r6 = size of decompressed image - * r2 = end of malloc space (and therefore this image) - * We basically want: - * r4 >= r2 -> OK - * r4 + image length <= r5 -> OK - */ - ldw r4, =KERNEL_IMAGE_START - csub.a r4, r2 - bea wont_overwrite - add r0, r4, r6 - csub.a r0, r5 - beb wont_overwrite - - /* - * If overwrite, just print error message - */ - b __error_overwrite - - /* - * We're not in danger of overwriting ourselves. - * Do this the simple way. - */ -wont_overwrite: - /* - * decompress_kernel: - * r0: output_start - * r1: free_mem_ptr_p - * r2: free_mem_ptr_end_p - */ - mov r0, r4 - b.l decompress_kernel @ C functions - - /* - * Clean and flush the cache to maintain consistency. - */ - mov r0, #0 - movc p0.c5, r0, #14 @ flush dcache - nop8 - movc p0.c5, r0, #20 @ icache invalidate all - nop8 - - /* - * Turn off the Cache and MMU. - */ - mov r0, #0 @ disable i/d cache and MMU - movc p0.c1, r0, #0 - nop8 - - mov r0, #0 @ must be zero - ldw r4, =KERNEL_IMAGE_START - mov pc, r4 @ call kernel - - - .align 2 - .type LC0, #object -LC0: .word LC0 @ r1 - .word __bss_start @ r2 - .word _end @ r3 - .word _start @ r5 - .word _image_size @ r6 - .word _got_start @ r7 - .word _got_end @ r8 - .word decompress_stack_end @ sp - .size LC0, . - LC0 - -print_string: -#ifdef CONFIG_DEBUG_OCD -2001: ldb.w r1, [r0]+, #1 - csub.a r1, #0 - bne 2002f - mov pc, lr -2002: - movc r2, p1.c0, #0 - cand.a r2, #2 - bne 2002b - movc p1.c1, r1, #1 - csub.a r1, #'\n' - cmoveq r1, #'\r' - beq 2002b - b 2001b -#else - mov pc, lr -#endif - -__error_overwrite: - adr r0, str_error - b.l print_string -2001: nop8 - b 2001b -str_error: .asciz "\nError: Kernel address OVERWRITE\n" - .align - - .ltorg - - .align 4 - .section ".stack", "aw", %nobits -decompress_stack: .space 4096 -decompress_stack_end: diff --git a/arch/unicore32/boot/compressed/misc.c b/arch/unicore32/boot/compressed/misc.c deleted file mode 100644 index 450d3355de20..000000000000 --- a/arch/unicore32/boot/compressed/misc.c +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/boot/compressed/misc.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#include -#include - -/* - * gzip delarations - */ -unsigned char *output_data; -unsigned long output_ptr; - -unsigned int free_mem_ptr; -unsigned int free_mem_end_ptr; - -#define STATIC static -#define STATIC_RW_DATA /* non-static please */ - -/* - * arch-dependent implementations - */ -#ifndef ARCH_HAVE_DECOMP_ERROR -#define arch_decomp_error(x) -#endif - -#ifndef ARCH_HAVE_DECOMP_SETUP -#define arch_decomp_setup() -#endif - -#ifndef ARCH_HAVE_DECOMP_PUTS -#define arch_decomp_puts(p) -#endif - -void *memcpy(void *dest, const void *src, size_t n) -{ - int i = 0; - unsigned char *d = (unsigned char *)dest, *s = (unsigned char *)src; - - for (i = n >> 3; i > 0; i--) { - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - } - - if (n & 1 << 2) { - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - } - - if (n & 1 << 1) { - *d++ = *s++; - *d++ = *s++; - } - - if (n & 1) - *d++ = *s++; - - return dest; -} - -void error(char *x) -{ - arch_decomp_puts("\n\n"); - arch_decomp_puts(x); - arch_decomp_puts("\n\n -- System halted"); - - arch_decomp_error(x); - - for (;;) - ; /* Halt */ -} - -/* Heap size should be adjusted for different decompress method */ -#ifdef CONFIG_KERNEL_GZIP -#include "../../../../lib/decompress_inflate.c" -#endif - -#ifdef CONFIG_KERNEL_BZIP2 -#include "../../../../lib/decompress_bunzip2.c" -#endif - -#ifdef CONFIG_KERNEL_LZO -#include "../../../../lib/decompress_unlzo.c" -#endif - -#ifdef CONFIG_KERNEL_LZMA -#include "../../../../lib/decompress_unlzma.c" -#endif - -unsigned long decompress_kernel(unsigned long output_start, - unsigned long free_mem_ptr_p, - unsigned long free_mem_ptr_end_p) -{ - unsigned char *tmp; - - output_data = (unsigned char *)output_start; - free_mem_ptr = free_mem_ptr_p; - free_mem_end_ptr = free_mem_ptr_end_p; - - arch_decomp_setup(); - - tmp = (unsigned char *) (((unsigned long)input_data_end) - 4); - output_ptr = get_unaligned_le32(tmp); - - arch_decomp_puts("Uncompressing Linux..."); - __decompress(input_data, input_data_end - input_data, NULL, NULL, - output_data, 0, NULL, error); - arch_decomp_puts(" done, booting the kernel.\n"); - return output_ptr; -} diff --git a/arch/unicore32/boot/compressed/piggy.S.in b/arch/unicore32/boot/compressed/piggy.S.in deleted file mode 100644 index b79704d58026..000000000000 --- a/arch/unicore32/boot/compressed/piggy.S.in +++ /dev/null @@ -1,6 +0,0 @@ - .section .piggydata,#alloc - .globl input_data -input_data: - .incbin "arch/unicore32/boot/compressed/piggy.DECOMP_SUFFIX" - .globl input_data_end -input_data_end: diff --git a/arch/unicore32/boot/compressed/vmlinux.lds.S b/arch/unicore32/boot/compressed/vmlinux.lds.S deleted file mode 100644 index edda4ddfa357..000000000000 --- a/arch/unicore32/boot/compressed/vmlinux.lds.S +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore/boot/compressed/vmlinux.lds.in - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -OUTPUT_ARCH(unicore32) -ENTRY(_start) -SECTIONS -{ - /DISCARD/ : { - /* - * Discard any r/w data - this produces a link error if we have any, - * which is required for PIC decompression. Local data generates - * GOTOFF relocations, which prevents it being relocated independently - * of the text/got segments. - */ - *(.data) - } - - . = TEXT_START; - _text = .; - - .text : { - _start = .; - *(.start) - *(.text) - *(.text.*) - *(.fixup) - *(.gnu.warning) - *(.rodata) - *(.rodata.*) - *(.piggydata) - . = ALIGN(4); - } - - _etext = .; - - /* Assume size of decompressed image is 4x the compressed image */ - _image_size = (_etext - _text) * 4; - - _got_start = .; - .got : { *(.got) } - _got_end = .; - .got.plt : { *(.got.plt) } - _edata = .; - - . = BSS_START; - __bss_start = .; - .bss : { *(.bss) } - _end = .; - - .stack : { *(.stack) } - .comment 0 : { *(.comment) } -} - diff --git a/arch/unicore32/configs/defconfig b/arch/unicore32/configs/defconfig deleted file mode 100644 index 360cc9abcdb0..000000000000 --- a/arch/unicore32/configs/defconfig +++ /dev/null @@ -1,214 +0,0 @@ -### General setup -CONFIG_EXPERIMENTAL=y -CONFIG_LOCALVERSION="-unicore32" -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_HOTPLUG=y -# Initial RAM filesystem and RAM disk (initramfs/initrd) support -#CONFIG_BLK_DEV_INITRD=y -#CONFIG_INITRAMFS_SOURCE="arch/unicore/ramfs/ramfs_config" - -### Enable loadable module support -CONFIG_MODULES=n -CONFIG_MODULE_UNLOAD=y - -### System Type -CONFIG_ARCH_PUV3=y -# Board Selection -CONFIG_PUV3_NB0916=y -# Processor Features -CONFIG_CPU_DCACHE_LINE_DISABLE=y -CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE=n - -### Bus support -CONFIG_PCI=y -CONFIG_PCI_LEGACY=n - -### Boot options -# for debug, adding: earlyprintk=ocd,keep initcall_debug -# others support: test_suspend=mem root=/dev/sda -# hibernate support: resume=/dev/sda3 -CONFIG_CMDLINE="earlyprintk=ocd,keep ignore_loglevel" -# TODO: mem=512M video=unifb:1024x600-16@75 -# for nfs: root=/dev/nfs rw nfsroot=192.168.10.88:/home/udb/nfs/,rsize=1024,wsize=1024 -# ip=192.168.10.83:192.168.10.88:192.168.10.1:255.255.255.0::eth0:off -CONFIG_CMDLINE_FORCE=y - -### Power management options -CONFIG_PM=y -CONFIG_HIBERNATION=y -CONFIG_PM_STD_PARTITION="/dev/sda3" -CONFIG_CPU_FREQ=n -CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y - -### Networking support -CONFIG_NET=y -# Networking options -CONFIG_PACKET=m -CONFIG_UNIX=m -# TCP/IP networking -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_PNP=y -CONFIG_IPV6=n -# Wireless -CONFIG_WIRELESS=y -CONFIG_WIRELESS_EXT=y -CONFIG_MAC80211=m - -### PKUnity SoC Features -CONFIG_USB_WLAN_HED_AQ3=n -CONFIG_USB_CMMB_INNOFIDEI=n -CONFIG_I2C_BATTERY_BQ27200=n -CONFIG_I2C_EEPROM_AT24=n -CONFIG_LCD_BACKLIGHT=n - -CONFIG_PUV3_UMAL=y -CONFIG_PUV3_MUSB=n -CONFIG_PUV3_AC97=n -CONFIG_PUV3_NAND=n -CONFIG_PUV3_MMC=n -CONFIG_PUV3_UART=n - -### Device Drivers -# Memory Technology Device (MTD) support -CONFIG_MTD=m -CONFIG_MTD_UBI=m -CONFIG_MTD_PARTITIONS=y -CONFIG_MTD_CHAR=m -CONFIG_MTD_BLKDEVS=m -# RAM/ROM/Flash chip drivers -CONFIG_MTD_CFI=m -CONFIG_MTD_JEDECPROBE=m -CONFIG_MTD_CFI_AMDSTD=m -# Mapping drivers for chip access -CONFIG_MTD_PHYSMAP=m - -# Block devices -CONFIG_BLK_DEV_LOOP=m - -# SCSI device support -CONFIG_SCSI=y -CONFIG_BLK_DEV_SD=y -CONFIG_BLK_DEV_SR=m -CONFIG_CHR_DEV_SG=m - -# Serial ATA (prod) and Parallel ATA (experimental) drivers -CONFIG_ATA=y -CONFIG_SATA_VIA=y - -# Network device support -CONFIG_NETDEVICES=y -CONFIG_NET_ETHERNET=y -CONFIG_NETDEV_1000=y -# Wireless LAN -CONFIG_WLAN_80211=n -CONFIG_RT2X00=n -CONFIG_RT73USB=n - -# Input device support -CONFIG_INPUT_EVDEV=m -# Keyboards -CONFIG_KEYBOARD_GPIO=m - -# I2C support -CONFIG_I2C=y -CONFIG_I2C_PUV3=y - -# Hardware Monitoring support -#CONFIG_SENSORS_LM75=m -# Generic Thermal sysfs driver -#CONFIG_THERMAL=y -#CONFIG_THERMAL_HWMON=y - -# Multimedia support -CONFIG_MEDIA_SUPPORT=n -CONFIG_VIDEO_DEV=n -CONFIG_USB_VIDEO_CLASS=n - -# Graphics support -CONFIG_FB=y -CONFIG_FB_PUV3_UNIGFX=y -# Console display driver support -CONFIG_VGA_CONSOLE=n -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FONTS=y -CONFIG_FONT_8x8=y -CONFIG_FONT_8x16=y -# Bootup logo -CONFIG_LOGO=n - -# Sound card support -CONFIG_SOUND=m -# Advanced Linux Sound Architecture -CONFIG_SND=m -CONFIG_SND_MIXER_OSS=m -CONFIG_SND_PCM_OSS=m - -# USB support -CONFIG_USB_ARCH_HAS_HCD=n -CONFIG_USB=n -CONFIG_USB_PRINTER=n -CONFIG_USB_STORAGE=n -# Inventra Highspeed Dual Role Controller -CONFIG_USB_MUSB_HDRC=n - -# LED Support -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=y -CONFIG_LEDS_GPIO=y -# LED Triggers -CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=y -CONFIG_LEDS_TRIGGER_DISK=y -CONFIG_LEDS_TRIGGER_HEARTBEAT=y - -# Real Time Clock -CONFIG_RTC_LIB=y -CONFIG_RTC_CLASS=y -CONFIG_RTC_DRV_PUV3=y - -### File systems -CONFIG_EXT2_FS=m -CONFIG_EXT3_FS=y -CONFIG_EXT4_FS=y -CONFIG_FUSE_FS=m -# CD-ROM/DVD Filesystems -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -CONFIG_UDF_FS=m -# DOS/FAT/NT Filesystems -CONFIG_VFAT_FS=m -# Pseudo filesystems -CONFIG_PROC_FS=y -CONFIG_SYSFS=y -CONFIG_TMPFS=y -# Miscellaneous filesystems -CONFIG_MISC_FILESYSTEMS=y -CONFIG_JFFS2_FS=m -CONFIG_UBIFS_FS=m -# Network File Systems -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_NFS_FS=y -CONFIG_NFS_V3=y -CONFIG_ROOT_NFS=y -# Partition Types -CONFIG_PARTITION_ADVANCED=y -CONFIG_MSDOS_PARTITION=y -# Native language support -CONFIG_NLS=y -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_CODEPAGE_936=m -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_UTF8=m - -### Kernel hacking -CONFIG_FRAME_WARN=8096 -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_KERNEL=y -CONFIG_PROVE_LOCKING=n -CONFIG_DEBUG_BUGVERBOSE=y -CONFIG_FRAME_POINTER=y -CONFIG_DEBUG_LL=y - diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild deleted file mode 100644 index 55026e8240d8..000000000000 --- a/arch/unicore32/include/asm/Kbuild +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -generic-y += extable.h -generic-y += kvm_para.h -generic-y += mcs_spinlock.h -generic-y += parport.h -generic-y += syscalls.h -generic-y += user.h diff --git a/arch/unicore32/include/asm/assembler.h b/arch/unicore32/include/asm/assembler.h deleted file mode 100644 index 3de843d92850..000000000000 --- a/arch/unicore32/include/asm/assembler.h +++ /dev/null @@ -1,128 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/assembler.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Do not include any C declarations in this file - it is included by - * assembler source. - */ -#ifndef __ASSEMBLY__ -#error "Only include this from assembly code" -#endif - -#include - -/* - * Little Endian independent macros for shifting bytes within registers. - */ -#define pull >> -#define push << -#define get_byte_0 << #0 -#define get_byte_1 >> #8 -#define get_byte_2 >> #16 -#define get_byte_3 >> #24 -#define put_byte_0 << #0 -#define put_byte_1 << #8 -#define put_byte_2 << #16 -#define put_byte_3 << #24 - -#define cadd cmpadd -#define cand cmpand -#define csub cmpsub -#define cxor cmpxor - -/* - * Enable and disable interrupts - */ - .macro disable_irq, temp - mov \temp, asr - andn \temp, \temp, #0xFF - or \temp, \temp, #PSR_I_BIT | PRIV_MODE - mov.a asr, \temp - .endm - - .macro enable_irq, temp - mov \temp, asr - andn \temp, \temp, #0xFF - or \temp, \temp, #PRIV_MODE - mov.a asr, \temp - .endm - -#define USER(x...) \ -9999: x; \ - .pushsection __ex_table, "a"; \ - .align 3; \ - .long 9999b, 9001f; \ - .popsection - - .macro notcond, cond, nexti = .+8 - .ifc \cond, eq - bne \nexti - .else; .ifc \cond, ne - beq \nexti - .else; .ifc \cond, ea - bub \nexti - .else; .ifc \cond, ub - bea \nexti - .else; .ifc \cond, fs - bns \nexti - .else; .ifc \cond, ns - bfs \nexti - .else; .ifc \cond, fv - bnv \nexti - .else; .ifc \cond, nv - bfv \nexti - .else; .ifc \cond, ua - beb \nexti - .else; .ifc \cond, eb - bua \nexti - .else; .ifc \cond, eg - bsl \nexti - .else; .ifc \cond, sl - beg \nexti - .else; .ifc \cond, sg - bel \nexti - .else; .ifc \cond, el - bsg \nexti - .else; .ifnc \cond, al - .error "Unknown cond in notcond macro argument" - .endif; .endif; .endif; .endif; .endif; .endif; .endif - .endif; .endif; .endif; .endif; .endif; .endif; .endif - .endif - .endm - - .macro usracc, instr, reg, ptr, inc, cond, rept, abort - .rept \rept - notcond \cond, .+8 -9999 : - .if \inc == 1 - \instr\()b.u \reg, [\ptr], #\inc - .elseif \inc == 4 - \instr\()w.u \reg, [\ptr], #\inc - .else - .error "Unsupported inc macro argument" - .endif - - .pushsection __ex_table, "a" - .align 3 - .long 9999b, \abort - .popsection - .endr - .endm - - .macro strusr, reg, ptr, inc, cond = al, rept = 1, abort = 9001f - usracc st, \reg, \ptr, \inc, \cond, \rept, \abort - .endm - - .macro ldrusr, reg, ptr, inc, cond = al, rept = 1, abort = 9001f - usracc ld, \reg, \ptr, \inc, \cond, \rept, \abort - .endm - - .macro nop8 - .rept 8 - nop - .endr - .endm diff --git a/arch/unicore32/include/asm/barrier.h b/arch/unicore32/include/asm/barrier.h deleted file mode 100644 index efb81de87507..000000000000 --- a/arch/unicore32/include/asm/barrier.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Memory barrier implementations for PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2012 GUAN Xue-tao - */ -#ifndef __UNICORE_BARRIER_H__ -#define __UNICORE_BARRIER_H__ - -#define isb() __asm__ __volatile__ ("" : : : "memory") -#define dsb() __asm__ __volatile__ ("" : : : "memory") -#define dmb() __asm__ __volatile__ ("" : : : "memory") - -#include - -#endif /* __UNICORE_BARRIER_H__ */ diff --git a/arch/unicore32/include/asm/bitops.h b/arch/unicore32/include/asm/bitops.h deleted file mode 100644 index deeb2163f35e..000000000000 --- a/arch/unicore32/include/asm/bitops.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/bitops.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_BITOPS_H__ -#define __UNICORE_BITOPS_H__ - -#define _ASM_GENERIC_BITOPS_FLS_H_ -#define _ASM_GENERIC_BITOPS___FLS_H_ -#define _ASM_GENERIC_BITOPS_FFS_H_ -#define _ASM_GENERIC_BITOPS___FFS_H_ -/* - * On UNICORE, those functions can be implemented around - * the cntlz instruction for much better code efficiency. - */ - -static inline int fls(unsigned int x) -{ - int ret; - - asm("cntlz\t%0, %1" : "=r" (ret) : "r" (x) : "cc"); - ret = 32 - ret; - - return ret; -} - -#define __fls(x) (fls(x) - 1) -#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); }) -#define __ffs(x) (ffs(x) - 1) - -#include - -/* following definitions: to avoid using codes in lib/find_*.c */ -#define find_next_bit find_next_bit -#define find_next_zero_bit find_next_zero_bit -#define find_first_bit find_first_bit -#define find_first_zero_bit find_first_zero_bit - -#include - -#endif /* __UNICORE_BITOPS_H__ */ diff --git a/arch/unicore32/include/asm/bug.h b/arch/unicore32/include/asm/bug.h deleted file mode 100644 index 99acea84a865..000000000000 --- a/arch/unicore32/include/asm/bug.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Bug handling for PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2012 GUAN Xue-tao - */ -#ifndef __UNICORE_BUG_H__ -#define __UNICORE_BUG_H__ - -#include - -struct pt_regs; -struct siginfo; - -extern void die(const char *msg, struct pt_regs *regs, int err); -extern void uc32_notify_die(const char *str, struct pt_regs *regs, - int sig, int code, void __user *addr, - unsigned long err, unsigned long trap); - -#endif /* __UNICORE_BUG_H__ */ diff --git a/arch/unicore32/include/asm/cache.h b/arch/unicore32/include/asm/cache.h deleted file mode 100644 index 44ecd1f300fe..000000000000 --- a/arch/unicore32/include/asm/cache.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/cache.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_CACHE_H__ -#define __UNICORE_CACHE_H__ - -#define L1_CACHE_SHIFT (5) -#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) - -/* - * Memory returned by kmalloc() may be used for DMA, so we must make - * sure that all such allocations are cache aligned. Otherwise, - * unrelated code may cause parts of the buffer to be read into the - * cache before the transfer is done, causing old data to be seen by - * the CPU. - */ -#define ARCH_DMA_MINALIGN L1_CACHE_BYTES - -#endif diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h deleted file mode 100644 index ff0be92ebc32..000000000000 --- a/arch/unicore32/include/asm/cacheflush.h +++ /dev/null @@ -1,186 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/cacheflush.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_CACHEFLUSH_H__ -#define __UNICORE_CACHEFLUSH_H__ - -#include - -#include - -#define CACHE_COLOUR(vaddr) ((vaddr & (SHMLBA - 1)) >> PAGE_SHIFT) - -/* - * This flag is used to indicate that the page pointed to by a pte is clean - * and does not require cleaning before returning it to the user. - */ -#define PG_dcache_clean PG_arch_1 - -/* - * MM Cache Management - * =================== - * - * The arch/unicore32/mm/cache.S files implement these methods. - * - * Start addresses are inclusive and end addresses are exclusive; - * start addresses should be rounded down, end addresses up. - * - * See Documentation/core-api/cachetlb.rst for more information. - * Please note that the implementation of these, and the required - * effects are cache-type (VIVT/VIPT/PIPT) specific. - * - * flush_icache_all() - * - * Unconditionally clean and invalidate the entire icache. - * Currently only needed for cache-v6.S and cache-v7.S, see - * __flush_icache_all for the generic implementation. - * - * flush_kern_all() - * - * Unconditionally clean and invalidate the entire cache. - * - * flush_user_all() - * - * Clean and invalidate all user space cache entries - * before a change of page tables. - * - * flush_user_range(start, end, flags) - * - * Clean and invalidate a range of cache entries in the - * specified address space before a change of page tables. - * - start - user start address (inclusive, page aligned) - * - end - user end address (exclusive, page aligned) - * - flags - vma->vm_flags field - * - * coherent_kern_range(start, end) - * - * Ensure coherency between the Icache and the Dcache in the - * region described by start, end. If you have non-snooping - * Harvard caches, you need to implement this function. - * - start - virtual start address - * - end - virtual end address - * - * coherent_user_range(start, end) - * - * Ensure coherency between the Icache and the Dcache in the - * region described by start, end. If you have non-snooping - * Harvard caches, you need to implement this function. - * - start - virtual start address - * - end - virtual end address - * - * flush_kern_dcache_area(kaddr, size) - * - * Ensure that the data held in page is written back. - * - kaddr - page address - * - size - region size - * - * DMA Cache Coherency - * =================== - * - * dma_flush_range(start, end) - * - * Clean and invalidate the specified virtual address range. - * - start - virtual start address - * - end - virtual end address - */ - -extern void __cpuc_flush_icache_all(void); -extern void __cpuc_flush_kern_all(void); -extern void __cpuc_flush_user_all(void); -extern void __cpuc_flush_user_range(unsigned long, unsigned long, unsigned int); -extern void __cpuc_coherent_kern_range(unsigned long, unsigned long); -extern void __cpuc_coherent_user_range(unsigned long, unsigned long); -extern void __cpuc_flush_dcache_area(void *, size_t); -extern void __cpuc_flush_kern_dcache_area(void *addr, size_t size); - -/* - * Copy user data from/to a page which is mapped into a different - * processes address space. Really, we want to allow our "user - * space" model to handle this. - */ -extern void copy_to_user_page(struct vm_area_struct *, struct page *, - unsigned long, void *, const void *, unsigned long); -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - } while (0) - -/* - * Convert calls to our calling convention. - */ -/* Invalidate I-cache */ -static inline void __flush_icache_all(void) -{ - asm("movc p0.c5, %0, #20;\n" - "nop; nop; nop; nop; nop; nop; nop; nop\n" - : - : "r" (0)); -} - -#define flush_cache_all() __cpuc_flush_kern_all() - -extern void flush_cache_mm(struct mm_struct *mm); -extern void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); -extern void flush_cache_page(struct vm_area_struct *vma, - unsigned long user_addr, unsigned long pfn); - -#define flush_cache_dup_mm(mm) flush_cache_mm(mm) - -/* - * Perform necessary cache operations to ensure that data previously - * stored within this range of addresses can be executed by the CPU. - */ -#define flush_icache_range(s, e) __cpuc_coherent_kern_range(s, e) - -/* - * Perform necessary cache operations to ensure that the TLB will - * see data written in the specified area. - */ -#define clean_dcache_area(start, size) cpu_dcache_clean_area(start, size) - -/* - * flush_dcache_page is used when the kernel has written to the page - * cache page at virtual address page->virtual. - * - * If this page isn't mapped (ie, page_mapping == NULL), or it might - * have userspace mappings, then we _must_ always clean + invalidate - * the dcache entries associated with the kernel mapping. - * - * Otherwise we can defer the operation, and clean the cache when we are - * about to change to user space. This is the same method as used on SPARC64. - * See update_mmu_cache for the user space part. - */ -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page *); - -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) - -/* - * We don't appear to need to do anything here. In fact, if we did, we'd - * duplicate cache flushing elsewhere performed by flush_dcache_page(). - */ -#define flush_icache_page(vma, page) do { } while (0) - -/* - * flush_cache_vmap() is used when creating mappings (eg, via vmap, - * vmalloc, ioremap etc) in kernel space for pages. On non-VIPT - * caches, since the direct-mappings of these pages may contain cached - * data, we need to do a full cache flush to ensure that writebacks - * don't corrupt data placed into these pages via the new mappings. - */ -static inline void flush_cache_vmap(unsigned long start, unsigned long end) -{ -} - -static inline void flush_cache_vunmap(unsigned long start, unsigned long end) -{ -} - -#endif diff --git a/arch/unicore32/include/asm/checksum.h b/arch/unicore32/include/asm/checksum.h deleted file mode 100644 index e774ca268c15..000000000000 --- a/arch/unicore32/include/asm/checksum.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/checksum.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * IP checksum routines - */ -#ifndef __UNICORE_CHECKSUM_H__ -#define __UNICORE_CHECKSUM_H__ - -/* - * computes the checksum of the TCP/UDP pseudo-header - * returns a 16-bit checksum, already complemented - */ - -static inline __wsum -csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, - __u8 proto, __wsum sum) -{ - __asm__( - "add.a %0, %1, %2\n" - "addc.a %0, %0, %3\n" - "addc.a %0, %0, %4 << #8\n" - "addc.a %0, %0, %5\n" - "addc %0, %0, #0\n" - : "=&r"(sum) - : "r" (sum), "r" (daddr), "r" (saddr), "r" (len), "Ir" (htons(proto)) - : "cc"); - return sum; -} -#define csum_tcpudp_nofold csum_tcpudp_nofold - -#include - -#endif diff --git a/arch/unicore32/include/asm/cmpxchg.h b/arch/unicore32/include/asm/cmpxchg.h deleted file mode 100644 index 87f960a2e4f0..000000000000 --- a/arch/unicore32/include/asm/cmpxchg.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Atomics xchg/cmpxchg for PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2012 GUAN Xue-tao - */ -#ifndef __UNICORE_CMPXCHG_H__ -#define __UNICORE_CMPXCHG_H__ - -/* - * Generate a link failure on undefined symbol if the pointer points to a value - * of unsupported size. - */ -extern void __xchg_bad_pointer(void); - -static inline unsigned long __xchg(unsigned long x, volatile void *ptr, - int size) -{ - unsigned long ret; - - switch (size) { - case 1: - asm volatile("swapb %0, %1, [%2]" - : "=&r" (ret) - : "r" (x), "r" (ptr) - : "memory", "cc"); - break; - case 4: - asm volatile("swapw %0, %1, [%2]" - : "=&r" (ret) - : "r" (x), "r" (ptr) - : "memory", "cc"); - break; - default: - __xchg_bad_pointer(); - } - - return ret; -} - -#define xchg(ptr, x) \ - ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr)))) - -#include - -/* - * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make - * them available. - */ -#define cmpxchg_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg_local_generic((ptr), \ - (unsigned long)(o), (unsigned long)(n), sizeof(*(ptr)))) -#define cmpxchg64_local(ptr, o, n) \ - __cmpxchg64_local_generic((ptr), (o), (n)) - -#include - -#endif /* __UNICORE_CMPXCHG_H__ */ diff --git a/arch/unicore32/include/asm/cpu-single.h b/arch/unicore32/include/asm/cpu-single.h deleted file mode 100644 index 1b419d697fd1..000000000000 --- a/arch/unicore32/include/asm/cpu-single.h +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/cpu-single.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_CPU_SINGLE_H__ -#define __UNICORE_CPU_SINGLE_H__ - -#include -#include - -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ - -#define cpu_switch_mm(pgd, mm) cpu_do_switch_mm(virt_to_phys(pgd), mm) - -#define cpu_get_pgd() \ - ({ \ - unsigned long pg; \ - __asm__("movc %0, p0.c2, #0" \ - : "=r" (pg) : : "cc"); \ - pg &= ~0x0fff; \ - (pgd_t *)phys_to_virt(pg); \ - }) - -struct mm_struct; - -/* declare all the functions as extern */ -extern void cpu_proc_fin(void); -extern int cpu_do_idle(void); -extern void cpu_dcache_clean_area(void *, int); -extern void cpu_do_switch_mm(unsigned long pgd_phys, struct mm_struct *mm); -extern void cpu_set_pte(pte_t *ptep, pte_t pte); -extern void cpu_reset(unsigned long addr) __attribute__((noreturn)); - -#endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - -#endif /* __UNICORE_CPU_SINGLE_H__ */ diff --git a/arch/unicore32/include/asm/cputype.h b/arch/unicore32/include/asm/cputype.h deleted file mode 100644 index 08a47e3bdbcc..000000000000 --- a/arch/unicore32/include/asm/cputype.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/cputype.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_CPUTYPE_H__ -#define __UNICORE_CPUTYPE_H__ - -#include - -#define CPUID_CPUID 0 -#define CPUID_CACHETYPE 1 - -#define read_cpuid(reg) \ - ({ \ - unsigned int __val; \ - asm("movc %0, p0.c0, #" __stringify(reg) \ - : "=r" (__val) \ - : \ - : "cc"); \ - __val; \ - }) - -#define uc32_cpuid read_cpuid(CPUID_CPUID) -#define uc32_cachetype read_cpuid(CPUID_CACHETYPE) - -#endif diff --git a/arch/unicore32/include/asm/delay.h b/arch/unicore32/include/asm/delay.h deleted file mode 100644 index 934193edfa66..000000000000 --- a/arch/unicore32/include/asm/delay.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/delay.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Delay routines, using a pre-computed "loops_per_second" value. - */ -#ifndef __UNICORE_DELAY_H__ -#define __UNICORE_DELAY_H__ - -#include /* HZ */ - -extern void __delay(int loops); - -/* - * This function intentionally does not exist; if you see references to - * it, it means that you're calling udelay() with an out of range value. - * - * With currently imposed limits, this means that we support a max delay - * of 2000us. Further limits: HZ<=1000 and bogomips<=3355 - */ -extern void __bad_udelay(void); - -/* - * division by multiplication: you don't have to worry about - * loss of precision. - * - * Use only for very small delays ( < 1 msec). Should probably use a - * lookup table, really, as the multiplications take much too long with - * short delays. This is a "reasonable" implementation, though (and the - * first constant multiplications gets optimized away if the delay is - * a constant) - */ -extern void __udelay(unsigned long usecs); -extern void __const_udelay(unsigned long); - -#define MAX_UDELAY_MS 2 - -#define udelay(n) \ - (__builtin_constant_p(n) ? \ - ((n) > (MAX_UDELAY_MS * 1000) ? __bad_udelay() : \ - __const_udelay((n) * ((2199023U*HZ)>>11))) : \ - __udelay(n)) - -#endif /* __UNICORE_DELAY_H__ */ - diff --git a/arch/unicore32/include/asm/dma.h b/arch/unicore32/include/asm/dma.h deleted file mode 100644 index 1326310b21e6..000000000000 --- a/arch/unicore32/include/asm/dma.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/dma.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_DMA_H__ -#define __UNICORE_DMA_H__ - -#include -#include - -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#endif - -#endif /* __UNICORE_DMA_H__ */ diff --git a/arch/unicore32/include/asm/elf.h b/arch/unicore32/include/asm/elf.h deleted file mode 100644 index a464ed5f05d4..000000000000 --- a/arch/unicore32/include/asm/elf.h +++ /dev/null @@ -1,90 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/elf.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_ELF_H__ -#define __UNICORE_ELF_H__ - -#include - -/* - * ELF register definitions.. - */ -#include -#include - -typedef unsigned long elf_greg_t; -typedef unsigned long elf_freg_t[3]; - -#define ELF_NGREG (sizeof(struct pt_regs) / sizeof(elf_greg_t)) -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; - -typedef struct fp_state elf_fpregset_t; - -#define R_UNICORE_NONE 0 -#define R_UNICORE_PC24 1 -#define R_UNICORE_ABS32 2 -#define R_UNICORE_CALL 28 -#define R_UNICORE_JUMP24 29 - -/* - * These are used to set parameters in the core dumps. - */ -#define ELF_CLASS ELFCLASS32 -#define ELF_DATA ELFDATA2LSB -#define ELF_ARCH EM_UNICORE - -/* - * This yields a string that ld.so will use to load implementation - * specific libraries for optimization. This is more specific in - * intent than poking at uname or /proc/cpuinfo. - * - */ -#define ELF_PLATFORM_SIZE 8 -#define ELF_PLATFORM (elf_platform) - -extern char elf_platform[]; - -struct elf32_hdr; - -/* - * This is used to ensure we don't load something for the wrong architecture. - */ -extern int elf_check_arch(const struct elf32_hdr *); -#define elf_check_arch elf_check_arch - -struct task_struct; -int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs); -#define ELF_CORE_COPY_TASK_REGS dump_task_regs - -#define ELF_EXEC_PAGESIZE 4096 - -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) - -/* When the program starts, a1 contains a pointer to a function to be - registered with atexit, as per the SVR4 ABI. A value of 0 means we - have no such handler. */ -#define ELF_PLAT_INIT(_r, load_addr) {(_r)->UCreg_00 = 0; } - -extern void elf_set_personality(const struct elf32_hdr *); -#define SET_PERSONALITY(ex) elf_set_personality(&(ex)) - -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - -extern int vectors_user_mapping(void); -#define arch_setup_additional_pages(bprm, uses_interp) vectors_user_mapping() -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES - -#endif diff --git a/arch/unicore32/include/asm/fpstate.h b/arch/unicore32/include/asm/fpstate.h deleted file mode 100644 index 5811293e7a7e..000000000000 --- a/arch/unicore32/include/asm/fpstate.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/fpstate.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_FPSTATE_H__ -#define __UNICORE_FPSTATE_H__ - -#ifndef __ASSEMBLY__ - -#define FP_REGS_NUMBER 33 - -struct fp_state { - unsigned int regs[FP_REGS_NUMBER]; -} __attribute__((aligned(8))); - -#endif - -#endif diff --git a/arch/unicore32/include/asm/fpu-ucf64.h b/arch/unicore32/include/asm/fpu-ucf64.h deleted file mode 100644 index 7a0c8a9e05d4..000000000000 --- a/arch/unicore32/include/asm/fpu-ucf64.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/fpu-ucf64.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ -#define FPSCR s31 - -/* FPSCR bits */ -#define FPSCR_DEFAULT_NAN (1<<25) - -#define FPSCR_CMPINSTR_BIT (1<<31) - -#define FPSCR_CON (1<<29) -#define FPSCR_TRAP (1<<27) - -/* RND mode */ -#define FPSCR_ROUND_NEAREST (0<<0) -#define FPSCR_ROUND_PLUSINF (2<<0) -#define FPSCR_ROUND_MINUSINF (3<<0) -#define FPSCR_ROUND_TOZERO (1<<0) -#define FPSCR_RMODE_BIT (0) -#define FPSCR_RMODE_MASK (7 << FPSCR_RMODE_BIT) - -/* trap enable */ -#define FPSCR_IOE (1<<16) -#define FPSCR_OFE (1<<14) -#define FPSCR_UFE (1<<13) -#define FPSCR_IXE (1<<12) -#define FPSCR_HIE (1<<11) -#define FPSCR_NDE (1<<10) /* non denomal */ - -/* flags */ -#define FPSCR_IDC (1<<24) -#define FPSCR_HIC (1<<23) -#define FPSCR_IXC (1<<22) -#define FPSCR_OFC (1<<21) -#define FPSCR_UFC (1<<20) -#define FPSCR_IOC (1<<19) - -/* stick bits */ -#define FPSCR_IOS (1<<9) -#define FPSCR_OFS (1<<7) -#define FPSCR_UFS (1<<6) -#define FPSCR_IXS (1<<5) -#define FPSCR_HIS (1<<4) -#define FPSCR_NDS (1<<3) /*non denomal */ diff --git a/arch/unicore32/include/asm/gpio.h b/arch/unicore32/include/asm/gpio.h deleted file mode 100644 index dfad04ca0a65..000000000000 --- a/arch/unicore32/include/asm/gpio.h +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/gpio.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_GPIO_H__ -#define __UNICORE_GPIO_H__ - -#include -#include -#include -#include - -#define GPI_OTP_INT 0 -#define GPI_PCI_INTA 1 -#define GPI_PCI_INTB 2 -#define GPI_PCI_INTC 3 -#define GPI_PCI_INTD 4 -#define GPI_BAT_DET 5 -#define GPI_SD_CD 6 -#define GPI_SOFF_REQ 7 -#define GPI_SD_WP 8 -#define GPI_LCD_CASE_OFF 9 -#define GPO_WIFI_EN 10 -#define GPO_HDD_LED 11 -#define GPO_VGA_EN 12 -#define GPO_LCD_EN 13 -#define GPO_LED_DATA 14 -#define GPO_LED_CLK 15 -#define GPO_CAM_PWR_EN 16 -#define GPO_LCD_VCC_EN 17 -#define GPO_SOFT_OFF 18 -#define GPO_BT_EN 19 -#define GPO_FAN_ON 20 -#define GPO_SPKR 21 -#define GPO_SET_V1 23 -#define GPO_SET_V2 24 -#define GPO_CPU_HEALTH 25 -#define GPO_LAN_SEL 26 - -#ifdef CONFIG_PUV3_NB0916 -#define GPI_BTN_TOUCH 14 -#define GPIO_IN 0x000043ff /* 1 for input */ -#define GPIO_OUT 0x0fffbc00 /* 1 for output */ -#endif /* CONFIG_PUV3_NB0916 */ - -#ifdef CONFIG_PUV3_SMW0919 -#define GPIO_IN 0x000003ff /* 1 for input */ -#define GPIO_OUT 0x0ffffc00 /* 1 for output */ -#endif /* CONFIG_PUV3_SMW0919 */ - -#ifdef CONFIG_PUV3_DB0913 -#define GPIO_IN 0x000001df /* 1 for input */ -#define GPIO_OUT 0x03fee800 /* 1 for output */ -#endif /* CONFIG_PUV3_DB0913 */ - -#define GPIO_DIR (~((GPIO_IN) | 0xf0000000)) - /* 0 input, 1 output */ - -static inline int gpio_get_value(unsigned gpio) -{ - if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX)) - return readl(GPIO_GPLR) & GPIO_GPIO(gpio); - else - return __gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned gpio, int value) -{ - if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX)) - if (value) - writel(GPIO_GPIO(gpio), GPIO_GPSR); - else - writel(GPIO_GPIO(gpio), GPIO_GPCR); - else - __gpio_set_value(gpio, value); -} - -#define gpio_cansleep __gpio_cansleep - -static inline unsigned gpio_to_irq(unsigned gpio) -{ - if ((gpio < IRQ_GPIOHIGH) && (FIELD(1, 1, gpio) & readl(GPIO_GPIR))) - return IRQ_GPIOLOW0 + gpio; - else - return IRQ_GPIO0 + gpio; -} - -static inline unsigned irq_to_gpio(unsigned irq) -{ - if (irq < IRQ_GPIOHIGH) - return irq - IRQ_GPIOLOW0; - else - return irq - IRQ_GPIO0; -} - -#endif /* __UNICORE_GPIO_H__ */ diff --git a/arch/unicore32/include/asm/hwcap.h b/arch/unicore32/include/asm/hwcap.h deleted file mode 100644 index 2e15ffbe8391..000000000000 --- a/arch/unicore32/include/asm/hwcap.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/hwcap.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_HWCAP_H__ -#define __UNICORE_HWCAP_H__ - -/* - * HWCAP flags - */ -#define HWCAP_MSP 1 -#define HWCAP_UNICORE16 2 -#define HWCAP_CMOV 4 -#define HWCAP_UNICORE_F64 8 -#define HWCAP_TLS 0x80 - -#if defined(__KERNEL__) && !defined(__ASSEMBLY__) -/* - * This yields a mask that user programs can use to figure out what - * instruction set this cpu supports. - */ -#define ELF_HWCAP (HWCAP_CMOV | HWCAP_UNICORE_F64) -#endif - -#endif diff --git a/arch/unicore32/include/asm/hwdef-copro.h b/arch/unicore32/include/asm/hwdef-copro.h deleted file mode 100644 index 2db8cf864e43..000000000000 --- a/arch/unicore32/include/asm/hwdef-copro.h +++ /dev/null @@ -1,45 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Co-processor register definitions for PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2012 GUAN Xue-tao - */ -#ifndef __UNICORE_HWDEF_COPRO_H__ -#define __UNICORE_HWDEF_COPRO_H__ - -/* - * Control Register bits (CP#0 CR1) - */ -#define CR_M (1 << 0) /* MMU enable */ -#define CR_A (1 << 1) /* Alignment abort enable */ -#define CR_D (1 << 2) /* Dcache enable */ -#define CR_I (1 << 3) /* Icache enable */ -#define CR_B (1 << 4) /* Dcache write mechanism: write back */ -#define CR_T (1 << 5) /* Burst enable */ -#define CR_V (1 << 13) /* Vectors relocated to 0xffff0000 */ - -#ifndef __ASSEMBLY__ - -#define vectors_high() (cr_alignment & CR_V) - -extern unsigned long cr_no_alignment; /* defined in entry.S */ -extern unsigned long cr_alignment; /* defined in entry.S */ - -static inline unsigned int get_cr(void) -{ - unsigned int val; - asm("movc %0, p0.c1, #0" : "=r" (val) : : "cc"); - return val; -} - -static inline void set_cr(unsigned int val) -{ - asm volatile("movc p0.c1, %0, #0" : : "r" (val) : "cc"); - isb(); -} - -extern void adjust_cr(unsigned long mask, unsigned long set); - -#endif /* __ASSEMBLY__ */ - -#endif /* __UNICORE_HWDEF_COPRO_H__ */ diff --git a/arch/unicore32/include/asm/io.h b/arch/unicore32/include/asm/io.h deleted file mode 100644 index bd4e7c332f85..000000000000 --- a/arch/unicore32/include/asm/io.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/io.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_IO_H__ -#define __UNICORE_IO_H__ - -#ifdef __KERNEL__ - -#include -#include - -#define PCI_IOBASE PKUNITY_PCILIO_BASE -#include - -/* - * __uc32_ioremap takes CPU physical address. - */ -extern void __iomem *__uc32_ioremap(unsigned long, size_t); -extern void __uc32_iounmap(volatile void __iomem *addr); - -/* - * ioremap and friends. - * - * ioremap takes a PCI memory address, as specified in - * Documentation/driver-api/io-mapping.rst. - * - */ -#define ioremap(cookie, size) __uc32_ioremap(cookie, size) -#define iounmap(cookie) __uc32_iounmap(cookie) - -#define readb_relaxed readb -#define readw_relaxed readw -#define readl_relaxed readl - -#define HAVE_ARCH_PIO_SIZE -#define PIO_OFFSET (unsigned int)(PCI_IOBASE) -#define PIO_MASK (unsigned int)(IO_SPACE_LIMIT) -#define PIO_RESERVED (PIO_OFFSET + PIO_MASK + 1) - -#ifdef CONFIG_STRICT_DEVMEM - -#include -#include - -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain - * address is valid. The argument is a physical page number. - * We mimic x86 here by disallowing access to system RAM as well as - * device-exclusive MMIO regions. This effectively disable read()/write() - * on /dev/mem. - */ -static inline int devmem_is_allowed(unsigned long pfn) -{ - if (iomem_is_exclusive(pfn << PAGE_SHIFT)) - return 0; - if (!page_is_ram(pfn)) - return 1; - return 0; -} - -#endif /* CONFIG_STRICT_DEVMEM */ - -#endif /* __KERNEL__ */ -#endif /* __UNICORE_IO_H__ */ diff --git a/arch/unicore32/include/asm/irq.h b/arch/unicore32/include/asm/irq.h deleted file mode 100644 index 3f7f07c0338c..000000000000 --- a/arch/unicore32/include/asm/irq.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/irq.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_IRQ_H__ -#define __UNICORE_IRQ_H__ - -#include - -#define IRQ_GPIOLOW0 0x00 -#define IRQ_GPIOLOW1 0x01 -#define IRQ_GPIOLOW2 0x02 -#define IRQ_GPIOLOW3 0x03 -#define IRQ_GPIOLOW4 0x04 -#define IRQ_GPIOLOW5 0x05 -#define IRQ_GPIOLOW6 0x06 -#define IRQ_GPIOLOW7 0x07 -#define IRQ_GPIOHIGH 0x08 -#define IRQ_USB 0x09 -#define IRQ_SDC 0x0a -#define IRQ_AC97 0x0b -#define IRQ_SATA 0x0c -#define IRQ_MME 0x0d -#define IRQ_PCI_BRIDGE 0x0e -#define IRQ_DDR 0x0f -#define IRQ_SPI 0x10 -#define IRQ_UNIGFX 0x11 -#define IRQ_I2C 0x11 -#define IRQ_UART1 0x12 -#define IRQ_UART0 0x13 -#define IRQ_UMAL 0x14 -#define IRQ_NAND 0x15 -#define IRQ_PS2_KBD 0x16 -#define IRQ_PS2_AUX 0x17 -#define IRQ_DMA 0x18 -#define IRQ_DMAERR 0x19 -#define IRQ_TIMER0 0x1a -#define IRQ_TIMER1 0x1b -#define IRQ_TIMER2 0x1c -#define IRQ_TIMER3 0x1d -#define IRQ_RTC 0x1e -#define IRQ_RTCAlarm 0x1f - -#define IRQ_GPIO0 0x20 -#define IRQ_GPIO1 0x21 -#define IRQ_GPIO2 0x22 -#define IRQ_GPIO3 0x23 -#define IRQ_GPIO4 0x24 -#define IRQ_GPIO5 0x25 -#define IRQ_GPIO6 0x26 -#define IRQ_GPIO7 0x27 -#define IRQ_GPIO8 0x28 -#define IRQ_GPIO9 0x29 -#define IRQ_GPIO10 0x2a -#define IRQ_GPIO11 0x2b -#define IRQ_GPIO12 0x2c -#define IRQ_GPIO13 0x2d -#define IRQ_GPIO14 0x2e -#define IRQ_GPIO15 0x2f -#define IRQ_GPIO16 0x30 -#define IRQ_GPIO17 0x31 -#define IRQ_GPIO18 0x32 -#define IRQ_GPIO19 0x33 -#define IRQ_GPIO20 0x34 -#define IRQ_GPIO21 0x35 -#define IRQ_GPIO22 0x36 -#define IRQ_GPIO23 0x37 -#define IRQ_GPIO24 0x38 -#define IRQ_GPIO25 0x39 -#define IRQ_GPIO26 0x3a -#define IRQ_GPIO27 0x3b - -#ifdef CONFIG_ARCH_FPGA -#define IRQ_PCIINTA IRQ_GPIOLOW2 -#define IRQ_PCIINTB IRQ_GPIOLOW1 -#define IRQ_PCIINTC IRQ_GPIOLOW0 -#define IRQ_PCIINTD IRQ_GPIOLOW6 -#endif - -#if defined(CONFIG_PUV3_DB0913) || defined(CONFIG_PUV3_NB0916) \ - || defined(CONFIG_PUV3_SMW0919) -#define IRQ_PCIINTA IRQ_GPIOLOW1 -#define IRQ_PCIINTB IRQ_GPIOLOW2 -#define IRQ_PCIINTC IRQ_GPIOLOW3 -#define IRQ_PCIINTD IRQ_GPIOLOW4 -#endif - -#define IRQ_SD_CD IRQ_GPIO6 /* falling or rising trigger */ - -#ifndef __ASSEMBLY__ -struct pt_regs; - -extern void asm_do_IRQ(unsigned int, struct pt_regs *); - -#endif - -#endif - diff --git a/arch/unicore32/include/asm/irqflags.h b/arch/unicore32/include/asm/irqflags.h deleted file mode 100644 index f64c82e3eae6..000000000000 --- a/arch/unicore32/include/asm/irqflags.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/irqflags.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_IRQFLAGS_H__ -#define __UNICORE_IRQFLAGS_H__ - -#ifdef __KERNEL__ - -#include - -#define ARCH_IRQ_DISABLED (PRIV_MODE | PSR_I_BIT) -#define ARCH_IRQ_ENABLED (PRIV_MODE) - -/* - * Save the current interrupt enable state. - */ -static inline unsigned long arch_local_save_flags(void) -{ - unsigned long temp; - - asm volatile("mov %0, asr" : "=r" (temp) : : "memory", "cc"); - - return temp & PSR_c; -} - -/* - * restore saved IRQ state - */ -static inline void arch_local_irq_restore(unsigned long flags) -{ - unsigned long temp; - - asm volatile( - "mov %0, asr\n" - "mov.a asr, %1\n" - "mov.f asr, %0" - : "=&r" (temp) - : "r" (flags) - : "memory", "cc"); -} - -#include - -#endif -#endif diff --git a/arch/unicore32/include/asm/linkage.h b/arch/unicore32/include/asm/linkage.h deleted file mode 100644 index 8e341ba7bc4a..000000000000 --- a/arch/unicore32/include/asm/linkage.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/linkage.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_LINKAGE_H__ -#define __UNICORE_LINKAGE_H__ - -#define __ALIGN .align 0 -#define __ALIGN_STR ".align 0" - -#define ENDPROC(name) \ - .type name, %function; \ - END(name) - -#endif diff --git a/arch/unicore32/include/asm/memblock.h b/arch/unicore32/include/asm/memblock.h deleted file mode 100644 index eb56a6ddce83..000000000000 --- a/arch/unicore32/include/asm/memblock.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/memblock.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_MEMBLOCK_H__ -#define __UNICORE_MEMBLOCK_H__ - -/* - * Memory map description - */ -# define NR_BANKS 8 - -struct membank { - unsigned long start; - unsigned long size; - unsigned int highmem; -}; - -struct meminfo { - int nr_banks; - struct membank bank[NR_BANKS]; -}; - -extern struct meminfo meminfo; - -#define for_each_bank(iter, mi) \ - for (iter = 0; iter < (mi)->nr_banks; iter++) - -#define bank_pfn_start(bank) __phys_to_pfn((bank)->start) -#define bank_pfn_end(bank) __phys_to_pfn((bank)->start + (bank)->size) -#define bank_pfn_size(bank) ((bank)->size >> PAGE_SHIFT) -#define bank_phys_start(bank) ((bank)->start) -#define bank_phys_end(bank) ((bank)->start + (bank)->size) -#define bank_phys_size(bank) ((bank)->size) - -extern void uc32_memblock_init(struct meminfo *); - -#endif diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h deleted file mode 100644 index 66285178dd9b..000000000000 --- a/arch/unicore32/include/asm/memory.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/memory.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Note: this file should not be included by non-asm/.h files - */ -#ifndef __UNICORE_MEMORY_H__ -#define __UNICORE_MEMORY_H__ - -#include -#include -#include -#include - -/* - * PAGE_OFFSET - the virtual address of the start of the kernel image - * TASK_SIZE - the maximum size of a user space task. - * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area - */ -#define PAGE_OFFSET UL(0xC0000000) -#define TASK_SIZE (PAGE_OFFSET - UL(0x41000000)) -#define TASK_UNMAPPED_BASE (PAGE_OFFSET / 3) - -/* - * The module space lives between the addresses given by TASK_SIZE - * and PAGE_OFFSET - it must be within 32MB of the kernel text. - */ -#define MODULES_VADDR (PAGE_OFFSET - 16*1024*1024) -#if TASK_SIZE > MODULES_VADDR -#error Top of user space clashes with start of module space -#endif - -#define MODULES_END (PAGE_OFFSET) - -/* - * Allow 16MB-aligned ioremap pages - */ -#define IOREMAP_MAX_ORDER 24 - -/* - * Physical vs virtual RAM address space conversion. These are - * private definitions which should NOT be used outside memory.h - * files. Use virt_to_phys/phys_to_virt/__pa/__va instead. - */ -#ifndef __virt_to_phys -#define __virt_to_phys(x) ((x) - PAGE_OFFSET + PHYS_OFFSET) -#define __phys_to_virt(x) ((x) - PHYS_OFFSET + PAGE_OFFSET) -#endif - -/* - * Convert a page to/from a physical address - */ -#define page_to_phys(page) (__pfn_to_phys(page_to_pfn(page))) -#define phys_to_page(phys) (pfn_to_page(__phys_to_pfn(phys))) - -#ifndef __ASSEMBLY__ - -#ifndef arch_adjust_zones -#define arch_adjust_zones(max_zone_pfn) do { } while (0) -#endif - -/* - * PFNs are used to describe any physical page; this means - * PFN 0 == physical address 0. - * - * This is the PFN of the first RAM page in the kernel - * direct-mapped view. We assume this is the first page - * of RAM in the mem_map as well. - */ -#define PHYS_PFN_OFFSET (PHYS_OFFSET >> PAGE_SHIFT) - -/* - * Drivers should NOT use these either. - */ -#define __pa(x) __virt_to_phys((unsigned long)(x)) -#define __va(x) ((void *)__phys_to_virt((unsigned long)(x))) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) - -/* - * Conversion between a struct page and a physical address. - * - * page_to_pfn(page) convert a struct page * to a PFN number - * pfn_to_page(pfn) convert a _valid_ PFN number to struct page * - * - * virt_to_page(k) convert a _valid_ virtual address to struct page * - * virt_addr_valid(k) indicates whether a virtual address is valid - */ -#define ARCH_PFN_OFFSET PHYS_PFN_OFFSET - -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define virt_addr_valid(kaddr) ((unsigned long)(kaddr) >= PAGE_OFFSET && \ - (unsigned long)(kaddr) < (unsigned long)high_memory) - -#endif - -#include - -#endif diff --git a/arch/unicore32/include/asm/mmu.h b/arch/unicore32/include/asm/mmu.h deleted file mode 100644 index 8ad4e7eae17b..000000000000 --- a/arch/unicore32/include/asm/mmu.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/mmu.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_MMU_H__ -#define __UNICORE_MMU_H__ - -typedef unsigned long mm_context_t; - -#endif diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h deleted file mode 100644 index 388c0c811c68..000000000000 --- a/arch/unicore32/include/asm/mmu_context.h +++ /dev/null @@ -1,98 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/mmu_context.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_MMU_CONTEXT_H__ -#define __UNICORE_MMU_CONTEXT_H__ - -#include -#include -#include -#include -#include - -#include -#include - -#define init_new_context(tsk, mm) 0 - -#define destroy_context(mm) do { } while (0) - -/* - * This is called when "tsk" is about to enter lazy TLB mode. - * - * mm: describes the currently active mm context - * tsk: task which is entering lazy tlb - * cpu: cpu number which is entering lazy tlb - * - * tsk->mm will be NULL - */ -static inline void -enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ -} - -/* - * This is the actual mm switch as far as the scheduler - * is concerned. No registers are touched. We avoid - * calling the CPU specific function when the mm hasn't - * actually changed. - */ -static inline void -switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) -{ - unsigned int cpu = smp_processor_id(); - - if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next) - cpu_switch_mm(next->pgd, next); -} - -#define deactivate_mm(tsk, mm) do { } while (0) -#define activate_mm(prev, next) switch_mm(prev, next, NULL) - -/* - * We are inserting a "fake" vma for the user-accessible vector page so - * gdb and friends can get to it through ptrace and /proc//mem. - * But we also want to remove it before the generic code gets to see it - * during process exit or the unmapping of it would cause total havoc. - * (the macro is used as remove_vma() is static to mm/mmap.c) - */ -#define arch_exit_mmap(mm) \ -do { \ - struct vm_area_struct *high_vma = find_vma(mm, 0xffff0000); \ - if (high_vma) { \ - BUG_ON(high_vma->vm_next); /* it should be last */ \ - if (high_vma->vm_prev) \ - high_vma->vm_prev->vm_next = NULL; \ - else \ - mm->mmap = NULL; \ - rb_erase(&high_vma->vm_rb, &mm->mm_rb); \ - vmacache_invalidate(mm); \ - mm->map_count--; \ - remove_vma(high_vma); \ - } \ -} while (0) - -static inline int arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) -{ - return 0; -} - -static inline void arch_unmap(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ -} - -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool execute, bool foreign) -{ - /* by default, allow everything */ - return true; -} -#endif diff --git a/arch/unicore32/include/asm/page.h b/arch/unicore32/include/asm/page.h deleted file mode 100644 index 96d6bdf180bd..000000000000 --- a/arch/unicore32/include/asm/page.h +++ /dev/null @@ -1,74 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/page.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PAGE_H__ -#define __UNICORE_PAGE_H__ - -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) - -#ifndef __ASSEMBLY__ - -struct page; -struct vm_area_struct; - -#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) -extern void copy_page(void *to, const void *from); - -#define clear_user_page(page, vaddr, pg) clear_page(page) -#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) - -#undef STRICT_MM_TYPECHECKS - -#ifdef STRICT_MM_TYPECHECKS -/* - * These are used to make use of C type-checking.. - */ -typedef struct { unsigned long pte; } pte_t; -typedef struct { unsigned long pgd; } pgd_t; -typedef struct { unsigned long pgprot; } pgprot_t; - -#define pte_val(x) ((x).pte) -#define pgd_val(x) ((x).pgd) -#define pgprot_val(x) ((x).pgprot) - -#define __pte(x) ((pte_t) { (x) }) -#define __pgd(x) ((pgd_t) { (x) }) -#define __pgprot(x) ((pgprot_t) { (x) }) - -#else -/* - * .. while these make it easier on the compiler - */ -typedef unsigned long pte_t; -typedef unsigned long pgd_t; -typedef unsigned long pgprot_t; - -#define pte_val(x) (x) -#define pgd_val(x) (x) -#define pgprot_val(x) (x) - -#define __pte(x) (x) -#define __pgd(x) (x) -#define __pgprot(x) (x) - -#endif /* STRICT_MM_TYPECHECKS */ - -typedef struct page *pgtable_t; - -extern int pfn_valid(unsigned long); - -#include - -#endif /* !__ASSEMBLY__ */ - -#include - -#endif diff --git a/arch/unicore32/include/asm/pci.h b/arch/unicore32/include/asm/pci.h deleted file mode 100644 index 3efa8ee1afce..000000000000 --- a/arch/unicore32/include/asm/pci.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/pci.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PCI_H__ -#define __UNICORE_PCI_H__ - -#ifdef __KERNEL__ -#include -#include /* for PCIBIOS_MIN_* */ - -#define HAVE_PCI_MMAP -#define ARCH_GENERIC_PCI_MMAP_RESOURCE - -#endif /* __KERNEL__ */ -#endif diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h deleted file mode 100644 index ba1c9a79993b..000000000000 --- a/arch/unicore32/include/asm/pgalloc.h +++ /dev/null @@ -1,87 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/pgalloc.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PGALLOC_H__ -#define __UNICORE_PGALLOC_H__ - -#include -#include -#include -#include - -#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL -#define __HAVE_ARCH_PTE_ALLOC_ONE -#include - -#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) -#define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) - -extern pgd_t *get_pgd_slow(struct mm_struct *mm); -extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd); - -#define pgd_alloc(mm) get_pgd_slow(mm) -#define pgd_free(mm, pgd) free_pgd_slow(mm, pgd) - -/* - * Allocate one PTE table. - */ -static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte = __pte_alloc_one_kernel(mm); - - if (pte) - clean_dcache_area(pte, PTRS_PER_PTE * sizeof(pte_t)); - - return pte; -} - -static inline pgtable_t -pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = __pte_alloc_one(mm, GFP_PGTABLE_USER); - if (!pte) - return NULL; - if (!PageHighMem(pte)) - clean_pte_table(page_address(pte)); - return pte; -} - -static inline void __pmd_populate(pmd_t *pmdp, unsigned long pmdval) -{ - set_pmd(pmdp, __pmd(pmdval)); - flush_pmd_entry(pmdp); -} - -/* - * Populate the pmdp entry with a pointer to the pte. This pmd is part - * of the mm address space. - */ -static inline void -pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) -{ - unsigned long pte_ptr = (unsigned long)ptep; - - /* - * The pmd must be loaded with the physical - * address of the PTE table - */ - __pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE); -} - -static inline void -pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) -{ - __pmd_populate(pmdp, - page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE); -} -#define pmd_pgtable(pmd) pmd_page(pmd) - -#endif diff --git a/arch/unicore32/include/asm/pgtable-hwdef.h b/arch/unicore32/include/asm/pgtable-hwdef.h deleted file mode 100644 index f28b58c61db9..000000000000 --- a/arch/unicore32/include/asm/pgtable-hwdef.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/pgtable-hwdef.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PGTABLE_HWDEF_H__ -#define __UNICORE_PGTABLE_HWDEF_H__ - -/* - * Hardware page table definitions. - * - * + Level 1 descriptor (PMD) - * - common - */ -#define PMD_TYPE_MASK (3 << 0) -#define PMD_TYPE_TABLE (0 << 0) -/*#define PMD_TYPE_LARGE (1 << 0) */ -#define PMD_TYPE_INVALID (2 << 0) -#define PMD_TYPE_SECT (3 << 0) - -#define PMD_PRESENT (1 << 2) -#define PMD_YOUNG (1 << 3) - -/*#define PMD_SECT_DIRTY (1 << 4) */ -#define PMD_SECT_CACHEABLE (1 << 5) -#define PMD_SECT_EXEC (1 << 6) -#define PMD_SECT_WRITE (1 << 7) -#define PMD_SECT_READ (1 << 8) - -/* - * + Level 2 descriptor (PTE) - * - common - */ -#define PTE_TYPE_MASK (3 << 0) -#define PTE_TYPE_SMALL (0 << 0) -#define PTE_TYPE_MIDDLE (1 << 0) -#define PTE_TYPE_LARGE (2 << 0) -#define PTE_TYPE_INVALID (3 << 0) - -#define PTE_PRESENT (1 << 2) -#define PTE_YOUNG (1 << 3) -#define PTE_DIRTY (1 << 4) -#define PTE_CACHEABLE (1 << 5) -#define PTE_EXEC (1 << 6) -#define PTE_WRITE (1 << 7) -#define PTE_READ (1 << 8) - -#endif diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h deleted file mode 100644 index 97f564c8ecba..000000000000 --- a/arch/unicore32/include/asm/pgtable.h +++ /dev/null @@ -1,267 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/pgtable.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PGTABLE_H__ -#define __UNICORE_PGTABLE_H__ - -#include -#include - -#include -#include - -/* - * Just any arbitrary offset to the start of the vmalloc VM area: the - * current 8MB value just means that there will be a 8MB "hole" after the - * physical memory until the kernel virtual memory starts. That means that - * any out-of-bounds memory accesses will hopefully be caught. - * The vmalloc() routines leaves a hole of 4kB between each vmalloced - * area for the same reason. ;) - * - * Note that platforms may override VMALLOC_START, but they must provide - * VMALLOC_END. VMALLOC_END defines the (exclusive) limit of this space, - * which may not overlap IO space. - */ -#ifndef VMALLOC_START -#define VMALLOC_OFFSET SZ_8M -#define VMALLOC_START (((unsigned long)high_memory + VMALLOC_OFFSET) \ - & ~(VMALLOC_OFFSET-1)) -#define VMALLOC_END (0xff000000UL) -#endif - -#define PTRS_PER_PTE 1024 -#define PTRS_PER_PGD 1024 - -/* - * PGDIR_SHIFT determines what a third-level page table entry can map - */ -#define PGDIR_SHIFT 22 - -#ifndef __ASSEMBLY__ -extern void __pte_error(const char *file, int line, unsigned long val); -extern void __pgd_error(const char *file, int line, unsigned long val); - -#define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte)) -#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) -#endif /* !__ASSEMBLY__ */ - -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -/* - * This is the lowest virtual address we can permit any user space - * mapping to be mapped at. This is particularly important for - * non-high vector CPUs. - */ -#define FIRST_USER_ADDRESS PAGE_SIZE - -#define FIRST_USER_PGD_NR 1 -#define USER_PTRS_PER_PGD ((TASK_SIZE/PGDIR_SIZE) - FIRST_USER_PGD_NR) - -/* - * section address mask and size definitions. - */ -#define SECTION_SHIFT 22 -#define SECTION_SIZE (1UL << SECTION_SHIFT) -#define SECTION_MASK (~(SECTION_SIZE-1)) - -#ifndef __ASSEMBLY__ - -/* - * The pgprot_* and protection_map entries will be fixed up in runtime - * to include the cachable bits based on memory policy, as well as any - * architecture dependent bits. - */ -#define _PTE_DEFAULT (PTE_PRESENT | PTE_YOUNG | PTE_CACHEABLE) - -extern pgprot_t pgprot_user; -extern pgprot_t pgprot_kernel; - -#define PAGE_NONE pgprot_user -#define PAGE_SHARED __pgprot(pgprot_val(pgprot_user | PTE_READ \ - | PTE_WRITE)) -#define PAGE_SHARED_EXEC __pgprot(pgprot_val(pgprot_user | PTE_READ \ - | PTE_WRITE \ - | PTE_EXEC)) -#define PAGE_COPY __pgprot(pgprot_val(pgprot_user | PTE_READ) -#define PAGE_COPY_EXEC __pgprot(pgprot_val(pgprot_user | PTE_READ \ - | PTE_EXEC)) -#define PAGE_READONLY __pgprot(pgprot_val(pgprot_user | PTE_READ)) -#define PAGE_READONLY_EXEC __pgprot(pgprot_val(pgprot_user | PTE_READ \ - | PTE_EXEC)) -#define PAGE_KERNEL pgprot_kernel -#define PAGE_KERNEL_EXEC __pgprot(pgprot_val(pgprot_kernel | PTE_EXEC)) - -#define __PAGE_NONE __pgprot(_PTE_DEFAULT) -#define __PAGE_SHARED __pgprot(_PTE_DEFAULT | PTE_READ \ - | PTE_WRITE) -#define __PAGE_SHARED_EXEC __pgprot(_PTE_DEFAULT | PTE_READ \ - | PTE_WRITE \ - | PTE_EXEC) -#define __PAGE_COPY __pgprot(_PTE_DEFAULT | PTE_READ) -#define __PAGE_COPY_EXEC __pgprot(_PTE_DEFAULT | PTE_READ \ - | PTE_EXEC) -#define __PAGE_READONLY __pgprot(_PTE_DEFAULT | PTE_READ) -#define __PAGE_READONLY_EXEC __pgprot(_PTE_DEFAULT | PTE_READ \ - | PTE_EXEC) - -#endif /* __ASSEMBLY__ */ - -/* - * The table below defines the page protection levels that we insert into our - * Linux page table version. These get translated into the best that the - * architecture can perform. Note that on UniCore hardware: - * 1) We cannot do execute protection - * 2) If we could do execute protection, then read is implied - * 3) write implies read permissions - */ -#define __P000 __PAGE_NONE -#define __P001 __PAGE_READONLY -#define __P010 __PAGE_COPY -#define __P011 __PAGE_COPY -#define __P100 __PAGE_READONLY_EXEC -#define __P101 __PAGE_READONLY_EXEC -#define __P110 __PAGE_COPY_EXEC -#define __P111 __PAGE_COPY_EXEC - -#define __S000 __PAGE_NONE -#define __S001 __PAGE_READONLY -#define __S010 __PAGE_SHARED -#define __S011 __PAGE_SHARED -#define __S100 __PAGE_READONLY_EXEC -#define __S101 __PAGE_READONLY_EXEC -#define __S110 __PAGE_SHARED_EXEC -#define __S111 __PAGE_SHARED_EXEC - -#ifndef __ASSEMBLY__ -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern struct page *empty_zero_page; -#define ZERO_PAGE(vaddr) (empty_zero_page) - -#define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) -#define pfn_pte(pfn, prot) (__pte(((pfn) << PAGE_SHIFT) \ - | pgprot_val(prot))) - -#define pte_none(pte) (!pte_val(pte)) -#define pte_clear(mm, addr, ptep) set_pte(ptep, __pte(0)) -#define pte_page(pte) (pfn_to_page(pte_pfn(pte))) - -#define set_pte(ptep, pte) cpu_set_pte(ptep, pte) - -#define set_pte_at(mm, addr, ptep, pteval) \ - do { \ - set_pte(ptep, pteval); \ - } while (0) - -/* - * The following only work if pte_present() is true. - * Undefined behaviour if not.. - */ -#define pte_present(pte) (pte_val(pte) & PTE_PRESENT) -#define pte_write(pte) (pte_val(pte) & PTE_WRITE) -#define pte_dirty(pte) (pte_val(pte) & PTE_DIRTY) -#define pte_young(pte) (pte_val(pte) & PTE_YOUNG) -#define pte_exec(pte) (pte_val(pte) & PTE_EXEC) - -#define PTE_BIT_FUNC(fn, op) \ -static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; } - -PTE_BIT_FUNC(wrprotect, &= ~PTE_WRITE); -PTE_BIT_FUNC(mkwrite, |= PTE_WRITE); -PTE_BIT_FUNC(mkclean, &= ~PTE_DIRTY); -PTE_BIT_FUNC(mkdirty, |= PTE_DIRTY); -PTE_BIT_FUNC(mkold, &= ~PTE_YOUNG); -PTE_BIT_FUNC(mkyoung, |= PTE_YOUNG); - -/* - * Mark the prot value as uncacheable. - */ -#define pgprot_noncached(prot) \ - __pgprot(pgprot_val(prot) & ~PTE_CACHEABLE) -#define pgprot_writecombine(prot) \ - __pgprot(pgprot_val(prot) & ~PTE_CACHEABLE) - -#define pmd_none(pmd) (!pmd_val(pmd)) -#define pmd_present(pmd) (pmd_val(pmd) & PMD_PRESENT) -#define pmd_bad(pmd) (((pmd_val(pmd) & \ - (PMD_PRESENT | PMD_TYPE_MASK)) \ - != (PMD_PRESENT | PMD_TYPE_TABLE))) - -#define set_pmd(pmdpd, pmdval) \ - do { \ - *(pmdpd) = pmdval; \ - } while (0) - -#define pmd_clear(pmdp) \ - do { \ - set_pmd(pmdp, __pmd(0));\ - clean_pmd_entry(pmdp); \ - } while (0) - -#define pmd_page_vaddr(pmd) ((pte_t *)__va(pmd_val(pmd) & PAGE_MASK)) -#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd))) - -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot) - -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - const unsigned long mask = PTE_EXEC | PTE_WRITE | PTE_READ; - pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask); - return pte; -} - -extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; - -/* - * Encode and decode a swap entry. Swap entries are stored in the Linux - * page tables as follows: - * - * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 - * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - * <--------------- offset --------------> <--- type --> 0 0 0 0 0 - * - * This gives us up to 127 swap files and 32GB per swap file. Note that - * the offset field is always non-zero. - */ -#define __SWP_TYPE_SHIFT 5 -#define __SWP_TYPE_BITS 7 -#define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) -#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) - -#define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) \ - & __SWP_TYPE_MASK) -#define __swp_offset(x) ((x).val >> __SWP_OFFSET_SHIFT) -#define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << __SWP_TYPE_SHIFT) | \ - ((offset) << __SWP_OFFSET_SHIFT) }) - -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) - -/* - * It is an error for the kernel to have more swap files than we can - * encode in the PTEs. This ensures that we know when MAX_SWAPFILES - * is increased beyond what we presently support. - */ -#define MAX_SWAPFILES_CHECK() \ - BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS) - -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -/* FIXME: this is not correct */ -#define kern_addr_valid(addr) (1) - -#endif /* !__ASSEMBLY__ */ - -#endif /* __UNICORE_PGTABLE_H__ */ diff --git a/arch/unicore32/include/asm/processor.h b/arch/unicore32/include/asm/processor.h deleted file mode 100644 index 6f01620da3d1..000000000000 --- a/arch/unicore32/include/asm/processor.h +++ /dev/null @@ -1,74 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/processor.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_PROCESSOR_H__ -#define __UNICORE_PROCESSOR_H__ - -#ifdef __KERNEL__ - -#include -#include - -#ifdef __KERNEL__ -#define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX TASK_SIZE -#endif - -struct debug_entry { - u32 address; - u32 insn; -}; - -struct debug_info { - int nsaved; - struct debug_entry bp[2]; -}; - -struct thread_struct { - /* fault info */ - unsigned long address; - unsigned long trap_no; - unsigned long error_code; - /* debugging */ - struct debug_info debug; -}; - -#define INIT_THREAD { } - -#define start_thread(regs, pc, sp) \ -({ \ - unsigned long *stack = (unsigned long *)sp; \ - memset(regs->uregs, 0, sizeof(regs->uregs)); \ - regs->UCreg_asr = USER_MODE; \ - regs->UCreg_pc = pc & ~1; /* pc */ \ - regs->UCreg_sp = sp; /* sp */ \ - regs->UCreg_02 = stack[2]; /* r2 (envp) */ \ - regs->UCreg_01 = stack[1]; /* r1 (argv) */ \ - regs->UCreg_00 = stack[0]; /* r0 (argc) */ \ -}) - -/* Forward declaration, a strange C thing */ -struct task_struct; - -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - -unsigned long get_wchan(struct task_struct *p); - -#define cpu_relax() barrier() - -#define task_pt_regs(p) \ - ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1) - -#define KSTK_EIP(tsk) (task_pt_regs(tsk)->UCreg_pc) -#define KSTK_ESP(tsk) (task_pt_regs(tsk)->UCreg_sp) - -#endif - -#endif /* __UNICORE_PROCESSOR_H__ */ diff --git a/arch/unicore32/include/asm/ptrace.h b/arch/unicore32/include/asm/ptrace.h deleted file mode 100644 index bb4cbc42c321..000000000000 --- a/arch/unicore32/include/asm/ptrace.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/ptrace.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PTRACE_H__ -#define __UNICORE_PTRACE_H__ - -#include - -#ifndef __ASSEMBLY__ - -#define user_mode(regs) \ - (processor_mode(regs) == USER_MODE) - -#define processor_mode(regs) \ - ((regs)->UCreg_asr & MODE_MASK) - -#define interrupts_enabled(regs) \ - (!((regs)->UCreg_asr & PSR_I_BIT)) - -#define fast_interrupts_enabled(regs) \ - (!((regs)->UCreg_asr & PSR_R_BIT)) - -/* Are the current registers suitable for user mode? - * (used to maintain security in signal handlers) - */ -static inline int valid_user_regs(struct pt_regs *regs) -{ - unsigned long mode = regs->UCreg_asr & MODE_MASK; - - /* - * Always clear the R (REAL) bits - */ - regs->UCreg_asr &= ~(PSR_R_BIT); - - if ((regs->UCreg_asr & PSR_I_BIT) == 0) { - if (mode == USER_MODE) - return 1; - } - - /* - * Force ASR to something logical... - */ - regs->UCreg_asr &= PSR_f | USER_MODE; - - return 0; -} - -#define instruction_pointer(regs) ((regs)->UCreg_pc) -#define user_stack_pointer(regs) ((regs)->UCreg_sp) -#define profile_pc(regs) instruction_pointer(regs) - -#endif /* __ASSEMBLY__ */ -#endif diff --git a/arch/unicore32/include/asm/stacktrace.h b/arch/unicore32/include/asm/stacktrace.h deleted file mode 100644 index 3e59f9d2faed..000000000000 --- a/arch/unicore32/include/asm/stacktrace.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/stacktrace.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_STACKTRACE_H__ -#define __UNICORE_STACKTRACE_H__ - -struct stackframe { - unsigned long fp; - unsigned long sp; - unsigned long lr; - unsigned long pc; -}; - -#ifdef CONFIG_FRAME_POINTER -extern int unwind_frame(struct stackframe *frame); -#else -#define unwind_frame(f) (-EINVAL) -#endif -extern void walk_stackframe(struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data); - -#endif /* __UNICORE_STACKTRACE_H__ */ diff --git a/arch/unicore32/include/asm/string.h b/arch/unicore32/include/asm/string.h deleted file mode 100644 index 1649b0e4271b..000000000000 --- a/arch/unicore32/include/asm/string.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/string.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_STRING_H__ -#define __UNICORE_STRING_H__ - -/* - * We don't do inline string functions, since the - * optimised inline asm versions are not small. - */ - -#define __HAVE_ARCH_STRRCHR -extern char *strrchr(const char *s, int c); - -#define __HAVE_ARCH_STRCHR -extern char *strchr(const char *s, int c); - -#define __HAVE_ARCH_MEMCPY -extern void *memcpy(void *, const void *, __kernel_size_t); - -#define __HAVE_ARCH_MEMMOVE -extern void *memmove(void *, const void *, __kernel_size_t); - -#define __HAVE_ARCH_MEMCHR -extern void *memchr(const void *, int, __kernel_size_t); - -#define __HAVE_ARCH_MEMSET -extern void *memset(void *, int, __kernel_size_t); - -#endif diff --git a/arch/unicore32/include/asm/suspend.h b/arch/unicore32/include/asm/suspend.h deleted file mode 100644 index 72bd89c44d10..000000000000 --- a/arch/unicore32/include/asm/suspend.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/suspend.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_SUSPEND_H__ -#define __UNICORE_SUSPEND_H__ - -#ifndef __ASSEMBLY__ - -#include - -struct swsusp_arch_regs { - struct cpu_context_save cpu_context; /* cpu context */ -#ifdef CONFIG_UNICORE_FPU_F64 - struct fp_state fpstate __attribute__((aligned(8))); -#endif -}; -#endif - -#endif /* __UNICORE_SUSPEND_H__ */ - diff --git a/arch/unicore32/include/asm/switch_to.h b/arch/unicore32/include/asm/switch_to.h deleted file mode 100644 index 12e534b3bfa5..000000000000 --- a/arch/unicore32/include/asm/switch_to.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Task switching for PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2012 GUAN Xue-tao - */ -#ifndef __UNICORE_SWITCH_TO_H__ -#define __UNICORE_SWITCH_TO_H__ - -struct task_struct; -struct thread_info; - -/* - * switch_to(prev, next) should switch from task `prev' to `next' - * `prev' will never be the same as `next'. schedule() itself - * contains the memory barrier to tell GCC not to cache `current'. - */ -extern struct task_struct *__switch_to(struct task_struct *, - struct thread_info *, struct thread_info *); - -#define switch_to(prev, next, last) \ - do { \ - last = __switch_to(prev, task_thread_info(prev), \ - task_thread_info(next)); \ - } while (0) - -#endif /* __UNICORE_SWITCH_TO_H__ */ diff --git a/arch/unicore32/include/asm/syscall.h b/arch/unicore32/include/asm/syscall.h deleted file mode 100644 index 607961797fff..000000000000 --- a/arch/unicore32/include/asm/syscall.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_UNICORE_SYSCALL_H -#define _ASM_UNICORE_SYSCALL_H - -#include - -static inline int syscall_get_arch(struct task_struct *task) -{ - return AUDIT_ARCH_UNICORE; -} - -#endif /* _ASM_UNICORE_SYSCALL_H */ diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h deleted file mode 100644 index d8a6d6b7a403..000000000000 --- a/arch/unicore32/include/asm/thread_info.h +++ /dev/null @@ -1,133 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/thread_info.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_THREAD_INFO_H__ -#define __UNICORE_THREAD_INFO_H__ - -#ifdef __KERNEL__ - -#include -#include - -#define THREAD_SIZE_ORDER 1 -#define THREAD_SIZE 8192 -#define THREAD_START_SP (THREAD_SIZE - 8) - -#ifndef __ASSEMBLY__ - -struct task_struct; - -#include - -typedef struct { - unsigned long seg; -} mm_segment_t; - -struct cpu_context_save { - __u32 r4; - __u32 r5; - __u32 r6; - __u32 r7; - __u32 r8; - __u32 r9; - __u32 r10; - __u32 r11; - __u32 r12; - __u32 r13; - __u32 r14; - __u32 r15; - __u32 r16; - __u32 r17; - __u32 r18; - __u32 r19; - __u32 r20; - __u32 r21; - __u32 r22; - __u32 r23; - __u32 r24; - __u32 r25; - __u32 r26; - __u32 fp; - __u32 sp; - __u32 pc; -}; - -/* - * low level task data that entry.S needs immediate access to. - * __switch_to() assumes cpu_context follows immediately after cpu_domain. - */ -struct thread_info { - unsigned long flags; /* low level flags */ - int preempt_count; /* 0 => preemptable */ - /* <0 => bug */ - mm_segment_t addr_limit; /* address limit */ - struct task_struct *task; /* main task structure */ - __u32 cpu; /* cpu */ - struct cpu_context_save cpu_context; /* cpu context */ - __u32 syscall; /* syscall number */ - __u8 used_cp[16]; /* thread used copro */ -#ifdef CONFIG_UNICORE_FPU_F64 - struct fp_state fpstate __attribute__((aligned(8))); -#endif -}; - -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .flags = 0, \ - .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ -} - -/* - * how to get the thread information struct from C - */ -static inline struct thread_info *current_thread_info(void) __attribute_const__; - -static inline struct thread_info *current_thread_info(void) -{ - register unsigned long sp asm ("sp"); - return (struct thread_info *)(sp & ~(THREAD_SIZE - 1)); -} - -#define thread_saved_pc(tsk) \ - ((unsigned long)(task_thread_info(tsk)->cpu_context.pc)) -#define thread_saved_sp(tsk) \ - ((unsigned long)(task_thread_info(tsk)->cpu_context.sp)) -#define thread_saved_fp(tsk) \ - ((unsigned long)(task_thread_info(tsk)->cpu_context.fp)) - -#endif - -/* - * thread information flags: - * TIF_SYSCALL_TRACE - syscall trace active - * TIF_SIGPENDING - signal pending - * TIF_NEED_RESCHED - rescheduling necessary - * TIF_NOTIFY_RESUME - callback before returning to user - */ -#define TIF_SIGPENDING 0 -#define TIF_NEED_RESCHED 1 -#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ -#define TIF_SYSCALL_TRACE 8 -#define TIF_MEMDIE 18 -#define TIF_RESTORE_SIGMASK 20 - -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) - -/* - * Change these and you break ASM code in entry-common.S - */ -#define _TIF_WORK_MASK \ - (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_NOTIFY_RESUME) - -#endif /* __KERNEL__ */ -#endif /* __UNICORE_THREAD_INFO_H__ */ diff --git a/arch/unicore32/include/asm/timex.h b/arch/unicore32/include/asm/timex.h deleted file mode 100644 index d714af3dbce1..000000000000 --- a/arch/unicore32/include/asm/timex.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/timex.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_TIMEX_H__ -#define __UNICORE_TIMEX_H__ - -#ifdef CONFIG_ARCH_FPGA - -/* in FPGA, APB clock is 33M, and OST clock is 32K, */ -/* so, 1M is selected for timer interrupt correctly */ -#define CLOCK_TICK_RATE (32*1024) - -#endif - -#if defined(CONFIG_PUV3_DB0913) \ - || defined(CONFIG_PUV3_NB0916) \ - || defined(CONFIG_PUV3_SMW0919) - -#define CLOCK_TICK_RATE (14318000) - -#endif - -#include - -#endif diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h deleted file mode 100644 index 4663d8cc80ef..000000000000 --- a/arch/unicore32/include/asm/tlb.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/tlb.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_TLB_H__ -#define __UNICORE_TLB_H__ - -/* - * unicore32 lacks an efficient flush_tlb_range(), use flush_tlb_mm(). - */ - -#define __pte_free_tlb(tlb, pte, addr) \ - do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), (pte)); \ - } while (0) - -#include - -#endif diff --git a/arch/unicore32/include/asm/tlbflush.h b/arch/unicore32/include/asm/tlbflush.h deleted file mode 100644 index 1cf18ef55515..000000000000 --- a/arch/unicore32/include/asm/tlbflush.h +++ /dev/null @@ -1,192 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/tlbflush.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_TLBFLUSH_H__ -#define __UNICORE_TLBFLUSH_H__ - -#ifndef __ASSEMBLY__ - -#include - -extern void __cpu_flush_user_tlb_range(unsigned long, unsigned long, - struct vm_area_struct *); -extern void __cpu_flush_kern_tlb_range(unsigned long, unsigned long); - -/* - * TLB Management - * ============== - * - * The arch/unicore/mm/tlb-*.S files implement these methods. - * - * The TLB specific code is expected to perform whatever tests it - * needs to determine if it should invalidate the TLB for each - * call. Start addresses are inclusive and end addresses are - * exclusive; it is safe to round these addresses down. - * - * flush_tlb_all() - * - * Invalidate the entire TLB. - * - * flush_tlb_mm(mm) - * - * Invalidate all TLB entries in a particular address - * space. - * - mm - mm_struct describing address space - * - * flush_tlb_range(mm,start,end) - * - * Invalidate a range of TLB entries in the specified - * address space. - * - mm - mm_struct describing address space - * - start - start address (may not be aligned) - * - end - end address (exclusive, may not be aligned) - * - * flush_tlb_page(vaddr,vma) - * - * Invalidate the specified page in the specified address range. - * - vaddr - virtual address (may not be aligned) - * - vma - vma_struct describing address range - * - * flush_kern_tlb_page(kaddr) - * - * Invalidate the TLB entry for the specified page. The address - * will be in the kernels virtual memory space. Current uses - * only require the D-TLB to be invalidated. - * - kaddr - Kernel virtual memory address - */ - -static inline void local_flush_tlb_all(void) -{ - const int zero = 0; - - /* TLB invalidate all */ - asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (zero) : "cc"); -} - -static inline void local_flush_tlb_mm(struct mm_struct *mm) -{ - const int zero = 0; - - if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) { - /* TLB invalidate all */ - asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (zero) : "cc"); - } - put_cpu(); -} - -static inline void -local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) -{ - if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { -#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE - /* iTLB invalidate page */ - asm("movc p0.c6, %0, #5; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (uaddr & PAGE_MASK) : "cc"); - /* dTLB invalidate page */ - asm("movc p0.c6, %0, #3; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (uaddr & PAGE_MASK) : "cc"); -#else - /* TLB invalidate all */ - asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (uaddr & PAGE_MASK) : "cc"); -#endif - } -} - -static inline void local_flush_tlb_kernel_page(unsigned long kaddr) -{ -#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE - /* iTLB invalidate page */ - asm("movc p0.c6, %0, #5; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (kaddr & PAGE_MASK) : "cc"); - /* dTLB invalidate page */ - asm("movc p0.c6, %0, #3; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (kaddr & PAGE_MASK) : "cc"); -#else - /* TLB invalidate all */ - asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (kaddr & PAGE_MASK) : "cc"); -#endif -} - -/* - * flush_pmd_entry - * - * Flush a PMD entry (word aligned, or double-word aligned) to - * RAM if the TLB for the CPU we are running on requires this. - * This is typically used when we are creating PMD entries. - * - * clean_pmd_entry - * - * Clean (but don't drain the write buffer) if the CPU requires - * these operations. This is typically used when we are removing - * PMD entries. - */ -static inline void flush_pmd_entry(pmd_t *pmd) -{ -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - /* flush dcache line, see dcacheline_flush in proc-macros.S */ - asm("mov r1, %0 << #20\n" - "ldw r2, =_stext\n" - "add r2, r2, r1 >> #20\n" - "ldw r1, [r2+], #0x0000\n" - "ldw r1, [r2+], #0x1000\n" - "ldw r1, [r2+], #0x2000\n" - "ldw r1, [r2+], #0x3000\n" - : : "r" (pmd) : "r1", "r2"); -#else - /* flush dcache all */ - asm("movc p0.c5, %0, #14; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (pmd) : "cc"); -#endif -} - -static inline void clean_pmd_entry(pmd_t *pmd) -{ -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - /* clean dcache line */ - asm("movc p0.c5, %0, #11; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (__pa(pmd) & ~(L1_CACHE_BYTES - 1)) : "cc"); -#else - /* clean dcache all */ - asm("movc p0.c5, %0, #10; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (pmd) : "cc"); -#endif -} - -/* - * Convert calls to our calling convention. - */ -#define local_flush_tlb_range(vma, start, end) \ - __cpu_flush_user_tlb_range(start, end, vma) -#define local_flush_tlb_kernel_range(s, e) \ - __cpu_flush_kern_tlb_range(s, e) - -#define flush_tlb_all local_flush_tlb_all -#define flush_tlb_mm local_flush_tlb_mm -#define flush_tlb_page local_flush_tlb_page -#define flush_tlb_kernel_page local_flush_tlb_kernel_page -#define flush_tlb_range local_flush_tlb_range -#define flush_tlb_kernel_range local_flush_tlb_kernel_range - -/* - * if PG_dcache_clean is not set for the page, we need to ensure that any - * cache entries for the kernels virtual memory range are written - * back to the page. - */ -extern void update_mmu_cache(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); - -extern void do_bad_area(unsigned long addr, unsigned int fsr, - struct pt_regs *regs); - -#endif - -#endif diff --git a/arch/unicore32/include/asm/traps.h b/arch/unicore32/include/asm/traps.h deleted file mode 100644 index ad1508a9a903..000000000000 --- a/arch/unicore32/include/asm/traps.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/traps.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_TRAP_H__ -#define __UNICORE_TRAP_H__ - -extern void __init early_trap_init(void); -extern void dump_backtrace_entry(unsigned long where, - unsigned long from, unsigned long frame); - -extern void do_DataAbort(unsigned long addr, unsigned int fsr, - struct pt_regs *regs); -#endif diff --git a/arch/unicore32/include/asm/uaccess.h b/arch/unicore32/include/asm/uaccess.h deleted file mode 100644 index 33c24f430511..000000000000 --- a/arch/unicore32/include/asm/uaccess.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/uaccess.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_UACCESS_H__ -#define __UNICORE_UACCESS_H__ - -#include - -#define __strncpy_from_user __strncpy_from_user -#define __strnlen_user __strnlen_user -#define __clear_user __clear_user - -#define __kernel_ok (uaccess_kernel()) -#define __user_ok(addr, size) (((size) <= TASK_SIZE) \ - && ((addr) <= TASK_SIZE - (size))) -#define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size))) - -extern unsigned long __must_check -raw_copy_from_user(void *to, const void __user *from, unsigned long n); -extern unsigned long __must_check -raw_copy_to_user(void __user *to, const void *from, unsigned long n); -extern unsigned long __must_check -__clear_user(void __user *addr, unsigned long n); -extern unsigned long __must_check -__strncpy_from_user(char *to, const char __user *from, unsigned long count); -extern unsigned long -__strnlen_user(const char __user *s, long n); -#define INLINE_COPY_FROM_USER -#define INLINE_COPY_TO_USER - -#include - -#endif /* __UNICORE_UACCESS_H__ */ diff --git a/arch/unicore32/include/asm/vmalloc.h b/arch/unicore32/include/asm/vmalloc.h deleted file mode 100644 index 054435818a14..000000000000 --- a/arch/unicore32/include/asm/vmalloc.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef _ASM_UNICORE32_VMALLOC_H -#define _ASM_UNICORE32_VMALLOC_H - -#endif /* _ASM_UNICORE32_VMALLOC_H */ diff --git a/arch/unicore32/include/mach/PKUnity.h b/arch/unicore32/include/mach/PKUnity.h deleted file mode 100644 index 78f77517c1c7..000000000000 --- a/arch/unicore32/include/mach/PKUnity.h +++ /dev/null @@ -1,95 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/PKUnity.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -/* Be sure that virtual mapping is defined right */ -#ifndef __MACH_PUV3_HARDWARE_H__ -#error You must include hardware.h not PKUnity.h -#endif - -#include - -/* - * Memory Definitions - */ -#define PKUNITY_SDRAM_BASE 0x00000000 /* 0x00000000 - 0x7FFFFFFF 2GB */ -#define PKUNITY_MMIO_BASE 0x80000000 /* 0x80000000 - 0xFFFFFFFF 2GB */ - -/* - * PKUNITY System Bus Addresses (PCI): 0x80000000 - 0xBFFFFFFF (1GB) - * 0x80000000 - 0x8000000B 12B PCI Configuration regs - * 0x80010000 - 0x80010250 592B PCI Bridge Base - * 0x80030000 - 0x8003FFFF 64KB PCI Legacy IO - * 0x90000000 - 0x97FFFFFF 128MB PCI AHB-PCI MEM-mapping - * 0x98000000 - 0x9FFFFFFF 128MB PCI PCI-AHB MEM-mapping - */ -#define PKUNITY_PCI_BASE io_p2v(0x80000000) /* 0x80000000 - 0xBFFFFFFF 1GB */ -#include - -#define PKUNITY_PCICFG_BASE (PKUNITY_PCI_BASE + 0x0) -#define PKUNITY_PCIBRI_BASE (PKUNITY_PCI_BASE + 0x00010000) -#define PKUNITY_PCILIO_BASE (PKUNITY_PCI_BASE + 0x00030000) -#define PKUNITY_PCIMEM_BASE (PKUNITY_PCI_BASE + 0x10000000) -#define PKUNITY_PCIAHB_BASE (PKUNITY_PCI_BASE + 0x18000000) - -/* - * PKUNITY System Bus Addresses (AHB): 0xC0000000 - 0xEDFFFFFF (640MB) - */ -#define PKUNITY_AHB_BASE io_p2v(0xC0000000) - -/* AHB-0 is DDR2 SDRAM */ -/* AHB-1 is PCI Space */ -#define PKUNITY_ARBITER_BASE (PKUNITY_AHB_BASE + 0x000000) /* AHB-2 */ -#define PKUNITY_DDR2CTRL_BASE (PKUNITY_AHB_BASE + 0x100000) /* AHB-3 */ -#define PKUNITY_DMAC_BASE (PKUNITY_AHB_BASE + 0x200000) /* AHB-4 */ -#include -#define PKUNITY_UMAL_BASE (PKUNITY_AHB_BASE + 0x300000) /* AHB-5 */ -#include -#define PKUNITY_USB_BASE (PKUNITY_AHB_BASE + 0x400000) /* AHB-6 */ -#define PKUNITY_SATA_BASE (PKUNITY_AHB_BASE + 0x500000) /* AHB-7 */ -#define PKUNITY_SMC_BASE (PKUNITY_AHB_BASE + 0x600000) /* AHB-8 */ -/* AHB-9 is for APB bridge */ -#define PKUNITY_MME_BASE (PKUNITY_AHB_BASE + 0x700000) /* AHB-10 */ -#define PKUNITY_UNIGFX_BASE (PKUNITY_AHB_BASE + 0x800000) /* AHB-11 */ -#include -#define PKUNITY_NAND_BASE (PKUNITY_AHB_BASE + 0x900000) /* AHB-12 */ -#include -#define PKUNITY_H264D_BASE (PKUNITY_AHB_BASE + 0xA00000) /* AHB-13 */ -#define PKUNITY_H264E_BASE (PKUNITY_AHB_BASE + 0xB00000) /* AHB-14 */ - -/* - * PKUNITY Peripheral Bus Addresses (APB): 0xEE000000 - 0xEFFFFFFF (128MB) - */ -#define PKUNITY_APB_BASE io_p2v(0xEE000000) - -#define PKUNITY_UART0_BASE (PKUNITY_APB_BASE + 0x000000) /* APB-0 */ -#define PKUNITY_UART1_BASE (PKUNITY_APB_BASE + 0x100000) /* APB-1 */ -#include -#define PKUNITY_I2C_BASE (PKUNITY_APB_BASE + 0x200000) /* APB-2 */ -#include -#define PKUNITY_SPI_BASE (PKUNITY_APB_BASE + 0x300000) /* APB-3 */ -#include -#define PKUNITY_AC97_BASE (PKUNITY_APB_BASE + 0x400000) /* APB-4 */ -#include -#define PKUNITY_GPIO_BASE (PKUNITY_APB_BASE + 0x500000) /* APB-5 */ -#include -#define PKUNITY_INTC_BASE (PKUNITY_APB_BASE + 0x600000) /* APB-6 */ -#include -#define PKUNITY_RTC_BASE (PKUNITY_APB_BASE + 0x700000) /* APB-7 */ -#include -#define PKUNITY_OST_BASE (PKUNITY_APB_BASE + 0x800000) /* APB-8 */ -#include -#define PKUNITY_RESETC_BASE (PKUNITY_APB_BASE + 0x900000) /* APB-9 */ -#include -#define PKUNITY_PM_BASE (PKUNITY_APB_BASE + 0xA00000) /* APB-10 */ -#include -#define PKUNITY_PS2_BASE (PKUNITY_APB_BASE + 0xB00000) /* APB-11 */ -#include -#define PKUNITY_SDC_BASE (PKUNITY_APB_BASE + 0xC00000) /* APB-12 */ -#include - diff --git a/arch/unicore32/include/mach/bitfield.h b/arch/unicore32/include/mach/bitfield.h deleted file mode 100644 index 766b7f01f1cd..000000000000 --- a/arch/unicore32/include/mach/bitfield.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/bitfield.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __MACH_PUV3_BITFIELD_H__ -#define __MACH_PUV3_BITFIELD_H__ - -#ifndef __ASSEMBLY__ -#define UData(Data) ((unsigned long) (Data)) -#else -#define UData(Data) (Data) -#endif - -#define FIELD(val, vmask, vshift) (((val) & ((UData(1) << (vmask)) - 1)) << (vshift)) -#define FMASK(vmask, vshift) (((UData(1) << (vmask)) - 1) << (vshift)) - -#endif /* __MACH_PUV3_BITFIELD_H__ */ diff --git a/arch/unicore32/include/mach/dma.h b/arch/unicore32/include/mach/dma.h deleted file mode 100644 index 271001cd13c4..000000000000 --- a/arch/unicore32/include/mach/dma.h +++ /dev/null @@ -1,45 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/dma.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __MACH_PUV3_DMA_H__ -#define __MACH_PUV3_DMA_H__ - -/* - * The PKUnity has six internal DMA channels. - */ -#define MAX_DMA_CHANNELS 6 - -typedef enum { - DMA_PRIO_HIGH = 0, - DMA_PRIO_MEDIUM = 1, - DMA_PRIO_LOW = 2 -} puv3_dma_prio; - -/* - * DMA registration - */ - -extern int puv3_request_dma(char *name, - puv3_dma_prio prio, - void (*irq_handler)(int, void *), - void (*err_handler)(int, void *), - void *data); - -extern void puv3_free_dma(int dma_ch); - -static inline void puv3_stop_dma(int ch) -{ - writel(readl(DMAC_CONFIG(ch)) & ~DMAC_CONFIG_EN, DMAC_CONFIG(ch)); -} - -static inline void puv3_resume_dma(int ch) -{ - writel(readl(DMAC_CONFIG(ch)) | DMAC_CONFIG_EN, DMAC_CONFIG(ch)); -} - -#endif /* __MACH_PUV3_DMA_H__ */ diff --git a/arch/unicore32/include/mach/hardware.h b/arch/unicore32/include/mach/hardware.h deleted file mode 100644 index 2d7571cbd1d0..000000000000 --- a/arch/unicore32/include/mach/hardware.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/hardware.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This file contains the hardware definitions for PKUnity architecture - */ - -#ifndef __MACH_PUV3_HARDWARE_H__ -#define __MACH_PUV3_HARDWARE_H__ - -#include - -#ifndef __ASSEMBLY__ -#define io_p2v(x) (void __iomem *)((x) - PKUNITY_MMIO_BASE) -#define io_v2p(x) (phys_addr_t)((x) + PKUNITY_MMIO_BASE) -#else -#define io_p2v(x) ((x) - PKUNITY_MMIO_BASE) -#define io_v2p(x) ((x) + PKUNITY_MMIO_BASE) -#endif - -#define PCIBIOS_MIN_IO 0x4000 /* should lower than 64KB */ -#define PCIBIOS_MIN_MEM io_v2p(PKUNITY_PCIMEM_BASE) - -#define pcibios_assign_all_busses() 1 - -#endif /* __MACH_PUV3_HARDWARE_H__ */ diff --git a/arch/unicore32/include/mach/map.h b/arch/unicore32/include/mach/map.h deleted file mode 100644 index 7a83eeeb1287..000000000000 --- a/arch/unicore32/include/mach/map.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/map.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Page table mapping constructs and function prototypes - */ -#define MT_DEVICE 0 -#define MT_DEVICE_CACHED 2 -#define MT_KUSER 7 -#define MT_HIGH_VECTORS 8 -#define MT_MEMORY 9 -#define MT_ROM 10 - diff --git a/arch/unicore32/include/mach/memory.h b/arch/unicore32/include/mach/memory.h deleted file mode 100644 index b4e6035cb9a3..000000000000 --- a/arch/unicore32/include/mach/memory.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/memory.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __MACH_PUV3_MEMORY_H__ -#define __MACH_PUV3_MEMORY_H__ - -#include - -/* Physical DRAM offset. */ -#define PHYS_OFFSET UL(0x00000000) -/* The base address of exception vectors. */ -#define VECTORS_BASE UL(0xffff0000) -/* The base address of kuser area. */ -#define KUSER_BASE UL(0x80000000) - -#ifdef __ASSEMBLY__ -/* The byte offset of the kernel image in RAM from the start of RAM. */ -#define KERNEL_IMAGE_START 0x00408000 -#endif - -#if !defined(__ASSEMBLY__) && defined(CONFIG_PCI) - -void puv3_pci_adjust_zones(unsigned long *max_zone_pfn); - -#define arch_adjust_zones(max_zone_pfn) \ - puv3_pci_adjust_zones(max_zone_pfn) - -#endif - -/* - * PCI controller in PKUnity-3 masks highest 5-bit for upstream channel, - * so we must limit the DMA allocation within 128M physical memory for - * supporting PCI devices. - */ -#define PCI_DMA_THRESHOLD (PHYS_OFFSET + SZ_128M - 1) - -#define is_pcibus_device(dev) (dev && \ - (strncmp(dev->bus->name, "pci", 3) == 0)) - -#define __virt_to_pcibus(x) (__virt_to_phys((x) + PKUNITY_PCIAHB_BASE)) -#define __pcibus_to_virt(x) (__phys_to_virt(x) - PKUNITY_PCIAHB_BASE) - -/* kuser area */ -#define KUSER_VECPAGE_BASE (KUSER_BASE + UL(0x3fff0000)) -/* kuser_vecpage (0xbfff0000) is ro, and vectors page (0xffff0000) is rw */ -#define kuser_vecpage_to_vectors(x) ((x) - (KUSER_VECPAGE_BASE) \ - + (VECTORS_BASE)) - -#endif diff --git a/arch/unicore32/include/mach/ocd.h b/arch/unicore32/include/mach/ocd.h deleted file mode 100644 index 2a814929e389..000000000000 --- a/arch/unicore32/include/mach/ocd.h +++ /dev/null @@ -1,33 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/ocd.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __MACH_PUV3_OCD_H__ -#define __MACH_PUV3_OCD_H__ - -#if defined(CONFIG_DEBUG_OCD) -static inline void ocd_putc(unsigned int c) -{ - int status, i = 0x2000000; - - do { - if (--i < 0) - return; - - asm volatile ("movc %0, p1.c0, #0" : "=r" (status)); - } while (status & 2); - - asm("movc p1.c1, %0, #1" : : "r" (c)); -} - -#define putc(ch) ocd_putc(ch) -#else -#define putc(ch) -#endif - -#endif diff --git a/arch/unicore32/include/mach/pm.h b/arch/unicore32/include/mach/pm.h deleted file mode 100644 index cb40b8490a57..000000000000 --- a/arch/unicore32/include/mach/pm.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore/include/mach/pm.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __PUV3_PM_H__ -#define __PUV3_PM_H__ - -#include - -struct puv3_cpu_pm_fns { - int save_count; - void (*save)(unsigned long *); - void (*restore)(unsigned long *); - int (*valid)(suspend_state_t state); - void (*enter)(suspend_state_t state); - int (*prepare)(void); - void (*finish)(void); -}; - -extern struct puv3_cpu_pm_fns *puv3_cpu_pm_fns; - -/* sleep.S */ -extern void puv3_cpu_suspend(unsigned int); - -extern void puv3_cpu_resume(void); - -extern int puv3_pm_enter(suspend_state_t state); - -/* Defined in hibernate_asm.S */ -extern int restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist); - -extern struct pbe *restore_pblist; -#endif diff --git a/arch/unicore32/include/mach/regs-ac97.h b/arch/unicore32/include/mach/regs-ac97.h deleted file mode 100644 index 85c601898d02..000000000000 --- a/arch/unicore32/include/mach/regs-ac97.h +++ /dev/null @@ -1,33 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity AC97 Registers - */ - -#define PKUNITY_AC97_CONR (PKUNITY_AC97_BASE + 0x0000) -#define PKUNITY_AC97_OCR (PKUNITY_AC97_BASE + 0x0004) -#define PKUNITY_AC97_ICR (PKUNITY_AC97_BASE + 0x0008) -#define PKUNITY_AC97_CRAC (PKUNITY_AC97_BASE + 0x000C) -#define PKUNITY_AC97_INTR (PKUNITY_AC97_BASE + 0x0010) -#define PKUNITY_AC97_INTRSTAT (PKUNITY_AC97_BASE + 0x0014) -#define PKUNITY_AC97_INTRCLEAR (PKUNITY_AC97_BASE + 0x0018) -#define PKUNITY_AC97_ENABLE (PKUNITY_AC97_BASE + 0x001C) -#define PKUNITY_AC97_OUT_FIFO (PKUNITY_AC97_BASE + 0x0020) -#define PKUNITY_AC97_IN_FIFO (PKUNITY_AC97_BASE + 0x0030) - -#define AC97_CODEC_REG(v) FIELD((v), 7, 16) -#define AC97_CODEC_VAL(v) FIELD((v), 16, 0) -#define AC97_CODEC_WRITECOMPLETE FIELD(1, 1, 2) - -/* - * VAR PLAY SAMPLE RATE - */ -#define AC97_CMD_VPSAMPLE (FIELD(3, 2, 16) | FIELD(3, 2, 0)) - -/* - * FIX CAPTURE SAMPLE RATE - */ -#define AC97_CMD_FCSAMPLE FIELD(7, 3, 0) - -#define AC97_CMD_RESET FIELD(1, 1, 0) -#define AC97_CMD_ENABLE FIELD(1, 1, 0) -#define AC97_CMD_DISABLE FIELD(0, 1, 0) diff --git a/arch/unicore32/include/mach/regs-dmac.h b/arch/unicore32/include/mach/regs-dmac.h deleted file mode 100644 index bbdc52d06a98..000000000000 --- a/arch/unicore32/include/mach/regs-dmac.h +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Direct Memory Access Controller (DMAC) - */ - -/* - * Interrupt Status Reg DMAC_ISR. - */ -#define DMAC_ISR (PKUNITY_DMAC_BASE + 0x0020) -/* - * Interrupt Transfer Complete Status Reg DMAC_ITCSR. - */ -#define DMAC_ITCSR (PKUNITY_DMAC_BASE + 0x0050) -/* - * Interrupt Transfer Complete Clear Reg DMAC_ITCCR. - */ -#define DMAC_ITCCR (PKUNITY_DMAC_BASE + 0x0060) -/* - * Interrupt Error Status Reg DMAC_IESR. - */ -#define DMAC_IESR (PKUNITY_DMAC_BASE + 0x0080) -/* - * Interrupt Error Clear Reg DMAC_IECR. - */ -#define DMAC_IECR (PKUNITY_DMAC_BASE + 0x0090) -/* - * Enable Channels Reg DMAC_ENCH. - */ -#define DMAC_ENCH (PKUNITY_DMAC_BASE + 0x00B0) - -/* - * DMA control reg. Space [byte] - */ -#define DMASp 0x00000100 - -/* - * Source Addr DMAC_SRCADDR(ch). - */ -#define DMAC_SRCADDR(ch) (PKUNITY_DMAC_BASE + (ch)*DMASp + 0x00) -/* - * Destination Addr DMAC_DESTADDR(ch). - */ -#define DMAC_DESTADDR(ch) (PKUNITY_DMAC_BASE + (ch)*DMASp + 0x04) -/* - * Control Reg DMAC_CONTROL(ch). - */ -#define DMAC_CONTROL(ch) (PKUNITY_DMAC_BASE + (ch)*DMASp + 0x0C) -/* - * Configuration Reg DMAC_CONFIG(ch). - */ -#define DMAC_CONFIG(ch) (PKUNITY_DMAC_BASE + (ch)*DMASp + 0x10) - -#define DMAC_IR_MASK FMASK(6, 0) -/* - * select channel (ch) - */ -#define DMAC_CHANNEL(ch) FIELD(1, 1, (ch)) - -#define DMAC_CONTROL_SIZE_BYTE(v) (FIELD((v), 12, 14) | \ - FIELD(0, 3, 9) | FIELD(0, 3, 6)) -#define DMAC_CONTROL_SIZE_HWORD(v) (FIELD((v) >> 1, 12, 14) | \ - FIELD(1, 3, 9) | FIELD(1, 3, 6)) -#define DMAC_CONTROL_SIZE_WORD(v) (FIELD((v) >> 2, 12, 14) | \ - FIELD(2, 3, 9) | FIELD(2, 3, 6)) -#define DMAC_CONTROL_DI FIELD(1, 1, 13) -#define DMAC_CONTROL_SI FIELD(1, 1, 12) -#define DMAC_CONTROL_BURST_1BYTE (FIELD(0, 3, 3) | FIELD(0, 3, 0)) -#define DMAC_CONTROL_BURST_4BYTE (FIELD(3, 3, 3) | FIELD(3, 3, 0)) -#define DMAC_CONTROL_BURST_8BYTE (FIELD(5, 3, 3) | FIELD(5, 3, 0)) -#define DMAC_CONTROL_BURST_16BYTE (FIELD(7, 3, 3) | FIELD(7, 3, 0)) - -#define DMAC_CONFIG_UART0_WR (FIELD(2, 4, 11) | FIELD(1, 2, 1)) -#define DMAC_CONFIG_UART0_RD (FIELD(2, 4, 7) | FIELD(2, 2, 1)) -#define DMAC_CONFIG_UART1_WR (FIELD(3, 4, 11) | FIELD(1, 2, 1)) -#define DMAC_CONFIG_UART1RD (FIELD(3, 4, 7) | FIELD(2, 2, 1)) -#define DMAC_CONFIG_AC97WR (FIELD(4, 4, 11) | FIELD(1, 2, 1)) -#define DMAC_CONFIG_AC97RD (FIELD(4, 4, 7) | FIELD(2, 2, 1)) -#define DMAC_CONFIG_MMCWR (FIELD(7, 4, 11) | FIELD(1, 2, 1)) -#define DMAC_CONFIG_MMCRD (FIELD(7, 4, 7) | FIELD(2, 2, 1)) -#define DMAC_CONFIG_MASKITC FIELD(1, 1, 4) -#define DMAC_CONFIG_MASKIE FIELD(1, 1, 3) -#define DMAC_CONFIG_EN FIELD(1, 1, 0) diff --git a/arch/unicore32/include/mach/regs-gpio.h b/arch/unicore32/include/mach/regs-gpio.h deleted file mode 100644 index 5fc701ee33e3..000000000000 --- a/arch/unicore32/include/mach/regs-gpio.h +++ /dev/null @@ -1,71 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity General-Purpose Input/Output (GPIO) Registers - */ - -/* - * Voltage Status Reg GPIO_GPLR. - */ -#define GPIO_GPLR (PKUNITY_GPIO_BASE + 0x0000) -/* - * Pin Direction Reg GPIO_GPDR. - */ -#define GPIO_GPDR (PKUNITY_GPIO_BASE + 0x0004) -/* - * Output Pin Set Reg GPIO_GPSR. - */ -#define GPIO_GPSR (PKUNITY_GPIO_BASE + 0x0008) -/* - * Output Pin Clear Reg GPIO_GPCR. - */ -#define GPIO_GPCR (PKUNITY_GPIO_BASE + 0x000C) -/* - * Raise Edge Detect Reg GPIO_GRER. - */ -#define GPIO_GRER (PKUNITY_GPIO_BASE + 0x0010) -/* - * Fall Edge Detect Reg GPIO_GFER. - */ -#define GPIO_GFER (PKUNITY_GPIO_BASE + 0x0014) -/* - * Edge Status Reg GPIO_GEDR. - */ -#define GPIO_GEDR (PKUNITY_GPIO_BASE + 0x0018) -/* - * Special Voltage Detect Reg GPIO_GPIR. - */ -#define GPIO_GPIR (PKUNITY_GPIO_BASE + 0x0020) - -#define GPIO_MIN (0) -#define GPIO_MAX (27) - -#define GPIO_GPIO(Nb) (0x00000001 << (Nb)) /* GPIO [0..27] */ -#define GPIO_GPIO0 GPIO_GPIO(0) /* GPIO [0] */ -#define GPIO_GPIO1 GPIO_GPIO(1) /* GPIO [1] */ -#define GPIO_GPIO2 GPIO_GPIO(2) /* GPIO [2] */ -#define GPIO_GPIO3 GPIO_GPIO(3) /* GPIO [3] */ -#define GPIO_GPIO4 GPIO_GPIO(4) /* GPIO [4] */ -#define GPIO_GPIO5 GPIO_GPIO(5) /* GPIO [5] */ -#define GPIO_GPIO6 GPIO_GPIO(6) /* GPIO [6] */ -#define GPIO_GPIO7 GPIO_GPIO(7) /* GPIO [7] */ -#define GPIO_GPIO8 GPIO_GPIO(8) /* GPIO [8] */ -#define GPIO_GPIO9 GPIO_GPIO(9) /* GPIO [9] */ -#define GPIO_GPIO10 GPIO_GPIO(10) /* GPIO [10] */ -#define GPIO_GPIO11 GPIO_GPIO(11) /* GPIO [11] */ -#define GPIO_GPIO12 GPIO_GPIO(12) /* GPIO [12] */ -#define GPIO_GPIO13 GPIO_GPIO(13) /* GPIO [13] */ -#define GPIO_GPIO14 GPIO_GPIO(14) /* GPIO [14] */ -#define GPIO_GPIO15 GPIO_GPIO(15) /* GPIO [15] */ -#define GPIO_GPIO16 GPIO_GPIO(16) /* GPIO [16] */ -#define GPIO_GPIO17 GPIO_GPIO(17) /* GPIO [17] */ -#define GPIO_GPIO18 GPIO_GPIO(18) /* GPIO [18] */ -#define GPIO_GPIO19 GPIO_GPIO(19) /* GPIO [19] */ -#define GPIO_GPIO20 GPIO_GPIO(20) /* GPIO [20] */ -#define GPIO_GPIO21 GPIO_GPIO(21) /* GPIO [21] */ -#define GPIO_GPIO22 GPIO_GPIO(22) /* GPIO [22] */ -#define GPIO_GPIO23 GPIO_GPIO(23) /* GPIO [23] */ -#define GPIO_GPIO24 GPIO_GPIO(24) /* GPIO [24] */ -#define GPIO_GPIO25 GPIO_GPIO(25) /* GPIO [25] */ -#define GPIO_GPIO26 GPIO_GPIO(26) /* GPIO [26] */ -#define GPIO_GPIO27 GPIO_GPIO(27) /* GPIO [27] */ - diff --git a/arch/unicore32/include/mach/regs-i2c.h b/arch/unicore32/include/mach/regs-i2c.h deleted file mode 100644 index b41aa7c92430..000000000000 --- a/arch/unicore32/include/mach/regs-i2c.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Inter-integrated Circuit (I2C) Registers - */ - -/* - * Control Reg I2C_CON. - */ -#define I2C_CON (PKUNITY_I2C_BASE + 0x0000) -/* - * Target Address Reg I2C_TAR. - */ -#define I2C_TAR (PKUNITY_I2C_BASE + 0x0004) -/* - * Data buffer and command Reg I2C_DATACMD. - */ -#define I2C_DATACMD (PKUNITY_I2C_BASE + 0x0010) -/* - * Enable Reg I2C_ENABLE. - */ -#define I2C_ENABLE (PKUNITY_I2C_BASE + 0x006C) -/* - * Status Reg I2C_STATUS. - */ -#define I2C_STATUS (PKUNITY_I2C_BASE + 0x0070) -/* - * Tx FIFO Length Reg I2C_TXFLR. - */ -#define I2C_TXFLR (PKUNITY_I2C_BASE + 0x0074) -/* - * Rx FIFO Length Reg I2C_RXFLR. - */ -#define I2C_RXFLR (PKUNITY_I2C_BASE + 0x0078) -/* - * Enable Status Reg I2C_ENSTATUS. - */ -#define I2C_ENSTATUS (PKUNITY_I2C_BASE + 0x009C) - -#define I2C_CON_MASTER FIELD(1, 1, 0) -#define I2C_CON_SPEED_STD FIELD(1, 2, 1) -#define I2C_CON_SPEED_FAST FIELD(2, 2, 1) -#define I2C_CON_RESTART FIELD(1, 1, 5) -#define I2C_CON_SLAVEDISABLE FIELD(1, 1, 6) - -#define I2C_DATACMD_READ FIELD(1, 1, 8) -#define I2C_DATACMD_WRITE FIELD(0, 1, 8) -#define I2C_DATACMD_DAT_MASK FMASK(8, 0) -#define I2C_DATACMD_DAT(v) FIELD((v), 8, 0) - -#define I2C_ENABLE_ENABLE FIELD(1, 1, 0) -#define I2C_ENABLE_DISABLE FIELD(0, 1, 0) - -#define I2C_STATUS_RFF FIELD(1, 1, 4) -#define I2C_STATUS_RFNE FIELD(1, 1, 3) -#define I2C_STATUS_TFE FIELD(1, 1, 2) -#define I2C_STATUS_TFNF FIELD(1, 1, 1) -#define I2C_STATUS_ACTIVITY FIELD(1, 1, 0) - -#define I2C_ENSTATUS_ENABLE FIELD(1, 1, 0) - -#define I2C_TAR_THERMAL 0x4f -#define I2C_TAR_SPD 0x50 -#define I2C_TAR_PWIC 0x55 -#define I2C_TAR_EEPROM 0x57 diff --git a/arch/unicore32/include/mach/regs-intc.h b/arch/unicore32/include/mach/regs-intc.h deleted file mode 100644 index 4eb1b5b571bb..000000000000 --- a/arch/unicore32/include/mach/regs-intc.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUNITY Interrupt Controller (INTC) Registers - */ -/* - * INTC Level Reg INTC_ICLR. - */ -#define INTC_ICLR (PKUNITY_INTC_BASE + 0x0000) -/* - * INTC Mask Reg INTC_ICMR. - */ -#define INTC_ICMR (PKUNITY_INTC_BASE + 0x0004) -/* - * INTC Pending Reg INTC_ICPR. - */ -#define INTC_ICPR (PKUNITY_INTC_BASE + 0x0008) -/* - * INTC IRQ Pending Reg INTC_ICIP. - */ -#define INTC_ICIP (PKUNITY_INTC_BASE + 0x000C) -/* - * INTC REAL Pending Reg INTC_ICFP. - */ -#define INTC_ICFP (PKUNITY_INTC_BASE + 0x0010) -/* - * INTC Control Reg INTC_ICCR. - */ -#define INTC_ICCR (PKUNITY_INTC_BASE + 0x0014) - diff --git a/arch/unicore32/include/mach/regs-nand.h b/arch/unicore32/include/mach/regs-nand.h deleted file mode 100644 index 7f29939251ef..000000000000 --- a/arch/unicore32/include/mach/regs-nand.h +++ /dev/null @@ -1,80 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity NAND Controller Registers - */ -/* - * ID Reg. 0 NAND_IDR0 - */ -#define NAND_IDR0 (PKUNITY_NAND_BASE + 0x0000) -/* - * ID Reg. 1 NAND_IDR1 - */ -#define NAND_IDR1 (PKUNITY_NAND_BASE + 0x0004) -/* - * ID Reg. 2 NAND_IDR2 - */ -#define NAND_IDR2 (PKUNITY_NAND_BASE + 0x0008) -/* - * ID Reg. 3 NAND_IDR3 - */ -#define NAND_IDR3 (PKUNITY_NAND_BASE + 0x000C) -/* - * Page Address Reg 0 NAND_PAR0 - */ -#define NAND_PAR0 (PKUNITY_NAND_BASE + 0x0010) -/* - * Page Address Reg 1 NAND_PAR1 - */ -#define NAND_PAR1 (PKUNITY_NAND_BASE + 0x0014) -/* - * Page Address Reg 2 NAND_PAR2 - */ -#define NAND_PAR2 (PKUNITY_NAND_BASE + 0x0018) -/* - * ECC Enable Reg NAND_ECCEN - */ -#define NAND_ECCEN (PKUNITY_NAND_BASE + 0x001C) -/* - * Buffer Reg NAND_BUF - */ -#define NAND_BUF (PKUNITY_NAND_BASE + 0x0020) -/* - * ECC Status Reg NAND_ECCSR - */ -#define NAND_ECCSR (PKUNITY_NAND_BASE + 0x0024) -/* - * Command Reg NAND_CMD - */ -#define NAND_CMD (PKUNITY_NAND_BASE + 0x0028) -/* - * DMA Configure Reg NAND_DMACR - */ -#define NAND_DMACR (PKUNITY_NAND_BASE + 0x002C) -/* - * Interrupt Reg NAND_IR - */ -#define NAND_IR (PKUNITY_NAND_BASE + 0x0030) -/* - * Interrupt Mask Reg NAND_IMR - */ -#define NAND_IMR (PKUNITY_NAND_BASE + 0x0034) -/* - * Chip Enable Reg NAND_CHIPEN - */ -#define NAND_CHIPEN (PKUNITY_NAND_BASE + 0x0038) -/* - * Address Reg NAND_ADDR - */ -#define NAND_ADDR (PKUNITY_NAND_BASE + 0x003C) - -/* - * Command bits NAND_CMD_CMD_MASK - */ -#define NAND_CMD_CMD_MASK FMASK(4, 4) -#define NAND_CMD_CMD_READPAGE FIELD(0x0, 4, 4) -#define NAND_CMD_CMD_ERASEBLOCK FIELD(0x6, 4, 4) -#define NAND_CMD_CMD_READSTATUS FIELD(0x7, 4, 4) -#define NAND_CMD_CMD_WRITEPAGE FIELD(0x8, 4, 4) -#define NAND_CMD_CMD_READID FIELD(0x9, 4, 4) -#define NAND_CMD_CMD_RESET FIELD(0xf, 4, 4) - diff --git a/arch/unicore32/include/mach/regs-ost.h b/arch/unicore32/include/mach/regs-ost.h deleted file mode 100644 index 6c63e7b7569e..000000000000 --- a/arch/unicore32/include/mach/regs-ost.h +++ /dev/null @@ -1,91 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Operating System Timer (OST) Registers - */ -/* - * Match Reg 0 OST_OSMR0 - */ -#define OST_OSMR0 (PKUNITY_OST_BASE + 0x0000) -/* - * Match Reg 1 OST_OSMR1 - */ -#define OST_OSMR1 (PKUNITY_OST_BASE + 0x0004) -/* - * Match Reg 2 OST_OSMR2 - */ -#define OST_OSMR2 (PKUNITY_OST_BASE + 0x0008) -/* - * Match Reg 3 OST_OSMR3 - */ -#define OST_OSMR3 (PKUNITY_OST_BASE + 0x000C) -/* - * Counter Reg OST_OSCR - */ -#define OST_OSCR (PKUNITY_OST_BASE + 0x0010) -/* - * Status Reg OST_OSSR - */ -#define OST_OSSR (PKUNITY_OST_BASE + 0x0014) -/* - * Watchdog Enable Reg OST_OWER - */ -#define OST_OWER (PKUNITY_OST_BASE + 0x0018) -/* - * Interrupt Enable Reg OST_OIER - */ -#define OST_OIER (PKUNITY_OST_BASE + 0x001C) - -/* - * PWM Registers: IO base address: PKUNITY_OST_BASE + 0x80 - * PWCR: Pulse Width Control Reg - * DCCR: Duty Cycle Control Reg - * PCR: Period Control Reg - */ -#define OST_PWM_PWCR (0x00) -#define OST_PWM_DCCR (0x04) -#define OST_PWM_PCR (0x08) - -/* - * Match detected 0 OST_OSSR_M0 - */ -#define OST_OSSR_M0 FIELD(1, 1, 0) -/* - * Match detected 1 OST_OSSR_M1 - */ -#define OST_OSSR_M1 FIELD(1, 1, 1) -/* - * Match detected 2 OST_OSSR_M2 - */ -#define OST_OSSR_M2 FIELD(1, 1, 2) -/* - * Match detected 3 OST_OSSR_M3 - */ -#define OST_OSSR_M3 FIELD(1, 1, 3) - -/* - * Interrupt enable 0 OST_OIER_E0 - */ -#define OST_OIER_E0 FIELD(1, 1, 0) -/* - * Interrupt enable 1 OST_OIER_E1 - */ -#define OST_OIER_E1 FIELD(1, 1, 1) -/* - * Interrupt enable 2 OST_OIER_E2 - */ -#define OST_OIER_E2 FIELD(1, 1, 2) -/* - * Interrupt enable 3 OST_OIER_E3 - */ -#define OST_OIER_E3 FIELD(1, 1, 3) - -/* - * Watchdog Match Enable OST_OWER_WME - */ -#define OST_OWER_WME FIELD(1, 1, 0) - -/* - * PWM Full Duty Cycle OST_PWMDCCR_FDCYCLE - */ -#define OST_PWMDCCR_FDCYCLE FIELD(1, 1, 10) - diff --git a/arch/unicore32/include/mach/regs-pci.h b/arch/unicore32/include/mach/regs-pci.h deleted file mode 100644 index 25bb307b87c3..000000000000 --- a/arch/unicore32/include/mach/regs-pci.h +++ /dev/null @@ -1,95 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity AHB-PCI Bridge Registers - */ - -/* - * AHB/PCI fixed physical address for pci addess configuration - */ -/* - * PCICFG Bridge Base Reg. - */ -#define PCICFG_BRIBASE (PKUNITY_PCICFG_BASE + 0x0000) -/* - * PCICFG Address Reg. - */ -#define PCICFG_ADDR (PKUNITY_PCICFG_BASE + 0x0004) -/* - * PCICFG Address Reg. - */ -#define PCICFG_DATA (PKUNITY_PCICFG_BASE + 0x0008) - -/* - * PCI Bridge configuration space - */ -#define PCIBRI_ID (PKUNITY_PCIBRI_BASE + 0x0000) -#define PCIBRI_CMD (PKUNITY_PCIBRI_BASE + 0x0004) -#define PCIBRI_CLASS (PKUNITY_PCIBRI_BASE + 0x0008) -#define PCIBRI_LTR (PKUNITY_PCIBRI_BASE + 0x000C) -#define PCIBRI_BAR0 (PKUNITY_PCIBRI_BASE + 0x0010) -#define PCIBRI_BAR1 (PKUNITY_PCIBRI_BASE + 0x0014) -#define PCIBRI_BAR2 (PKUNITY_PCIBRI_BASE + 0x0018) -#define PCIBRI_BAR3 (PKUNITY_PCIBRI_BASE + 0x001C) -#define PCIBRI_BAR4 (PKUNITY_PCIBRI_BASE + 0x0020) -#define PCIBRI_BAR5 (PKUNITY_PCIBRI_BASE + 0x0024) - -#define PCIBRI_PCICTL0 (PKUNITY_PCIBRI_BASE + 0x0100) -#define PCIBRI_PCIBAR0 (PKUNITY_PCIBRI_BASE + 0x0104) -#define PCIBRI_PCIAMR0 (PKUNITY_PCIBRI_BASE + 0x0108) -#define PCIBRI_PCITAR0 (PKUNITY_PCIBRI_BASE + 0x010C) -#define PCIBRI_PCICTL1 (PKUNITY_PCIBRI_BASE + 0x0110) -#define PCIBRI_PCIBAR1 (PKUNITY_PCIBRI_BASE + 0x0114) -#define PCIBRI_PCIAMR1 (PKUNITY_PCIBRI_BASE + 0x0118) -#define PCIBRI_PCITAR1 (PKUNITY_PCIBRI_BASE + 0x011C) -#define PCIBRI_PCICTL2 (PKUNITY_PCIBRI_BASE + 0x0120) -#define PCIBRI_PCIBAR2 (PKUNITY_PCIBRI_BASE + 0x0124) -#define PCIBRI_PCIAMR2 (PKUNITY_PCIBRI_BASE + 0x0128) -#define PCIBRI_PCITAR2 (PKUNITY_PCIBRI_BASE + 0x012C) -#define PCIBRI_PCICTL3 (PKUNITY_PCIBRI_BASE + 0x0130) -#define PCIBRI_PCIBAR3 (PKUNITY_PCIBRI_BASE + 0x0134) -#define PCIBRI_PCIAMR3 (PKUNITY_PCIBRI_BASE + 0x0138) -#define PCIBRI_PCITAR3 (PKUNITY_PCIBRI_BASE + 0x013C) -#define PCIBRI_PCICTL4 (PKUNITY_PCIBRI_BASE + 0x0140) -#define PCIBRI_PCIBAR4 (PKUNITY_PCIBRI_BASE + 0x0144) -#define PCIBRI_PCIAMR4 (PKUNITY_PCIBRI_BASE + 0x0148) -#define PCIBRI_PCITAR4 (PKUNITY_PCIBRI_BASE + 0x014C) -#define PCIBRI_PCICTL5 (PKUNITY_PCIBRI_BASE + 0x0150) -#define PCIBRI_PCIBAR5 (PKUNITY_PCIBRI_BASE + 0x0154) -#define PCIBRI_PCIAMR5 (PKUNITY_PCIBRI_BASE + 0x0158) -#define PCIBRI_PCITAR5 (PKUNITY_PCIBRI_BASE + 0x015C) - -#define PCIBRI_AHBCTL0 (PKUNITY_PCIBRI_BASE + 0x0180) -#define PCIBRI_AHBBAR0 (PKUNITY_PCIBRI_BASE + 0x0184) -#define PCIBRI_AHBAMR0 (PKUNITY_PCIBRI_BASE + 0x0188) -#define PCIBRI_AHBTAR0 (PKUNITY_PCIBRI_BASE + 0x018C) -#define PCIBRI_AHBCTL1 (PKUNITY_PCIBRI_BASE + 0x0190) -#define PCIBRI_AHBBAR1 (PKUNITY_PCIBRI_BASE + 0x0194) -#define PCIBRI_AHBAMR1 (PKUNITY_PCIBRI_BASE + 0x0198) -#define PCIBRI_AHBTAR1 (PKUNITY_PCIBRI_BASE + 0x019C) -#define PCIBRI_AHBCTL2 (PKUNITY_PCIBRI_BASE + 0x01A0) -#define PCIBRI_AHBBAR2 (PKUNITY_PCIBRI_BASE + 0x01A4) -#define PCIBRI_AHBAMR2 (PKUNITY_PCIBRI_BASE + 0x01A8) -#define PCIBRI_AHBTAR2 (PKUNITY_PCIBRI_BASE + 0x01AC) -#define PCIBRI_AHBCTL3 (PKUNITY_PCIBRI_BASE + 0x01B0) -#define PCIBRI_AHBBAR3 (PKUNITY_PCIBRI_BASE + 0x01B4) -#define PCIBRI_AHBAMR3 (PKUNITY_PCIBRI_BASE + 0x01B8) -#define PCIBRI_AHBTAR3 (PKUNITY_PCIBRI_BASE + 0x01BC) -#define PCIBRI_AHBCTL4 (PKUNITY_PCIBRI_BASE + 0x01C0) -#define PCIBRI_AHBBAR4 (PKUNITY_PCIBRI_BASE + 0x01C4) -#define PCIBRI_AHBAMR4 (PKUNITY_PCIBRI_BASE + 0x01C8) -#define PCIBRI_AHBTAR4 (PKUNITY_PCIBRI_BASE + 0x01CC) -#define PCIBRI_AHBCTL5 (PKUNITY_PCIBRI_BASE + 0x01D0) -#define PCIBRI_AHBBAR5 (PKUNITY_PCIBRI_BASE + 0x01D4) -#define PCIBRI_AHBAMR5 (PKUNITY_PCIBRI_BASE + 0x01D8) -#define PCIBRI_AHBTAR5 (PKUNITY_PCIBRI_BASE + 0x01DC) - -#define PCIBRI_CTLx_AT FIELD(1, 1, 2) -#define PCIBRI_CTLx_PREF FIELD(1, 1, 1) -#define PCIBRI_CTLx_MRL FIELD(1, 1, 0) - -#define PCIBRI_BARx_ADDR FIELD(0xFFFFFFFC, 30, 2) -#define PCIBRI_BARx_IO FIELD(1, 1, 0) -#define PCIBRI_BARx_MEM FIELD(0, 1, 0) - -#define PCIBRI_CMD_IO FIELD(1, 1, 0) -#define PCIBRI_CMD_MEM FIELD(1, 1, 1) diff --git a/arch/unicore32/include/mach/regs-pm.h b/arch/unicore32/include/mach/regs-pm.h deleted file mode 100644 index 777b1ace39b9..000000000000 --- a/arch/unicore32/include/mach/regs-pm.h +++ /dev/null @@ -1,127 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUNITY Power Manager (PM) Registers - */ -/* - * PM Control Reg PM_PMCR - */ -#define PM_PMCR (PKUNITY_PM_BASE + 0x0000) -/* - * PM General Conf. Reg PM_PGCR - */ -#define PM_PGCR (PKUNITY_PM_BASE + 0x0004) -/* - * PM PLL Conf. Reg PM_PPCR - */ -#define PM_PPCR (PKUNITY_PM_BASE + 0x0008) -/* - * PM Wakeup Enable Reg PM_PWER - */ -#define PM_PWER (PKUNITY_PM_BASE + 0x000C) -/* - * PM GPIO Sleep Status Reg PM_PGSR - */ -#define PM_PGSR (PKUNITY_PM_BASE + 0x0010) -/* - * PM Clock Gate Reg PM_PCGR - */ -#define PM_PCGR (PKUNITY_PM_BASE + 0x0014) -/* - * PM SYS PLL Conf. Reg PM_PLLSYSCFG - */ -#define PM_PLLSYSCFG (PKUNITY_PM_BASE + 0x0018) -/* - * PM DDR PLL Conf. Reg PM_PLLDDRCFG - */ -#define PM_PLLDDRCFG (PKUNITY_PM_BASE + 0x001C) -/* - * PM VGA PLL Conf. Reg PM_PLLVGACFG - */ -#define PM_PLLVGACFG (PKUNITY_PM_BASE + 0x0020) -/* - * PM Div Conf. Reg PM_DIVCFG - */ -#define PM_DIVCFG (PKUNITY_PM_BASE + 0x0024) -/* - * PM SYS PLL Status Reg PM_PLLSYSSTATUS - */ -#define PM_PLLSYSSTATUS (PKUNITY_PM_BASE + 0x0028) -/* - * PM DDR PLL Status Reg PM_PLLDDRSTATUS - */ -#define PM_PLLDDRSTATUS (PKUNITY_PM_BASE + 0x002C) -/* - * PM VGA PLL Status Reg PM_PLLVGASTATUS - */ -#define PM_PLLVGASTATUS (PKUNITY_PM_BASE + 0x0030) -/* - * PM Div Status Reg PM_DIVSTATUS - */ -#define PM_DIVSTATUS (PKUNITY_PM_BASE + 0x0034) -/* - * PM Software Reset Reg PM_SWRESET - */ -#define PM_SWRESET (PKUNITY_PM_BASE + 0x0038) -/* - * PM DDR2 PAD Start Reg PM_DDR2START - */ -#define PM_DDR2START (PKUNITY_PM_BASE + 0x003C) -/* - * PM DDR2 PAD Status Reg PM_DDR2CAL0 - */ -#define PM_DDR2CAL0 (PKUNITY_PM_BASE + 0x0040) -/* - * PM PLL DFC Done Reg PM_PLLDFCDONE - */ -#define PM_PLLDFCDONE (PKUNITY_PM_BASE + 0x0044) - -#define PM_PMCR_SFB FIELD(1, 1, 0) -#define PM_PMCR_IFB FIELD(1, 1, 1) -#define PM_PMCR_CFBSYS FIELD(1, 1, 2) -#define PM_PMCR_CFBDDR FIELD(1, 1, 3) -#define PM_PMCR_CFBVGA FIELD(1, 1, 4) -#define PM_PMCR_CFBDIVBCLK FIELD(1, 1, 5) - -/* - * GPIO 8~27 wake-up enable PM_PWER_GPIOHIGH - */ -#define PM_PWER_GPIOHIGH FIELD(1, 1, 8) -/* - * RTC alarm wake-up enable PM_PWER_RTC - */ -#define PM_PWER_RTC FIELD(1, 1, 31) - -#define PM_PCGR_BCLK64DDR FIELD(1, 1, 0) -#define PM_PCGR_BCLK64VGA FIELD(1, 1, 1) -#define PM_PCGR_BCLKDDR FIELD(1, 1, 2) -#define PM_PCGR_BCLKPCI FIELD(1, 1, 4) -#define PM_PCGR_BCLKDMAC FIELD(1, 1, 5) -#define PM_PCGR_BCLKUMAL FIELD(1, 1, 6) -#define PM_PCGR_BCLKUSB FIELD(1, 1, 7) -#define PM_PCGR_BCLKMME FIELD(1, 1, 10) -#define PM_PCGR_BCLKNAND FIELD(1, 1, 11) -#define PM_PCGR_BCLKH264E FIELD(1, 1, 12) -#define PM_PCGR_BCLKVGA FIELD(1, 1, 13) -#define PM_PCGR_BCLKH264D FIELD(1, 1, 14) -#define PM_PCGR_VECLK FIELD(1, 1, 15) -#define PM_PCGR_HECLK FIELD(1, 1, 16) -#define PM_PCGR_HDCLK FIELD(1, 1, 17) -#define PM_PCGR_NANDCLK FIELD(1, 1, 18) -#define PM_PCGR_GECLK FIELD(1, 1, 19) -#define PM_PCGR_VGACLK FIELD(1, 1, 20) -#define PM_PCGR_PCICLK FIELD(1, 1, 21) -#define PM_PCGR_SATACLK FIELD(1, 1, 25) - -/* - * [23:20]PM_DIVCFG_VGACLK(v) - */ -#define PM_DIVCFG_VGACLK_MASK FMASK(4, 20) -#define PM_DIVCFG_VGACLK(v) FIELD((v), 4, 20) - -#define PM_SWRESET_USB FIELD(1, 1, 6) -#define PM_SWRESET_VGADIV FIELD(1, 1, 26) -#define PM_SWRESET_GEDIV FIELD(1, 1, 27) - -#define PM_PLLDFCDONE_SYSDFC FIELD(1, 1, 0) -#define PM_PLLDFCDONE_DDRDFC FIELD(1, 1, 1) -#define PM_PLLDFCDONE_VGADFC FIELD(1, 1, 2) diff --git a/arch/unicore32/include/mach/regs-ps2.h b/arch/unicore32/include/mach/regs-ps2.h deleted file mode 100644 index d539d7482462..000000000000 --- a/arch/unicore32/include/mach/regs-ps2.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity PS2 Controller Registers - */ -/* - * the same as I8042_DATA_REG PS2_DATA - */ -#define PS2_DATA (PKUNITY_PS2_BASE + 0x0060) -/* - * the same as I8042_COMMAND_REG PS2_COMMAND - */ -#define PS2_COMMAND (PKUNITY_PS2_BASE + 0x0064) -/* - * the same as I8042_STATUS_REG PS2_STATUS - */ -#define PS2_STATUS (PKUNITY_PS2_BASE + 0x0064) -/* - * counter reg PS2_CNT - */ -#define PS2_CNT (PKUNITY_PS2_BASE + 0x0068) - diff --git a/arch/unicore32/include/mach/regs-resetc.h b/arch/unicore32/include/mach/regs-resetc.h deleted file mode 100644 index 5f2b9d77a9ec..000000000000 --- a/arch/unicore32/include/mach/regs-resetc.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Reset Controller (RC) Registers - */ -/* - * Software Reset Register - */ -#define RESETC_SWRR (PKUNITY_RESETC_BASE + 0x0000) -/* - * Reset Status Register - */ -#define RESETC_RSSR (PKUNITY_RESETC_BASE + 0x0004) - -/* - * Software Reset Bit - */ -#define RESETC_SWRR_SRB FIELD(1, 1, 0) - -/* - * Hardware Reset - */ -#define RESETC_RSSR_HWR FIELD(1, 1, 0) -/* - * Software Reset - */ -#define RESETC_RSSR_SWR FIELD(1, 1, 1) -/* - * Watchdog Reset - */ -#define RESETC_RSSR_WDR FIELD(1, 1, 2) -/* - * Sleep Mode Reset - */ -#define RESETC_RSSR_SMR FIELD(1, 1, 3) - diff --git a/arch/unicore32/include/mach/regs-rtc.h b/arch/unicore32/include/mach/regs-rtc.h deleted file mode 100644 index f2f7f47eb65e..000000000000 --- a/arch/unicore32/include/mach/regs-rtc.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Real-Time Clock (RTC) control registers - */ -/* - * RTC Alarm Reg RTC_RTAR - */ -#define RTC_RTAR (PKUNITY_RTC_BASE + 0x0000) -/* - * RTC Count Reg RTC_RCNR - */ -#define RTC_RCNR (PKUNITY_RTC_BASE + 0x0004) -/* - * RTC Trim Reg RTC_RTTR - */ -#define RTC_RTTR (PKUNITY_RTC_BASE + 0x0008) -/* - * RTC Status Reg RTC_RTSR - */ -#define RTC_RTSR (PKUNITY_RTC_BASE + 0x0010) - -/* - * ALarm detected RTC_RTSR_AL - */ -#define RTC_RTSR_AL FIELD(1, 1, 0) -/* - * 1 Hz clock detected RTC_RTSR_HZ - */ -#define RTC_RTSR_HZ FIELD(1, 1, 1) -/* - * ALarm interrupt Enable RTC_RTSR_ALE - */ -#define RTC_RTSR_ALE FIELD(1, 1, 2) -/* - * 1 Hz clock interrupt Enable RTC_RTSR_HZE - */ -#define RTC_RTSR_HZE FIELD(1, 1, 3) - diff --git a/arch/unicore32/include/mach/regs-sdc.h b/arch/unicore32/include/mach/regs-sdc.h deleted file mode 100644 index 658bfaf4cb3c..000000000000 --- a/arch/unicore32/include/mach/regs-sdc.h +++ /dev/null @@ -1,157 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Multi-Media Card and Security Digital Card (MMC/SD) Registers - */ -/* - * Clock Control Reg SDC_CCR - */ -#define SDC_CCR (PKUNITY_SDC_BASE + 0x0000) -/* - * Software Reset Reg SDC_SRR - */ -#define SDC_SRR (PKUNITY_SDC_BASE + 0x0004) -/* - * Argument Reg SDC_ARGUMENT - */ -#define SDC_ARGUMENT (PKUNITY_SDC_BASE + 0x0008) -/* - * Command Reg SDC_COMMAND - */ -#define SDC_COMMAND (PKUNITY_SDC_BASE + 0x000C) -/* - * Block Size Reg SDC_BLOCKSIZE - */ -#define SDC_BLOCKSIZE (PKUNITY_SDC_BASE + 0x0010) -/* - * Block Cound Reg SDC_BLOCKCOUNT - */ -#define SDC_BLOCKCOUNT (PKUNITY_SDC_BASE + 0x0014) -/* - * Transfer Mode Reg SDC_TMR - */ -#define SDC_TMR (PKUNITY_SDC_BASE + 0x0018) -/* - * Response Reg. 0 SDC_RES0 - */ -#define SDC_RES0 (PKUNITY_SDC_BASE + 0x001C) -/* - * Response Reg. 1 SDC_RES1 - */ -#define SDC_RES1 (PKUNITY_SDC_BASE + 0x0020) -/* - * Response Reg. 2 SDC_RES2 - */ -#define SDC_RES2 (PKUNITY_SDC_BASE + 0x0024) -/* - * Response Reg. 3 SDC_RES3 - */ -#define SDC_RES3 (PKUNITY_SDC_BASE + 0x0028) -/* - * Read Timeout Control Reg SDC_RTCR - */ -#define SDC_RTCR (PKUNITY_SDC_BASE + 0x002C) -/* - * Interrupt Status Reg SDC_ISR - */ -#define SDC_ISR (PKUNITY_SDC_BASE + 0x0030) -/* - * Interrupt Status Mask Reg SDC_ISMR - */ -#define SDC_ISMR (PKUNITY_SDC_BASE + 0x0034) -/* - * RX FIFO SDC_RXFIFO - */ -#define SDC_RXFIFO (PKUNITY_SDC_BASE + 0x0038) -/* - * TX FIFO SDC_TXFIFO - */ -#define SDC_TXFIFO (PKUNITY_SDC_BASE + 0x003C) - -/* - * SD Clock Enable SDC_CCR_CLKEN - */ -#define SDC_CCR_CLKEN FIELD(1, 1, 2) -/* - * [15:8] SDC_CCR_PDIV(v) - */ -#define SDC_CCR_PDIV(v) FIELD((v), 8, 8) - -/* - * Software reset enable SDC_SRR_ENABLE - */ -#define SDC_SRR_ENABLE FIELD(0, 1, 0) -/* - * Software reset disable SDC_SRR_DISABLE - */ -#define SDC_SRR_DISABLE FIELD(1, 1, 0) - -/* - * Response type SDC_COMMAND_RESTYPE_MASK - */ -#define SDC_COMMAND_RESTYPE_MASK FMASK(2, 0) -/* - * No response SDC_COMMAND_RESTYPE_NONE - */ -#define SDC_COMMAND_RESTYPE_NONE FIELD(0, 2, 0) -/* - * 136-bit long response SDC_COMMAND_RESTYPE_LONG - */ -#define SDC_COMMAND_RESTYPE_LONG FIELD(1, 2, 0) -/* - * 48-bit short response SDC_COMMAND_RESTYPE_SHORT - */ -#define SDC_COMMAND_RESTYPE_SHORT FIELD(2, 2, 0) -/* - * 48-bit short and test if busy response SDC_COMMAND_RESTYPE_SHORTBUSY - */ -#define SDC_COMMAND_RESTYPE_SHORTBUSY FIELD(3, 2, 0) -/* - * data ready SDC_COMMAND_DATAREADY - */ -#define SDC_COMMAND_DATAREADY FIELD(1, 1, 2) -#define SDC_COMMAND_CMDEN FIELD(1, 1, 3) -/* - * [10:5] SDC_COMMAND_CMDINDEX(v) - */ -#define SDC_COMMAND_CMDINDEX(v) FIELD((v), 6, 5) - -/* - * [10:0] SDC_BLOCKSIZE_BSMASK(v) - */ -#define SDC_BLOCKSIZE_BSMASK(v) FIELD((v), 11, 0) -/* - * [11:0] SDC_BLOCKCOUNT_BCMASK(v) - */ -#define SDC_BLOCKCOUNT_BCMASK(v) FIELD((v), 12, 0) - -/* - * Data Width 1bit SDC_TMR_WTH_1BIT - */ -#define SDC_TMR_WTH_1BIT FIELD(0, 1, 0) -/* - * Data Width 4bit SDC_TMR_WTH_4BIT - */ -#define SDC_TMR_WTH_4BIT FIELD(1, 1, 0) -/* - * Read SDC_TMR_DIR_READ - */ -#define SDC_TMR_DIR_READ FIELD(0, 1, 1) -/* - * Write SDC_TMR_DIR_WRITE - */ -#define SDC_TMR_DIR_WRITE FIELD(1, 1, 1) - -#define SDC_IR_MASK FMASK(13, 0) -#define SDC_IR_RESTIMEOUT FIELD(1, 1, 0) -#define SDC_IR_WRITECRC FIELD(1, 1, 1) -#define SDC_IR_READCRC FIELD(1, 1, 2) -#define SDC_IR_TXFIFOREAD FIELD(1, 1, 3) -#define SDC_IR_RXFIFOWRITE FIELD(1, 1, 4) -#define SDC_IR_READTIMEOUT FIELD(1, 1, 5) -#define SDC_IR_DATACOMPLETE FIELD(1, 1, 6) -#define SDC_IR_CMDCOMPLETE FIELD(1, 1, 7) -#define SDC_IR_RXFIFOFULL FIELD(1, 1, 8) -#define SDC_IR_RXFIFOEMPTY FIELD(1, 1, 9) -#define SDC_IR_TXFIFOFULL FIELD(1, 1, 10) -#define SDC_IR_TXFIFOEMPTY FIELD(1, 1, 11) -#define SDC_IR_ENDCMDWITHRES FIELD(1, 1, 12) diff --git a/arch/unicore32/include/mach/regs-spi.h b/arch/unicore32/include/mach/regs-spi.h deleted file mode 100644 index 3460647a9c2a..000000000000 --- a/arch/unicore32/include/mach/regs-spi.h +++ /dev/null @@ -1,99 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Serial Peripheral Interface (SPI) Registers - */ -/* - * Control reg. 0 SPI_CR0 - */ -#define SPI_CR0 (PKUNITY_SPI_BASE + 0x0000) -/* - * Control reg. 1 SPI_CR1 - */ -#define SPI_CR1 (PKUNITY_SPI_BASE + 0x0004) -/* - * Enable reg SPI_SSIENR - */ -#define SPI_SSIENR (PKUNITY_SPI_BASE + 0x0008) -/* - * Status reg SPI_SR - */ -#define SPI_SR (PKUNITY_SPI_BASE + 0x0028) -/* - * Interrupt Mask reg SPI_IMR - */ -#define SPI_IMR (PKUNITY_SPI_BASE + 0x002C) -/* - * Interrupt Status reg SPI_ISR - */ -#define SPI_ISR (PKUNITY_SPI_BASE + 0x0030) - -/* - * Enable SPI Controller SPI_SSIENR_EN - */ -#define SPI_SSIENR_EN FIELD(1, 1, 0) - -/* - * SPI Busy SPI_SR_BUSY - */ -#define SPI_SR_BUSY FIELD(1, 1, 0) -/* - * Transmit FIFO Not Full SPI_SR_TFNF - */ -#define SPI_SR_TFNF FIELD(1, 1, 1) -/* - * Transmit FIFO Empty SPI_SR_TFE - */ -#define SPI_SR_TFE FIELD(1, 1, 2) -/* - * Receive FIFO Not Empty SPI_SR_RFNE - */ -#define SPI_SR_RFNE FIELD(1, 1, 3) -/* - * Receive FIFO Full SPI_SR_RFF - */ -#define SPI_SR_RFF FIELD(1, 1, 4) - -/* - * Trans. FIFO Empty Interrupt Status SPI_ISR_TXEIS - */ -#define SPI_ISR_TXEIS FIELD(1, 1, 0) -/* - * Trans. FIFO Overflow Interrupt Status SPI_ISR_TXOIS - */ -#define SPI_ISR_TXOIS FIELD(1, 1, 1) -/* - * Receiv. FIFO Underflow Interrupt Status SPI_ISR_RXUIS - */ -#define SPI_ISR_RXUIS FIELD(1, 1, 2) -/* - * Receiv. FIFO Overflow Interrupt Status SPI_ISR_RXOIS - */ -#define SPI_ISR_RXOIS FIELD(1, 1, 3) -/* - * Receiv. FIFO Full Interrupt Status SPI_ISR_RXFIS - */ -#define SPI_ISR_RXFIS FIELD(1, 1, 4) -#define SPI_ISR_MSTIS FIELD(1, 1, 5) - -/* - * Trans. FIFO Empty Interrupt Mask SPI_IMR_TXEIM - */ -#define SPI_IMR_TXEIM FIELD(1, 1, 0) -/* - * Trans. FIFO Overflow Interrupt Mask SPI_IMR_TXOIM - */ -#define SPI_IMR_TXOIM FIELD(1, 1, 1) -/* - * Receiv. FIFO Underflow Interrupt Mask SPI_IMR_RXUIM - */ -#define SPI_IMR_RXUIM FIELD(1, 1, 2) -/* - * Receiv. FIFO Overflow Interrupt Mask SPI_IMR_RXOIM - */ -#define SPI_IMR_RXOIM FIELD(1, 1, 3) -/* - * Receiv. FIFO Full Interrupt Mask SPI_IMR_RXFIM - */ -#define SPI_IMR_RXFIM FIELD(1, 1, 4) -#define SPI_IMR_MSTIM FIELD(1, 1, 5) - diff --git a/arch/unicore32/include/mach/regs-uart.h b/arch/unicore32/include/mach/regs-uart.h deleted file mode 100644 index 9fa6b1938b77..000000000000 --- a/arch/unicore32/include/mach/regs-uart.h +++ /dev/null @@ -1,3 +0,0 @@ -/* - * PKUnity Universal Asynchronous Receiver/Transmitter (UART) Registers - */ diff --git a/arch/unicore32/include/mach/regs-umal.h b/arch/unicore32/include/mach/regs-umal.h deleted file mode 100644 index 7023089c61c6..000000000000 --- a/arch/unicore32/include/mach/regs-umal.h +++ /dev/null @@ -1,230 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Ultra Media Access Layer (UMAL) Ethernet MAC Registers - */ - -/* MAC module of UMAL */ -/* UMAL's MAC module includes G/MII interface, several additional PHY - * interfaces, and MAC control sub-layer, which provides support for control - * frames (e.g. PAUSE frames). - */ -/* - * TX/RX reset and control UMAL_CFG1 - */ -#define UMAL_CFG1 (PKUNITY_UMAL_BASE + 0x0000) -/* - * MAC interface mode control UMAL_CFG2 - */ -#define UMAL_CFG2 (PKUNITY_UMAL_BASE + 0x0004) -/* - * Inter Packet/Frame Gap UMAL_IPGIFG - */ -#define UMAL_IPGIFG (PKUNITY_UMAL_BASE + 0x0008) -/* - * Collision retry or backoff UMAL_HALFDUPLEX - */ -#define UMAL_HALFDUPLEX (PKUNITY_UMAL_BASE + 0x000c) -/* - * Maximum Frame Length UMAL_MAXFRAME - */ -#define UMAL_MAXFRAME (PKUNITY_UMAL_BASE + 0x0010) -/* - * Test Regsiter UMAL_TESTREG - */ -#define UMAL_TESTREG (PKUNITY_UMAL_BASE + 0x001c) -/* - * MII Management Configure UMAL_MIICFG - */ -#define UMAL_MIICFG (PKUNITY_UMAL_BASE + 0x0020) -/* - * MII Management Command UMAL_MIICMD - */ -#define UMAL_MIICMD (PKUNITY_UMAL_BASE + 0x0024) -/* - * MII Management Address UMAL_MIIADDR - */ -#define UMAL_MIIADDR (PKUNITY_UMAL_BASE + 0x0028) -/* - * MII Management Control UMAL_MIICTRL - */ -#define UMAL_MIICTRL (PKUNITY_UMAL_BASE + 0x002c) -/* - * MII Management Status UMAL_MIISTATUS - */ -#define UMAL_MIISTATUS (PKUNITY_UMAL_BASE + 0x0030) -/* - * MII Management Indicator UMAL_MIIIDCT - */ -#define UMAL_MIIIDCT (PKUNITY_UMAL_BASE + 0x0034) -/* - * Interface Control UMAL_IFCTRL - */ -#define UMAL_IFCTRL (PKUNITY_UMAL_BASE + 0x0038) -/* - * Interface Status UMAL_IFSTATUS - */ -#define UMAL_IFSTATUS (PKUNITY_UMAL_BASE + 0x003c) -/* - * MAC address (high 4 bytes) UMAL_STADDR1 - */ -#define UMAL_STADDR1 (PKUNITY_UMAL_BASE + 0x0040) -/* - * MAC address (low 2 bytes) UMAL_STADDR2 - */ -#define UMAL_STADDR2 (PKUNITY_UMAL_BASE + 0x0044) - -/* FIFO MODULE OF UMAL */ -/* UMAL's FIFO module provides data queuing for increased system level - * throughput - */ -#define UMAL_FIFOCFG0 (PKUNITY_UMAL_BASE + 0x0048) -#define UMAL_FIFOCFG1 (PKUNITY_UMAL_BASE + 0x004c) -#define UMAL_FIFOCFG2 (PKUNITY_UMAL_BASE + 0x0050) -#define UMAL_FIFOCFG3 (PKUNITY_UMAL_BASE + 0x0054) -#define UMAL_FIFOCFG4 (PKUNITY_UMAL_BASE + 0x0058) -#define UMAL_FIFOCFG5 (PKUNITY_UMAL_BASE + 0x005c) -#define UMAL_FIFORAM0 (PKUNITY_UMAL_BASE + 0x0060) -#define UMAL_FIFORAM1 (PKUNITY_UMAL_BASE + 0x0064) -#define UMAL_FIFORAM2 (PKUNITY_UMAL_BASE + 0x0068) -#define UMAL_FIFORAM3 (PKUNITY_UMAL_BASE + 0x006c) -#define UMAL_FIFORAM4 (PKUNITY_UMAL_BASE + 0x0070) -#define UMAL_FIFORAM5 (PKUNITY_UMAL_BASE + 0x0074) -#define UMAL_FIFORAM6 (PKUNITY_UMAL_BASE + 0x0078) -#define UMAL_FIFORAM7 (PKUNITY_UMAL_BASE + 0x007c) - -/* MAHBE MODULE OF UMAL */ -/* UMAL's MAHBE module interfaces to the host system through 32-bit AHB Master - * and Slave ports.Registers within the M-AHBE provide Control and Status - * information concerning these transfers. - */ -/* - * Transmit Control UMAL_DMATxCtrl - */ -#define UMAL_DMATxCtrl (PKUNITY_UMAL_BASE + 0x0180) -/* - * Pointer to TX Descripter UMAL_DMATxDescriptor - */ -#define UMAL_DMATxDescriptor (PKUNITY_UMAL_BASE + 0x0184) -/* - * Status of Tx Packet Transfers UMAL_DMATxStatus - */ -#define UMAL_DMATxStatus (PKUNITY_UMAL_BASE + 0x0188) -/* - * Receive Control UMAL_DMARxCtrl - */ -#define UMAL_DMARxCtrl (PKUNITY_UMAL_BASE + 0x018c) -/* - * Pointer to Rx Descriptor UMAL_DMARxDescriptor - */ -#define UMAL_DMARxDescriptor (PKUNITY_UMAL_BASE + 0x0190) -/* - * Status of Rx Packet Transfers UMAL_DMARxStatus - */ -#define UMAL_DMARxStatus (PKUNITY_UMAL_BASE + 0x0194) -/* - * Interrupt Mask UMAL_DMAIntrMask - */ -#define UMAL_DMAIntrMask (PKUNITY_UMAL_BASE + 0x0198) -/* - * Interrupts, read only UMAL_DMAInterrupt - */ -#define UMAL_DMAInterrupt (PKUNITY_UMAL_BASE + 0x019c) - -/* - * Commands for UMAL_CFG1 register - */ -#define UMAL_CFG1_TXENABLE FIELD(1, 1, 0) -#define UMAL_CFG1_RXENABLE FIELD(1, 1, 2) -#define UMAL_CFG1_TXFLOWCTL FIELD(1, 1, 4) -#define UMAL_CFG1_RXFLOWCTL FIELD(1, 1, 5) -#define UMAL_CFG1_CONFLPBK FIELD(1, 1, 8) -#define UMAL_CFG1_RESET FIELD(1, 1, 31) -#define UMAL_CFG1_CONFFLCTL (MAC_TX_FLOW_CTL | MAC_RX_FLOW_CTL) - -/* - * Commands for UMAL_CFG2 register - */ -#define UMAL_CFG2_FULLDUPLEX FIELD(1, 1, 0) -#define UMAL_CFG2_CRCENABLE FIELD(1, 1, 1) -#define UMAL_CFG2_PADCRC FIELD(1, 1, 2) -#define UMAL_CFG2_LENGTHCHECK FIELD(1, 1, 4) -#define UMAL_CFG2_MODEMASK FMASK(2, 8) -#define UMAL_CFG2_NIBBLEMODE FIELD(1, 2, 8) -#define UMAL_CFG2_BYTEMODE FIELD(2, 2, 8) -#define UMAL_CFG2_PREAMBLENMASK FMASK(4, 12) -#define UMAL_CFG2_DEFPREAMBLEN FIELD(7, 4, 12) -#define UMAL_CFG2_FD100 (UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_NIBBLEMODE \ - | UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \ - | UMAL_CFG2_CRCENABLE | UMAL_CFG2_FULLDUPLEX) -#define UMAL_CFG2_FD1000 (UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_BYTEMODE \ - | UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \ - | UMAL_CFG2_CRCENABLE | UMAL_CFG2_FULLDUPLEX) -#define UMAL_CFG2_HD100 (UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_NIBBLEMODE \ - | UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \ - | UMAL_CFG2_CRCENABLE) - -/* - * Command for UMAL_IFCTRL register - */ -#define UMAL_IFCTRL_RESET FIELD(1, 1, 31) - -/* - * Command for UMAL_MIICFG register - */ -#define UMAL_MIICFG_RESET FIELD(1, 1, 31) - -/* - * Command for UMAL_MIICMD register - */ -#define UMAL_MIICMD_READ FIELD(1, 1, 0) - -/* - * Command for UMAL_MIIIDCT register - */ -#define UMAL_MIIIDCT_BUSY FIELD(1, 1, 0) -#define UMAL_MIIIDCT_NOTVALID FIELD(1, 1, 2) - -/* - * Commands for DMATxCtrl regesters - */ -#define UMAL_DMA_Enable FIELD(1, 1, 0) - -/* - * Commands for DMARxCtrl regesters - */ -#define UMAL_DMAIntrMask_ENABLEHALFWORD FIELD(1, 1, 16) - -/* - * Command for DMARxStatus - */ -#define CLR_RX_BUS_ERR FIELD(1, 1, 3) -#define CLR_RX_OVERFLOW FIELD(1, 1, 2) -#define CLR_RX_PKT FIELD(1, 1, 0) - -/* - * Command for DMATxStatus - */ -#define CLR_TX_BUS_ERR FIELD(1, 1, 3) -#define CLR_TX_UNDERRUN FIELD(1, 1, 1) -#define CLR_TX_PKT FIELD(1, 1, 0) - -/* - * Commands for DMAIntrMask and DMAInterrupt register - */ -#define INT_RX_MASK FIELD(0xd, 4, 4) -#define INT_TX_MASK FIELD(0xb, 4, 0) - -#define INT_RX_BUS_ERR FIELD(1, 1, 7) -#define INT_RX_OVERFLOW FIELD(1, 1, 6) -#define INT_RX_PKT FIELD(1, 1, 4) -#define INT_TX_BUS_ERR FIELD(1, 1, 3) -#define INT_TX_UNDERRUN FIELD(1, 1, 1) -#define INT_TX_PKT FIELD(1, 1, 0) - -/* - * MARCOS of UMAL's descriptors - */ -#define UMAL_DESC_PACKETSIZE_EMPTY FIELD(1, 1, 31) -#define UMAL_DESC_PACKETSIZE_NONEMPTY FIELD(0, 1, 31) -#define UMAL_DESC_PACKETSIZE_SIZEMASK FMASK(12, 0) - diff --git a/arch/unicore32/include/mach/regs-unigfx.h b/arch/unicore32/include/mach/regs-unigfx.h deleted file mode 100644 index 553d1157c6b2..000000000000 --- a/arch/unicore32/include/mach/regs-unigfx.h +++ /dev/null @@ -1,201 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity UNIGFX Registers - */ - -#define UDE_BASE (PKUNITY_UNIGFX_BASE + 0x1400) -#define UGE_BASE (PKUNITY_UNIGFX_BASE + 0x0000) - -/* - * command reg for UNIGFX DE - */ -/* - * control reg UDE_CFG - */ -#define UDE_CFG (UDE_BASE + 0x0000) -/* - * framebuffer start address reg UDE_FSA - */ -#define UDE_FSA (UDE_BASE + 0x0004) -/* - * line size reg UDE_LS - */ -#define UDE_LS (UDE_BASE + 0x0008) -/* - * pitch size reg UDE_PS - */ -#define UDE_PS (UDE_BASE + 0x000C) -/* - * horizontal active time reg UDE_HAT - */ -#define UDE_HAT (UDE_BASE + 0x0010) -/* - * horizontal blank time reg UDE_HBT - */ -#define UDE_HBT (UDE_BASE + 0x0014) -/* - * horizontal sync time reg UDE_HST - */ -#define UDE_HST (UDE_BASE + 0x0018) -/* - * vertival active time reg UDE_VAT - */ -#define UDE_VAT (UDE_BASE + 0x001C) -/* - * vertival blank time reg UDE_VBT - */ -#define UDE_VBT (UDE_BASE + 0x0020) -/* - * vertival sync time reg UDE_VST - */ -#define UDE_VST (UDE_BASE + 0x0024) -/* - * cursor position UDE_CXY - */ -#define UDE_CXY (UDE_BASE + 0x0028) -/* - * cursor front color UDE_CC0 - */ -#define UDE_CC0 (UDE_BASE + 0x002C) -/* - * cursor background color UDE_CC1 - */ -#define UDE_CC1 (UDE_BASE + 0x0030) -/* - * video position UDE_VXY - */ -#define UDE_VXY (UDE_BASE + 0x0034) -/* - * video start address reg UDE_VSA - */ -#define UDE_VSA (UDE_BASE + 0x0040) -/* - * video size reg UDE_VS - */ -#define UDE_VS (UDE_BASE + 0x004C) - -/* - * command reg for UNIGFX GE - */ -/* - * src xy reg UGE_SRCXY - */ -#define UGE_SRCXY (UGE_BASE + 0x0000) -/* - * dst xy reg UGE_DSTXY - */ -#define UGE_DSTXY (UGE_BASE + 0x0004) -/* - * pitch reg UGE_PITCH - */ -#define UGE_PITCH (UGE_BASE + 0x0008) -/* - * src start reg UGE_SRCSTART - */ -#define UGE_SRCSTART (UGE_BASE + 0x000C) -/* - * dst start reg UGE_DSTSTART - */ -#define UGE_DSTSTART (UGE_BASE + 0x0010) -/* - * width height reg UGE_WIDHEIGHT - */ -#define UGE_WIDHEIGHT (UGE_BASE + 0x0014) -/* - * rop alpah reg UGE_ROPALPHA - */ -#define UGE_ROPALPHA (UGE_BASE + 0x0018) -/* - * front color UGE_FCOLOR - */ -#define UGE_FCOLOR (UGE_BASE + 0x001C) -/* - * background color UGE_BCOLOR - */ -#define UGE_BCOLOR (UGE_BASE + 0x0020) -/* - * src color key for high value UGE_SCH - */ -#define UGE_SCH (UGE_BASE + 0x0024) -/* - * dst color key for high value UGE_DCH - */ -#define UGE_DCH (UGE_BASE + 0x0028) -/* - * src color key for low value UGE_SCL - */ -#define UGE_SCL (UGE_BASE + 0x002C) -/* - * dst color key for low value UGE_DCL - */ -#define UGE_DCL (UGE_BASE + 0x0030) -/* - * clip 0 reg UGE_CLIP0 - */ -#define UGE_CLIP0 (UGE_BASE + 0x0034) -/* - * clip 1 reg UGE_CLIP1 - */ -#define UGE_CLIP1 (UGE_BASE + 0x0038) -/* - * command reg UGE_COMMAND - */ -#define UGE_COMMAND (UGE_BASE + 0x003C) -/* - * pattern 0 UGE_P0 - */ -#define UGE_P0 (UGE_BASE + 0x0040) -#define UGE_P1 (UGE_BASE + 0x0044) -#define UGE_P2 (UGE_BASE + 0x0048) -#define UGE_P3 (UGE_BASE + 0x004C) -#define UGE_P4 (UGE_BASE + 0x0050) -#define UGE_P5 (UGE_BASE + 0x0054) -#define UGE_P6 (UGE_BASE + 0x0058) -#define UGE_P7 (UGE_BASE + 0x005C) -#define UGE_P8 (UGE_BASE + 0x0060) -#define UGE_P9 (UGE_BASE + 0x0064) -#define UGE_P10 (UGE_BASE + 0x0068) -#define UGE_P11 (UGE_BASE + 0x006C) -#define UGE_P12 (UGE_BASE + 0x0070) -#define UGE_P13 (UGE_BASE + 0x0074) -#define UGE_P14 (UGE_BASE + 0x0078) -#define UGE_P15 (UGE_BASE + 0x007C) -#define UGE_P16 (UGE_BASE + 0x0080) -#define UGE_P17 (UGE_BASE + 0x0084) -#define UGE_P18 (UGE_BASE + 0x0088) -#define UGE_P19 (UGE_BASE + 0x008C) -#define UGE_P20 (UGE_BASE + 0x0090) -#define UGE_P21 (UGE_BASE + 0x0094) -#define UGE_P22 (UGE_BASE + 0x0098) -#define UGE_P23 (UGE_BASE + 0x009C) -#define UGE_P24 (UGE_BASE + 0x00A0) -#define UGE_P25 (UGE_BASE + 0x00A4) -#define UGE_P26 (UGE_BASE + 0x00A8) -#define UGE_P27 (UGE_BASE + 0x00AC) -#define UGE_P28 (UGE_BASE + 0x00B0) -#define UGE_P29 (UGE_BASE + 0x00B4) -#define UGE_P30 (UGE_BASE + 0x00B8) -#define UGE_P31 (UGE_BASE + 0x00BC) - -#define UDE_CFG_DST_MASK FMASK(2, 8) -#define UDE_CFG_DST8 FIELD(0x0, 2, 8) -#define UDE_CFG_DST16 FIELD(0x1, 2, 8) -#define UDE_CFG_DST24 FIELD(0x2, 2, 8) -#define UDE_CFG_DST32 FIELD(0x3, 2, 8) - -/* - * GDEN enable UDE_CFG_GDEN_ENABLE - */ -#define UDE_CFG_GDEN_ENABLE FIELD(1, 1, 3) -/* - * VDEN enable UDE_CFG_VDEN_ENABLE - */ -#define UDE_CFG_VDEN_ENABLE FIELD(1, 1, 4) -/* - * CDEN enable UDE_CFG_CDEN_ENABLE - */ -#define UDE_CFG_CDEN_ENABLE FIELD(1, 1, 5) -/* - * TIMEUP enable UDE_CFG_TIMEUP_ENABLE - */ -#define UDE_CFG_TIMEUP_ENABLE FIELD(1, 1, 6) diff --git a/arch/unicore32/include/mach/uncompress.h b/arch/unicore32/include/mach/uncompress.h deleted file mode 100644 index 0c1a56a1913f..000000000000 --- a/arch/unicore32/include/mach/uncompress.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/uncompress.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __MACH_PUV3_UNCOMPRESS_H__ -#define __MACH_PUV3_UNCOMPRESS_H__ - -#include -#include - -extern char input_data[]; -extern char input_data_end[]; - -static void arch_decomp_puts(const char *ptr) -{ - char c; - - while ((c = *ptr++) != '\0') { - if (c == '\n') - putc('\r'); - putc(c); - } -} -#define ARCH_HAVE_DECOMP_PUTS - -#endif /* __MACH_PUV3_UNCOMPRESS_H__ */ diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild deleted file mode 100644 index e78470141932..000000000000 --- a/arch/unicore32/include/uapi/asm/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -generic-y += ucontext.h diff --git a/arch/unicore32/include/uapi/asm/byteorder.h b/arch/unicore32/include/uapi/asm/byteorder.h deleted file mode 100644 index 864fe4814cf4..000000000000 --- a/arch/unicore32/include/uapi/asm/byteorder.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * linux/arch/unicore32/include/asm/byteorder.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * UniCore ONLY support Little Endian mode, the data bus is connected such - * that byte accesses appear as: - * 0 = d0...d7, 1 = d8...d15, 2 = d16...d23, 3 = d24...d31 - * and word accesses (data or instruction) appear as: - * d0...d31 - */ -#ifndef __UNICORE_BYTEORDER_H__ -#define __UNICORE_BYTEORDER_H__ - -#include - -#endif - diff --git a/arch/unicore32/include/uapi/asm/ptrace.h b/arch/unicore32/include/uapi/asm/ptrace.h deleted file mode 100644 index 2820de83e37d..000000000000 --- a/arch/unicore32/include/uapi/asm/ptrace.h +++ /dev/null @@ -1,91 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * linux/arch/unicore32/include/asm/ptrace.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef _UAPI__UNICORE_PTRACE_H__ -#define _UAPI__UNICORE_PTRACE_H__ - -#define PTRACE_GET_THREAD_AREA 22 - -/* - * PSR bits - */ -#define USER_MODE 0x00000010 -#define REAL_MODE 0x00000011 -#define INTR_MODE 0x00000012 -#define PRIV_MODE 0x00000013 -#define ABRT_MODE 0x00000017 -#define EXTN_MODE 0x0000001b -#define SUSR_MODE 0x0000001f -#define MODE_MASK 0x0000001f -#define PSR_R_BIT 0x00000040 -#define PSR_I_BIT 0x00000080 -#define PSR_V_BIT 0x10000000 -#define PSR_C_BIT 0x20000000 -#define PSR_Z_BIT 0x40000000 -#define PSR_S_BIT 0x80000000 - -/* - * Groups of PSR bits - */ -#define PSR_f 0xff000000 /* Flags */ -#define PSR_c 0x000000ff /* Control */ - -#ifndef __ASSEMBLY__ - -/* - * This struct defines the way the registers are stored on the - * stack during a system call. Note that sizeof(struct pt_regs) - * has to be a multiple of 8. - */ -struct pt_regs { - unsigned long uregs[34]; -}; - -#define UCreg_asr uregs[32] -#define UCreg_pc uregs[31] -#define UCreg_lr uregs[30] -#define UCreg_sp uregs[29] -#define UCreg_ip uregs[28] -#define UCreg_fp uregs[27] -#define UCreg_26 uregs[26] -#define UCreg_25 uregs[25] -#define UCreg_24 uregs[24] -#define UCreg_23 uregs[23] -#define UCreg_22 uregs[22] -#define UCreg_21 uregs[21] -#define UCreg_20 uregs[20] -#define UCreg_19 uregs[19] -#define UCreg_18 uregs[18] -#define UCreg_17 uregs[17] -#define UCreg_16 uregs[16] -#define UCreg_15 uregs[15] -#define UCreg_14 uregs[14] -#define UCreg_13 uregs[13] -#define UCreg_12 uregs[12] -#define UCreg_11 uregs[11] -#define UCreg_10 uregs[10] -#define UCreg_09 uregs[9] -#define UCreg_08 uregs[8] -#define UCreg_07 uregs[7] -#define UCreg_06 uregs[6] -#define UCreg_05 uregs[5] -#define UCreg_04 uregs[4] -#define UCreg_03 uregs[3] -#define UCreg_02 uregs[2] -#define UCreg_01 uregs[1] -#define UCreg_00 uregs[0] -#define UCreg_ORIG_00 uregs[33] - - -#endif /* __ASSEMBLY__ */ - -#endif /* _UAPI__UNICORE_PTRACE_H__ */ diff --git a/arch/unicore32/include/uapi/asm/sigcontext.h b/arch/unicore32/include/uapi/asm/sigcontext.h deleted file mode 100644 index 79e56f28e4b5..000000000000 --- a/arch/unicore32/include/uapi/asm/sigcontext.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * linux/arch/unicore32/include/asm/sigcontext.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef __UNICORE_SIGCONTEXT_H__ -#define __UNICORE_SIGCONTEXT_H__ - -#include -/* - * Signal context structure - contains all info to do with the state - * before the signal handler was invoked. Note: only add new entries - * to the end of the structure. - */ -struct sigcontext { - unsigned long trap_no; - unsigned long error_code; - unsigned long oldmask; - unsigned long fault_address; - struct pt_regs regs; -}; - -#endif diff --git a/arch/unicore32/include/uapi/asm/unistd.h b/arch/unicore32/include/uapi/asm/unistd.h deleted file mode 100644 index 54a7378a70b1..000000000000 --- a/arch/unicore32/include/uapi/asm/unistd.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * linux/arch/unicore32/include/asm/unistd.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#define __ARCH_WANT_RENAMEAT -#define __ARCH_WANT_SET_GET_RLIMIT -#define __ARCH_WANT_STAT64 -#define __ARCH_WANT_TIME32_SYSCALLS - -/* Use the standard ABI for syscalls. */ -#include -#define __ARCH_WANT_SYS_CLONE diff --git a/arch/unicore32/kernel/Makefile b/arch/unicore32/kernel/Makefile deleted file mode 100644 index 2f79aa56735b..000000000000 --- a/arch/unicore32/kernel/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for the linux kernel. -# - -# Object file lists. -obj-y := dma.o elf.o entry.o process.o ptrace.o -obj-y += setup.o signal.o sys.o stacktrace.o traps.o - -obj-$(CONFIG_MODULES) += ksyms.o module.o -obj-$(CONFIG_EARLY_PRINTK) += early_printk.o - -obj-$(CONFIG_UNICORE_FPU_F64) += fpu-ucf64.o - -# obj-y for architecture PKUnity v3 -obj-$(CONFIG_ARCH_PUV3) += clock.o irq.o time.o - -obj-$(CONFIG_PUV3_GPIO) += gpio.o -obj-$(CONFIG_PUV3_PM) += pm.o sleep.o -obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate_asm.o - -obj-$(CONFIG_PCI) += pci.o - -# obj-y for specific machines -obj-$(CONFIG_ARCH_PUV3) += puv3-core.o -obj-$(CONFIG_PUV3_NB0916) += puv3-nb0916.o - -head-y := head.o -obj-$(CONFIG_DEBUG_LL) += debug.o - -extra-y := $(head-y) vmlinux.lds diff --git a/arch/unicore32/kernel/asm-offsets.c b/arch/unicore32/kernel/asm-offsets.c deleted file mode 100644 index f7d672267549..000000000000 --- a/arch/unicore32/kernel/asm-offsets.c +++ /dev/null @@ -1,108 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/asm-offsets.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed to extract - * and format the required data. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * GCC 3.0, 3.1: general bad code generation. - * GCC 3.2.0: incorrect function argument offset calculation. - * GCC 3.2.x: miscompiles NEW_AUX_ENT in fs/binfmt_elf.c - * (http://gcc.gnu.org/PR8896) and incorrect structure - * initialisation in fs/jffs2/erase.c - */ -#if (__GNUC__ < 4) -#error Your compiler should upgrade to uc4 -#error Known good compilers: 4.2.2 -#endif - -int main(void) -{ - DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); - BLANK(); - DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); - DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); - DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); - DEFINE(TI_TASK, offsetof(struct thread_info, task)); - DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); - DEFINE(TI_CPU_SAVE, offsetof(struct thread_info, cpu_context)); - DEFINE(TI_USED_CP, offsetof(struct thread_info, used_cp)); -#ifdef CONFIG_UNICORE_FPU_F64 - DEFINE(TI_FPSTATE, offsetof(struct thread_info, fpstate)); -#endif - BLANK(); - DEFINE(S_R0, offsetof(struct pt_regs, UCreg_00)); - DEFINE(S_R1, offsetof(struct pt_regs, UCreg_01)); - DEFINE(S_R2, offsetof(struct pt_regs, UCreg_02)); - DEFINE(S_R3, offsetof(struct pt_regs, UCreg_03)); - DEFINE(S_R4, offsetof(struct pt_regs, UCreg_04)); - DEFINE(S_R5, offsetof(struct pt_regs, UCreg_05)); - DEFINE(S_R6, offsetof(struct pt_regs, UCreg_06)); - DEFINE(S_R7, offsetof(struct pt_regs, UCreg_07)); - DEFINE(S_R8, offsetof(struct pt_regs, UCreg_08)); - DEFINE(S_R9, offsetof(struct pt_regs, UCreg_09)); - DEFINE(S_R10, offsetof(struct pt_regs, UCreg_10)); - DEFINE(S_R11, offsetof(struct pt_regs, UCreg_11)); - DEFINE(S_R12, offsetof(struct pt_regs, UCreg_12)); - DEFINE(S_R13, offsetof(struct pt_regs, UCreg_13)); - DEFINE(S_R14, offsetof(struct pt_regs, UCreg_14)); - DEFINE(S_R15, offsetof(struct pt_regs, UCreg_15)); - DEFINE(S_R16, offsetof(struct pt_regs, UCreg_16)); - DEFINE(S_R17, offsetof(struct pt_regs, UCreg_17)); - DEFINE(S_R18, offsetof(struct pt_regs, UCreg_18)); - DEFINE(S_R19, offsetof(struct pt_regs, UCreg_19)); - DEFINE(S_R20, offsetof(struct pt_regs, UCreg_20)); - DEFINE(S_R21, offsetof(struct pt_regs, UCreg_21)); - DEFINE(S_R22, offsetof(struct pt_regs, UCreg_22)); - DEFINE(S_R23, offsetof(struct pt_regs, UCreg_23)); - DEFINE(S_R24, offsetof(struct pt_regs, UCreg_24)); - DEFINE(S_R25, offsetof(struct pt_regs, UCreg_25)); - DEFINE(S_R26, offsetof(struct pt_regs, UCreg_26)); - DEFINE(S_FP, offsetof(struct pt_regs, UCreg_fp)); - DEFINE(S_IP, offsetof(struct pt_regs, UCreg_ip)); - DEFINE(S_SP, offsetof(struct pt_regs, UCreg_sp)); - DEFINE(S_LR, offsetof(struct pt_regs, UCreg_lr)); - DEFINE(S_PC, offsetof(struct pt_regs, UCreg_pc)); - DEFINE(S_PSR, offsetof(struct pt_regs, UCreg_asr)); - DEFINE(S_OLD_R0, offsetof(struct pt_regs, UCreg_ORIG_00)); - DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); - BLANK(); - DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); - DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags)); - BLANK(); - DEFINE(VM_EXEC, VM_EXEC); - BLANK(); - DEFINE(PAGE_SZ, PAGE_SIZE); - BLANK(); - DEFINE(SYS_ERROR0, 0x9f0000); - BLANK(); - DEFINE(PBE_ADDRESS, offsetof(struct pbe, address)); - DEFINE(PBE_ORIN_ADDRESS, offsetof(struct pbe, orig_address)); - DEFINE(PBE_NEXT, offsetof(struct pbe, next)); - DEFINE(SWSUSP_CPU, offsetof(struct swsusp_arch_regs, \ - cpu_context)); -#ifdef CONFIG_UNICORE_FPU_F64 - DEFINE(SWSUSP_FPSTATE, offsetof(struct swsusp_arch_regs, \ - fpstate)); -#endif - BLANK(); - DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL); - DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); - DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); - return 0; -} diff --git a/arch/unicore32/kernel/clock.c b/arch/unicore32/kernel/clock.c deleted file mode 100644 index 41df6be0a3b2..000000000000 --- a/arch/unicore32/kernel/clock.c +++ /dev/null @@ -1,387 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/clock.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Very simple clock implementation - */ -struct clk { - struct list_head node; - unsigned long rate; - const char *name; -}; - -static struct clk clk_ost_clk = { - .name = "OST_CLK", - .rate = CLOCK_TICK_RATE, -}; - -static struct clk clk_mclk_clk = { - .name = "MAIN_CLK", -}; - -static struct clk clk_bclk32_clk = { - .name = "BUS32_CLK", -}; - -static struct clk clk_ddr_clk = { - .name = "DDR_CLK", -}; - -static struct clk clk_vga_clk = { - .name = "VGA_CLK", -}; - -static LIST_HEAD(clocks); -static DEFINE_MUTEX(clocks_mutex); - -struct clk *clk_get(struct device *dev, const char *id) -{ - struct clk *p, *clk = ERR_PTR(-ENOENT); - - mutex_lock(&clocks_mutex); - list_for_each_entry(p, &clocks, node) { - if (strcmp(id, p->name) == 0) { - clk = p; - break; - } - } - mutex_unlock(&clocks_mutex); - - return clk; -} -EXPORT_SYMBOL(clk_get); - -void clk_put(struct clk *clk) -{ -} -EXPORT_SYMBOL(clk_put); - -int clk_enable(struct clk *clk) -{ - return 0; -} -EXPORT_SYMBOL(clk_enable); - -void clk_disable(struct clk *clk) -{ -} -EXPORT_SYMBOL(clk_disable); - -unsigned long clk_get_rate(struct clk *clk) -{ - return clk->rate; -} -EXPORT_SYMBOL(clk_get_rate); - -struct { - unsigned long rate; - unsigned long cfg; - unsigned long div; -} vga_clk_table[] = { - {.rate = 25175000, .cfg = 0x00002001, .div = 0x9}, - {.rate = 31500000, .cfg = 0x00002001, .div = 0x7}, - {.rate = 40000000, .cfg = 0x00003801, .div = 0x9}, - {.rate = 49500000, .cfg = 0x00003801, .div = 0x7}, - {.rate = 65000000, .cfg = 0x00002c01, .div = 0x4}, - {.rate = 78750000, .cfg = 0x00002400, .div = 0x7}, - {.rate = 108000000, .cfg = 0x00002c01, .div = 0x2}, - {.rate = 106500000, .cfg = 0x00003c01, .div = 0x3}, - {.rate = 50650000, .cfg = 0x00106400, .div = 0x9}, - {.rate = 61500000, .cfg = 0x00106400, .div = 0xa}, - {.rate = 85500000, .cfg = 0x00002800, .div = 0x6}, -}; - -struct { - unsigned long mrate; - unsigned long prate; -} mclk_clk_table[] = { - {.mrate = 500000000, .prate = 0x00109801}, - {.mrate = 525000000, .prate = 0x00104C00}, - {.mrate = 550000000, .prate = 0x00105000}, - {.mrate = 575000000, .prate = 0x00105400}, - {.mrate = 600000000, .prate = 0x00105800}, - {.mrate = 625000000, .prate = 0x00105C00}, - {.mrate = 650000000, .prate = 0x00106000}, - {.mrate = 675000000, .prate = 0x00106400}, - {.mrate = 700000000, .prate = 0x00106800}, - {.mrate = 725000000, .prate = 0x00106C00}, - {.mrate = 750000000, .prate = 0x00107000}, - {.mrate = 775000000, .prate = 0x00107400}, - {.mrate = 800000000, .prate = 0x00107800}, -}; - -int clk_set_rate(struct clk *clk, unsigned long rate) -{ - if (clk == &clk_vga_clk) { - unsigned long pll_vgacfg, pll_vgadiv; - int ret, i; - - /* lookup vga_clk_table */ - ret = -EINVAL; - for (i = 0; i < ARRAY_SIZE(vga_clk_table); i++) { - if (rate == vga_clk_table[i].rate) { - pll_vgacfg = vga_clk_table[i].cfg; - pll_vgadiv = vga_clk_table[i].div; - ret = 0; - break; - } - } - - if (ret) - return ret; - - if (readl(PM_PLLVGACFG) == pll_vgacfg) - return 0; - - /* set pll vga cfg reg. */ - writel(pll_vgacfg, PM_PLLVGACFG); - - writel(PM_PMCR_CFBVGA, PM_PMCR); - while ((readl(PM_PLLDFCDONE) & PM_PLLDFCDONE_VGADFC) - != PM_PLLDFCDONE_VGADFC) - udelay(100); /* about 1ms */ - - /* set div cfg reg. */ - writel(readl(PM_PCGR) | PM_PCGR_VGACLK, PM_PCGR); - - writel((readl(PM_DIVCFG) & ~PM_DIVCFG_VGACLK_MASK) - | PM_DIVCFG_VGACLK(pll_vgadiv), PM_DIVCFG); - - writel(readl(PM_SWRESET) | PM_SWRESET_VGADIV, PM_SWRESET); - while ((readl(PM_SWRESET) & PM_SWRESET_VGADIV) - == PM_SWRESET_VGADIV) - udelay(100); /* 65536 bclk32, about 320us */ - - writel(readl(PM_PCGR) & ~PM_PCGR_VGACLK, PM_PCGR); - } -#ifdef CONFIG_CPU_FREQ - if (clk == &clk_mclk_clk) { - u32 pll_rate, divstatus = readl(PM_DIVSTATUS); - int ret, i; - - /* lookup mclk_clk_table */ - ret = -EINVAL; - for (i = 0; i < ARRAY_SIZE(mclk_clk_table); i++) { - if (rate == mclk_clk_table[i].mrate) { - pll_rate = mclk_clk_table[i].prate; - clk_mclk_clk.rate = mclk_clk_table[i].mrate; - ret = 0; - break; - } - } - - if (ret) - return ret; - - if (clk_mclk_clk.rate) - clk_bclk32_clk.rate = clk_mclk_clk.rate - / (((divstatus & 0x0000f000) >> 12) + 1); - - /* set pll sys cfg reg. */ - writel(pll_rate, PM_PLLSYSCFG); - - writel(PM_PMCR_CFBSYS, PM_PMCR); - while ((readl(PM_PLLDFCDONE) & PM_PLLDFCDONE_SYSDFC) - != PM_PLLDFCDONE_SYSDFC) - udelay(100); - /* about 1ms */ - } -#endif - return 0; -} -EXPORT_SYMBOL(clk_set_rate); - -int clk_register(struct clk *clk) -{ - mutex_lock(&clocks_mutex); - list_add(&clk->node, &clocks); - mutex_unlock(&clocks_mutex); - printk(KERN_DEFAULT "PKUnity PM: %s %lu.%02luM\n", clk->name, - (clk->rate)/1000000, (clk->rate)/10000 % 100); - return 0; -} -EXPORT_SYMBOL(clk_register); - -void clk_unregister(struct clk *clk) -{ - mutex_lock(&clocks_mutex); - list_del(&clk->node); - mutex_unlock(&clocks_mutex); -} -EXPORT_SYMBOL(clk_unregister); - -struct { - unsigned long prate; - unsigned long rate; -} pllrate_table[] = { - {.prate = 0x00002001, .rate = 250000000}, - {.prate = 0x00104801, .rate = 250000000}, - {.prate = 0x00104C01, .rate = 262500000}, - {.prate = 0x00002401, .rate = 275000000}, - {.prate = 0x00105001, .rate = 275000000}, - {.prate = 0x00105401, .rate = 287500000}, - {.prate = 0x00002801, .rate = 300000000}, - {.prate = 0x00105801, .rate = 300000000}, - {.prate = 0x00105C01, .rate = 312500000}, - {.prate = 0x00002C01, .rate = 325000000}, - {.prate = 0x00106001, .rate = 325000000}, - {.prate = 0x00106401, .rate = 337500000}, - {.prate = 0x00003001, .rate = 350000000}, - {.prate = 0x00106801, .rate = 350000000}, - {.prate = 0x00106C01, .rate = 362500000}, - {.prate = 0x00003401, .rate = 375000000}, - {.prate = 0x00107001, .rate = 375000000}, - {.prate = 0x00107401, .rate = 387500000}, - {.prate = 0x00003801, .rate = 400000000}, - {.prate = 0x00107801, .rate = 400000000}, - {.prate = 0x00107C01, .rate = 412500000}, - {.prate = 0x00003C01, .rate = 425000000}, - {.prate = 0x00108001, .rate = 425000000}, - {.prate = 0x00108401, .rate = 437500000}, - {.prate = 0x00004001, .rate = 450000000}, - {.prate = 0x00108801, .rate = 450000000}, - {.prate = 0x00108C01, .rate = 462500000}, - {.prate = 0x00004401, .rate = 475000000}, - {.prate = 0x00109001, .rate = 475000000}, - {.prate = 0x00109401, .rate = 487500000}, - {.prate = 0x00004801, .rate = 500000000}, - {.prate = 0x00109801, .rate = 500000000}, - {.prate = 0x00104C00, .rate = 525000000}, - {.prate = 0x00002400, .rate = 550000000}, - {.prate = 0x00105000, .rate = 550000000}, - {.prate = 0x00105400, .rate = 575000000}, - {.prate = 0x00002800, .rate = 600000000}, - {.prate = 0x00105800, .rate = 600000000}, - {.prate = 0x00105C00, .rate = 625000000}, - {.prate = 0x00002C00, .rate = 650000000}, - {.prate = 0x00106000, .rate = 650000000}, - {.prate = 0x00106400, .rate = 675000000}, - {.prate = 0x00003000, .rate = 700000000}, - {.prate = 0x00106800, .rate = 700000000}, - {.prate = 0x00106C00, .rate = 725000000}, - {.prate = 0x00003400, .rate = 750000000}, - {.prate = 0x00107000, .rate = 750000000}, - {.prate = 0x00107400, .rate = 775000000}, - {.prate = 0x00003800, .rate = 800000000}, - {.prate = 0x00107800, .rate = 800000000}, - {.prate = 0x00107C00, .rate = 825000000}, - {.prate = 0x00003C00, .rate = 850000000}, - {.prate = 0x00108000, .rate = 850000000}, - {.prate = 0x00108400, .rate = 875000000}, - {.prate = 0x00004000, .rate = 900000000}, - {.prate = 0x00108800, .rate = 900000000}, - {.prate = 0x00108C00, .rate = 925000000}, - {.prate = 0x00004400, .rate = 950000000}, - {.prate = 0x00109000, .rate = 950000000}, - {.prate = 0x00109400, .rate = 975000000}, - {.prate = 0x00004800, .rate = 1000000000}, - {.prate = 0x00109800, .rate = 1000000000}, -}; - -struct { - unsigned long prate; - unsigned long drate; -} pddr_table[] = { - {.prate = 0x00100800, .drate = 44236800}, - {.prate = 0x00100C00, .drate = 66355200}, - {.prate = 0x00101000, .drate = 88473600}, - {.prate = 0x00101400, .drate = 110592000}, - {.prate = 0x00101800, .drate = 132710400}, - {.prate = 0x00101C01, .drate = 154828800}, - {.prate = 0x00102001, .drate = 176947200}, - {.prate = 0x00102401, .drate = 199065600}, - {.prate = 0x00102801, .drate = 221184000}, - {.prate = 0x00102C01, .drate = 243302400}, - {.prate = 0x00103001, .drate = 265420800}, - {.prate = 0x00103401, .drate = 287539200}, - {.prate = 0x00103801, .drate = 309657600}, - {.prate = 0x00103C01, .drate = 331776000}, - {.prate = 0x00104001, .drate = 353894400}, -}; - -static int __init clk_init(void) -{ -#ifdef CONFIG_PUV3_PM - u32 pllrate, divstatus = readl(PM_DIVSTATUS); - u32 pcgr_val = readl(PM_PCGR); - int i; - - pcgr_val |= PM_PCGR_BCLKMME | PM_PCGR_BCLKH264E | PM_PCGR_BCLKH264D - | PM_PCGR_HECLK | PM_PCGR_HDCLK; - writel(pcgr_val, PM_PCGR); - - pllrate = readl(PM_PLLSYSSTATUS); - - /* lookup pmclk_table */ - clk_mclk_clk.rate = 0; - for (i = 0; i < ARRAY_SIZE(pllrate_table); i++) { - if (pllrate == pllrate_table[i].prate) { - clk_mclk_clk.rate = pllrate_table[i].rate; - break; - } - } - - if (clk_mclk_clk.rate) - clk_bclk32_clk.rate = clk_mclk_clk.rate / - (((divstatus & 0x0000f000) >> 12) + 1); - - pllrate = readl(PM_PLLDDRSTATUS); - - /* lookup pddr_table */ - clk_ddr_clk.rate = 0; - for (i = 0; i < ARRAY_SIZE(pddr_table); i++) { - if (pllrate == pddr_table[i].prate) { - clk_ddr_clk.rate = pddr_table[i].drate; - break; - } - } - - pllrate = readl(PM_PLLVGASTATUS); - - /* lookup pvga_table */ - clk_vga_clk.rate = 0; - for (i = 0; i < ARRAY_SIZE(pllrate_table); i++) { - if (pllrate == pllrate_table[i].prate) { - clk_vga_clk.rate = pllrate_table[i].rate; - break; - } - } - - if (clk_vga_clk.rate) - clk_vga_clk.rate = clk_vga_clk.rate / - (((divstatus & 0x00f00000) >> 20) + 1); - - clk_register(&clk_vga_clk); -#endif -#ifdef CONFIG_ARCH_FPGA - clk_ddr_clk.rate = 33000000; - clk_mclk_clk.rate = 33000000; - clk_bclk32_clk.rate = 33000000; -#endif - clk_register(&clk_ddr_clk); - clk_register(&clk_mclk_clk); - clk_register(&clk_bclk32_clk); - clk_register(&clk_ost_clk); - return 0; -} -core_initcall(clk_init); diff --git a/arch/unicore32/kernel/debug-macro.S b/arch/unicore32/kernel/debug-macro.S deleted file mode 100644 index 7e2da0de4f71..000000000000 --- a/arch/unicore32/kernel/debug-macro.S +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/debug-macro.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Debugging macro include header - */ -#include -#include - - .macro put_word_ocd, rd, rx=r16 -1001: movc \rx, p1.c0, #0 - cand.a \rx, #2 - bne 1001b - movc p1.c1, \rd, #1 - .endm - -#ifdef CONFIG_DEBUG_OCD - /* debug using UniCore On-Chip-Debugger */ - .macro addruart, rx - .endm - - .macro senduart, rd, rx - put_word_ocd \rd, \rx - .endm - - .macro busyuart, rd, rx - .endm - - .macro waituart, rd, rx - .endm -#else -#define UART_CLK_DEFAULT 3686400 * 20 - /* Uartclk = MCLK/ 2, The MCLK on my board is 3686400 * 40 */ -#define BAUD_RATE_DEFAULT 115200 - /* The baud rate of the serial port */ - -#define UART_DIVISOR_DEFAULT (UART_CLK_DEFAULT \ - / (16 * BAUD_RATE_DEFAULT) - 1) - - .macro addruart,rx - mrc p0, #0, \rx, c1, c0 - tst \rx, #1 @ MMU enabled? - moveq \rx, #0xee000000 @ physical base address - movne \rx, #0x6e000000 @ virtual address - - @ We probe for the active serial port here - @ However, now we assume UART0 is active: epip4d - @ We assume r1 and r2 can be clobbered. - - movl r2, #UART_DIVISOR_DEFAULT - mov r1, #0x80 - str r1, [\rx, #UART_LCR_OFFSET] - and r1, r2, #0xff00 - mov r1, r1, lsr #8 - str r1, [\rx, #UART_DLH_OFFSET] - and r1, r2, #0xff - str r1, [\rx, #UART_DLL_OFFSET] - mov r1, #0x7 - str r1, [\rx, #UART_FCR_OFFSET] - mov r1, #0x3 - str r1, [\rx, #UART_LCR_OFFSET] - mov r1, #0x0 - str r1, [\rx, #UART_IER_OFFSET] - .endm - - .macro senduart,rd,rx - str \rd, [\rx, #UART_THR_OFFSET] - .endm - - .macro waituart,rd,rx -1001: ldr \rd, [\rx, #UART_LSR_OFFSET] - tst \rd, #UART_LSR_THRE - beq 1001b - .endm - - .macro busyuart,rd,rx -1001: ldr \rd, [\rx, #UART_LSR_OFFSET] - tst \rd, #UART_LSR_TEMT - bne 1001b - .endm -#endif - diff --git a/arch/unicore32/kernel/debug.S b/arch/unicore32/kernel/debug.S deleted file mode 100644 index 13bc8c8550e4..000000000000 --- a/arch/unicore32/kernel/debug.S +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/debug.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * 32-bit debugging code - */ -#include -#include - - .text - -/* - * Some debugging routines (useful if you've got MM problems and - * printk isn't working). For DEBUGGING ONLY!!! Do not leave - * references to these in a production kernel! - */ -#include "debug-macro.S" - -/* - * Useful debugging routines - */ -ENTRY(printhex8) - mov r1, #8 - b printhex -ENDPROC(printhex8) - -ENTRY(printhex4) - mov r1, #4 - b printhex -ENDPROC(printhex4) - -ENTRY(printhex2) - mov r1, #2 -printhex: adr r2, hexbuf - add r3, r2, r1 - mov r1, #0 - stb r1, [r3] -1: and r1, r0, #15 - mov r0, r0 >> #4 - csub.a r1, #10 - beg 2f - add r1, r1, #'0' - 'a' + 10 -2: add r1, r1, #'a' - 10 - stb.w r1, [r3+], #-1 - cxor.a r3, r2 - bne 1b - mov r0, r2 - b printascii -ENDPROC(printhex2) - - .ltorg - -ENTRY(printascii) - addruart r3 - b 2f -1: waituart r2, r3 - senduart r1, r3 - busyuart r2, r3 - cxor.a r1, #'\n' - cmoveq r1, #'\r' - beq 1b -2: cxor.a r0, #0 - beq 3f - ldb.w r1, [r0]+, #1 - cxor.a r1, #0 - bne 1b -3: mov pc, lr -ENDPROC(printascii) - -ENTRY(printch) - addruart r3 - mov r1, r0 - mov r0, #0 - b 1b -ENDPROC(printch) - -hexbuf: .space 16 - diff --git a/arch/unicore32/kernel/dma.c b/arch/unicore32/kernel/dma.c deleted file mode 100644 index 7a0e2d4d6077..000000000000 --- a/arch/unicore32/kernel/dma.c +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/dma.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -struct dma_channel { - char *name; - puv3_dma_prio prio; - void (*irq_handler)(int, void *); - void (*err_handler)(int, void *); - void *data; -}; - -static struct dma_channel dma_channels[MAX_DMA_CHANNELS]; - -int puv3_request_dma(char *name, puv3_dma_prio prio, - void (*irq_handler)(int, void *), - void (*err_handler)(int, void *), - void *data) -{ - unsigned long flags; - int i, found = 0; - - /* basic sanity checks */ - if (!name) - return -EINVAL; - - local_irq_save(flags); - - do { - /* try grabbing a DMA channel with the requested priority */ - for (i = 0; i < MAX_DMA_CHANNELS; i++) { - if ((dma_channels[i].prio == prio) && - !dma_channels[i].name) { - found = 1; - break; - } - } - /* if requested prio group is full, try a hier priority */ - } while (!found && prio--); - - if (found) { - dma_channels[i].name = name; - dma_channels[i].irq_handler = irq_handler; - dma_channels[i].err_handler = err_handler; - dma_channels[i].data = data; - } else { - printk(KERN_WARNING "No more available DMA channels for %s\n", - name); - i = -ENODEV; - } - - local_irq_restore(flags); - return i; -} -EXPORT_SYMBOL(puv3_request_dma); - -void puv3_free_dma(int dma_ch) -{ - unsigned long flags; - - if (!dma_channels[dma_ch].name) { - printk(KERN_CRIT - "%s: trying to free channel %d which is already freed\n", - __func__, dma_ch); - return; - } - - local_irq_save(flags); - dma_channels[dma_ch].name = NULL; - dma_channels[dma_ch].err_handler = NULL; - local_irq_restore(flags); -} -EXPORT_SYMBOL(puv3_free_dma); - -static irqreturn_t dma_irq_handler(int irq, void *dev_id) -{ - int i, dint; - - dint = readl(DMAC_ITCSR); - for (i = 0; i < MAX_DMA_CHANNELS; i++) { - if (dint & DMAC_CHANNEL(i)) { - struct dma_channel *channel = &dma_channels[i]; - - /* Clear TC interrupt of channel i */ - writel(DMAC_CHANNEL(i), DMAC_ITCCR); - writel(0, DMAC_ITCCR); - - if (channel->name && channel->irq_handler) { - channel->irq_handler(i, channel->data); - } else { - /* - * IRQ for an unregistered DMA channel: - * let's clear the interrupts and disable it. - */ - printk(KERN_WARNING "spurious IRQ for" - " DMA channel %d\n", i); - } - } - } - return IRQ_HANDLED; -} - -static irqreturn_t dma_err_handler(int irq, void *dev_id) -{ - int i, dint; - - dint = readl(DMAC_IESR); - for (i = 0; i < MAX_DMA_CHANNELS; i++) { - if (dint & DMAC_CHANNEL(i)) { - struct dma_channel *channel = &dma_channels[i]; - - /* Clear Err interrupt of channel i */ - writel(DMAC_CHANNEL(i), DMAC_IECR); - writel(0, DMAC_IECR); - - if (channel->name && channel->err_handler) { - channel->err_handler(i, channel->data); - } else { - /* - * IRQ for an unregistered DMA channel: - * let's clear the interrupts and disable it. - */ - printk(KERN_WARNING "spurious IRQ for" - " DMA channel %d\n", i); - } - } - } - return IRQ_HANDLED; -} - -int __init puv3_init_dma(void) -{ - int i, ret; - - /* dma channel priorities on v8 processors: - * ch 0 - 1 <--> (0) DMA_PRIO_HIGH - * ch 2 - 3 <--> (1) DMA_PRIO_MEDIUM - * ch 4 - 5 <--> (2) DMA_PRIO_LOW - */ - for (i = 0; i < MAX_DMA_CHANNELS; i++) { - puv3_stop_dma(i); - dma_channels[i].name = NULL; - dma_channels[i].prio = min((i & 0x7) >> 1, DMA_PRIO_LOW); - } - - ret = request_irq(IRQ_DMA, dma_irq_handler, 0, "DMA", NULL); - if (ret) { - printk(KERN_CRIT "Can't register IRQ for DMA\n"); - return ret; - } - - ret = request_irq(IRQ_DMAERR, dma_err_handler, 0, "DMAERR", NULL); - if (ret) { - printk(KERN_CRIT "Can't register IRQ for DMAERR\n"); - free_irq(IRQ_DMA, "DMA"); - return ret; - } - - return 0; -} - -postcore_initcall(puv3_init_dma); diff --git a/arch/unicore32/kernel/early_printk.c b/arch/unicore32/kernel/early_printk.c deleted file mode 100644 index c00b6712b8f7..000000000000 --- a/arch/unicore32/kernel/early_printk.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/early_printk.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include - -/* On-Chip-Debugger functions */ - -static void early_ocd_write(struct console *con, const char *s, unsigned n) -{ - while (*s && n-- > 0) { - if (*s == '\n') - ocd_putc((int)'\r'); - ocd_putc((int)*s); - s++; - } -} - -static struct console early_ocd_console = { - .name = "earlyocd", - .write = early_ocd_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -static int __init setup_early_printk(char *buf) -{ - if (!buf || early_console) - return 0; - - early_console = &early_ocd_console; - if (strstr(buf, "keep")) - early_console->flags &= ~CON_BOOT; - else - early_console->flags |= CON_BOOT; - register_console(early_console); - return 0; -} -early_param("earlyprintk", setup_early_printk); diff --git a/arch/unicore32/kernel/elf.c b/arch/unicore32/kernel/elf.c deleted file mode 100644 index 22adc65a03e9..000000000000 --- a/arch/unicore32/kernel/elf.c +++ /dev/null @@ -1,35 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/elf.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include - -int elf_check_arch(const struct elf32_hdr *x) -{ - /* Make sure it's an UniCore executable */ - if (x->e_machine != EM_UNICORE) - return 0; - - /* Make sure the entry address is reasonable */ - if (x->e_entry & 3) - return 0; - - return 1; -} -EXPORT_SYMBOL(elf_check_arch); - -void elf_set_personality(const struct elf32_hdr *x) -{ - unsigned int personality = PER_LINUX; - - set_personality(personality); -} -EXPORT_SYMBOL(elf_set_personality); diff --git a/arch/unicore32/kernel/entry.S b/arch/unicore32/kernel/entry.S deleted file mode 100644 index b35dc83069cb..000000000000 --- a/arch/unicore32/kernel/entry.S +++ /dev/null @@ -1,802 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/entry.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Low-level vector interface routines - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include "debug-macro.S" - -@ -@ Most of the stack format comes from struct pt_regs, but with -@ the addition of 8 bytes for storing syscall args 5 and 6. -@ -#define S_OFF 8 - -/* - * The SWI code relies on the fact that R0 is at the bottom of the stack - * (due to slow/fast restore user regs). - */ -#if S_R0 != 0 -#error "Please fix" -#endif - - .macro zero_fp -#ifdef CONFIG_FRAME_POINTER - mov fp, #0 -#endif - .endm - - .macro alignment_trap, rtemp -#ifdef CONFIG_ALIGNMENT_TRAP - ldw \rtemp, .LCcralign - ldw \rtemp, [\rtemp] - movc p0.c1, \rtemp, #0 -#endif - .endm - - .macro load_user_sp_lr, rd, rtemp, offset = 0 - mov \rtemp, asr - xor \rtemp, \rtemp, #(PRIV_MODE ^ SUSR_MODE) - mov.a asr, \rtemp @ switch to the SUSR mode - - ldw sp, [\rd+], #\offset @ load sp_user - ldw lr, [\rd+], #\offset + 4 @ load lr_user - - xor \rtemp, \rtemp, #(PRIV_MODE ^ SUSR_MODE) - mov.a asr, \rtemp @ switch back to the PRIV mode - .endm - - .macro priv_exit, rpsr - mov.a bsr, \rpsr - ldm.w (r0 - r15), [sp]+ - ldm.b (r16 - pc), [sp]+ @ load r0 - pc, asr - .endm - - .macro restore_user_regs, fast = 0, offset = 0 - ldw r1, [sp+], #\offset + S_PSR @ get calling asr - ldw lr, [sp+], #\offset + S_PC @ get pc - mov.a bsr, r1 @ save in bsr_priv - .if \fast - add sp, sp, #\offset + S_R1 @ r0 is syscall return value - ldm.w (r1 - r15), [sp]+ @ get calling r1 - r15 - ldur (r16 - lr), [sp]+ @ get calling r16 - lr - .else - ldm.w (r0 - r15), [sp]+ @ get calling r0 - r15 - ldur (r16 - lr), [sp]+ @ get calling r16 - lr - .endif - nop - add sp, sp, #S_FRAME_SIZE - S_R16 - mov.a pc, lr @ return - @ and move bsr_priv into asr - .endm - - .macro get_thread_info, rd - mov \rd, sp >> #13 - mov \rd, \rd << #13 - .endm - - .macro get_irqnr_and_base, irqnr, irqstat, base, tmp - ldw \base, =(PKUNITY_INTC_BASE) - ldw \irqstat, [\base+], #0xC @ INTC_ICIP - ldw \tmp, [\base+], #0x4 @ INTC_ICMR - and.a \irqstat, \irqstat, \tmp - beq 1001f - cntlz \irqnr, \irqstat - rsub \irqnr, \irqnr, #31 -1001: /* EQ will be set if no irqs pending */ - .endm - -#ifdef CONFIG_DEBUG_LL - .macro printreg, reg, temp - adr \temp, 901f - stm (r0-r3), [\temp]+ - stw lr, [\temp+], #0x10 - mov r0, \reg - b.l printhex8 - mov r0, #':' - b.l printch - mov r0, pc - b.l printhex8 - adr r0, 902f - b.l printascii - adr \temp, 901f - ldm (r0-r3), [\temp]+ - ldw lr, [\temp+], #0x10 - b 903f -901: .word 0, 0, 0, 0, 0 @ r0-r3, lr -902: .asciz ": epip4d\n" - .align -903: - .endm -#endif - -/* - * These are the registers used in the syscall handler, and allow us to - * have in theory up to 7 arguments to a function - r0 to r6. - * - * Note that tbl == why is intentional. - * - * We must set at least "tsk" and "why" when calling ret_with_reschedule. - */ -scno .req r21 @ syscall number -tbl .req r22 @ syscall table pointer -why .req r22 @ Linux syscall (!= 0) -tsk .req r23 @ current thread_info - -/* - * Interrupt handling. Preserves r17, r18, r19 - */ - .macro intr_handler -1: get_irqnr_and_base r0, r6, r5, lr - beq 2f - mov r1, sp - @ - @ routine called with r0 = irq number, r1 = struct pt_regs * - @ - adr lr, 1b - b asm_do_IRQ -2: - .endm - -/* - * PRIV mode handlers - */ - .macro priv_entry - sub sp, sp, #(S_FRAME_SIZE - 4) - stm (r1 - r15), [sp]+ - add r5, sp, #S_R15 - stm (r16 - r28), [r5]+ - - ldm (r1 - r3), [r0]+ - add r5, sp, #S_SP - 4 @ here for interlock avoidance - mov r4, #-1 @ "" "" "" "" - add r0, sp, #(S_FRAME_SIZE - 4) - stw.w r1, [sp+], #-4 @ save the "real" r0 copied - @ from the exception stack - - mov r1, lr - - @ - @ We are now ready to fill in the remaining blanks on the stack: - @ - @ r0 - sp_priv - @ r1 - lr_priv - @ r2 - lr_, already fixed up for correct return/restart - @ r3 - bsr_ - @ r4 - orig_r0 (see pt_regs definition in ptrace.h) - @ - stm (r0 - r4), [r5]+ - .endm - -/* - * User mode handlers - * - */ - .macro user_entry - sub sp, sp, #S_FRAME_SIZE - stm (r1 - r15), [sp+] - add r4, sp, #S_R16 - stm (r16 - r28), [r4]+ - - ldm (r1 - r3), [r0]+ - add r0, sp, #S_PC @ here for interlock avoidance - mov r4, #-1 @ "" "" "" "" - - stw r1, [sp] @ save the "real" r0 copied - @ from the exception stack - - @ - @ We are now ready to fill in the remaining blanks on the stack: - @ - @ r2 - lr_, already fixed up for correct return/restart - @ r3 - bsr_ - @ r4 - orig_r0 (see pt_regs definition in ptrace.h) - @ - @ Also, separately save sp_user and lr_user - @ - stm (r2 - r4), [r0]+ - stur (sp, lr), [r0-] - - @ - @ Enable the alignment trap while in kernel mode - @ - alignment_trap r0 - - @ - @ Clear FP to mark the first stack frame - @ - zero_fp - .endm - - .text - -@ -@ __invalid - generic code for failed exception -@ (re-entrant version of handlers) -@ -__invalid: - sub sp, sp, #S_FRAME_SIZE - stm (r1 - r15), [sp+] - add r1, sp, #S_R16 - stm (r16 - r28, sp, lr), [r1]+ - - zero_fp - - ldm (r4 - r6), [r0]+ - add r0, sp, #S_PC @ here for interlock avoidance - mov r7, #-1 @ "" "" "" "" - stw r4, [sp] @ save preserved r0 - stm (r5 - r7), [r0]+ @ lr_, - @ asr_, "old_r0" - - mov r0, sp - mov r1, asr - b bad_mode -ENDPROC(__invalid) - - .align 5 -__dabt_priv: - priv_entry - - @ - @ get ready to re-enable interrupts if appropriate - @ - mov r17, asr - cand.a r3, #PSR_I_BIT - bne 1f - andn r17, r17, #PSR_I_BIT -1: - - @ - @ Call the processor-specific abort handler: - @ - @ r2 - aborted context pc - @ r3 - aborted context asr - @ - @ The abort handler must return the aborted address in r0, and - @ the fault status register in r1. - @ - movc r1, p0.c3, #0 @ get FSR - movc r0, p0.c4, #0 @ get FAR - - @ - @ set desired INTR state, then call main handler - @ - mov.a asr, r17 - mov r2, sp - b.l do_DataAbort - - @ - @ INTRs off again before pulling preserved data off the stack - @ - disable_irq r0 - - @ - @ restore BSR and restart the instruction - @ - ldw r2, [sp+], #S_PSR - priv_exit r2 @ return from exception -ENDPROC(__dabt_priv) - - .align 5 -__intr_priv: - priv_entry - - intr_handler - - mov r0, #0 @ epip4d - movc p0.c5, r0, #14 - nop; nop; nop; nop; nop; nop; nop; nop - - ldw r4, [sp+], #S_PSR @ irqs are already disabled - - priv_exit r4 @ return from exception -ENDPROC(__intr_priv) - - .ltorg - - .align 5 -__extn_priv: - priv_entry - - mov r0, sp @ struct pt_regs *regs - mov r1, asr - b bad_mode @ not supported -ENDPROC(__extn_priv) - - .align 5 -__pabt_priv: - priv_entry - - @ - @ re-enable interrupts if appropriate - @ - mov r17, asr - cand.a r3, #PSR_I_BIT - bne 1f - andn r17, r17, #PSR_I_BIT -1: - - @ - @ set args, then call main handler - @ - @ r0 - address of faulting instruction - @ r1 - pointer to registers on stack - @ - mov r0, r2 @ pass address of aborted instruction - mov r1, #5 - mov.a asr, r17 - mov r2, sp @ regs - b.l do_PrefetchAbort @ call abort handler - - @ - @ INTRs off again before pulling preserved data off the stack - @ - disable_irq r0 - - @ - @ restore BSR and restart the instruction - @ - ldw r2, [sp+], #S_PSR - priv_exit r2 @ return from exception -ENDPROC(__pabt_priv) - - .align 5 -.LCcralign: - .word cr_alignment - - .align 5 -__dabt_user: - user_entry - -#ifdef CONFIG_UNICORE_FPU_F64 - cff ip, s31 - cand.a ip, #0x08000000 @ FPU execption traps? - beq 209f - - ldw ip, [sp+], #S_PC - add ip, ip, #4 - stw ip, [sp+], #S_PC - @ - @ fall through to the emulation code, which returns using r19 if - @ it has emulated the instruction, or the more conventional lr - @ if we are to treat this as a real extended instruction - @ - @ r0 - instruction - @ -1: ldw.u r0, [r2] - adr r19, ret_from_exception - adr lr, 209f - @ - @ fallthrough to call do_uc_f64 - @ -/* - * Check whether the instruction is a co-processor instruction. - * If yes, we need to call the relevant co-processor handler. - * - * Note that we don't do a full check here for the co-processor - * instructions; all instructions with bit 27 set are well - * defined. The only instructions that should fault are the - * co-processor instructions. - * - * Emulators may wish to make use of the following registers: - * r0 = instruction opcode. - * r2 = PC - * r19 = normal "successful" return address - * r20 = this threads thread_info structure. - * lr = unrecognised instruction return address - */ - get_thread_info r20 @ get current thread - and r8, r0, #0x00003c00 @ mask out CP number - mov r7, #1 - stb r7, [r20+], #TI_USED_CP + 2 @ set appropriate used_cp[] - - @ F64 hardware support entry point. - @ r0 = faulted instruction - @ r19 = return address - @ r20 = fp_state - enable_irq r4 - add r20, r20, #TI_FPSTATE @ r20 = workspace - cff r1, s31 @ get fpu FPSCR - andn r2, r1, #0x08000000 - ctf r2, s31 @ clear 27 bit - mov r2, sp @ nothing stacked - regdump is at TOS - mov lr, r19 @ setup for a return to the user code - - @ Now call the C code to package up the bounce to the support code - @ r0 holds the trigger instruction - @ r1 holds the FPSCR value - @ r2 pointer to register dump - b ucf64_exchandler -209: -#endif - @ - @ Call the processor-specific abort handler: - @ - @ r2 - aborted context pc - @ r3 - aborted context asr - @ - @ The abort handler must return the aborted address in r0, and - @ the fault status register in r1. - @ - movc r1, p0.c3, #0 @ get FSR - movc r0, p0.c4, #0 @ get FAR - - @ - @ INTRs on, then call the main handler - @ - enable_irq r2 - mov r2, sp - adr lr, ret_from_exception - b do_DataAbort -ENDPROC(__dabt_user) - - .align 5 -__intr_user: - user_entry - - get_thread_info tsk - - intr_handler - - mov why, #0 - b ret_to_user -ENDPROC(__intr_user) - - .ltorg - - .align 5 -__extn_user: - user_entry - - mov r0, sp - mov r1, asr - b bad_mode -ENDPROC(__extn_user) - - .align 5 -__pabt_user: - user_entry - - mov r0, r2 @ pass address of aborted instruction. - mov r1, #5 - enable_irq r1 @ Enable interrupts - mov r2, sp @ regs - b.l do_PrefetchAbort @ call abort handler - /* fall through */ -/* - * This is the return code to user mode for abort handlers - */ -ENTRY(ret_from_exception) - get_thread_info tsk - mov why, #0 - b ret_to_user -ENDPROC(__pabt_user) -ENDPROC(ret_from_exception) - -/* - * Register switch for UniCore V2 processors - * r0 = previous task_struct, r1 = previous thread_info, r2 = next thread_info - * previous and next are guaranteed not to be the same. - */ -ENTRY(__switch_to) - add ip, r1, #TI_CPU_SAVE - stm.w (r4 - r15), [ip]+ - stm.w (r16 - r27, sp, lr), [ip]+ - -#ifdef CONFIG_UNICORE_FPU_F64 - add ip, r1, #TI_FPSTATE - sfm.w (f0 - f7 ), [ip]+ - sfm.w (f8 - f15), [ip]+ - sfm.w (f16 - f23), [ip]+ - sfm.w (f24 - f31), [ip]+ - cff r4, s31 - stw r4, [ip] - - add ip, r2, #TI_FPSTATE - lfm.w (f0 - f7 ), [ip]+ - lfm.w (f8 - f15), [ip]+ - lfm.w (f16 - f23), [ip]+ - lfm.w (f24 - f31), [ip]+ - ldw r4, [ip] - ctf r4, s31 -#endif - add ip, r2, #TI_CPU_SAVE - ldm.w (r4 - r15), [ip]+ - ldm (r16 - r27, sp, pc), [ip]+ @ Load all regs saved previously -ENDPROC(__switch_to) - - .align 5 -/* - * This is the fast syscall return path. We do as little as - * possible here, and this includes saving r0 back into the PRIV - * stack. - */ -ret_fast_syscall: - disable_irq r1 @ disable interrupts - ldw r1, [tsk+], #TI_FLAGS - cand.a r1, #_TIF_WORK_MASK - bne fast_work_pending - - @ fast_restore_user_regs - restore_user_regs fast = 1, offset = S_OFF - -/* - * Ok, we need to do extra processing, enter the slow path. - */ -fast_work_pending: - stw.w r0, [sp+], #S_R0+S_OFF @ returned r0 -work_pending: - cand.a r1, #_TIF_NEED_RESCHED - bne work_resched - mov r0, sp @ 'regs' - mov r2, why @ 'syscall' - cand.a r1, #_TIF_SIGPENDING @ delivering a signal? - cmovne why, #0 @ prevent further restarts - b.l do_notify_resume - b ret_slow_syscall @ Check work again - -work_resched: - b.l schedule -/* - * "slow" syscall return path. "why" tells us if this was a real syscall. - */ -ENTRY(ret_to_user) -ret_slow_syscall: - disable_irq r1 @ disable interrupts - get_thread_info tsk @ epip4d, one path error?! - ldw r1, [tsk+], #TI_FLAGS - cand.a r1, #_TIF_WORK_MASK - bne work_pending -no_work_pending: - @ slow_restore_user_regs - restore_user_regs fast = 0, offset = 0 -ENDPROC(ret_to_user) - -/* - * This is how we return from a fork. - */ -ENTRY(ret_from_fork) - b.l schedule_tail - b ret_slow_syscall -ENDPROC(ret_from_fork) - -ENTRY(ret_from_kernel_thread) - b.l schedule_tail - mov r0, r5 - adr lr, ret_slow_syscall - mov pc, r4 -ENDPROC(ret_from_kernel_thread) - -/*============================================================================= - * SWI handler - *----------------------------------------------------------------------------- - */ - .align 5 -ENTRY(vector_swi) - sub sp, sp, #S_FRAME_SIZE - stm (r0 - r15), [sp]+ @ Calling r0 - r15 - add r8, sp, #S_R16 - stm (r16 - r28), [r8]+ @ Calling r16 - r28 - add r8, sp, #S_PC - stur (sp, lr), [r8-] @ Calling sp, lr - mov r8, bsr @ called from non-REAL mode - stw lr, [sp+], #S_PC @ Save calling PC - stw r8, [sp+], #S_PSR @ Save ASR - stw r0, [sp+], #S_OLD_R0 @ Save OLD_R0 - zero_fp - - /* - * Get the system call number. - */ - sub ip, lr, #4 - ldw.u scno, [ip] @ get SWI instruction - -#ifdef CONFIG_ALIGNMENT_TRAP - ldw ip, __cr_alignment - ldw ip, [ip] - movc p0.c1, ip, #0 @ update control register -#endif - enable_irq ip - - get_thread_info tsk - ldw tbl, =sys_call_table @ load syscall table pointer - - andn scno, scno, #0xff000000 @ mask off SWI op-code - andn scno, scno, #0x00ff0000 @ mask off SWI op-code - - stm.w (r4, r5), [sp-] @ push fifth and sixth args - ldw ip, [tsk+], #TI_FLAGS @ check for syscall tracing - cand.a ip, #_TIF_SYSCALL_TRACE @ are we tracing syscalls? - bne __sys_trace - - csub.a scno, #__NR_syscalls @ check upper syscall limit - adr lr, ret_fast_syscall @ return address - bea 1f - ldw pc, [tbl+], scno << #2 @ call sys_* routine -1: - add r1, sp, #S_OFF -2: mov why, #0 @ no longer a real syscall - b sys_ni_syscall @ not private func - - /* - * This is the really slow path. We're going to be doing - * context switches, and waiting for our parent to respond. - */ -__sys_trace: - mov r2, scno - add r1, sp, #S_OFF - mov r0, #0 @ trace entry [IP = 0] - b.l syscall_trace - - adr lr, __sys_trace_return @ return address - mov scno, r0 @ syscall number (possibly new) - add r1, sp, #S_R0 + S_OFF @ pointer to regs - csub.a scno, #__NR_syscalls @ check upper syscall limit - bea 2b - ldm (r0 - r3), [r1]+ @ have to reload r0 - r3 - ldw pc, [tbl+], scno << #2 @ call sys_* routine - -__sys_trace_return: - stw.w r0, [sp+], #S_R0 + S_OFF @ save returned r0 - mov r2, scno - mov r1, sp - mov r0, #1 @ trace exit [IP = 1] - b.l syscall_trace - b ret_slow_syscall - - .align 5 -#ifdef CONFIG_ALIGNMENT_TRAP - .type __cr_alignment, #object -__cr_alignment: - .word cr_alignment -#endif - .ltorg - -ENTRY(sys_rt_sigreturn) - add r0, sp, #S_OFF - mov why, #0 @ prevent syscall restart handling - b __sys_rt_sigreturn -ENDPROC(sys_rt_sigreturn) - - __INIT - -/* - * Vector stubs. - * - * This code is copied to 0xffff0200 so we can use branches in the - * vectors, rather than ldr's. Note that this code must not - * exceed 0x300 bytes. - * - * Common stub entry macro: - * Enter in INTR mode, bsr = PRIV/USER ASR, lr = PRIV/USER PC - * - * SP points to a minimal amount of processor-private memory, the address - * of which is copied into r0 for the mode specific abort handler. - */ - .macro vector_stub, name, mode - .align 5 - -vector_\name: - @ - @ Save r0, lr_ (parent PC) and bsr_ - @ (parent ASR) - @ - stw r0, [sp] - stw lr, [sp+], #4 @ save r0, lr - mov lr, bsr - stw lr, [sp+], #8 @ save bsr - - @ - @ Prepare for PRIV mode. INTRs remain disabled. - @ - mov r0, asr - xor r0, r0, #(\mode ^ PRIV_MODE) - mov.a bsr, r0 - - @ - @ the branch table must immediately follow this code - @ - and lr, lr, #0x03 - add lr, lr, #1 - mov r0, sp - ldw lr, [pc+], lr << #2 - mov.a pc, lr @ branch to handler in PRIV mode -ENDPROC(vector_\name) - .align 2 - @ handler addresses follow this label - .endm - - .globl __stubs_start -__stubs_start: -/* - * Interrupt dispatcher - */ - vector_stub intr, INTR_MODE - - .long __intr_user @ 0 (USER) - .long __invalid @ 1 - .long __invalid @ 2 - .long __intr_priv @ 3 (PRIV) - -/* - * Data abort dispatcher - * Enter in ABT mode, bsr = USER ASR, lr = USER PC - */ - vector_stub dabt, ABRT_MODE - - .long __dabt_user @ 0 (USER) - .long __invalid @ 1 - .long __invalid @ 2 (INTR) - .long __dabt_priv @ 3 (PRIV) - -/* - * Prefetch abort dispatcher - * Enter in ABT mode, bsr = USER ASR, lr = USER PC - */ - vector_stub pabt, ABRT_MODE - - .long __pabt_user @ 0 (USER) - .long __invalid @ 1 - .long __invalid @ 2 (INTR) - .long __pabt_priv @ 3 (PRIV) - -/* - * Undef instr entry dispatcher - * Enter in EXTN mode, bsr = PRIV/USER ASR, lr = PRIV/USER PC - */ - vector_stub extn, EXTN_MODE - - .long __extn_user @ 0 (USER) - .long __invalid @ 1 - .long __invalid @ 2 (INTR) - .long __extn_priv @ 3 (PRIV) - -/* - * We group all the following data together to optimise - * for CPUs with separate I & D caches. - */ - .align 5 - -.LCvswi: - .word vector_swi - - .globl __stubs_end -__stubs_end: - - .equ stubs_offset, __vectors_start + 0x200 - __stubs_start - - .globl __vectors_start -__vectors_start: - jepriv SYS_ERROR0 - b vector_extn + stubs_offset - ldw pc, .LCvswi + stubs_offset - b vector_pabt + stubs_offset - b vector_dabt + stubs_offset - jepriv SYS_ERROR0 - b vector_intr + stubs_offset - jepriv SYS_ERROR0 - - .globl __vectors_end -__vectors_end: - - .data - - .globl cr_alignment - .globl cr_no_alignment -cr_alignment: - .space 4 -cr_no_alignment: - .space 4 diff --git a/arch/unicore32/kernel/fpu-ucf64.c b/arch/unicore32/kernel/fpu-ucf64.c deleted file mode 100644 index 85f0af29d29b..000000000000 --- a/arch/unicore32/kernel/fpu-ucf64.c +++ /dev/null @@ -1,117 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/fpu-ucf64.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include - -#include - -/* - * A special flag to tell the normalisation code not to normalise. - */ -#define F64_NAN_FLAG 0x100 - -/* - * A bit pattern used to indicate the initial (unset) value of the - * exception mask, in case nothing handles an instruction. This - * doesn't include the NAN flag, which get masked out before - * we check for an error. - */ -#define F64_EXCEPTION_ERROR ((u32)-1 & ~F64_NAN_FLAG) - -/* - * Since we aren't building with -mfpu=f64, we need to code - * these instructions using their MRC/MCR equivalents. - */ -#define f64reg(_f64_) #_f64_ - -#define cff(_f64_) ({ \ - u32 __v; \ - asm("cff %0, " f64reg(_f64_) "@ fmrx %0, " #_f64_ \ - : "=r" (__v) : : "cc"); \ - __v; \ - }) - -#define ctf(_f64_, _var_) \ - asm("ctf %0, " f64reg(_f64_) "@ fmxr " #_f64_ ", %0" \ - : : "r" (_var_) : "cc") - -/* - * Raise a SIGFPE for the current process. - * sicode describes the signal being raised. - */ -void ucf64_raise_sigfpe(struct pt_regs *regs) -{ - /* - * This is the same as NWFPE, because it's not clear what - * this is used for - */ - current->thread.error_code = 0; - current->thread.trap_no = 6; - - send_sig_fault(SIGFPE, FPE_FLTUNK, - (void __user *)(instruction_pointer(regs) - 4), - current); -} - -/* - * Handle exceptions of UniCore-F64. - */ -void ucf64_exchandler(u32 inst, u32 fpexc, struct pt_regs *regs) -{ - u32 tmp = fpexc; - u32 exc = F64_EXCEPTION_ERROR & fpexc; - - pr_debug("UniCore-F64: instruction %08x fpscr %08x\n", - inst, fpexc); - - if (exc & FPSCR_CMPINSTR_BIT) { - if (exc & FPSCR_CON) - tmp |= FPSCR_CON; - else - tmp &= ~(FPSCR_CON); - exc &= ~(FPSCR_CMPINSTR_BIT | FPSCR_CON); - } else { - pr_debug("UniCore-F64 Error: unhandled exceptions\n"); - pr_debug("UniCore-F64 FPSCR 0x%08x INST 0x%08x\n", - cff(FPSCR), inst); - - ucf64_raise_sigfpe(regs); - return; - } - - /* - * Update the FPSCR with the additional exception flags. - * Comparison instructions always return at least one of - * these flags set. - */ - tmp &= ~(FPSCR_TRAP | FPSCR_IOS | FPSCR_OFS | FPSCR_UFS | - FPSCR_IXS | FPSCR_HIS | FPSCR_IOC | FPSCR_OFC | - FPSCR_UFC | FPSCR_IXC | FPSCR_HIC); - - tmp |= exc; - ctf(FPSCR, tmp); -} - -/* - * F64 support code initialisation. - */ -static int __init ucf64_init(void) -{ - ctf(FPSCR, 0x0); /* FPSCR_UFE | FPSCR_NDE perhaps better */ - - printk(KERN_INFO "Enable UniCore-F64 support.\n"); - - return 0; -} - -late_initcall(ucf64_init); diff --git a/arch/unicore32/kernel/gpio.c b/arch/unicore32/kernel/gpio.c deleted file mode 100644 index 36d395b54b7c..000000000000 --- a/arch/unicore32/kernel/gpio.c +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/gpio.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ -/* in FPGA, no GPIO support */ - -#include -#include -#include -/* FIXME: needed for gpio_set_value() - convert to use descriptors or hogs */ -#include -#include - -#ifdef CONFIG_LEDS -#include -#include - -static const struct gpio_led puv3_gpio_leds[] = { - { .name = "cpuhealth", .gpio = GPO_CPU_HEALTH, .active_low = 0, - .default_trigger = "heartbeat", }, - { .name = "hdd_led", .gpio = GPO_HDD_LED, .active_low = 1, - .default_trigger = "disk-activity", }, -}; - -static const struct gpio_led_platform_data puv3_gpio_led_data = { - .num_leds = ARRAY_SIZE(puv3_gpio_leds), - .leds = (void *) puv3_gpio_leds, -}; - -static struct platform_device puv3_gpio_gpio_leds = { - .name = "leds-gpio", - .id = -1, - .dev = { - .platform_data = (void *) &puv3_gpio_led_data, - } -}; - -static int __init puv3_gpio_leds_init(void) -{ - platform_device_register(&puv3_gpio_gpio_leds); - return 0; -} - -device_initcall(puv3_gpio_leds_init); -#endif - -static int puv3_gpio_get(struct gpio_chip *chip, unsigned offset) -{ - return !!(readl(GPIO_GPLR) & GPIO_GPIO(offset)); -} - -static void puv3_gpio_set(struct gpio_chip *chip, unsigned offset, int value) -{ - if (value) - writel(GPIO_GPIO(offset), GPIO_GPSR); - else - writel(GPIO_GPIO(offset), GPIO_GPCR); -} - -static int puv3_direction_input(struct gpio_chip *chip, unsigned offset) -{ - unsigned long flags; - - local_irq_save(flags); - writel(readl(GPIO_GPDR) & ~GPIO_GPIO(offset), GPIO_GPDR); - local_irq_restore(flags); - return 0; -} - -static int puv3_direction_output(struct gpio_chip *chip, unsigned offset, - int value) -{ - unsigned long flags; - - local_irq_save(flags); - puv3_gpio_set(chip, offset, value); - writel(readl(GPIO_GPDR) | GPIO_GPIO(offset), GPIO_GPDR); - local_irq_restore(flags); - return 0; -} - -static struct gpio_chip puv3_gpio_chip = { - .label = "gpio", - .direction_input = puv3_direction_input, - .direction_output = puv3_direction_output, - .set = puv3_gpio_set, - .get = puv3_gpio_get, - .base = 0, - .ngpio = GPIO_MAX + 1, -}; - -void __init puv3_init_gpio(void) -{ - writel(GPIO_DIR, GPIO_GPDR); -#if defined(CONFIG_PUV3_NB0916) || defined(CONFIG_PUV3_SMW0919) \ - || defined(CONFIG_PUV3_DB0913) - gpio_set_value(GPO_WIFI_EN, 1); - gpio_set_value(GPO_HDD_LED, 1); - gpio_set_value(GPO_VGA_EN, 1); - gpio_set_value(GPO_LCD_EN, 1); - gpio_set_value(GPO_CAM_PWR_EN, 0); - gpio_set_value(GPO_LCD_VCC_EN, 1); - gpio_set_value(GPO_SOFT_OFF, 1); - gpio_set_value(GPO_BT_EN, 1); - gpio_set_value(GPO_FAN_ON, 0); - gpio_set_value(GPO_SPKR, 0); - gpio_set_value(GPO_CPU_HEALTH, 1); - gpio_set_value(GPO_LAN_SEL, 1); -/* - * DO NOT modify the GPO_SET_V1 and GPO_SET_V2 in kernel - * gpio_set_value(GPO_SET_V1, 1); - * gpio_set_value(GPO_SET_V2, 1); - */ -#endif - gpiochip_add_data(&puv3_gpio_chip, NULL); -} diff --git a/arch/unicore32/kernel/head.S b/arch/unicore32/kernel/head.S deleted file mode 100644 index 9bbb8668f9f7..000000000000 --- a/arch/unicore32/kernel/head.S +++ /dev/null @@ -1,249 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/head.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#if (PHYS_OFFSET & 0x003fffff) -#error "PHYS_OFFSET must be at an even 4MiB boundary!" -#endif - -#define KERNEL_RAM_VADDR (PAGE_OFFSET + KERNEL_IMAGE_START) -#define KERNEL_RAM_PADDR (PHYS_OFFSET + KERNEL_IMAGE_START) - -#define KERNEL_PGD_PADDR (KERNEL_RAM_PADDR - 0x1000) -#define KERNEL_PGD_VADDR (KERNEL_RAM_VADDR - 0x1000) - -#define KERNEL_START KERNEL_RAM_VADDR -#define KERNEL_END _end - -/* - * swapper_pg_dir is the virtual address of the initial page table. - * We place the page tables 4K below KERNEL_RAM_VADDR. Therefore, we must - * make sure that KERNEL_RAM_VADDR is correctly set. Currently, we expect - * the least significant 16 bits to be 0x8000, but we could probably - * relax this restriction to KERNEL_RAM_VADDR >= PAGE_OFFSET + 0x1000. - */ -#if (KERNEL_RAM_VADDR & 0xffff) != 0x8000 -#error KERNEL_RAM_VADDR must start at 0xXXXX8000 -#endif - - .globl swapper_pg_dir - .equ swapper_pg_dir, KERNEL_RAM_VADDR - 0x1000 - -/* - * Kernel startup entry point. - * --------------------------- - * - * This is normally called from the decompressor code. The requirements - * are: MMU = off, D-cache = off, I-cache = dont care - * - * This code is mostly position independent, so if you link the kernel at - * 0xc0008000, you call this at __pa(0xc0008000). - */ - __HEAD -ENTRY(stext) - @ set asr - mov r0, #PRIV_MODE @ ensure priv mode - or r0, #PSR_R_BIT | PSR_I_BIT @ disable irqs - mov.a asr, r0 - - @ process identify - movc r0, p0.c0, #0 @ cpuid - movl r1, 0xff00ffff @ mask - movl r2, 0x4d000863 @ value - and r0, r1, r0 - cxor.a r0, r2 - bne __error_p @ invalid processor id - - /* - * Clear the 4K level 1 swapper page table - */ - movl r0, #KERNEL_PGD_PADDR @ page table address - mov r1, #0 - add r2, r0, #0x1000 -101: stw.w r1, [r0]+, #4 - stw.w r1, [r0]+, #4 - stw.w r1, [r0]+, #4 - stw.w r1, [r0]+, #4 - cxor.a r0, r2 - bne 101b - - movl r4, #KERNEL_PGD_PADDR @ page table address - mov r7, #PMD_TYPE_SECT | PMD_PRESENT @ page size: section - or r7, r7, #PMD_SECT_CACHEABLE @ cacheable - or r7, r7, #PMD_SECT_READ | PMD_SECT_WRITE | PMD_SECT_EXEC - - /* - * Create identity mapping for first 4MB of kernel to - * cater for the MMU enable. This identity mapping - * will be removed by paging_init(). We use our current program - * counter to determine corresponding section base address. - */ - mov r6, pc - mov r6, r6 >> #22 @ start of kernel section - or r1, r7, r6 << #22 @ flags + kernel base - stw r1, [r4+], r6 << #2 @ identity mapping - - /* - * Now setup the pagetables for our kernel direct - * mapped region. - */ - add r0, r4, #(KERNEL_START & 0xff000000) >> 20 - stw.w r1, [r0+], #(KERNEL_START & 0x00c00000) >> 20 - movl r6, #(KERNEL_END - 1) - add r0, r0, #4 - add r6, r4, r6 >> #20 -102: csub.a r0, r6 - add r1, r1, #1 << 22 - bua 103f - stw.w r1, [r0]+, #4 - b 102b -103: - /* - * Then map first 4MB of ram in case it contains our boot params. - */ - add r0, r4, #PAGE_OFFSET >> 20 - or r6, r7, #(PHYS_OFFSET & 0xffc00000) - stw r6, [r0] - - ldw r15, __switch_data @ address to jump to after - - /* - * Initialise TLB, Caches, and MMU state ready to switch the MMU - * on. - */ - mov r0, #0 - movc p0.c5, r0, #28 @ cache invalidate all - nop8 - movc p0.c6, r0, #6 @ TLB invalidate all - nop8 - - /* - * ..V. .... ..TB IDAM - * ..1. .... ..01 1111 - */ - movl r0, #0x201f @ control register setting - - /* - * Setup common bits before finally enabling the MMU. Essentially - * this is just loading the page table pointer and domain access - * registers. - */ - #ifndef CONFIG_ALIGNMENT_TRAP - andn r0, r0, #CR_A - #endif - #ifdef CONFIG_CPU_DCACHE_DISABLE - andn r0, r0, #CR_D - #endif - #ifdef CONFIG_CPU_DCACHE_WRITETHROUGH - andn r0, r0, #CR_B - #endif - #ifdef CONFIG_CPU_ICACHE_DISABLE - andn r0, r0, #CR_I - #endif - - movc p0.c2, r4, #0 @ set pgd - b __turn_mmu_on -ENDPROC(stext) - -/* - * Enable the MMU. This completely changes the structure of the visible - * memory space. You will not be able to trace execution through this. - * - * r0 = cp#0 control register - * r15 = *virtual* address to jump to upon completion - */ - .align 5 -__turn_mmu_on: - mov r0, r0 - movc p0.c1, r0, #0 @ write control reg - nop @ fetch inst by phys addr - mov pc, r15 - nop8 @ fetch inst by phys addr -ENDPROC(__turn_mmu_on) - -/* - * Setup the initial page tables. We only setup the barest - * amount which are required to get the kernel running, which - * generally means mapping in the kernel code. - * - * r9 = cpuid - * r10 = procinfo - * - * Returns: - * r0, r3, r6, r7 corrupted - * r4 = physical page table address - */ - .ltorg - - .align 2 - .type __switch_data, %object -__switch_data: - .long __mmap_switched - .long __bss_start @ r6 - .long _end @ r7 - .long cr_alignment @ r8 - .long init_thread_union + THREAD_START_SP @ sp - -/* - * The following fragment of code is executed with the MMU on in MMU mode, - * and uses absolute addresses; this is not position independent. - * - * r0 = cp#0 control register - */ -__mmap_switched: - adr r3, __switch_data + 4 - - ldm.w (r6, r7, r8), [r3]+ - ldw sp, [r3] - - mov fp, #0 @ Clear BSS (and zero fp) -203: csub.a r6, r7 - bea 204f - stw.w fp, [r6]+,#4 - b 203b -204: - andn r1, r0, #CR_A @ Clear 'A' bit - stm (r0, r1), [r8]+ @ Save control register values - b start_kernel -ENDPROC(__mmap_switched) - -/* - * Exception handling. Something went wrong and we can't proceed. We - * ought to tell the user, but since we don't have any guarantee that - * we're even running on the right architecture, we do virtually nothing. - * - * If CONFIG_DEBUG_LL is set we try to print out something about the error - * and hope for the best (useful if bootloader fails to pass a proper - * machine ID for example). - */ -__error_p: -#ifdef CONFIG_DEBUG_LL - adr r0, str_p1 - b.l printascii - mov r0, r9 - b.l printhex8 - adr r0, str_p2 - b.l printascii -901: nop8 - b 901b -str_p1: .asciz "\nError: unrecognized processor variant (0x" -str_p2: .asciz ").\n" - .align -#endif -ENDPROC(__error_p) - diff --git a/arch/unicore32/kernel/hibernate.c b/arch/unicore32/kernel/hibernate.c deleted file mode 100644 index 4cdf3c846a2d..000000000000 --- a/arch/unicore32/kernel/hibernate.c +++ /dev/null @@ -1,159 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/hibernate.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "mach/pm.h" - -/* Pointer to the temporary resume page tables */ -pgd_t *resume_pg_dir; - -struct swsusp_arch_regs swsusp_arch_regs_cpu0; - -/* - * Create a middle page table on a resume-safe page and put a pointer to it in - * the given global directory entry. This only returns the gd entry - * in non-PAE compilation mode, since the middle layer is folded. - */ -static pmd_t *resume_one_md_table_init(pgd_t *pgd) -{ - pud_t *pud; - p4d_t *p4d; - pmd_t *pmd_table; - - p4d = p4d_offset(pgd, 0); - pud = pud_offset(p4d, 0); - pmd_table = pmd_offset(pud, 0); - - return pmd_table; -} - -/* - * Create a page table on a resume-safe page and place a pointer to it in - * a middle page directory entry. - */ -static pte_t *resume_one_page_table_init(pmd_t *pmd) -{ - if (pmd_none(*pmd)) { - pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC); - if (!page_table) - return NULL; - - set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_KERNEL_TABLE)); - - BUG_ON(page_table != pte_offset_kernel(pmd, 0)); - - return page_table; - } - - return pte_offset_kernel(pmd, 0); -} - -/* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. The page tables are allocated out of resume-safe pages. - */ -static int resume_physical_mapping_init(pgd_t *pgd_base) -{ - unsigned long pfn; - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int pgd_idx, pmd_idx; - - pgd_idx = pgd_index(PAGE_OFFSET); - pgd = pgd_base + pgd_idx; - pfn = 0; - - for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { - pmd = resume_one_md_table_init(pgd); - if (!pmd) - return -ENOMEM; - - if (pfn >= max_low_pfn) - continue; - - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) { - pte_t *max_pte; - - if (pfn >= max_low_pfn) - break; - - /* Map with normal page tables. - * NOTE: We can mark everything as executable here - */ - pte = resume_one_page_table_init(pmd); - if (!pte) - return -ENOMEM; - - max_pte = pte + PTRS_PER_PTE; - for (; pte < max_pte; pte++, pfn++) { - if (pfn >= max_low_pfn) - break; - - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - } - } - } - - return 0; -} - -static inline void resume_init_first_level_page_table(pgd_t *pg_dir) -{ -} - -int swsusp_arch_resume(void) -{ - int error; - - resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!resume_pg_dir) - return -ENOMEM; - - resume_init_first_level_page_table(resume_pg_dir); - error = resume_physical_mapping_init(resume_pg_dir); - if (error) - return error; - - /* We have got enough memory and from now on we cannot recover */ - restore_image(resume_pg_dir, restore_pblist); - return 0; -} - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; - unsigned long end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; - - return (pfn >= begin_pfn) && (pfn < end_pfn); -} - -void save_processor_state(void) -{ -} - -void restore_processor_state(void) -{ - local_flush_tlb_all(); -} diff --git a/arch/unicore32/kernel/hibernate_asm.S b/arch/unicore32/kernel/hibernate_asm.S deleted file mode 100644 index a589bc189e24..000000000000 --- a/arch/unicore32/kernel/hibernate_asm.S +++ /dev/null @@ -1,114 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/hibernate_asm.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include -#include - -@ restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist) -@ r0: resume_pg_dir -@ r1: restore_pblist -@ copy restore_pblist pages -@ restore registers from swsusp_arch_regs_cpu0 -@ -ENTRY(restore_image) - sub r0, r0, #PAGE_OFFSET - mov r5, #0 - movc p0.c6, r5, #6 @invalidate ITLB & DTLB - movc p0.c2, r0, #0 - nop - nop - nop - nop - nop - nop - nop - - .p2align 4,,7 -101: - csub.a r1, #0 - beq 109f - - ldw r6, [r1+], #PBE_ADDRESS - ldw r7, [r1+], #PBE_ORIN_ADDRESS - - movl ip, #128 -102: ldm.w (r8 - r15), [r6]+ - stm.w (r8 - r15), [r7]+ - sub.a ip, ip, #1 - bne 102b - - ldw r1, [r1+], #PBE_NEXT - b 101b - - .p2align 4,,7 -109: - /* go back to the original page tables */ - ldw r0, =swapper_pg_dir - sub r0, r0, #PAGE_OFFSET - mov r5, #0 - movc p0.c6, r5, #6 - movc p0.c2, r0, #0 - nop - nop - nop - nop - nop - nop - nop - -#ifdef CONFIG_UNICORE_FPU_F64 - ldw ip, 1f - add ip, ip, #SWSUSP_FPSTATE - lfm.w (f0 - f7 ), [ip]+ - lfm.w (f8 - f15), [ip]+ - lfm.w (f16 - f23), [ip]+ - lfm.w (f24 - f31), [ip]+ - ldw r4, [ip] - ctf r4, s31 -#endif - mov r0, #0x0 - ldw ip, 1f - add ip, ip, #SWSUSP_CPU - ldm.w (r4 - r15), [ip]+ - ldm (r16 - r27, sp, pc), [ip]+ @ Load all regs saved previously - - .align 2 -1: .long swsusp_arch_regs_cpu0 - - -@ swsusp_arch_suspend() -@ - prepare pc for resume, return from function without swsusp_save on resume -@ - save registers in swsusp_arch_regs_cpu0 -@ - call swsusp_save write suspend image - -ENTRY(swsusp_arch_suspend) - ldw ip, 1f - add ip, ip, #SWSUSP_CPU - stm.w (r4 - r15), [ip]+ - stm.w (r16 - r27, sp, lr), [ip]+ - -#ifdef CONFIG_UNICORE_FPU_F64 - ldw ip, 1f - add ip, ip, #SWSUSP_FPSTATE - sfm.w (f0 - f7 ), [ip]+ - sfm.w (f8 - f15), [ip]+ - sfm.w (f16 - f23), [ip]+ - sfm.w (f24 - f31), [ip]+ - cff r4, s31 - stw r4, [ip] -#endif - b swsusp_save @ no return - -1: .long swsusp_arch_regs_cpu0 diff --git a/arch/unicore32/kernel/irq.c b/arch/unicore32/kernel/irq.c deleted file mode 100644 index c014ae3c3e48..000000000000 --- a/arch/unicore32/kernel/irq.c +++ /dev/null @@ -1,371 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/irq.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "setup.h" - -/* - * PKUnity GPIO edge detection for IRQs: - * IRQs are generated on Falling-Edge, Rising-Edge, or both. - * Use this instead of directly setting GRER/GFER. - */ -static int GPIO_IRQ_rising_edge; -static int GPIO_IRQ_falling_edge; -static int GPIO_IRQ_mask = 0; - -#define GPIO_MASK(irq) (1 << (irq - IRQ_GPIO0)) - -static int puv3_gpio_type(struct irq_data *d, unsigned int type) -{ - unsigned int mask; - - if (d->irq < IRQ_GPIOHIGH) - mask = 1 << d->irq; - else - mask = GPIO_MASK(d->irq); - - if (type == IRQ_TYPE_PROBE) { - if ((GPIO_IRQ_rising_edge | GPIO_IRQ_falling_edge) & mask) - return 0; - type = IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING; - } - - if (type & IRQ_TYPE_EDGE_RISING) - GPIO_IRQ_rising_edge |= mask; - else - GPIO_IRQ_rising_edge &= ~mask; - if (type & IRQ_TYPE_EDGE_FALLING) - GPIO_IRQ_falling_edge |= mask; - else - GPIO_IRQ_falling_edge &= ~mask; - - writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER); - writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER); - - return 0; -} - -/* - * GPIO IRQs must be acknowledged. This is for IRQs from 0 to 7. - */ -static void puv3_low_gpio_ack(struct irq_data *d) -{ - writel((1 << d->irq), GPIO_GEDR); -} - -static void puv3_low_gpio_mask(struct irq_data *d) -{ - writel(readl(INTC_ICMR) & ~(1 << d->irq), INTC_ICMR); -} - -static void puv3_low_gpio_unmask(struct irq_data *d) -{ - writel(readl(INTC_ICMR) | (1 << d->irq), INTC_ICMR); -} - -static int puv3_low_gpio_wake(struct irq_data *d, unsigned int on) -{ - if (on) - writel(readl(PM_PWER) | (1 << d->irq), PM_PWER); - else - writel(readl(PM_PWER) & ~(1 << d->irq), PM_PWER); - return 0; -} - -static struct irq_chip puv3_low_gpio_chip = { - .name = "GPIO-low", - .irq_ack = puv3_low_gpio_ack, - .irq_mask = puv3_low_gpio_mask, - .irq_unmask = puv3_low_gpio_unmask, - .irq_set_type = puv3_gpio_type, - .irq_set_wake = puv3_low_gpio_wake, -}; - -/* - * IRQ8 (GPIO0 through 27) handler. We enter here with the - * irq_controller_lock held, and IRQs disabled. Decode the IRQ - * and call the handler. - */ -static void puv3_gpio_handler(struct irq_desc *desc) -{ - unsigned int mask, irq; - - mask = readl(GPIO_GEDR); - do { - /* - * clear down all currently active IRQ sources. - * We will be processing them all. - */ - writel(mask, GPIO_GEDR); - - irq = IRQ_GPIO0; - do { - if (mask & 1) - generic_handle_irq(irq); - mask >>= 1; - irq++; - } while (mask); - mask = readl(GPIO_GEDR); - } while (mask); -} - -/* - * GPIO0-27 edge IRQs need to be handled specially. - * In addition, the IRQs are all collected up into one bit in the - * interrupt controller registers. - */ -static void puv3_high_gpio_ack(struct irq_data *d) -{ - unsigned int mask = GPIO_MASK(d->irq); - - writel(mask, GPIO_GEDR); -} - -static void puv3_high_gpio_mask(struct irq_data *d) -{ - unsigned int mask = GPIO_MASK(d->irq); - - GPIO_IRQ_mask &= ~mask; - - writel(readl(GPIO_GRER) & ~mask, GPIO_GRER); - writel(readl(GPIO_GFER) & ~mask, GPIO_GFER); -} - -static void puv3_high_gpio_unmask(struct irq_data *d) -{ - unsigned int mask = GPIO_MASK(d->irq); - - GPIO_IRQ_mask |= mask; - - writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER); - writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER); -} - -static int puv3_high_gpio_wake(struct irq_data *d, unsigned int on) -{ - if (on) - writel(readl(PM_PWER) | PM_PWER_GPIOHIGH, PM_PWER); - else - writel(readl(PM_PWER) & ~PM_PWER_GPIOHIGH, PM_PWER); - return 0; -} - -static struct irq_chip puv3_high_gpio_chip = { - .name = "GPIO-high", - .irq_ack = puv3_high_gpio_ack, - .irq_mask = puv3_high_gpio_mask, - .irq_unmask = puv3_high_gpio_unmask, - .irq_set_type = puv3_gpio_type, - .irq_set_wake = puv3_high_gpio_wake, -}; - -/* - * We don't need to ACK IRQs on the PKUnity unless they're GPIOs - * this is for internal IRQs i.e. from 8 to 31. - */ -static void puv3_mask_irq(struct irq_data *d) -{ - writel(readl(INTC_ICMR) & ~(1 << d->irq), INTC_ICMR); -} - -static void puv3_unmask_irq(struct irq_data *d) -{ - writel(readl(INTC_ICMR) | (1 << d->irq), INTC_ICMR); -} - -/* - * Apart form GPIOs, only the RTC alarm can be a wakeup event. - */ -static int puv3_set_wake(struct irq_data *d, unsigned int on) -{ - if (d->irq == IRQ_RTCAlarm) { - if (on) - writel(readl(PM_PWER) | PM_PWER_RTC, PM_PWER); - else - writel(readl(PM_PWER) & ~PM_PWER_RTC, PM_PWER); - return 0; - } - return -EINVAL; -} - -static struct irq_chip puv3_normal_chip = { - .name = "PKUnity-v3", - .irq_ack = puv3_mask_irq, - .irq_mask = puv3_mask_irq, - .irq_unmask = puv3_unmask_irq, - .irq_set_wake = puv3_set_wake, -}; - -static struct resource irq_resource = { - .name = "irqs", - .start = io_v2p(PKUNITY_INTC_BASE), - .end = io_v2p(PKUNITY_INTC_BASE) + 0xFFFFF, -}; - -static struct puv3_irq_state { - unsigned int saved; - unsigned int icmr; - unsigned int iclr; - unsigned int iccr; -} puv3_irq_state; - -static int puv3_irq_suspend(void) -{ - struct puv3_irq_state *st = &puv3_irq_state; - - st->saved = 1; - st->icmr = readl(INTC_ICMR); - st->iclr = readl(INTC_ICLR); - st->iccr = readl(INTC_ICCR); - - /* - * Disable all GPIO-based interrupts. - */ - writel(readl(INTC_ICMR) & ~(0x1ff), INTC_ICMR); - - /* - * Set the appropriate edges for wakeup. - */ - writel(readl(PM_PWER) & GPIO_IRQ_rising_edge, GPIO_GRER); - writel(readl(PM_PWER) & GPIO_IRQ_falling_edge, GPIO_GFER); - - /* - * Clear any pending GPIO interrupts. - */ - writel(readl(GPIO_GEDR), GPIO_GEDR); - - return 0; -} - -static void puv3_irq_resume(void) -{ - struct puv3_irq_state *st = &puv3_irq_state; - - if (st->saved) { - writel(st->iccr, INTC_ICCR); - writel(st->iclr, INTC_ICLR); - - writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER); - writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER); - - writel(st->icmr, INTC_ICMR); - } -} - -static struct syscore_ops puv3_irq_syscore_ops = { - .suspend = puv3_irq_suspend, - .resume = puv3_irq_resume, -}; - -static int __init puv3_irq_init_syscore(void) -{ - register_syscore_ops(&puv3_irq_syscore_ops); - return 0; -} - -device_initcall(puv3_irq_init_syscore); - -void __init init_IRQ(void) -{ - unsigned int irq; - - request_resource(&iomem_resource, &irq_resource); - - /* disable all IRQs */ - writel(0, INTC_ICMR); - - /* all IRQs are IRQ, not REAL */ - writel(0, INTC_ICLR); - - /* clear all GPIO edge detects */ - writel(FMASK(8, 0) & ~FIELD(1, 1, GPI_SOFF_REQ), GPIO_GPIR); - writel(0, GPIO_GFER); - writel(0, GPIO_GRER); - writel(0x0FFFFFFF, GPIO_GEDR); - - writel(1, INTC_ICCR); - - for (irq = 0; irq < IRQ_GPIOHIGH; irq++) { - irq_set_chip(irq, &puv3_low_gpio_chip); - irq_set_handler(irq, handle_edge_irq); - irq_modify_status(irq, - IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN, - 0); - } - - for (irq = IRQ_GPIOHIGH + 1; irq < IRQ_GPIO0; irq++) { - irq_set_chip(irq, &puv3_normal_chip); - irq_set_handler(irq, handle_level_irq); - irq_modify_status(irq, - IRQ_NOREQUEST | IRQ_NOAUTOEN, - IRQ_NOPROBE); - } - - for (irq = IRQ_GPIO0; irq <= IRQ_GPIO27; irq++) { - irq_set_chip(irq, &puv3_high_gpio_chip); - irq_set_handler(irq, handle_edge_irq); - irq_modify_status(irq, - IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN, - 0); - } - - /* - * Install handler for GPIO 0-27 edge detect interrupts - */ - irq_set_chip(IRQ_GPIOHIGH, &puv3_normal_chip); - irq_set_chained_handler(IRQ_GPIOHIGH, puv3_gpio_handler); - -#ifdef CONFIG_PUV3_GPIO - puv3_init_gpio(); -#endif -} - -/* - * do_IRQ handles all hardware IRQ's. Decoded IRQs should not - * come via this function. Instead, they should provide their - * own 'handler' - */ -asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - irq_enter(); - - /* - * Some hardware gives randomly wrong interrupts. Rather - * than crashing, do something sensible. - */ - if (unlikely(irq >= nr_irqs)) { - if (printk_ratelimit()) - printk(KERN_WARNING "Bad IRQ%u\n", irq); - ack_bad_irq(irq); - } else { - generic_handle_irq(irq); - } - - irq_exit(); - set_irq_regs(old_regs); -} - diff --git a/arch/unicore32/kernel/ksyms.c b/arch/unicore32/kernel/ksyms.c deleted file mode 100644 index 731445008932..000000000000 --- a/arch/unicore32/kernel/ksyms.c +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/ksyms.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "ksyms.h" - -EXPORT_SYMBOL(find_first_bit); -EXPORT_SYMBOL(find_first_zero_bit); -EXPORT_SYMBOL(find_next_zero_bit); -EXPORT_SYMBOL(find_next_bit); - - /* platform dependent support */ -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__const_udelay); - - /* string / mem functions */ -EXPORT_SYMBOL(strchr); -EXPORT_SYMBOL(strrchr); -EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(memmove); -EXPORT_SYMBOL(memchr); - - /* user mem (segment) */ -EXPORT_SYMBOL(__strnlen_user); -EXPORT_SYMBOL(__strncpy_from_user); - -EXPORT_SYMBOL(copy_page); - -EXPORT_SYMBOL(raw_copy_from_user); -EXPORT_SYMBOL(raw_copy_to_user); -EXPORT_SYMBOL(__clear_user); - -EXPORT_SYMBOL(__ashldi3); -EXPORT_SYMBOL(__ashrdi3); -EXPORT_SYMBOL(__divsi3); -EXPORT_SYMBOL(__lshrdi3); -EXPORT_SYMBOL(__modsi3); -EXPORT_SYMBOL(__ucmpdi2); -EXPORT_SYMBOL(__udivsi3); -EXPORT_SYMBOL(__umodsi3); - diff --git a/arch/unicore32/kernel/ksyms.h b/arch/unicore32/kernel/ksyms.h deleted file mode 100644 index 5d2d5ba324ac..000000000000 --- a/arch/unicore32/kernel/ksyms.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * libgcc functions - functions that are used internally by the - * compiler... (prototypes are not correct though, but that - * doesn't really matter since they're not versioned). - */ -extern void __ashldi3(void); -extern void __ashrdi3(void); -extern void __divsi3(void); -extern void __lshrdi3(void); -extern void __modsi3(void); -extern void __ucmpdi2(void); -extern void __udivsi3(void); -extern void __umodsi3(void); diff --git a/arch/unicore32/kernel/module.c b/arch/unicore32/kernel/module.c deleted file mode 100644 index 67c89ef2d6ee..000000000000 --- a/arch/unicore32/kernel/module.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/module.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -void *module_alloc(unsigned long size) -{ - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} - -int -apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, - unsigned int relindex, struct module *module) -{ - Elf32_Shdr *symsec = sechdrs + symindex; - Elf32_Shdr *relsec = sechdrs + relindex; - Elf32_Shdr *dstsec = sechdrs + relsec->sh_info; - Elf32_Rel *rel = (void *)relsec->sh_addr; - unsigned int i; - - for (i = 0; i < relsec->sh_size / sizeof(Elf32_Rel); i++, rel++) { - unsigned long loc; - Elf32_Sym *sym; - s32 offset; - - offset = ELF32_R_SYM(rel->r_info); - if (offset < 0 || offset > - (symsec->sh_size / sizeof(Elf32_Sym))) { - printk(KERN_ERR "%s: bad relocation, " - "section %d reloc %d\n", - module->name, relindex, i); - return -ENOEXEC; - } - - sym = ((Elf32_Sym *)symsec->sh_addr) + offset; - - if (rel->r_offset < 0 || rel->r_offset > - dstsec->sh_size - sizeof(u32)) { - printk(KERN_ERR "%s: out of bounds relocation, " - "section %d reloc %d offset %d size %d\n", - module->name, relindex, i, rel->r_offset, - dstsec->sh_size); - return -ENOEXEC; - } - - loc = dstsec->sh_addr + rel->r_offset; - - switch (ELF32_R_TYPE(rel->r_info)) { - case R_UNICORE_NONE: - /* ignore */ - break; - - case R_UNICORE_ABS32: - *(u32 *)loc += sym->st_value; - break; - - case R_UNICORE_PC24: - case R_UNICORE_CALL: - case R_UNICORE_JUMP24: - offset = (*(u32 *)loc & 0x00ffffff) << 2; - if (offset & 0x02000000) - offset -= 0x04000000; - - offset += sym->st_value - loc; - if (offset & 3 || - offset <= (s32)0xfe000000 || - offset >= (s32)0x02000000) { - printk(KERN_ERR - "%s: relocation out of range, section " - "%d reloc %d sym '%s'\n", module->name, - relindex, i, strtab + sym->st_name); - return -ENOEXEC; - } - - offset >>= 2; - - *(u32 *)loc &= 0xff000000; - *(u32 *)loc |= offset & 0x00ffffff; - break; - - default: - printk(KERN_ERR "%s: unknown relocation: %u\n", - module->name, ELF32_R_TYPE(rel->r_info)); - return -ENOEXEC; - } - } - return 0; -} diff --git a/arch/unicore32/kernel/pci.c b/arch/unicore32/kernel/pci.c deleted file mode 100644 index 0d098aa05b47..000000000000 --- a/arch/unicore32/kernel/pci.c +++ /dev/null @@ -1,371 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/pci.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * PCI bios-type initialisation for PCI machines - */ -#include -#include -#include -#include -#include -#include -#include - -static int debug_pci; - -#define CONFIG_CMD(bus, devfn, where) \ - (0x80000000 | (bus->number << 16) | (devfn << 8) | (where & ~3)) - -static int -puv3_read_config(struct pci_bus *bus, unsigned int devfn, int where, - int size, u32 *value) -{ - writel(CONFIG_CMD(bus, devfn, where), PCICFG_ADDR); - switch (size) { - case 1: - *value = (readl(PCICFG_DATA) >> ((where & 3) * 8)) & 0xFF; - break; - case 2: - *value = (readl(PCICFG_DATA) >> ((where & 2) * 8)) & 0xFFFF; - break; - case 4: - *value = readl(PCICFG_DATA); - break; - } - return PCIBIOS_SUCCESSFUL; -} - -static int -puv3_write_config(struct pci_bus *bus, unsigned int devfn, int where, - int size, u32 value) -{ - writel(CONFIG_CMD(bus, devfn, where), PCICFG_ADDR); - switch (size) { - case 1: - writel((readl(PCICFG_DATA) & ~FMASK(8, (where&3)*8)) - | FIELD(value, 8, (where&3)*8), PCICFG_DATA); - break; - case 2: - writel((readl(PCICFG_DATA) & ~FMASK(16, (where&2)*8)) - | FIELD(value, 16, (where&2)*8), PCICFG_DATA); - break; - case 4: - writel(value, PCICFG_DATA); - break; - } - return PCIBIOS_SUCCESSFUL; -} - -struct pci_ops pci_puv3_ops = { - .read = puv3_read_config, - .write = puv3_write_config, -}; - -void pci_puv3_preinit(void) -{ - printk(KERN_DEBUG "PCI: PKUnity PCI Controller Initializing ...\n"); - /* config PCI bridge base */ - writel(io_v2p(PKUNITY_PCIBRI_BASE), PCICFG_BRIBASE); - - writel(0, PCIBRI_AHBCTL0); - writel(io_v2p(PKUNITY_PCIBRI_BASE) | PCIBRI_BARx_MEM, PCIBRI_AHBBAR0); - writel(0xFFFF0000, PCIBRI_AHBAMR0); - writel(0, PCIBRI_AHBTAR0); - - writel(PCIBRI_CTLx_AT, PCIBRI_AHBCTL1); - writel(io_v2p(PKUNITY_PCILIO_BASE) | PCIBRI_BARx_IO, PCIBRI_AHBBAR1); - writel(0xFFFF0000, PCIBRI_AHBAMR1); - writel(0x00000000, PCIBRI_AHBTAR1); - - writel(PCIBRI_CTLx_PREF, PCIBRI_AHBCTL2); - writel(io_v2p(PKUNITY_PCIMEM_BASE) | PCIBRI_BARx_MEM, PCIBRI_AHBBAR2); - writel(0xF8000000, PCIBRI_AHBAMR2); - writel(0, PCIBRI_AHBTAR2); - - writel(io_v2p(PKUNITY_PCIAHB_BASE) | PCIBRI_BARx_MEM, PCIBRI_BAR1); - - writel(PCIBRI_CTLx_AT | PCIBRI_CTLx_PREF, PCIBRI_PCICTL0); - writel(io_v2p(PKUNITY_PCIAHB_BASE) | PCIBRI_BARx_MEM, PCIBRI_PCIBAR0); - writel(0xF8000000, PCIBRI_PCIAMR0); - writel(PKUNITY_SDRAM_BASE, PCIBRI_PCITAR0); - - writel(readl(PCIBRI_CMD) | PCIBRI_CMD_IO | PCIBRI_CMD_MEM, PCIBRI_CMD); -} - -static int pci_puv3_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) -{ - if (dev->bus->number == 0) { -#ifdef CONFIG_ARCH_FPGA /* 4 pci slots */ - if (dev->devfn == 0x00) - return IRQ_PCIINTA; - else if (dev->devfn == 0x08) - return IRQ_PCIINTB; - else if (dev->devfn == 0x10) - return IRQ_PCIINTC; - else if (dev->devfn == 0x18) - return IRQ_PCIINTD; -#endif -#ifdef CONFIG_PUV3_DB0913 /* 3 pci slots */ - if (dev->devfn == 0x30) - return IRQ_PCIINTB; - else if (dev->devfn == 0x60) - return IRQ_PCIINTC; - else if (dev->devfn == 0x58) - return IRQ_PCIINTD; -#endif -#if defined(CONFIG_PUV3_NB0916) || defined(CONFIG_PUV3_SMW0919) - /* only support 2 pci devices */ - if (dev->devfn == 0x00) - return IRQ_PCIINTC; /* sata */ -#endif - } - return -1; -} - -/* - * Only first 128MB of memory can be accessed via PCI. - * We use GFP_DMA to allocate safe buffers to do map/unmap. - * This is really ugly and we need a better way of specifying - * DMA-capable regions of memory. - */ -void __init puv3_pci_adjust_zones(unsigned long max_zone_pfn) -{ - unsigned int sz = SZ_128M >> PAGE_SHIFT; - - max_zone_pfn[ZONE_DMA] = sz; -} - -/* - * If the bus contains any of these devices, then we must not turn on - * parity checking of any kind. - */ -static inline int pdev_bad_for_parity(struct pci_dev *dev) -{ - return 0; -} - -/* - * pcibios_fixup_bus - Called after each bus is probed, - * but before its children are examined. - */ -void pcibios_fixup_bus(struct pci_bus *bus) -{ - struct pci_dev *dev; - u16 features = PCI_COMMAND_SERR - | PCI_COMMAND_PARITY - | PCI_COMMAND_FAST_BACK; - - bus->resource[0] = &ioport_resource; - bus->resource[1] = &iomem_resource; - - /* - * Walk the devices on this bus, working out what we can - * and can't support. - */ - list_for_each_entry(dev, &bus->devices, bus_list) { - u16 status; - - pci_read_config_word(dev, PCI_STATUS, &status); - - /* - * If any device on this bus does not support fast back - * to back transfers, then the bus as a whole is not able - * to support them. Having fast back to back transfers - * on saves us one PCI cycle per transaction. - */ - if (!(status & PCI_STATUS_FAST_BACK)) - features &= ~PCI_COMMAND_FAST_BACK; - - if (pdev_bad_for_parity(dev)) - features &= ~(PCI_COMMAND_SERR - | PCI_COMMAND_PARITY); - - switch (dev->class >> 8) { - case PCI_CLASS_BRIDGE_PCI: - pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &status); - status |= PCI_BRIDGE_CTL_PARITY - | PCI_BRIDGE_CTL_MASTER_ABORT; - status &= ~(PCI_BRIDGE_CTL_BUS_RESET - | PCI_BRIDGE_CTL_FAST_BACK); - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, status); - break; - - case PCI_CLASS_BRIDGE_CARDBUS: - pci_read_config_word(dev, PCI_CB_BRIDGE_CONTROL, - &status); - status |= PCI_CB_BRIDGE_CTL_PARITY - | PCI_CB_BRIDGE_CTL_MASTER_ABORT; - pci_write_config_word(dev, PCI_CB_BRIDGE_CONTROL, - status); - break; - } - } - - /* - * Now walk the devices again, this time setting them up. - */ - list_for_each_entry(dev, &bus->devices, bus_list) { - u16 cmd; - - pci_read_config_word(dev, PCI_COMMAND, &cmd); - cmd |= features; - pci_write_config_word(dev, PCI_COMMAND, cmd); - - pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, - L1_CACHE_BYTES >> 2); - } - - /* - * Propagate the flags to the PCI bridge. - */ - if (bus->self && bus->self->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - if (features & PCI_COMMAND_FAST_BACK) - bus->bridge_ctl |= PCI_BRIDGE_CTL_FAST_BACK; - if (features & PCI_COMMAND_PARITY) - bus->bridge_ctl |= PCI_BRIDGE_CTL_PARITY; - } - - /* - * Report what we did for this bus - */ - printk(KERN_INFO "PCI: bus%d: Fast back to back transfers %sabled\n", - bus->number, (features & PCI_COMMAND_FAST_BACK) ? "en" : "dis"); -} -EXPORT_SYMBOL(pcibios_fixup_bus); - -static struct resource busn_resource = { - .name = "PCI busn", - .start = 0, - .end = 255, - .flags = IORESOURCE_BUS, -}; - -static int __init pci_common_init(void) -{ - struct pci_bus *puv3_bus; - struct pci_host_bridge *bridge; - int ret; - - bridge = pci_alloc_host_bridge(0); - if (!bridge) - return -ENOMEM; - - pci_puv3_preinit(); - - pci_add_resource(&bridge->windows, &ioport_resource); - pci_add_resource(&bridge->windows, &iomem_resource); - pci_add_resource(&bridge->windows, &busn_resource); - bridge->sysdata = NULL; - bridge->busnr = 0; - bridge->ops = &pci_puv3_ops; - bridge->swizzle_irq = pci_common_swizzle; - bridge->map_irq = pci_puv3_map_irq; - - /* Scan our single hose. */ - ret = pci_scan_root_bus_bridge(bridge); - if (ret) { - pci_free_host_bridge(bridge); - return; - } - - puv3_bus = bridge->bus; - - if (!puv3_bus) - panic("PCI: unable to scan bus!"); - - pci_bus_size_bridges(puv3_bus); - pci_bus_assign_resources(puv3_bus); - pci_bus_add_devices(puv3_bus); - return 0; -} -subsys_initcall(pci_common_init); - -char * __init pcibios_setup(char *str) -{ - if (!strcmp(str, "debug")) { - debug_pci = 1; - return NULL; - } - return str; -} - -void pcibios_set_master(struct pci_dev *dev) -{ - /* No special bus mastering setup handling */ -} - -/* - * From arch/i386/kernel/pci-i386.c: - * - * We need to avoid collisions with `mirrored' VGA ports - * and other strange ISA hardware, so we always want the - * addresses to be allocated in the 0x000-0x0ff region - * modulo 0x400. - * - * Why? Because some silly external IO cards only decode - * the low 10 bits of the IO address. The 0x00-0xff region - * is reserved for motherboard devices that decode all 16 - * bits, so it's ok to allocate at, say, 0x2800-0x28ff, - * but we want to try to avoid allocating at 0x2900-0x2bff - * which might be mirrored at 0x0100-0x03ff.. - */ -resource_size_t pcibios_align_resource(void *data, const struct resource *res, - resource_size_t size, resource_size_t align) -{ - resource_size_t start = res->start; - - if (res->flags & IORESOURCE_IO && start & 0x300) - start = (start + 0x3ff) & ~0x3ff; - - start = (start + align - 1) & ~(align - 1); - - return start; -} - -/** - * pcibios_enable_device - Enable I/O and memory. - * @dev: PCI device to be enabled - */ -int pcibios_enable_device(struct pci_dev *dev, int mask) -{ - u16 cmd, old_cmd; - int idx; - struct resource *r; - - pci_read_config_word(dev, PCI_COMMAND, &cmd); - old_cmd = cmd; - for (idx = 0; idx < 6; idx++) { - /* Only set up the requested stuff */ - if (!(mask & (1 << idx))) - continue; - - r = dev->resource + idx; - if (!r->start && r->end) { - printk(KERN_ERR "PCI: Device %s not available because" - " of resource collisions\n", pci_name(dev)); - return -EINVAL; - } - if (r->flags & IORESOURCE_IO) - cmd |= PCI_COMMAND_IO; - if (r->flags & IORESOURCE_MEM) - cmd |= PCI_COMMAND_MEMORY; - } - - /* - * Bridges (eg, cardbus bridges) need to be fully enabled - */ - if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) - cmd |= PCI_COMMAND_IO | PCI_COMMAND_MEMORY; - - if (cmd != old_cmd) { - printk("PCI: enabling device %s (%04x -> %04x)\n", - pci_name(dev), old_cmd, cmd); - pci_write_config_word(dev, PCI_COMMAND, cmd); - } - return 0; -} diff --git a/arch/unicore32/kernel/pm.c b/arch/unicore32/kernel/pm.c deleted file mode 100644 index 94b7f9df6c1a..000000000000 --- a/arch/unicore32/kernel/pm.c +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/pm.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "setup.h" - -struct puv3_cpu_pm_fns *puv3_cpu_pm_fns; -static unsigned long *sleep_save; - -int puv3_pm_enter(suspend_state_t state) -{ - unsigned long sleep_save_checksum = 0, checksum = 0; - int i; - - /* skip registers saving for standby */ - if (state != PM_SUSPEND_STANDBY) { - puv3_cpu_pm_fns->save(sleep_save); - /* before sleeping, calculate and save a checksum */ - for (i = 0; i < puv3_cpu_pm_fns->save_count - 1; i++) - sleep_save_checksum += sleep_save[i]; - } - - /* *** go zzz *** */ - puv3_cpu_pm_fns->enter(state); - cpu_init(); -#ifdef CONFIG_INPUT_KEYBOARD - puv3_ps2_init(); -#endif -#ifdef CONFIG_PCI - pci_puv3_preinit(); -#endif - if (state != PM_SUSPEND_STANDBY) { - /* after sleeping, validate the checksum */ - for (i = 0; i < puv3_cpu_pm_fns->save_count - 1; i++) - checksum += sleep_save[i]; - - /* if invalid, display message and wait for a hardware reset */ - if (checksum != sleep_save_checksum) { - while (1) - puv3_cpu_pm_fns->enter(state); - } - puv3_cpu_pm_fns->restore(sleep_save); - } - - pr_debug("*** made it back from resume\n"); - - return 0; -} -EXPORT_SYMBOL_GPL(puv3_pm_enter); - -unsigned long sleep_phys_sp(void *sp) -{ - return virt_to_phys(sp); -} - -static int puv3_pm_valid(suspend_state_t state) -{ - if (puv3_cpu_pm_fns) - return puv3_cpu_pm_fns->valid(state); - - return -EINVAL; -} - -static int puv3_pm_prepare(void) -{ - int ret = 0; - - if (puv3_cpu_pm_fns && puv3_cpu_pm_fns->prepare) - ret = puv3_cpu_pm_fns->prepare(); - - return ret; -} - -static void puv3_pm_finish(void) -{ - if (puv3_cpu_pm_fns && puv3_cpu_pm_fns->finish) - puv3_cpu_pm_fns->finish(); -} - -static struct platform_suspend_ops puv3_pm_ops = { - .valid = puv3_pm_valid, - .enter = puv3_pm_enter, - .prepare = puv3_pm_prepare, - .finish = puv3_pm_finish, -}; - -static int __init puv3_pm_init(void) -{ - if (!puv3_cpu_pm_fns) { - printk(KERN_ERR "no valid puv3_cpu_pm_fns defined\n"); - return -EINVAL; - } - - sleep_save = kmalloc_array(puv3_cpu_pm_fns->save_count, - sizeof(unsigned long), - GFP_KERNEL); - if (!sleep_save) { - printk(KERN_ERR "failed to alloc memory for pm save\n"); - return -ENOMEM; - } - - suspend_set_ops(&puv3_pm_ops); - return 0; -} - -device_initcall(puv3_pm_init); diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c deleted file mode 100644 index b4fd3a604a18..000000000000 --- a/arch/unicore32/kernel/process.c +++ /dev/null @@ -1,319 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/process.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "setup.h" - -static const char * const processor_modes[] = { - "UK00", "UK01", "UK02", "UK03", "UK04", "UK05", "UK06", "UK07", - "UK08", "UK09", "UK0A", "UK0B", "UK0C", "UK0D", "UK0E", "UK0F", - "USER", "REAL", "INTR", "PRIV", "UK14", "UK15", "UK16", "ABRT", - "UK18", "UK19", "UK1A", "EXTN", "UK1C", "UK1D", "UK1E", "SUSR" -}; - -void arch_cpu_idle(void) -{ - cpu_do_idle(); - local_irq_enable(); -} - -void machine_halt(void) -{ - gpio_set_value(GPO_SOFT_OFF, 0); -} - -/* - * Function pointers to optional machine specific functions - */ -void (*pm_power_off)(void) = NULL; -EXPORT_SYMBOL(pm_power_off); - -void machine_power_off(void) -{ - if (pm_power_off) - pm_power_off(); - machine_halt(); -} - -void machine_restart(char *cmd) -{ - /* Disable interrupts first */ - local_irq_disable(); - - /* - * Tell the mm system that we are going to reboot - - * we may need it to insert some 1:1 mappings so that - * soft boot works. - */ - setup_mm_for_reboot(); - - /* Clean and invalidate caches */ - flush_cache_all(); - - /* Turn off caching */ - cpu_proc_fin(); - - /* Push out any further dirty data, and ensure cache is empty */ - flush_cache_all(); - - /* - * Now handle reboot code. - */ - if (reboot_mode == REBOOT_SOFT) { - /* Jump into ROM at address 0xffff0000 */ - cpu_reset(VECTORS_BASE); - } else { - writel(0x00002001, PM_PLLSYSCFG); /* cpu clk = 250M */ - writel(0x00100800, PM_PLLDDRCFG); /* ddr clk = 44M */ - writel(0x00002001, PM_PLLVGACFG); /* vga clk = 250M */ - - /* Use on-chip reset capability */ - /* following instructions must be in one icache line */ - __asm__ __volatile__( - " .align 5\n\t" - " stw %1, [%0]\n\t" - "201: ldw r0, [%0]\n\t" - " cmpsub.a r0, #0\n\t" - " bne 201b\n\t" - " stw %3, [%2]\n\t" - " nop; nop; nop\n\t" - /* prefetch 3 instructions at most */ - : - : "r" (PM_PMCR), - "r" (PM_PMCR_CFBSYS | PM_PMCR_CFBDDR - | PM_PMCR_CFBVGA), - "r" (RESETC_SWRR), - "r" (RESETC_SWRR_SRB) - : "r0", "memory"); - } - - /* - * Whoops - the architecture was unable to reboot. - * Tell the user! - */ - mdelay(1000); - printk(KERN_EMERG "Reboot failed -- System halted\n"); - do { } while (1); -} - -void __show_regs(struct pt_regs *regs) -{ - unsigned long flags; - char buf[64]; - - show_regs_print_info(KERN_DEFAULT); - printk("PC is at %pS\n", (void *)instruction_pointer(regs)); - printk("LR is at %pS\n", (void *)regs->UCreg_lr); - printk(KERN_DEFAULT "pc : [<%08lx>] lr : [<%08lx>] psr: %08lx\n" - "sp : %08lx ip : %08lx fp : %08lx\n", - regs->UCreg_pc, regs->UCreg_lr, regs->UCreg_asr, - regs->UCreg_sp, regs->UCreg_ip, regs->UCreg_fp); - printk(KERN_DEFAULT "r26: %08lx r25: %08lx r24: %08lx\n", - regs->UCreg_26, regs->UCreg_25, - regs->UCreg_24); - printk(KERN_DEFAULT "r23: %08lx r22: %08lx r21: %08lx r20: %08lx\n", - regs->UCreg_23, regs->UCreg_22, - regs->UCreg_21, regs->UCreg_20); - printk(KERN_DEFAULT "r19: %08lx r18: %08lx r17: %08lx r16: %08lx\n", - regs->UCreg_19, regs->UCreg_18, - regs->UCreg_17, regs->UCreg_16); - printk(KERN_DEFAULT "r15: %08lx r14: %08lx r13: %08lx r12: %08lx\n", - regs->UCreg_15, regs->UCreg_14, - regs->UCreg_13, regs->UCreg_12); - printk(KERN_DEFAULT "r11: %08lx r10: %08lx r9 : %08lx r8 : %08lx\n", - regs->UCreg_11, regs->UCreg_10, - regs->UCreg_09, regs->UCreg_08); - printk(KERN_DEFAULT "r7 : %08lx r6 : %08lx r5 : %08lx r4 : %08lx\n", - regs->UCreg_07, regs->UCreg_06, - regs->UCreg_05, regs->UCreg_04); - printk(KERN_DEFAULT "r3 : %08lx r2 : %08lx r1 : %08lx r0 : %08lx\n", - regs->UCreg_03, regs->UCreg_02, - regs->UCreg_01, regs->UCreg_00); - - flags = regs->UCreg_asr; - buf[0] = flags & PSR_S_BIT ? 'S' : 's'; - buf[1] = flags & PSR_Z_BIT ? 'Z' : 'z'; - buf[2] = flags & PSR_C_BIT ? 'C' : 'c'; - buf[3] = flags & PSR_V_BIT ? 'V' : 'v'; - buf[4] = '\0'; - - printk(KERN_DEFAULT "Flags: %s INTR o%s REAL o%s Mode %s Segment %s\n", - buf, interrupts_enabled(regs) ? "n" : "ff", - fast_interrupts_enabled(regs) ? "n" : "ff", - processor_modes[processor_mode(regs)], - uaccess_kernel() ? "kernel" : "user"); - { - unsigned int ctrl; - - buf[0] = '\0'; - { - unsigned int transbase; - asm("movc %0, p0.c2, #0\n" - : "=r" (transbase)); - snprintf(buf, sizeof(buf), " Table: %08x", transbase); - } - asm("movc %0, p0.c1, #0\n" : "=r" (ctrl)); - - printk(KERN_DEFAULT "Control: %08x%s\n", ctrl, buf); - } -} - -void show_regs(struct pt_regs *regs) -{ - printk(KERN_DEFAULT "\n"); - printk(KERN_DEFAULT "Pid: %d, comm: %20s\n", - task_pid_nr(current), current->comm); - __show_regs(regs); - __backtrace(); -} - -void flush_thread(void) -{ - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = current; - - memset(thread->used_cp, 0, sizeof(thread->used_cp)); - memset(&tsk->thread.debug, 0, sizeof(struct debug_info)); -#ifdef CONFIG_UNICORE_FPU_F64 - memset(&thread->fpstate, 0, sizeof(struct fp_state)); -#endif -} - -void release_thread(struct task_struct *dead_task) -{ -} - -asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); - -int -copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p) -{ - struct thread_info *thread = task_thread_info(p); - struct pt_regs *childregs = task_pt_regs(p); - - memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save)); - thread->cpu_context.sp = (unsigned long)childregs; - if (unlikely(p->flags & PF_KTHREAD)) { - thread->cpu_context.pc = (unsigned long)ret_from_kernel_thread; - thread->cpu_context.r4 = stack_start; - thread->cpu_context.r5 = stk_sz; - memset(childregs, 0, sizeof(struct pt_regs)); - } else { - thread->cpu_context.pc = (unsigned long)ret_from_fork; - *childregs = *current_pt_regs(); - childregs->UCreg_00 = 0; - if (stack_start) - childregs->UCreg_sp = stack_start; - - if (clone_flags & CLONE_SETTLS) - childregs->UCreg_16 = childregs->UCreg_03; - } - return 0; -} - -/* - * Fill in the task's elfregs structure for a core dump. - */ -int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs) -{ - elf_core_copy_regs(elfregs, task_pt_regs(t)); - return 1; -} - -/* - * fill in the fpe structure for a core dump... - */ -int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fp) -{ - struct thread_info *thread = current_thread_info(); - int used_math = thread->used_cp[1] | thread->used_cp[2]; - -#ifdef CONFIG_UNICORE_FPU_F64 - if (used_math) - memcpy(fp, &thread->fpstate, sizeof(*fp)); -#endif - return used_math != 0; -} -EXPORT_SYMBOL(dump_fpu); - -unsigned long get_wchan(struct task_struct *p) -{ - struct stackframe frame; - int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - - frame.fp = thread_saved_fp(p); - frame.sp = thread_saved_sp(p); - frame.lr = 0; /* recovered from the stack */ - frame.pc = thread_saved_pc(p); - do { - int ret = unwind_frame(&frame); - if (ret < 0) - return 0; - if (!in_sched_functions(frame.pc)) - return frame.pc; - } while ((count++) < 16); - return 0; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - return randomize_page(mm->brk, 0x02000000); -} - -/* - * The vectors page is always readable from user space for the - * atomic helpers and the signal restart code. Let's declare a mapping - * for it so it is visible through ptrace and /proc//mem. - */ - -int vectors_user_mapping(void) -{ - struct mm_struct *mm = current->mm; - return install_special_mapping(mm, 0xffff0000, PAGE_SIZE, - VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYEXEC | - VM_DONTEXPAND | VM_DONTDUMP, - NULL); -} - -const char *arch_vma_name(struct vm_area_struct *vma) -{ - return (vma->vm_start == 0xffff0000) ? "[vectors]" : NULL; -} diff --git a/arch/unicore32/kernel/ptrace.c b/arch/unicore32/kernel/ptrace.c deleted file mode 100644 index 0f216567b90a..000000000000 --- a/arch/unicore32/kernel/ptrace.c +++ /dev/null @@ -1,147 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/ptrace.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * By Ross Biro 1/23/92 - */ -#include -#include -#include -#include -#include - -/* - * this routine will get a word off of the processes privileged stack. - * the offset is how far from the base addr as stored in the THREAD. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline long get_user_reg(struct task_struct *task, int offset) -{ - return task_pt_regs(task)->uregs[offset]; -} - -/* - * this routine will put a word on the processes privileged stack. - * the offset is how far from the base addr as stored in the THREAD. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline int -put_user_reg(struct task_struct *task, int offset, long data) -{ - struct pt_regs newregs, *regs = task_pt_regs(task); - int ret = -EINVAL; - - newregs = *regs; - newregs.uregs[offset] = data; - - if (valid_user_regs(&newregs)) { - regs->uregs[offset] = data; - ret = 0; - } - - return ret; -} - -/* - * Called by kernel/ptrace.c when detaching.. - */ -void ptrace_disable(struct task_struct *child) -{ -} - -/* - * We actually access the pt_regs stored on the kernel stack. - */ -static int ptrace_read_user(struct task_struct *tsk, unsigned long off, - unsigned long __user *ret) -{ - unsigned long tmp; - - tmp = 0; - if (off < sizeof(struct pt_regs)) - tmp = get_user_reg(tsk, off >> 2); - - return put_user(tmp, ret); -} - -/* - * We actually access the pt_regs stored on the kernel stack. - */ -static int ptrace_write_user(struct task_struct *tsk, unsigned long off, - unsigned long val) -{ - if (off >= sizeof(struct pt_regs)) - return 0; - - return put_user_reg(tsk, off >> 2, val); -} - -long arch_ptrace(struct task_struct *child, long request, - unsigned long addr, unsigned long data) -{ - int ret; - unsigned long __user *datap = (unsigned long __user *) data; - - switch (request) { - case PTRACE_PEEKUSR: - ret = ptrace_read_user(child, addr, datap); - break; - - case PTRACE_POKEUSR: - ret = ptrace_write_user(child, addr, data); - break; - - case PTRACE_GET_THREAD_AREA: - ret = put_user(task_pt_regs(child)->UCreg_16, - datap); - break; - - default: - ret = ptrace_request(child, request, addr, data); - break; - } - - return ret; -} - -asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno) -{ - unsigned long ip; - - if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return scno; - if (!(current->ptrace & PT_PTRACED)) - return scno; - - /* - * Save IP. IP is used to denote syscall entry/exit: - * IP = 0 -> entry, = 1 -> exit - */ - ip = regs->UCreg_ip; - regs->UCreg_ip = why; - - current_thread_info()->syscall = scno; - - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } - regs->UCreg_ip = ip; - - return current_thread_info()->syscall; -} diff --git a/arch/unicore32/kernel/puv3-core.c b/arch/unicore32/kernel/puv3-core.c deleted file mode 100644 index 78f12e627365..000000000000 --- a/arch/unicore32/kernel/puv3-core.c +++ /dev/null @@ -1,276 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/puv3-core.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -/* - * This is the PKUnity sched_clock implementation. This has - * a resolution of 271ns, and a maximum value of 32025597s (370 days). - * - * The return value is guaranteed to be monotonic in that range as - * long as there is always less than 582 seconds between successive - * calls to this function. - * - * ( * 1E9 / CLOCK_TICK_RATE ) -> about 2235/32 - */ -unsigned long long sched_clock(void) -{ - unsigned long long v = cnt32_to_63(readl(OST_OSCR)); - - /* original conservative method, but overflow frequently - * v *= NSEC_PER_SEC >> 12; - * do_div(v, CLOCK_TICK_RATE >> 12); - */ - v = ((v & 0x7fffffffffffffffULL) * 2235) >> 5; - - return v; -} - -static struct resource puv3_usb_resources[] = { - /* order is significant! */ - { - .start = io_v2p(PKUNITY_USB_BASE), - .end = io_v2p(PKUNITY_USB_BASE) + 0x3ff, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_USB, - .flags = IORESOURCE_IRQ, - }, { - .start = IRQ_USB, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct musb_hdrc_config puv3_usb_config[] = { - { - .num_eps = 16, - .multipoint = 1, -#ifdef CONFIG_USB_INVENTRA_DMA - .dma = 1, - .dma_channels = 8, -#endif - }, -}; - -static struct musb_hdrc_platform_data puv3_usb_plat = { - .mode = MUSB_HOST, - .min_power = 100, - .clock = 0, - .config = puv3_usb_config, -}; - -static struct resource puv3_mmc_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_SDC_BASE), - .end = io_v2p(PKUNITY_SDC_BASE) + 0xfff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_SDC, - .end = IRQ_SDC, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct resource puv3_unigfx_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_UNIGFX_BASE), - .end = io_v2p(PKUNITY_UNIGFX_BASE) + 0xfff, - .flags = IORESOURCE_MEM, - }, -}; - -static struct resource puv3_rtc_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_RTC_BASE), - .end = io_v2p(PKUNITY_RTC_BASE) + 0xff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_RTCAlarm, - .end = IRQ_RTCAlarm, - .flags = IORESOURCE_IRQ, - }, - [2] = { - .start = IRQ_RTC, - .end = IRQ_RTC, - .flags = IORESOURCE_IRQ - } -}; - -static struct resource puv3_pwm_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_OST_BASE) + 0x80, - .end = io_v2p(PKUNITY_OST_BASE) + 0xff, - .flags = IORESOURCE_MEM, - }, -}; - -static struct resource puv3_uart0_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_UART0_BASE), - .end = io_v2p(PKUNITY_UART0_BASE) + 0xff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_UART0, - .end = IRQ_UART0, - .flags = IORESOURCE_IRQ - } -}; - -static struct resource puv3_uart1_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_UART1_BASE), - .end = io_v2p(PKUNITY_UART1_BASE) + 0xff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_UART1, - .end = IRQ_UART1, - .flags = IORESOURCE_IRQ - } -}; - -static struct resource puv3_umal_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_UMAL_BASE), - .end = io_v2p(PKUNITY_UMAL_BASE) + 0x1fff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_UMAL, - .end = IRQ_UMAL, - .flags = IORESOURCE_IRQ - } -}; - -#ifdef CONFIG_PUV3_PM - -#define SAVE(x) sleep_save[SLEEP_SAVE_##x] = x -#define RESTORE(x) x = sleep_save[SLEEP_SAVE_##x] - -/* - * List of global PXA peripheral registers to preserve. - * More ones like CP and general purpose register values are preserved - * with the stack pointer in sleep.S. - */ -enum { - SLEEP_SAVE_PM_PLLDDRCFG, - SLEEP_SAVE_COUNT -}; - - -static void puv3_cpu_pm_save(unsigned long *sleep_save) -{ -/* SAVE(PM_PLLDDRCFG); */ -} - -static void puv3_cpu_pm_restore(unsigned long *sleep_save) -{ -/* RESTORE(PM_PLLDDRCFG); */ -} - -static int puv3_cpu_pm_prepare(void) -{ - /* set resume return address */ - writel(virt_to_phys(puv3_cpu_resume), PM_DIVCFG); - return 0; -} - -static void puv3_cpu_pm_enter(suspend_state_t state) -{ - /* Clear reset status */ - writel(RESETC_RSSR_HWR | RESETC_RSSR_WDR - | RESETC_RSSR_SMR | RESETC_RSSR_SWR, RESETC_RSSR); - - switch (state) { -/* case PM_SUSPEND_ON: - puv3_cpu_idle(); - break; */ - case PM_SUSPEND_MEM: - puv3_cpu_pm_prepare(); - puv3_cpu_suspend(PM_PMCR_SFB); - break; - } -} - -static int puv3_cpu_pm_valid(suspend_state_t state) -{ - return state == PM_SUSPEND_MEM; -} - -static void puv3_cpu_pm_finish(void) -{ - /* ensure not to come back here if it wasn't intended */ - /* PSPR = 0; */ -} - -static struct puv3_cpu_pm_fns puv3_cpu_pm_fnss = { - .save_count = SLEEP_SAVE_COUNT, - .valid = puv3_cpu_pm_valid, - .save = puv3_cpu_pm_save, - .restore = puv3_cpu_pm_restore, - .enter = puv3_cpu_pm_enter, - .prepare = puv3_cpu_pm_prepare, - .finish = puv3_cpu_pm_finish, -}; - -static void __init puv3_init_pm(void) -{ - puv3_cpu_pm_fns = &puv3_cpu_pm_fnss; -} -#else -static inline void puv3_init_pm(void) {} -#endif - -void puv3_ps2_init(void) -{ - struct clk *bclk32; - - bclk32 = clk_get(NULL, "BUS32_CLK"); - writel(clk_get_rate(bclk32) / 200000, PS2_CNT); /* should > 5us */ -} - -void __init puv3_core_init(void) -{ - puv3_init_pm(); - puv3_ps2_init(); - - platform_device_register_simple("PKUnity-v3-RTC", -1, - puv3_rtc_resources, ARRAY_SIZE(puv3_rtc_resources)); - platform_device_register_simple("PKUnity-v3-UMAL", -1, - puv3_umal_resources, ARRAY_SIZE(puv3_umal_resources)); - platform_device_register_simple("PKUnity-v3-MMC", -1, - puv3_mmc_resources, ARRAY_SIZE(puv3_mmc_resources)); - platform_device_register_simple("PKUnity-v3-UNIGFX", -1, - puv3_unigfx_resources, ARRAY_SIZE(puv3_unigfx_resources)); - platform_device_register_simple("PKUnity-v3-PWM", -1, - puv3_pwm_resources, ARRAY_SIZE(puv3_pwm_resources)); - platform_device_register_simple("PKUnity-v3-UART", 0, - puv3_uart0_resources, ARRAY_SIZE(puv3_uart0_resources)); - platform_device_register_simple("PKUnity-v3-UART", 1, - puv3_uart1_resources, ARRAY_SIZE(puv3_uart1_resources)); - platform_device_register_simple("PKUnity-v3-AC97", -1, NULL, 0); - platform_device_register_resndata(NULL, "musb_hdrc", -1, - puv3_usb_resources, ARRAY_SIZE(puv3_usb_resources), - &puv3_usb_plat, sizeof(puv3_usb_plat)); -} - diff --git a/arch/unicore32/kernel/puv3-nb0916.c b/arch/unicore32/kernel/puv3-nb0916.c deleted file mode 100644 index e251f5028396..000000000000 --- a/arch/unicore32/kernel/puv3-nb0916.c +++ /dev/null @@ -1,147 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/puv3-nb0916.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -static struct physmap_flash_data physmap_flash_data = { - .width = 1, -}; - -static struct resource physmap_flash_resource = { - .start = 0xFFF80000, - .end = 0xFFFFFFFF, - .flags = IORESOURCE_MEM, -}; - -static struct resource puv3_i2c_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_I2C_BASE), - .end = io_v2p(PKUNITY_I2C_BASE) + 0xff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_I2C, - .end = IRQ_I2C, - .flags = IORESOURCE_IRQ, - } -}; - -static struct pwm_lookup nb0916_pwm_lookup[] = { - PWM_LOOKUP("PKUnity-v3-PWM", 0, "pwm-backlight", NULL, 70 * 1024, - PWM_POLARITY_NORMAL), -}; - -static struct platform_pwm_backlight_data nb0916_backlight_data = { - .max_brightness = 100, - .dft_brightness = 100, -}; - -static struct gpio_keys_button nb0916_gpio_keys[] = { - { - .type = EV_KEY, - .code = KEY_POWER, - .gpio = GPI_SOFF_REQ, - .desc = "Power Button", - .wakeup = 1, - .active_low = 1, - }, - { - .type = EV_KEY, - .code = BTN_TOUCH, - .gpio = GPI_BTN_TOUCH, - .desc = "Touchpad Button", - .wakeup = 1, - .active_low = 1, - }, -}; - -static struct gpio_keys_platform_data nb0916_gpio_button_data = { - .buttons = nb0916_gpio_keys, - .nbuttons = ARRAY_SIZE(nb0916_gpio_keys), -}; - -static irqreturn_t nb0916_lcdcaseoff_handler(int irq, void *dev_id) -{ - if (gpio_get_value(GPI_LCD_CASE_OFF)) - gpio_set_value(GPO_LCD_EN, 1); - else - gpio_set_value(GPO_LCD_EN, 0); - - return IRQ_HANDLED; -} - -static irqreturn_t nb0916_overheat_handler(int irq, void *dev_id) -{ - machine_halt(); - /* SYSTEM HALT, NO RETURN */ - return IRQ_HANDLED; -} - -static struct i2c_board_info __initdata puv3_i2c_devices[] = { - { I2C_BOARD_INFO("lm75", I2C_TAR_THERMAL), }, - { I2C_BOARD_INFO("bq27200", I2C_TAR_PWIC), }, - { I2C_BOARD_INFO("24c02", I2C_TAR_EEPROM), }, -}; - -int __init mach_nb0916_init(void) -{ - i2c_register_board_info(0, puv3_i2c_devices, - ARRAY_SIZE(puv3_i2c_devices)); - - platform_device_register_simple("PKUnity-v3-I2C", -1, - puv3_i2c_resources, ARRAY_SIZE(puv3_i2c_resources)); - - pwm_add_table(nb0916_pwm_lookup, ARRAY_SIZE(nb0916_pwm_lookup)); - - platform_device_register_data(NULL, "pwm-backlight", -1, - &nb0916_backlight_data, sizeof(nb0916_backlight_data)); - - platform_device_register_data(NULL, "gpio-keys", -1, - &nb0916_gpio_button_data, sizeof(nb0916_gpio_button_data)); - - platform_device_register_resndata(NULL, "physmap-flash", -1, - &physmap_flash_resource, 1, - &physmap_flash_data, sizeof(physmap_flash_data)); - - if (request_irq(gpio_to_irq(GPI_LCD_CASE_OFF), - &nb0916_lcdcaseoff_handler, - IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, - "NB0916 lcd case off", NULL) < 0) { - - printk(KERN_DEBUG "LCD-Case-OFF IRQ %d not available\n", - gpio_to_irq(GPI_LCD_CASE_OFF)); - } - - if (request_irq(gpio_to_irq(GPI_OTP_INT), &nb0916_overheat_handler, - IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, - "NB0916 overheating protection", NULL) < 0) { - - printk(KERN_DEBUG "Overheating Protection IRQ %d not available\n", - gpio_to_irq(GPI_OTP_INT)); - } - - return 0; -} - -subsys_initcall_sync(mach_nb0916_init); diff --git a/arch/unicore32/kernel/setup.c b/arch/unicore32/kernel/setup.c deleted file mode 100644 index 0c4242a5ee1d..000000000000 --- a/arch/unicore32/kernel/setup.c +++ /dev/null @@ -1,352 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/setup.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "setup.h" - -#ifndef MEM_SIZE -#define MEM_SIZE (16*1024*1024) -#endif - -struct stack { - u32 irq[3]; - u32 abt[3]; - u32 und[3]; -} ____cacheline_aligned; - -static struct stack stacks[NR_CPUS]; - -#ifdef CONFIG_VGA_CONSOLE -struct screen_info screen_info; -#endif - -char elf_platform[ELF_PLATFORM_SIZE]; -EXPORT_SYMBOL(elf_platform); - -static char __initdata cmd_line[COMMAND_LINE_SIZE]; - -static char default_command_line[COMMAND_LINE_SIZE] __initdata = CONFIG_CMDLINE; - -/* - * Standard memory resources - */ -static struct resource mem_res[] = { - { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_SYSTEM_RAM - }, - { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_SYSTEM_RAM - } -}; - -#define kernel_code mem_res[0] -#define kernel_data mem_res[1] - -/* - * These functions re-use the assembly code in head.S, which - * already provide the required functionality. - */ -static void __init setup_processor(void) -{ - printk(KERN_DEFAULT "CPU: UniCore-II [%08x] revision %d, cr=%08lx\n", - uc32_cpuid, (int)(uc32_cpuid >> 16) & 15, cr_alignment); - - sprintf(init_utsname()->machine, "puv3"); - sprintf(elf_platform, "ucv2"); -} - -/* - * cpu_init - initialise one CPU. - * - * cpu_init sets up the per-CPU stacks. - */ -void cpu_init(void) -{ - unsigned int cpu = smp_processor_id(); - struct stack *stk = &stacks[cpu]; - - /* - * setup stacks for re-entrant exception handlers - */ - __asm__ ( - "mov.a asr, %1\n\t" - "add sp, %0, %2\n\t" - "mov.a asr, %3\n\t" - "add sp, %0, %4\n\t" - "mov.a asr, %5\n\t" - "add sp, %0, %6\n\t" - "mov.a asr, %7" - : - : "r" (stk), - "r" (PSR_R_BIT | PSR_I_BIT | INTR_MODE), - "I" (offsetof(struct stack, irq[0])), - "r" (PSR_R_BIT | PSR_I_BIT | ABRT_MODE), - "I" (offsetof(struct stack, abt[0])), - "r" (PSR_R_BIT | PSR_I_BIT | EXTN_MODE), - "I" (offsetof(struct stack, und[0])), - "r" (PSR_R_BIT | PSR_I_BIT | PRIV_MODE) - : "r30", "cc"); -} - -static int __init uc32_add_memory(unsigned long start, unsigned long size) -{ - struct membank *bank = &meminfo.bank[meminfo.nr_banks]; - - if (meminfo.nr_banks >= NR_BANKS) { - printk(KERN_CRIT "NR_BANKS too low, " - "ignoring memory at %#lx\n", start); - return -EINVAL; - } - - /* - * Ensure that start/size are aligned to a page boundary. - * Size is appropriately rounded down, start is rounded up. - */ - size -= start & ~PAGE_MASK; - - bank->start = PAGE_ALIGN(start); - bank->size = size & PAGE_MASK; - - /* - * Check whether this memory region has non-zero size or - * invalid node number. - */ - if (bank->size == 0) - return -EINVAL; - - meminfo.nr_banks++; - return 0; -} - -/* - * Pick out the memory size. We look for mem=size@start, - * where start and size are "size[KkMm]" - */ -static int __init early_mem(char *p) -{ - static int usermem __initdata = 1; - unsigned long size, start; - char *endp; - - /* - * If the user specifies memory size, we - * blow away any automatically generated - * size. - */ - if (usermem) { - usermem = 0; - meminfo.nr_banks = 0; - } - - start = PHYS_OFFSET; - size = memparse(p, &endp); - if (*endp == '@') - start = memparse(endp + 1, NULL); - - uc32_add_memory(start, size); - - return 0; -} -early_param("mem", early_mem); - -static void __init -request_standard_resources(struct meminfo *mi) -{ - struct resource *res; - int i; - - kernel_code.start = virt_to_phys(_stext); - kernel_code.end = virt_to_phys(_etext - 1); - kernel_data.start = virt_to_phys(_sdata); - kernel_data.end = virt_to_phys(_end - 1); - - for (i = 0; i < mi->nr_banks; i++) { - if (mi->bank[i].size == 0) - continue; - - res = memblock_alloc_low(sizeof(*res), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes align=%x\n", - __func__, sizeof(*res), SMP_CACHE_BYTES); - - res->name = "System RAM"; - res->start = mi->bank[i].start; - res->end = mi->bank[i].start + mi->bank[i].size - 1; - res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - - request_resource(&iomem_resource, res); - - if (kernel_code.start >= res->start && - kernel_code.end <= res->end) - request_resource(res, &kernel_code); - if (kernel_data.start >= res->start && - kernel_data.end <= res->end) - request_resource(res, &kernel_data); - } -} - -static void (*init_machine)(void) __initdata; - -static int __init customize_machine(void) -{ - /* customizes platform devices, or adds new ones */ - if (init_machine) - init_machine(); - return 0; -} -arch_initcall(customize_machine); - -void __init setup_arch(char **cmdline_p) -{ - char *from = default_command_line; - - setup_processor(); - - init_mm.start_code = (unsigned long) _stext; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = (unsigned long) _end; - - /* parse_early_param needs a boot_command_line */ - strlcpy(boot_command_line, from, COMMAND_LINE_SIZE); - - /* populate cmd_line too for later use, preserving boot_command_line */ - strlcpy(cmd_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = cmd_line; - - parse_early_param(); - - uc32_memblock_init(&meminfo); - - paging_init(); - request_standard_resources(&meminfo); - - cpu_init(); - - /* - * Set up various architecture-specific pointers - */ - init_machine = puv3_core_init; - -#ifdef CONFIG_VT -#if defined(CONFIG_VGA_CONSOLE) - conswitchp = &vga_con; -#endif -#endif - early_trap_init(); -} - -static struct cpu cpuinfo_unicore; - -static int __init topology_init(void) -{ - int i; - - for_each_possible_cpu(i) - register_cpu(&cpuinfo_unicore, i); - - return 0; -} -subsys_initcall(topology_init); - -#ifdef CONFIG_HAVE_PROC_CPU -static int __init proc_cpu_init(void) -{ - struct proc_dir_entry *res; - - res = proc_mkdir("cpu", NULL); - if (!res) - return -ENOMEM; - return 0; -} -fs_initcall(proc_cpu_init); -#endif - -static int c_show(struct seq_file *m, void *v) -{ - seq_printf(m, "Processor\t: UniCore-II rev %d (%s)\n", - (int)(uc32_cpuid >> 16) & 15, elf_platform); - - seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", - loops_per_jiffy / (500000/HZ), - (loops_per_jiffy / (5000/HZ)) % 100); - - /* dump out the processor features */ - seq_puts(m, "Features\t: CMOV UC-F64"); - - seq_printf(m, "\nCPU implementer\t: 0x%02x\n", uc32_cpuid >> 24); - seq_printf(m, "CPU architecture: 2\n"); - seq_printf(m, "CPU revision\t: %d\n", (uc32_cpuid >> 16) & 15); - - seq_printf(m, "Cache type\t: write-back\n" - "Cache clean\t: cp0 c5 ops\n" - "Cache lockdown\t: not support\n" - "Cache format\t: Harvard\n"); - - seq_puts(m, "\n"); - - seq_printf(m, "Hardware\t: PKUnity v3\n"); - - return 0; -} - -static void *c_start(struct seq_file *m, loff_t *pos) -{ - return *pos < 1 ? (void *)1 : NULL; -} - -static void *c_next(struct seq_file *m, void *v, loff_t *pos) -{ - ++*pos; - return NULL; -} - -static void c_stop(struct seq_file *m, void *v) -{ -} - -const struct seq_operations cpuinfo_op = { - .start = c_start, - .next = c_next, - .stop = c_stop, - .show = c_show -}; diff --git a/arch/unicore32/kernel/setup.h b/arch/unicore32/kernel/setup.h deleted file mode 100644 index 967352323185..000000000000 --- a/arch/unicore32/kernel/setup.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/setup.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_KERNEL_SETUP_H__ -#define __UNICORE_KERNEL_SETUP_H__ - -#include - -extern void paging_init(void); -extern void puv3_core_init(void); -extern void cpu_init(void); - -extern void puv3_ps2_init(void); -extern void pci_puv3_preinit(void); -extern void __init puv3_init_gpio(void); - -extern void setup_mm_for_reboot(void); - -extern char __stubs_start[], __stubs_end[]; -extern char __vectors_start[], __vectors_end[]; - -extern void kernel_thread_helper(void); - -extern void __init early_signal_init(void); - -extern asmlinkage void __backtrace(void); -extern asmlinkage void c_backtrace(unsigned long fp, const char *loglvl); - -extern void __show_regs(struct pt_regs *); - -#endif diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c deleted file mode 100644 index 3946182a835d..000000000000 --- a/arch/unicore32/kernel/signal.c +++ /dev/null @@ -1,424 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/signal.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* - * For UniCore syscalls, we encode the syscall number into the instruction. - */ -#define SWI_SYS_SIGRETURN (0xff000000) /* error number for new abi */ -#define SWI_SYS_RT_SIGRETURN (0xff000000 | (__NR_rt_sigreturn)) -#define SWI_SYS_RESTART (0xff000000 | (__NR_restart_syscall)) - -#define KERN_SIGRETURN_CODE (KUSER_VECPAGE_BASE + 0x00000500) -#define KERN_RESTART_CODE (KERN_SIGRETURN_CODE + sizeof(sigreturn_codes)) - -const unsigned long sigreturn_codes[3] = { - SWI_SYS_SIGRETURN, SWI_SYS_RT_SIGRETURN, -}; - -const unsigned long syscall_restart_code[2] = { - SWI_SYS_RESTART, /* swi __NR_restart_syscall */ - 0x69efc004, /* ldr pc, [sp], #4 */ -}; - -/* - * Do a signal return; undo the signal stack. These are aligned to 64-bit. - */ -struct sigframe { - struct ucontext uc; - unsigned long retcode[2]; -}; - -struct rt_sigframe { - struct siginfo info; - struct sigframe sig; -}; - -static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf) -{ - sigset_t set; - int err; - - err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); - if (err == 0) - set_current_blocked(&set); - - err |= __get_user(regs->UCreg_00, &sf->uc.uc_mcontext.regs.UCreg_00); - err |= __get_user(regs->UCreg_01, &sf->uc.uc_mcontext.regs.UCreg_01); - err |= __get_user(regs->UCreg_02, &sf->uc.uc_mcontext.regs.UCreg_02); - err |= __get_user(regs->UCreg_03, &sf->uc.uc_mcontext.regs.UCreg_03); - err |= __get_user(regs->UCreg_04, &sf->uc.uc_mcontext.regs.UCreg_04); - err |= __get_user(regs->UCreg_05, &sf->uc.uc_mcontext.regs.UCreg_05); - err |= __get_user(regs->UCreg_06, &sf->uc.uc_mcontext.regs.UCreg_06); - err |= __get_user(regs->UCreg_07, &sf->uc.uc_mcontext.regs.UCreg_07); - err |= __get_user(regs->UCreg_08, &sf->uc.uc_mcontext.regs.UCreg_08); - err |= __get_user(regs->UCreg_09, &sf->uc.uc_mcontext.regs.UCreg_09); - err |= __get_user(regs->UCreg_10, &sf->uc.uc_mcontext.regs.UCreg_10); - err |= __get_user(regs->UCreg_11, &sf->uc.uc_mcontext.regs.UCreg_11); - err |= __get_user(regs->UCreg_12, &sf->uc.uc_mcontext.regs.UCreg_12); - err |= __get_user(regs->UCreg_13, &sf->uc.uc_mcontext.regs.UCreg_13); - err |= __get_user(regs->UCreg_14, &sf->uc.uc_mcontext.regs.UCreg_14); - err |= __get_user(regs->UCreg_15, &sf->uc.uc_mcontext.regs.UCreg_15); - err |= __get_user(regs->UCreg_16, &sf->uc.uc_mcontext.regs.UCreg_16); - err |= __get_user(regs->UCreg_17, &sf->uc.uc_mcontext.regs.UCreg_17); - err |= __get_user(regs->UCreg_18, &sf->uc.uc_mcontext.regs.UCreg_18); - err |= __get_user(regs->UCreg_19, &sf->uc.uc_mcontext.regs.UCreg_19); - err |= __get_user(regs->UCreg_20, &sf->uc.uc_mcontext.regs.UCreg_20); - err |= __get_user(regs->UCreg_21, &sf->uc.uc_mcontext.regs.UCreg_21); - err |= __get_user(regs->UCreg_22, &sf->uc.uc_mcontext.regs.UCreg_22); - err |= __get_user(regs->UCreg_23, &sf->uc.uc_mcontext.regs.UCreg_23); - err |= __get_user(regs->UCreg_24, &sf->uc.uc_mcontext.regs.UCreg_24); - err |= __get_user(regs->UCreg_25, &sf->uc.uc_mcontext.regs.UCreg_25); - err |= __get_user(regs->UCreg_26, &sf->uc.uc_mcontext.regs.UCreg_26); - err |= __get_user(regs->UCreg_fp, &sf->uc.uc_mcontext.regs.UCreg_fp); - err |= __get_user(regs->UCreg_ip, &sf->uc.uc_mcontext.regs.UCreg_ip); - err |= __get_user(regs->UCreg_sp, &sf->uc.uc_mcontext.regs.UCreg_sp); - err |= __get_user(regs->UCreg_lr, &sf->uc.uc_mcontext.regs.UCreg_lr); - err |= __get_user(regs->UCreg_pc, &sf->uc.uc_mcontext.regs.UCreg_pc); - err |= __get_user(regs->UCreg_asr, &sf->uc.uc_mcontext.regs.UCreg_asr); - - err |= !valid_user_regs(regs); - - return err; -} - -asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - - /* Always make any pending restarted system calls return -EINTR */ - current->restart_block.fn = do_no_restart_syscall; - - /* - * Since we stacked the signal on a 64-bit boundary, - * then 'sp' should be word aligned here. If it's - * not, then the user is trying to mess with us. - */ - if (regs->UCreg_sp & 7) - goto badframe; - - frame = (struct rt_sigframe __user *)regs->UCreg_sp; - - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - - if (restore_sigframe(regs, &frame->sig)) - goto badframe; - - if (restore_altstack(&frame->sig.uc.uc_stack)) - goto badframe; - - return regs->UCreg_00; - -badframe: - force_sig(SIGSEGV); - return 0; -} - -static int setup_sigframe(struct sigframe __user *sf, struct pt_regs *regs, - sigset_t *set) -{ - int err = 0; - - err |= __put_user(regs->UCreg_00, &sf->uc.uc_mcontext.regs.UCreg_00); - err |= __put_user(regs->UCreg_01, &sf->uc.uc_mcontext.regs.UCreg_01); - err |= __put_user(regs->UCreg_02, &sf->uc.uc_mcontext.regs.UCreg_02); - err |= __put_user(regs->UCreg_03, &sf->uc.uc_mcontext.regs.UCreg_03); - err |= __put_user(regs->UCreg_04, &sf->uc.uc_mcontext.regs.UCreg_04); - err |= __put_user(regs->UCreg_05, &sf->uc.uc_mcontext.regs.UCreg_05); - err |= __put_user(regs->UCreg_06, &sf->uc.uc_mcontext.regs.UCreg_06); - err |= __put_user(regs->UCreg_07, &sf->uc.uc_mcontext.regs.UCreg_07); - err |= __put_user(regs->UCreg_08, &sf->uc.uc_mcontext.regs.UCreg_08); - err |= __put_user(regs->UCreg_09, &sf->uc.uc_mcontext.regs.UCreg_09); - err |= __put_user(regs->UCreg_10, &sf->uc.uc_mcontext.regs.UCreg_10); - err |= __put_user(regs->UCreg_11, &sf->uc.uc_mcontext.regs.UCreg_11); - err |= __put_user(regs->UCreg_12, &sf->uc.uc_mcontext.regs.UCreg_12); - err |= __put_user(regs->UCreg_13, &sf->uc.uc_mcontext.regs.UCreg_13); - err |= __put_user(regs->UCreg_14, &sf->uc.uc_mcontext.regs.UCreg_14); - err |= __put_user(regs->UCreg_15, &sf->uc.uc_mcontext.regs.UCreg_15); - err |= __put_user(regs->UCreg_16, &sf->uc.uc_mcontext.regs.UCreg_16); - err |= __put_user(regs->UCreg_17, &sf->uc.uc_mcontext.regs.UCreg_17); - err |= __put_user(regs->UCreg_18, &sf->uc.uc_mcontext.regs.UCreg_18); - err |= __put_user(regs->UCreg_19, &sf->uc.uc_mcontext.regs.UCreg_19); - err |= __put_user(regs->UCreg_20, &sf->uc.uc_mcontext.regs.UCreg_20); - err |= __put_user(regs->UCreg_21, &sf->uc.uc_mcontext.regs.UCreg_21); - err |= __put_user(regs->UCreg_22, &sf->uc.uc_mcontext.regs.UCreg_22); - err |= __put_user(regs->UCreg_23, &sf->uc.uc_mcontext.regs.UCreg_23); - err |= __put_user(regs->UCreg_24, &sf->uc.uc_mcontext.regs.UCreg_24); - err |= __put_user(regs->UCreg_25, &sf->uc.uc_mcontext.regs.UCreg_25); - err |= __put_user(regs->UCreg_26, &sf->uc.uc_mcontext.regs.UCreg_26); - err |= __put_user(regs->UCreg_fp, &sf->uc.uc_mcontext.regs.UCreg_fp); - err |= __put_user(regs->UCreg_ip, &sf->uc.uc_mcontext.regs.UCreg_ip); - err |= __put_user(regs->UCreg_sp, &sf->uc.uc_mcontext.regs.UCreg_sp); - err |= __put_user(regs->UCreg_lr, &sf->uc.uc_mcontext.regs.UCreg_lr); - err |= __put_user(regs->UCreg_pc, &sf->uc.uc_mcontext.regs.UCreg_pc); - err |= __put_user(regs->UCreg_asr, &sf->uc.uc_mcontext.regs.UCreg_asr); - - err |= __put_user(current->thread.trap_no, - &sf->uc.uc_mcontext.trap_no); - err |= __put_user(current->thread.error_code, - &sf->uc.uc_mcontext.error_code); - err |= __put_user(current->thread.address, - &sf->uc.uc_mcontext.fault_address); - err |= __put_user(set->sig[0], &sf->uc.uc_mcontext.oldmask); - - err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); - - return err; -} - -static inline void __user *get_sigframe(struct k_sigaction *ka, - struct pt_regs *regs, int framesize) -{ - unsigned long sp = regs->UCreg_sp; - void __user *frame; - - /* - * This is the X/Open sanctioned signal stack switching. - */ - if ((ka->sa.sa_flags & SA_ONSTACK) && !sas_ss_flags(sp)) - sp = current->sas_ss_sp + current->sas_ss_size; - - /* - * ATPCS B01 mandates 8-byte alignment - */ - frame = (void __user *)((sp - framesize) & ~7); - - /* - * Check that we can actually write to the signal frame. - */ - if (!access_ok(frame, framesize)) - frame = NULL; - - return frame; -} - -static int setup_return(struct pt_regs *regs, struct k_sigaction *ka, - unsigned long __user *rc, void __user *frame, int usig) -{ - unsigned long handler = (unsigned long)ka->sa.sa_handler; - unsigned long retcode; - unsigned long asr = regs->UCreg_asr & ~PSR_f; - - unsigned int idx = 0; - - if (ka->sa.sa_flags & SA_SIGINFO) - idx += 1; - - if (__put_user(sigreturn_codes[idx], rc) || - __put_user(sigreturn_codes[idx+1], rc+1)) - return 1; - - retcode = KERN_SIGRETURN_CODE + (idx << 2); - - regs->UCreg_00 = usig; - regs->UCreg_sp = (unsigned long)frame; - regs->UCreg_lr = retcode; - regs->UCreg_pc = handler; - regs->UCreg_asr = asr; - - return 0; -} - -static int setup_frame(struct ksignal *ksig, sigset_t *set, - struct pt_regs *regs) -{ - struct sigframe __user *frame = get_sigframe(&ksig->ka, regs, sizeof(*frame)); - int err = 0; - - if (!frame) - return 1; - - /* - * Set uc.uc_flags to a value which sc.trap_no would never have. - */ - err |= __put_user(0x5ac3c35a, &frame->uc.uc_flags); - - err |= setup_sigframe(frame, regs, set); - if (err == 0) - err |= setup_return(regs, &ksig->ka, frame->retcode, frame, - ksig->sig); - - return err; -} - -static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, - struct pt_regs *regs) -{ - struct rt_sigframe __user *frame = - get_sigframe(&ksig->ka, regs, sizeof(*frame)); - int err = 0; - - if (!frame) - return 1; - - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - - err |= __put_user(0, &frame->sig.uc.uc_flags); - err |= __put_user(NULL, &frame->sig.uc.uc_link); - err |= __save_altstack(&frame->sig.uc.uc_stack, regs->UCreg_sp); - err |= setup_sigframe(&frame->sig, regs, set); - if (err == 0) - err |= setup_return(regs, &ksig->ka, frame->sig.retcode, frame, - ksig->sig); - - if (err == 0) { - /* - * For realtime signals we must also set the second and third - * arguments for the signal handler. - */ - regs->UCreg_01 = (unsigned long)&frame->info; - regs->UCreg_02 = (unsigned long)&frame->sig.uc; - } - - return err; -} - -static inline void setup_syscall_restart(struct pt_regs *regs) -{ - regs->UCreg_00 = regs->UCreg_ORIG_00; - regs->UCreg_pc -= 4; -} - -/* - * OK, we're invoking a handler - */ -static void handle_signal(struct ksignal *ksig, struct pt_regs *regs, - int syscall) -{ - struct thread_info *thread = current_thread_info(); - sigset_t *oldset = sigmask_to_save(); - int usig = ksig->sig; - int ret; - - /* - * If we were from a system call, check for system call restarting... - */ - if (syscall) { - switch (regs->UCreg_00) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - regs->UCreg_00 = -EINTR; - break; - case -ERESTARTSYS: - if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { - regs->UCreg_00 = -EINTR; - break; - } - /* fallthrough */ - case -ERESTARTNOINTR: - setup_syscall_restart(regs); - } - } - - /* - * Set up the stack frame - */ - if (ksig->ka.sa.sa_flags & SA_SIGINFO) - ret = setup_rt_frame(ksig, oldset, regs); - else - ret = setup_frame(ksig, oldset, regs); - - /* - * Check that the resulting registers are actually sane. - */ - ret |= !valid_user_regs(regs); - - signal_setup_done(ret, ksig, 0); -} - -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - * - * Note that we go through the signals twice: once to check the signals that - * the kernel can handle, and then we build all the user-level signal handling - * stack-frames in one go after that. - */ -static void do_signal(struct pt_regs *regs, int syscall) -{ - struct ksignal ksig; - - /* - * We want the common case to go fast, which - * is why we may in certain cases get here from - * kernel mode. Just return without doing anything - * if so. - */ - if (!user_mode(regs)) - return; - - if (get_signal(&ksig)) { - handle_signal(&ksig, regs, syscall); - return; - } - - /* - * No signal to deliver to the process - restart the syscall. - */ - if (syscall) { - if (regs->UCreg_00 == -ERESTART_RESTARTBLOCK) { - u32 __user *usp; - - regs->UCreg_sp -= 4; - usp = (u32 __user *)regs->UCreg_sp; - - if (put_user(regs->UCreg_pc, usp) == 0) { - regs->UCreg_pc = KERN_RESTART_CODE; - } else { - regs->UCreg_sp += 4; - force_sigsegv(0); - } - } - if (regs->UCreg_00 == -ERESTARTNOHAND || - regs->UCreg_00 == -ERESTARTSYS || - regs->UCreg_00 == -ERESTARTNOINTR) { - setup_syscall_restart(regs); - } - } - /* If there's no signal to deliver, we just put the saved - * sigmask back. - */ - restore_saved_sigmask(); -} - -asmlinkage void do_notify_resume(struct pt_regs *regs, - unsigned int thread_flags, int syscall) -{ - if (thread_flags & _TIF_SIGPENDING) - do_signal(regs, syscall); - - if (thread_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - } -} - -/* - * Copy signal return handlers into the vector page, and - * set sigreturn to be a pointer to these. - */ -void __init early_signal_init(void) -{ - memcpy((void *)kuser_vecpage_to_vectors(KERN_SIGRETURN_CODE), - sigreturn_codes, sizeof(sigreturn_codes)); - memcpy((void *)kuser_vecpage_to_vectors(KERN_RESTART_CODE), - syscall_restart_code, sizeof(syscall_restart_code)); - /* Need not to flush icache, since early_trap_init will do it last. */ -} diff --git a/arch/unicore32/kernel/sleep.S b/arch/unicore32/kernel/sleep.S deleted file mode 100644 index 23151abe53c6..000000000000 --- a/arch/unicore32/kernel/sleep.S +++ /dev/null @@ -1,199 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/sleep.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include - - .text - -pkunity_cpu_save_cp: - - @ get coprocessor registers - - movc r3, p0.c7, #0 @ PID - movc r4, p0.c2, #0 @ translation table base addr - movc r5, p0.c1, #0 @ control reg - - - @ store them plus current virtual stack ptr on stack - mov r6, sp - stm.w (r3 - r6), [sp-] - - mov pc, lr - -pkunity_cpu_save_sp: - @ preserve phys address of stack - mov r0, sp - stw.w lr, [sp+], #-4 - b.l sleep_phys_sp - ldw r1, =sleep_save_sp - stw r0, [r1] - ldw.w pc, [sp]+, #4 - -/* - * puv3_cpu_suspend() - * - * Forces CPU into sleep state. - * - * r0 = value for PWRMODE M field for desired sleep state - */ - -ENTRY(puv3_cpu_suspend) - stm.w (r16 - r27, lr), [sp-] @ save registers on stack - stm.w (r4 - r15), [sp-] @ save registers on stack - -#ifdef CONFIG_UNICORE_FPU_F64 - sfm.w (f0 - f7 ), [sp-] - sfm.w (f8 - f15), [sp-] - sfm.w (f16 - f23), [sp-] - sfm.w (f24 - f31), [sp-] - cff r4, s31 - stm.w (r4), [sp-] -#endif - b.l pkunity_cpu_save_cp - - b.l pkunity_cpu_save_sp - - @ clean data cache - mov r1, #0 - movc p0.c5, r1, #14 - nop - nop - nop - nop - - - - @ DDR2 BaseAddr - ldw r0, =(PKUNITY_DDR2CTRL_BASE) - - @ PM BaseAddr - ldw r1, =(PKUNITY_PM_BASE) - - @ set PLL_SYS_CFG reg, 275 - movl r6, #0x00002401 - stw r6, [r1+], #0x18 - @ set PLL_DDR_CFG reg, 66MHz - movl r6, #0x00100c00 - stw r6, [r1+], #0x1c - - @ set wake up source - movl r8, #0x800001ff @ epip4d - stw r8, [r1+], #0xc - - @ set PGSR - movl r5, #0x40000 - stw r5, [r1+], #0x10 - - @ prepare DDR2 refresh settings - ldw r5, [r0+], #0x24 - or r5, r5, #0x00000001 - - @ prepare PMCR for PLL changing - movl r6, #0xc - - @ prepare for closing PLL - movl r7, #0x1 - - @ prepare sleep mode - mov r8, #0x1 - -@ movl r0, 0x11111111 -@ put_word_ocd r0 - b pkunity_cpu_do_suspend - - .ltorg - .align 5 -pkunity_cpu_do_suspend: - b 101f - @ put DDR2 into self-refresh -100: stw r5, [r0+], #0x24 - @ change PLL - stw r6, [r1] - b 1f - - .ltorg - .align 5 -101: b 102f - @ wait for PLL changing complete -1: ldw r6, [r1+], #0x44 - csub.a r6, #0x1 - bne 1b - b 2f - - .ltorg - .align 5 -102: b 100b - @ close PLL -2: stw r7, [r1+], #0x4 - @ enter sleep mode - stw r8, [r1] -3: b 3b - - - - -/* - * puv3_cpu_resume() - * - * entry point from bootloader into kernel during resume - * - * Note: Yes, part of the following code is located into the .data section. - * This is to allow sleep_save_sp to be accessed with a relative load - * while we can't rely on any MMU translation. We could have put - * sleep_save_sp in the .text section as well, but some setups might - * insist on it to be truly read-only. - */ - - .data - .align 5 -ENTRY(puv3_cpu_resume) -@ movl r0, 0x20202020 -@ put_word_ocd r0 - - ldw r0, sleep_save_sp @ stack phys addr - ldw r2, =resume_after_mmu @ its absolute virtual address - ldm (r3 - r6), [r0]+ @ CP regs + virt stack ptr - mov sp, r6 @ CP regs + virt stack ptr - - mov r1, #0 - movc p0.c6, r1, #6 @ invalidate I & D TLBs - movc p0.c5, r1, #28 @ invalidate I & D caches, BTB - - movc p0.c7, r3, #0 @ PID - movc p0.c2, r4, #0 @ translation table base addr - movc p0.c1, r5, #0 @ control reg, turn on mmu - nop - jump r2 - nop - nop - nop - nop - nop - -sleep_save_sp: - .word 0 @ preserve stack phys ptr here - - .text -resume_after_mmu: -@ movl r0, 0x30303030 -@ put_word_ocd r0 - -#ifdef CONFIG_UNICORE_FPU_F64 - lfm.w (f0 - f7 ), [sp]+ - lfm.w (f8 - f15), [sp]+ - lfm.w (f16 - f23), [sp]+ - lfm.w (f24 - f31), [sp]+ - ldm.w (r4), [sp]+ - ctf r4, s31 -#endif - ldm.w (r4 - r15), [sp]+ @ restore registers from stack - ldm.w (r16 - r27, pc), [sp]+ @ return to caller diff --git a/arch/unicore32/kernel/stacktrace.c b/arch/unicore32/kernel/stacktrace.c deleted file mode 100644 index c9d8650e9d78..000000000000 --- a/arch/unicore32/kernel/stacktrace.c +++ /dev/null @@ -1,127 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/stacktrace.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include - -#include - -#if defined(CONFIG_FRAME_POINTER) -/* - * Unwind the current stack frame and store the new register values in the - * structure passed as argument. Unwinding is equivalent to a function return, - * hence the new PC value rather than LR should be used for backtrace. - * - * With framepointer enabled, a simple function prologue looks like this: - * mov ip, sp - * stmdb sp!, {fp, ip, lr, pc} - * sub fp, ip, #4 - * - * A simple function epilogue looks like this: - * ldm sp, {fp, sp, pc} - * - * Note that with framepointer enabled, even the leaf functions have the same - * prologue and epilogue, therefore we can ignore the LR value in this case. - */ -int notrace unwind_frame(struct stackframe *frame) -{ - unsigned long high, low; - unsigned long fp = frame->fp; - - /* only go to a higher address on the stack */ - low = frame->sp; - high = ALIGN(low, THREAD_SIZE); - - /* check current frame pointer is within bounds */ - if (fp < (low + 12) || fp + 4 >= high) - return -EINVAL; - - /* restore the registers from the stack frame */ - frame->fp = *(unsigned long *)(fp - 12); - frame->sp = *(unsigned long *)(fp - 8); - frame->pc = *(unsigned long *)(fp - 4); - - return 0; -} -#endif - -void notrace walk_stackframe(struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data) -{ - while (1) { - int ret; - - if (fn(frame, data)) - break; - ret = unwind_frame(frame); - if (ret < 0) - break; - } -} -EXPORT_SYMBOL(walk_stackframe); - -#ifdef CONFIG_STACKTRACE -struct stack_trace_data { - struct stack_trace *trace; - unsigned int no_sched_functions; - unsigned int skip; -}; - -static int save_trace(struct stackframe *frame, void *d) -{ - struct stack_trace_data *data = d; - struct stack_trace *trace = data->trace; - unsigned long addr = frame->pc; - - if (data->no_sched_functions && in_sched_functions(addr)) - return 0; - if (data->skip) { - data->skip--; - return 0; - } - - trace->entries[trace->nr_entries++] = addr; - - return trace->nr_entries >= trace->max_entries; -} - -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) -{ - struct stack_trace_data data; - struct stackframe frame; - - data.trace = trace; - data.skip = trace->skip; - - if (tsk != current) { - data.no_sched_functions = 1; - frame.fp = thread_saved_fp(tsk); - frame.sp = thread_saved_sp(tsk); - frame.lr = 0; /* recovered from the stack */ - frame.pc = thread_saved_pc(tsk); - } else { - register unsigned long current_sp asm("sp"); - - data.no_sched_functions = 0; - frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_sp; - frame.lr = (unsigned long)__builtin_return_address(0); - frame.pc = (unsigned long)save_stack_trace_tsk; - } - - walk_stackframe(&frame, save_trace, &data); -} - -void save_stack_trace(struct stack_trace *trace) -{ - save_stack_trace_tsk(current, trace); -} -EXPORT_SYMBOL_GPL(save_stack_trace); -#endif diff --git a/arch/unicore32/kernel/sys.c b/arch/unicore32/kernel/sys.c deleted file mode 100644 index 256fb4082296..000000000000 --- a/arch/unicore32/kernel/sys.c +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/sys.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* Provide the actual syscall number to call mapping. */ -#undef __SYSCALL -#define __SYSCALL(nr, call) [nr] = (call), - -#define sys_mmap2 sys_mmap_pgoff -/* Note that we don't include but */ -void *sys_call_table[__NR_syscalls] = { - [0 ... __NR_syscalls-1] = sys_ni_syscall, -#include -}; diff --git a/arch/unicore32/kernel/time.c b/arch/unicore32/kernel/time.c deleted file mode 100644 index c3a37edf4d40..000000000000 --- a/arch/unicore32/kernel/time.c +++ /dev/null @@ -1,128 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/time.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ -#include -#include -#include -#include -#include -#include - -#include - -#define MIN_OSCR_DELTA 2 - -static irqreturn_t puv3_ost0_interrupt(int irq, void *dev_id) -{ - struct clock_event_device *c = dev_id; - - /* Disarm the compare/match, signal the event. */ - writel(readl(OST_OIER) & ~OST_OIER_E0, OST_OIER); - writel(readl(OST_OSSR) & ~OST_OSSR_M0, OST_OSSR); - c->event_handler(c); - - return IRQ_HANDLED; -} - -static int -puv3_osmr0_set_next_event(unsigned long delta, struct clock_event_device *c) -{ - unsigned long next, oscr; - - writel(readl(OST_OIER) | OST_OIER_E0, OST_OIER); - next = readl(OST_OSCR) + delta; - writel(next, OST_OSMR0); - oscr = readl(OST_OSCR); - - return (signed)(next - oscr) <= MIN_OSCR_DELTA ? -ETIME : 0; -} - -static int puv3_osmr0_shutdown(struct clock_event_device *evt) -{ - writel(readl(OST_OIER) & ~OST_OIER_E0, OST_OIER); - writel(readl(OST_OSSR) & ~OST_OSSR_M0, OST_OSSR); - return 0; -} - -static struct clock_event_device ckevt_puv3_osmr0 = { - .name = "osmr0", - .features = CLOCK_EVT_FEAT_ONESHOT, - .rating = 200, - .set_next_event = puv3_osmr0_set_next_event, - .set_state_shutdown = puv3_osmr0_shutdown, - .set_state_oneshot = puv3_osmr0_shutdown, -}; - -static u64 puv3_read_oscr(struct clocksource *cs) -{ - return readl(OST_OSCR); -} - -static struct clocksource cksrc_puv3_oscr = { - .name = "oscr", - .rating = 200, - .read = puv3_read_oscr, - .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -void __init time_init(void) -{ - writel(0, OST_OIER); /* disable any timer interrupts */ - writel(0, OST_OSSR); /* clear status on all timers */ - - clockevents_calc_mult_shift(&ckevt_puv3_osmr0, CLOCK_TICK_RATE, 5); - - ckevt_puv3_osmr0.max_delta_ns = - clockevent_delta2ns(0x7fffffff, &ckevt_puv3_osmr0); - ckevt_puv3_osmr0.max_delta_ticks = 0x7fffffff; - ckevt_puv3_osmr0.min_delta_ns = - clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_puv3_osmr0) + 1; - ckevt_puv3_osmr0.min_delta_ticks = MIN_OSCR_DELTA * 2; - ckevt_puv3_osmr0.cpumask = cpumask_of(0); - - if (request_irq(IRQ_TIMER0, puv3_ost0_interrupt, - IRQF_TIMER | IRQF_IRQPOLL, "ost0", &ckevt_puv3_osmr0)) - pr_err("Failed to register ost0 interrupt\n"); - - clocksource_register_hz(&cksrc_puv3_oscr, CLOCK_TICK_RATE); - clockevents_register_device(&ckevt_puv3_osmr0); -} - -#ifdef CONFIG_PM -unsigned long osmr[4], oier; - -void puv3_timer_suspend(void) -{ - osmr[0] = readl(OST_OSMR0); - osmr[1] = readl(OST_OSMR1); - osmr[2] = readl(OST_OSMR2); - osmr[3] = readl(OST_OSMR3); - oier = readl(OST_OIER); -} - -void puv3_timer_resume(void) -{ - writel(0, OST_OSSR); - writel(osmr[0], OST_OSMR0); - writel(osmr[1], OST_OSMR1); - writel(osmr[2], OST_OSMR2); - writel(osmr[3], OST_OSMR3); - writel(oier, OST_OIER); - - /* - * OSMR0 is the system timer: make sure OSCR is sufficiently behind - */ - writel(readl(OST_OSMR0) - LATCH, OST_OSCR); -} -#else -void puv3_timer_suspend(void) { }; -void puv3_timer_resume(void) { }; -#endif - diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c deleted file mode 100644 index a3ac01df1a2e..000000000000 --- a/arch/unicore32/kernel/traps.c +++ /dev/null @@ -1,322 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/traps.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * 'traps.c' handles hardware exceptions after we have saved some state. - * Mostly a debugging aid, but will probably kill the offending process. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "setup.h" - -static void dump_mem(const char *, const char *, unsigned long, unsigned long); - -void dump_backtrace_entry(unsigned long where, - unsigned long from, unsigned long frame) -{ -#ifdef CONFIG_KALLSYMS - printk(KERN_DEFAULT "[<%08lx>] (%pS) from [<%08lx>] (%pS)\n", - where, (void *)where, from, (void *)from); -#else - printk(KERN_DEFAULT "Function entered at [<%08lx>] from [<%08lx>]\n", - where, from); -#endif -} - -/* - * Stack pointers should always be within the kernels view of - * physical memory. If it is not there, then we can't dump - * out any information relating to the stack. - */ -static int verify_stack(unsigned long sp) -{ - if (sp < PAGE_OFFSET || - (sp > (unsigned long)high_memory && high_memory != NULL)) - return -EFAULT; - - return 0; -} - -/* - * Dump out the contents of some memory nicely... - */ -static void dump_mem(const char *lvl, const char *str, unsigned long bottom, - unsigned long top) -{ - unsigned long first; - mm_segment_t fs; - int i; - - /* - * We need to switch to kernel mode so that we can use __get_user - * to safely read from kernel space. Note that we now dump the - * code first, just in case the backtrace kills us. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - - printk(KERN_DEFAULT "%s%s(0x%08lx to 0x%08lx)\n", - lvl, str, bottom, top); - - for (first = bottom & ~31; first < top; first += 32) { - unsigned long p; - char str[sizeof(" 12345678") * 8 + 1]; - - memset(str, ' ', sizeof(str)); - str[sizeof(str) - 1] = '\0'; - - for (p = first, i = 0; i < 8 && p < top; i++, p += 4) { - if (p >= bottom && p < top) { - unsigned long val; - if (__get_user(val, (unsigned long *)p) == 0) - sprintf(str + i * 9, " %08lx", val); - else - sprintf(str + i * 9, " ????????"); - } - } - printk(KERN_DEFAULT "%s%04lx:%s\n", lvl, first & 0xffff, str); - } - - set_fs(fs); -} - -static void dump_instr(const char *lvl, struct pt_regs *regs) -{ - unsigned long addr = instruction_pointer(regs); - const int width = 8; - mm_segment_t fs; - char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; - int i; - - /* - * We need to switch to kernel mode so that we can use __get_user - * to safely read from kernel space. Note that we now dump the - * code first, just in case the backtrace kills us. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - - for (i = -4; i < 1; i++) { - unsigned int val, bad; - - bad = __get_user(val, &((u32 *)addr)[i]); - - if (!bad) - p += sprintf(p, i == 0 ? "(%0*x) " : "%0*x ", - width, val); - else { - p += sprintf(p, "bad PC value"); - break; - } - } - printk(KERN_DEFAULT "%sCode: %s\n", lvl, str); - - set_fs(fs); -} - -static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, - const char *loglvl) -{ - unsigned int fp; - int ok = 1; - - printk("%sBacktrace: ", loglvl); - - if (!tsk) - tsk = current; - - if (regs) - fp = regs->UCreg_fp; - else if (tsk != current) - fp = thread_saved_fp(tsk); - else - asm("mov %0, fp" : "=r" (fp) : : "cc"); - - if (!fp) { - printk("%sno frame pointer", loglvl); - ok = 0; - } else if (verify_stack(fp)) { - printk("%sinvalid frame pointer 0x%08x", loglvl, fp); - ok = 0; - } else if (fp < (unsigned long)end_of_stack(tsk)) - printk("%sframe pointer underflow", loglvl); - printk("%s\n", loglvl); - - if (ok) - c_backtrace(fp, loglvl); -} - -void show_stack(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) -{ - dump_backtrace(NULL, tsk, loglvl); - barrier(); -} - -static int __die(const char *str, int err, struct thread_info *thread, - struct pt_regs *regs) -{ - struct task_struct *tsk = thread->task; - static int die_counter; - int ret; - - printk(KERN_EMERG "Internal error: %s: %x [#%d]\n", - str, err, ++die_counter); - - /* trap and error numbers are mostly meaningless on UniCore */ - ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, \ - SIGSEGV); - if (ret == NOTIFY_STOP) - return ret; - - print_modules(); - __show_regs(regs); - printk(KERN_EMERG "Process %.*s (pid: %d, stack limit = 0x%p)\n", - TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1); - - if (!user_mode(regs) || in_interrupt()) { - dump_mem(KERN_EMERG, "Stack: ", regs->UCreg_sp, - THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - dump_backtrace(regs, tsk, KERN_EMERG); - dump_instr(KERN_EMERG, regs); - } - - return ret; -} - -DEFINE_SPINLOCK(die_lock); - -/* - * This function is protected against re-entrancy. - */ -void die(const char *str, struct pt_regs *regs, int err) -{ - struct thread_info *thread = current_thread_info(); - int ret; - - oops_enter(); - - spin_lock_irq(&die_lock); - console_verbose(); - bust_spinlocks(1); - ret = __die(str, err, thread, regs); - - bust_spinlocks(0); - add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); - spin_unlock_irq(&die_lock); - oops_exit(); - - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - if (ret != NOTIFY_STOP) - do_exit(SIGSEGV); -} - -void uc32_notify_die(const char *str, struct pt_regs *regs, - int sig, int code, void __user *addr, - unsigned long err, unsigned long trap) -{ - if (user_mode(regs)) { - current->thread.error_code = err; - current->thread.trap_no = trap; - - force_sig_fault(sig, code, addr); - } else - die(str, regs, err); -} - -/* - * bad_mode handles the impossible case in the vectors. If you see one of - * these, then it's extremely serious, and could mean you have buggy hardware. - * It never returns, and never tries to sync. We hope that we can at least - * dump out some state information... - */ -asmlinkage void bad_mode(struct pt_regs *regs, unsigned int reason) -{ - console_verbose(); - - printk(KERN_CRIT "Bad mode detected with reason 0x%x\n", reason); - - die("Oops - bad mode", regs, 0); - local_irq_disable(); - panic("bad mode"); -} - -void __pte_error(const char *file, int line, unsigned long val) -{ - printk(KERN_DEFAULT "%s:%d: bad pte %08lx.\n", file, line, val); -} - -void __pmd_error(const char *file, int line, unsigned long val) -{ - printk(KERN_DEFAULT "%s:%d: bad pmd %08lx.\n", file, line, val); -} - -void __pgd_error(const char *file, int line, unsigned long val) -{ - printk(KERN_DEFAULT "%s:%d: bad pgd %08lx.\n", file, line, val); -} - -asmlinkage void __div0(void) -{ - printk(KERN_DEFAULT "Division by zero in kernel.\n"); - dump_stack(); -} -EXPORT_SYMBOL(__div0); - -void abort(void) -{ - BUG(); - - /* if that doesn't kill us, halt */ - panic("Oops failed to kill thread"); -} - -void __init trap_init(void) -{ - return; -} - -void __init early_trap_init(void) -{ - unsigned long vectors = VECTORS_BASE; - - /* - * Copy the vectors, stubs (in entry-unicore.S) - * into the vector page, mapped at 0xffff0000, and ensure these - * are visible to the instruction stream. - */ - memcpy((void *)vectors, - __vectors_start, - __vectors_end - __vectors_start); - memcpy((void *)vectors + 0x200, - __stubs_start, - __stubs_end - __stubs_start); - - early_signal_init(); - - flush_icache_range(vectors, vectors + PAGE_SIZE); -} diff --git a/arch/unicore32/kernel/vmlinux.lds.S b/arch/unicore32/kernel/vmlinux.lds.S deleted file mode 100644 index 6fb320b337ef..000000000000 --- a/arch/unicore32/kernel/vmlinux.lds.S +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/vmlinux.lds.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#include -#include -#include -#include -#include - -OUTPUT_ARCH(unicore32) -ENTRY(stext) - -jiffies = jiffies_64; - -SECTIONS -{ - . = PAGE_OFFSET + KERNEL_IMAGE_START; - - _text = .; - __init_begin = .; - HEAD_TEXT_SECTION - INIT_TEXT_SECTION(PAGE_SIZE) - INIT_DATA_SECTION(16) - PERCPU_SECTION(L1_CACHE_BYTES) - __init_end = .; - - _stext = .; - .text : { /* Real text segment */ - TEXT_TEXT - SCHED_TEXT - CPUIDLE_TEXT - LOCK_TEXT - - *(.fixup) - *(.gnu.warning) - } - _etext = .; - - _sdata = .; - RO_DATA(PAGE_SIZE) - RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) - _edata = .; - - EXCEPTION_TABLE(L1_CACHE_BYTES) - - BSS_SECTION(0, 0, 0) - _end = .; - - STABS_DEBUG - DWARF_DEBUG - - DISCARDS /* Exit code and data */ -} diff --git a/arch/unicore32/lib/Makefile b/arch/unicore32/lib/Makefile deleted file mode 100644 index 5af06645b8f0..000000000000 --- a/arch/unicore32/lib/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# linux/arch/unicore32/lib/Makefile -# -# Copyright (C) 2001-2010 GUAN Xue-tao -# - -lib-y := backtrace.o delay.o findbit.o -lib-y += strncpy_from_user.o strnlen_user.o -lib-y += clear_user.o copy_page.o -lib-y += copy_from_user.o copy_to_user.o - -GNU_LIBC_A = $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libc.a) -GNU_LIBC_A_OBJS := memchr.o memcpy.o memmove.o memset.o -GNU_LIBC_A_OBJS += strchr.o strrchr.o -GNU_LIBC_A_OBJS += rawmemchr.o # needed by strrchr.o - -GNU_LIBGCC_A = $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libgcc.a) -GNU_LIBGCC_A_OBJS := _ashldi3.o _ashrdi3.o _lshrdi3.o -GNU_LIBGCC_A_OBJS += _divsi3.o _modsi3.o _ucmpdi2.o _umodsi3.o _udivsi3.o - -lib-y += $(GNU_LIBC_A_OBJS) $(GNU_LIBGCC_A_OBJS) - -$(addprefix $(obj)/, $(GNU_LIBC_A_OBJS)): - $(Q)$(AR) p $(GNU_LIBC_A) $(notdir $@) > $@ - -$(addprefix $(obj)/, $(GNU_LIBGCC_A_OBJS)): - $(Q)$(AR) p $(GNU_LIBGCC_A) $(notdir $@) > $@ diff --git a/arch/unicore32/lib/backtrace.S b/arch/unicore32/lib/backtrace.S deleted file mode 100644 index 6221944b81f3..000000000000 --- a/arch/unicore32/lib/backtrace.S +++ /dev/null @@ -1,168 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/backtrace.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - .text - -@ fp is 0 or stack frame - -#define frame v4 -#define sv_fp v5 -#define sv_pc v6 -#define offset v8 -#define loglvl v9 - -ENTRY(__backtrace) - mov r0, fp - -ENTRY(c_backtrace) - -#if !defined(CONFIG_FRAME_POINTER) || !defined(CONFIG_PRINTK) - mov pc, lr -ENDPROC(__backtrace) -ENDPROC(c_backtrace) -#else - stm.w (v4 - v10, lr), [sp-] @ Save an extra register - @ so we have a location... - mov.a frame, r0 @ if frame pointer is zero - beq no_frame @ we have no stack frames - mov loglvl, r1 - -1: stm.w (pc), [sp-] @ calculate offset of PC stored - ldw.w r0, [sp]+, #4 @ by stmfd for this CPU - adr r1, 1b - sub offset, r0, r1 - -/* - * Stack frame layout: - * optionally saved caller registers (r4 - r10) - * saved fp - * saved sp - * saved lr - * frame => saved pc - * optionally saved arguments (r0 - r3) - * saved sp => - * - * Functions start with the following code sequence: - * mov ip, sp - * stm.w (r0 - r3), [sp-] (optional) - * corrected pc => stm.w sp, (..., fp, ip, lr, pc) - */ -for_each_frame: - -1001: ldw sv_pc, [frame+], #0 @ get saved pc -1002: ldw sv_fp, [frame+], #-12 @ get saved fp - - sub sv_pc, sv_pc, offset @ Correct PC for prefetching - -1003: ldw r2, [sv_pc+], #-4 @ if stmfd sp, {args} exists, - ldw r3, .Ldsi+4 @ adjust saved 'pc' back one - cxor.a r3, r2 >> #14 @ instruction - beq 201f - sub r0, sv_pc, #4 @ allow for mov - b 202f -201: - sub r0, sv_pc, #8 @ allow for mov + stmia -202: - ldw r1, [frame+], #-4 @ get saved lr - mov r2, frame - b.l dump_backtrace_entry - - ldw r1, [sv_pc+], #-4 @ if stmfd sp, {args} exists, - ldw r3, .Ldsi+4 - cxor.a r3, r1 >> #14 - bne 1004f - ldw r0, [frame+], #-8 @ get sp - sub r0, r0, #4 @ point at the last arg - b.l .Ldumpstm @ dump saved registers - -1004: ldw r1, [sv_pc+], #0 @ if stmfd {, fp, ip, lr, pc} - ldw r3, .Ldsi @ instruction exists, - cxor.a r3, r1 >> #14 - bne 201f - sub r0, frame, #16 - b.l .Ldumpstm @ dump saved registers -201: - cxor.a sv_fp, #0 @ zero saved fp means - beq no_frame @ no further frames - - csub.a sv_fp, frame @ next frame must be - mov frame, sv_fp @ above the current frame - bua for_each_frame - -1006: adr r0, .Lbad - mov r1, loglvl - mov r2, frame - b.l printk -no_frame: ldm.w (v4 - v10, pc), [sp]+ -ENDPROC(__backtrace) -ENDPROC(c_backtrace) - - .pushsection __ex_table,"a" - .align 3 - .long 1001b, 1006b - .long 1002b, 1006b - .long 1003b, 1006b - .long 1004b, 1006b - .popsection - -#define instr v4 -#define reg v5 -#define stack v6 - -.Ldumpstm: stm.w (instr, reg, stack, v7, lr), [sp-] - mov stack, r0 - mov instr, r1 - mov reg, #14 - mov v7, #0 -1: mov r3, #1 - csub.a reg, #8 - bne 201f - sub reg, reg, #3 -201: - cand.a instr, r3 << reg - beq 2f - add v7, v7, #1 - cxor.a v7, #6 - cmoveq v7, #1 - bne 201f - adr r0, .Lcr - mov r1, loglvl - b.l printk -201: - ldw.w r3, [stack]+, #-4 - mov r2, reg - csub.a r2, #8 - bsl 201f - sub r2, r2, #3 -201: - cand.a instr, #0x40 @ if H is 1, high 16 regs - beq 201f - add r2, r2, #0x10 @ so r2 need add 16 -201: - adr r0, .Lfp - mov r1, loglvl - b.l printk -2: sub.a reg, reg, #1 - bns 1b - cxor.a v7, #0 - beq 201f - adr r0, .Lcr - mov r1, loglvl - b.l printk -201: ldm.w (instr, reg, stack, v7, pc), [sp]+ - -.Lfp: .asciz "%sr%d:%08x " -.Lcr: .asciz "%s\n" -.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" - .align -.Ldsi: .word 0x92eec000 >> 14 @ stm.w sp, (... fp, ip, lr, pc) - .word 0x92e10000 >> 14 @ stm.w sp, () - -#endif diff --git a/arch/unicore32/lib/clear_user.S b/arch/unicore32/lib/clear_user.S deleted file mode 100644 index c6ca431b1090..000000000000 --- a/arch/unicore32/lib/clear_user.S +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/clear_user.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - - .text - -/* Prototype: int __clear_user(void *addr, size_t sz) - * Purpose : clear some user memory - * Params : addr - user memory address to clear - * : sz - number of bytes to clear - * Returns : number of bytes NOT cleared - */ -WEAK(__clear_user) - stm.w (lr), [sp-] - stm.w (r1), [sp-] - mov r2, #0 - csub.a r1, #4 - bsl 2f - and.a ip, r0, #3 - beq 1f - csub.a ip, #2 - strusr r2, r0, 1 - strusr r2, r0, 1, el - strusr r2, r0, 1, sl - rsub ip, ip, #4 - sub r1, r1, ip @ 7 6 5 4 3 2 1 -1: sub.a r1, r1, #8 @ -1 -2 -3 -4 -5 -6 -7 - strusr r2, r0, 4, ns, rept=2 - bns 1b - add.a r1, r1, #4 @ 3 2 1 0 -1 -2 -3 - strusr r2, r0, 4, ns -2: cand.a r1, #2 @ 1x 1x 0x 0x 1x 1x 0x - strusr r2, r0, 1, ne, rept=2 - cand.a r1, #1 @ x1 x0 x1 x0 x1 x0 x1 - beq 3f -USER( stb.u r2, [r0]) -3: mov r0, #0 - ldm.w (r1), [sp]+ - ldm.w (pc), [sp]+ -ENDPROC(__clear_user) - - .pushsection .fixup,"ax" - .align 0 -9001: ldm.w (r0), [sp]+ - ldm.w (pc), [sp]+ - .popsection - diff --git a/arch/unicore32/lib/copy_from_user.S b/arch/unicore32/lib/copy_from_user.S deleted file mode 100644 index affb43920ac0..000000000000 --- a/arch/unicore32/lib/copy_from_user.S +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/copy_from_user.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#include -#include - -/* - * Prototype: - * - * size_t raw_copy_from_user(void *to, const void *from, size_t n) - * - * Purpose: - * - * copy a block to kernel memory from user memory - * - * Params: - * - * to = kernel memory - * from = user memory - * n = number of bytes to copy - * - * Return value: - * - * Number of bytes NOT copied. - */ - - .macro ldr1w ptr reg abort - ldrusr \reg, \ptr, 4, abort=\abort - .endm - - .macro ldr4w ptr reg1 reg2 reg3 reg4 abort -100: ldm.w (\reg1, \reg2, \reg3, \reg4), [\ptr]+ - .pushsection __ex_table, "a" - .align 3 - .long 100b, \abort - .popsection - .endm - - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort -100: ldm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+ - .pushsection __ex_table, "a" - .align 3 - .long 100b, \abort - .popsection - .endm - - .macro ldr1b ptr reg cond=al abort - ldrusr \reg, \ptr, 1, \cond, abort=\abort - .endm - - .macro str1w ptr reg abort - stw.w \reg, [\ptr]+, #4 - .endm - - .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - stm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+ - .endm - - .macro str1b ptr reg cond=al abort - .ifnc \cond, al - b\cond 201f - b 202f - .endif -201: stb.w \reg, [\ptr]+, #1 -202: - .endm - - .macro enter - mov r3, #0 - stm.w (r0, r2, r3), [sp-] - .endm - - .macro exit - add sp, sp, #8 - ldm.w (r0), [sp]+ - mov pc, lr - .endm - - .text - -ENTRY(raw_copy_from_user) - -#include "copy_template.S" - -ENDPROC(raw_copy_from_user) - - .pushsection .fixup,"ax" - .align 0 - copy_abort_preamble - ldm.w (r1, r2, r3), [sp]+ - sub r0, r0, r1 - rsub r0, r0, r2 - copy_abort_end - .popsection - diff --git a/arch/unicore32/lib/copy_page.S b/arch/unicore32/lib/copy_page.S deleted file mode 100644 index dc163f2d1af0..000000000000 --- a/arch/unicore32/lib/copy_page.S +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/copy_page.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * ASM optimised string functions - */ -#include -#include -#include -#include - -#define COPY_COUNT (PAGE_SZ/256) - - .text - .align 5 -/* - * UniCore optimised copy_page routine - */ -ENTRY(copy_page) - stm.w (r17 - r19, lr), [sp-] - mov r17, r0 - mov r18, r1 - mov r19, #COPY_COUNT -1: - .rept 4 - ldm.w (r0 - r15), [r18]+ - stm.w (r0 - r15), [r17]+ - .endr - sub.a r19, r19, #1 - bne 1b - ldm.w (r17 - r19, pc), [sp]+ -ENDPROC(copy_page) diff --git a/arch/unicore32/lib/copy_template.S b/arch/unicore32/lib/copy_template.S deleted file mode 100644 index 02a7aef83fbf..000000000000 --- a/arch/unicore32/lib/copy_template.S +++ /dev/null @@ -1,211 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/copy_template.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -/* - * Theory of operation - * ------------------- - * - * This file provides the core code for a forward memory copy used in - * the implementation of memcopy(), copy_to_user() and copy_from_user(). - * - * The including file must define the following accessor macros - * according to the need of the given function: - * - * ldr1w ptr reg abort - * - * This loads one word from 'ptr', stores it in 'reg' and increments - * 'ptr' to the next word. The 'abort' argument is used for fixup tables. - * - * ldr4w ptr reg1 reg2 reg3 reg4 abort - * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - * - * This loads four or eight words starting from 'ptr', stores them - * in provided registers and increments 'ptr' past those words. - * The'abort' argument is used for fixup tables. - * - * ldr1b ptr reg cond abort - * - * Similar to ldr1w, but it loads a byte and increments 'ptr' one byte. - * It also must apply the condition code if provided, otherwise the - * "al" condition is assumed by default. - * - * str1w ptr reg abort - * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - * str1b ptr reg cond abort - * - * Same as their ldr* counterparts, but data is stored to 'ptr' location - * rather than being loaded. - * - * enter - * - * Preserve the provided registers on the stack plus any additional - * data as needed by the implementation including this code. Called - * upon code entry. - * - * exit - * - * Restore registers with the values previously saved with the - * 'preserv' macro. Called upon code termination. - */ - - - enter - - sub.a r2, r2, #4 - bsl 8f - and.a ip, r0, #3 - bne 9f - and.a ip, r1, #3 - bne 10f - -1: sub.a r2, r2, #(28) - stm.w (r5 - r8), [sp-] - bsl 5f - -3: -4: ldr8w r1, r3, r4, r5, r6, r7, r8, r10, r11, abort=20f - sub.a r2, r2, #32 - str8w r0, r3, r4, r5, r6, r7, r8, r10, r11, abort=20f - beg 3b - -5: and.a ip, r2, #28 - rsub ip, ip, #32 - beq 7f - add pc, pc, ip @ C is always clear here - nop - - ldr1w r1, r3, abort=20f - ldr1w r1, r4, abort=20f - ldr1w r1, r5, abort=20f - ldr1w r1, r6, abort=20f - ldr1w r1, r7, abort=20f - ldr1w r1, r8, abort=20f - ldr1w r1, r11, abort=20f - - add pc, pc, ip - nop - - str1w r0, r3, abort=20f - str1w r0, r4, abort=20f - str1w r0, r5, abort=20f - str1w r0, r6, abort=20f - str1w r0, r7, abort=20f - str1w r0, r8, abort=20f - str1w r0, r11, abort=20f - -7: ldm.w (r5 - r8), [sp]+ - -8: mov.a r2, r2 << #31 - ldr1b r1, r3, ne, abort=21f - ldr1b r1, r4, ea, abort=21f - ldr1b r1, r10, ea, abort=21f - str1b r0, r3, ne, abort=21f - str1b r0, r4, ea, abort=21f - str1b r0, r10, ea, abort=21f - - exit - -9: rsub ip, ip, #4 - csub.a ip, #2 - ldr1b r1, r3, sg, abort=21f - ldr1b r1, r4, eg, abort=21f - ldr1b r1, r11, abort=21f - str1b r0, r3, sg, abort=21f - str1b r0, r4, eg, abort=21f - sub.a r2, r2, ip - str1b r0, r11, abort=21f - bsl 8b - and.a ip, r1, #3 - beq 1b - -10: andn r1, r1, #3 - csub.a ip, #2 - ldr1w r1, r11, abort=21f - beq 17f - bsg 18f - - - .macro forward_copy_shift a b - - sub.a r2, r2, #28 - bsl 14f - -11: stm.w (r5 - r9), [sp-] - -12: - ldr4w r1, r4, r5, r6, r7, abort=19f - mov r3, r11 pull #\a - sub.a r2, r2, #32 - ldr4w r1, r8, r9, r10, r11, abort=19f - or r3, r3, r4 push #\b - mov r4, r4 pull #\a - or r4, r4, r5 push #\b - mov r5, r5 pull #\a - or r5, r5, r6 push #\b - mov r6, r6 pull #\a - or r6, r6, r7 push #\b - mov r7, r7 pull #\a - or r7, r7, r8 push #\b - mov r8, r8 pull #\a - or r8, r8, r9 push #\b - mov r9, r9 pull #\a - or r9, r9, r10 push #\b - mov r10, r10 pull #\a - or r10, r10, r11 push #\b - str8w r0, r3, r4, r5, r6, r7, r8, r9, r10, , abort=19f - beg 12b - - ldm.w (r5 - r9), [sp]+ - -14: and.a ip, r2, #28 - beq 16f - -15: mov r3, r11 pull #\a - ldr1w r1, r11, abort=21f - sub.a ip, ip, #4 - or r3, r3, r11 push #\b - str1w r0, r3, abort=21f - bsg 15b - -16: sub r1, r1, #(\b / 8) - b 8b - - .endm - - - forward_copy_shift a=8 b=24 - -17: forward_copy_shift a=16 b=16 - -18: forward_copy_shift a=24 b=8 - - -/* - * Abort preamble and completion macros. - * If a fixup handler is required then those macros must surround it. - * It is assumed that the fixup code will handle the private part of - * the exit macro. - */ - - .macro copy_abort_preamble -19: ldm.w (r5 - r9), [sp]+ - b 21f -299: .word 0 @ store lr - @ to avoid function call in fixup -20: ldm.w (r5 - r8), [sp]+ -21: - adr r1, 299b - stw lr, [r1] - .endm - - .macro copy_abort_end - adr lr, 299b - ldw pc, [lr] - .endm - diff --git a/arch/unicore32/lib/copy_to_user.S b/arch/unicore32/lib/copy_to_user.S deleted file mode 100644 index c867f08f89ce..000000000000 --- a/arch/unicore32/lib/copy_to_user.S +++ /dev/null @@ -1,93 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/copy_to_user.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#include -#include - -/* - * Prototype: - * - * size_t raw_copy_to_user(void *to, const void *from, size_t n) - * - * Purpose: - * - * copy a block to user memory from kernel memory - * - * Params: - * - * to = user memory - * from = kernel memory - * n = number of bytes to copy - * - * Return value: - * - * Number of bytes NOT copied. - */ - - .macro ldr1w ptr reg abort - ldw.w \reg, [\ptr]+, #4 - .endm - - .macro ldr4w ptr reg1 reg2 reg3 reg4 abort - ldm.w (\reg1, \reg2, \reg3, \reg4), [\ptr]+ - .endm - - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - ldm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+ - .endm - - .macro ldr1b ptr reg cond=al abort - notcond \cond, .+8 - ldb.w \reg, [\ptr]+, #1 - .endm - - .macro str1w ptr reg abort - strusr \reg, \ptr, 4, abort=\abort - .endm - - .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort -100: stm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+ - - .pushsection __ex_table, "a" - .long 100b, \abort - .popsection - .endm - - .macro str1b ptr reg cond=al abort - strusr \reg, \ptr, 1, \cond, abort=\abort - .endm - - .macro enter - mov r3, #0 - stm.w (r0, r2, r3), [sp-] - .endm - - .macro exit - add sp, sp, #8 - ldm.w (r0), [sp]+ - mov pc, lr - .endm - - .text - -WEAK(raw_copy_to_user) - -#include "copy_template.S" - -ENDPROC(raw_copy_to_user) - - .pushsection .fixup,"ax" - .align 0 - copy_abort_preamble - ldm.w (r1, r2, r3), [sp]+ - sub r0, r0, r1 - rsub r0, r0, r2 - copy_abort_end - .popsection - diff --git a/arch/unicore32/lib/delay.S b/arch/unicore32/lib/delay.S deleted file mode 100644 index 6a359dd034e5..000000000000 --- a/arch/unicore32/lib/delay.S +++ /dev/null @@ -1,48 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/delay.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include - .text - -.LC0: .word loops_per_jiffy -.LC1: .word (2199023*HZ)>>11 - -/* - * r0 <= 2000 - * lpj <= 0x01ffffff (max. 3355 bogomips) - * HZ <= 1000 - */ - -ENTRY(__udelay) - ldw r2, .LC1 - mul r0, r2, r0 -ENTRY(__const_udelay) @ 0 <= r0 <= 0x7fffff06 - ldw r2, .LC0 - ldw r2, [r2] @ max = 0x01ffffff - mov r0, r0 >> #14 @ max = 0x0001ffff - mov r2, r2 >> #10 @ max = 0x00007fff - mul r0, r2, r0 @ max = 2^32-1 - mov.a r0, r0 >> #6 - cmoveq pc, lr - -/* - * loops = r0 * HZ * loops_per_jiffy / 1000000 - * - * Oh, if only we had a cycle counter... - */ - -@ Delay routine -ENTRY(__delay) - sub.a r0, r0, #2 - bua __delay - mov pc, lr -ENDPROC(__udelay) -ENDPROC(__const_udelay) -ENDPROC(__delay) diff --git a/arch/unicore32/lib/findbit.S b/arch/unicore32/lib/findbit.S deleted file mode 100644 index 42f1282670d2..000000000000 --- a/arch/unicore32/lib/findbit.S +++ /dev/null @@ -1,97 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/findbit.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - .text - -/* - * Purpose : Find a 'zero' bit - * Prototype: int find_first_zero_bit(void *addr, unsigned int maxbit); - */ -ENTRY(find_first_zero_bit) - cxor.a r1, #0 - beq 3f - mov r2, #0 -1: ldb r3, [r0+], r2 >> #3 - xor.a r3, r3, #0xff @ invert bits - bne .L_found @ any now set - found zero bit - add r2, r2, #8 @ next bit pointer -2: csub.a r2, r1 @ any more? - bub 1b -3: mov r0, r1 @ no free bits - mov pc, lr -ENDPROC(find_first_zero_bit) - -/* - * Purpose : Find next 'zero' bit - * Prototype: int find_next_zero_bit - * (void *addr, unsigned int maxbit, int offset) - */ -ENTRY(find_next_zero_bit) - cxor.a r1, #0 - beq 3b - and.a ip, r2, #7 - beq 1b @ If new byte, goto old routine - ldb r3, [r0+], r2 >> #3 - xor r3, r3, #0xff @ now looking for a 1 bit - mov.a r3, r3 >> ip @ shift off unused bits - bne .L_found - or r2, r2, #7 @ if zero, then no bits here - add r2, r2, #1 @ align bit pointer - b 2b @ loop for next bit -ENDPROC(find_next_zero_bit) - -/* - * Purpose : Find a 'one' bit - * Prototype: int find_first_bit - * (const unsigned long *addr, unsigned int maxbit); - */ -ENTRY(find_first_bit) - cxor.a r1, #0 - beq 3f - mov r2, #0 -1: ldb r3, [r0+], r2 >> #3 - mov.a r3, r3 - bne .L_found @ any now set - found zero bit - add r2, r2, #8 @ next bit pointer -2: csub.a r2, r1 @ any more? - bub 1b -3: mov r0, r1 @ no free bits - mov pc, lr -ENDPROC(find_first_bit) - -/* - * Purpose : Find next 'one' bit - * Prototype: int find_next_zero_bit - * (void *addr, unsigned int maxbit, int offset) - */ -ENTRY(find_next_bit) - cxor.a r1, #0 - beq 3b - and.a ip, r2, #7 - beq 1b @ If new byte, goto old routine - ldb r3, [r0+], r2 >> #3 - mov.a r3, r3 >> ip @ shift off unused bits - bne .L_found - or r2, r2, #7 @ if zero, then no bits here - add r2, r2, #1 @ align bit pointer - b 2b @ loop for next bit -ENDPROC(find_next_bit) - -/* - * One or more bits in the LSB of r3 are assumed to be set. - */ -.L_found: - rsub r1, r3, #0 - and r3, r3, r1 - cntlz r3, r3 - rsub r3, r3, #31 - add r0, r2, r3 - mov pc, lr - diff --git a/arch/unicore32/lib/strncpy_from_user.S b/arch/unicore32/lib/strncpy_from_user.S deleted file mode 100644 index f227b8227a4c..000000000000 --- a/arch/unicore32/lib/strncpy_from_user.S +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/strncpy_from_user.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include - - .text - .align 5 - -/* - * Copy a string from user space to kernel space. - * r0 = dst, r1 = src, r2 = byte length - * returns the number of characters copied (strlen of copied string), - * -EFAULT on exception, or "len" if we fill the whole buffer - */ -ENTRY(__strncpy_from_user) - mov ip, r1 -1: sub.a r2, r2, #1 - ldrusr r3, r1, 1, ns - bfs 2f - stb.w r3, [r0]+, #1 - cxor.a r3, #0 - bne 1b - sub r1, r1, #1 @ take NUL character out of count -2: sub r0, r1, ip - mov pc, lr -ENDPROC(__strncpy_from_user) - - .pushsection .fixup,"ax" - .align 0 -9001: mov r3, #0 - stb r3, [r0+], #0 @ null terminate - mov r0, #-EFAULT - mov pc, lr - .popsection - diff --git a/arch/unicore32/lib/strnlen_user.S b/arch/unicore32/lib/strnlen_user.S deleted file mode 100644 index c836b12776fe..000000000000 --- a/arch/unicore32/lib/strnlen_user.S +++ /dev/null @@ -1,39 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/strnlen_user.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include - - .text - .align 5 - -/* Prototype: unsigned long __strnlen_user(const char *str, long n) - * Purpose : get length of a string in user memory - * Params : str - address of string in user memory - * Returns : length of string *including terminator* - * or zero on exception, or n + 1 if too long - */ -ENTRY(__strnlen_user) - mov r2, r0 -1: - ldrusr r3, r0, 1 - cxor.a r3, #0 - beq 2f - sub.a r1, r1, #1 - bne 1b - add r0, r0, #1 -2: sub r0, r0, r2 - mov pc, lr -ENDPROC(__strnlen_user) - - .pushsection .fixup,"ax" - .align 0 -9001: mov r0, #0 - mov pc, lr - .popsection diff --git a/arch/unicore32/mm/Kconfig b/arch/unicore32/mm/Kconfig deleted file mode 100644 index 82759b6aba67..000000000000 --- a/arch/unicore32/mm/Kconfig +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -comment "Processor Type" - -# Select CPU types depending on the architecture selected. This selects -# which CPUs we support in the kernel image, and the compiler instruction -# optimiser behaviour. - -config CPU_UCV2 - def_bool y - -comment "Processor Features" - -config CPU_ICACHE_DISABLE - bool "Disable I-Cache (I-bit)" - help - Say Y here to disable the processor instruction cache. Unless - you have a reason not to or are unsure, say N. - -config CPU_DCACHE_DISABLE - bool "Disable D-Cache (D-bit)" - help - Say Y here to disable the processor data cache. Unless - you have a reason not to or are unsure, say N. - -config CPU_DCACHE_WRITETHROUGH - bool "Force write through D-cache" - help - Say Y here to use the data cache in writethrough mode. Unless you - specifically require this or are unsure, say N. - -config CPU_DCACHE_LINE_DISABLE - bool "Disable D-cache line ops" - default y - help - Say Y here to disable the data cache line operations. - -config CPU_TLB_SINGLE_ENTRY_DISABLE - bool "Disable TLB single entry ops" - default y - help - Say Y here to disable the TLB single entry operations. diff --git a/arch/unicore32/mm/Makefile b/arch/unicore32/mm/Makefile deleted file mode 100644 index 8106260583ab..000000000000 --- a/arch/unicore32/mm/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for the linux unicore-specific parts of the memory manager. -# - -obj-y := extable.o fault.o init.o pgd.o mmu.o -obj-y += flush.o ioremap.o - -obj-$(CONFIG_MODULES) += proc-syms.o - -obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o - -obj-$(CONFIG_CPU_UCV2) += cache-ucv2.o tlb-ucv2.o proc-ucv2.o - diff --git a/arch/unicore32/mm/alignment.c b/arch/unicore32/mm/alignment.c deleted file mode 100644 index 2ea98f7a4156..000000000000 --- a/arch/unicore32/mm/alignment.c +++ /dev/null @@ -1,524 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/alignment.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -/* - * TODO: - * FPU ldm/stm not handling - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "mm.h" - -#define CODING_BITS(i) (i & 0xe0000120) - -#define LDST_P_BIT(i) (i & (1 << 28)) /* Preindex */ -#define LDST_U_BIT(i) (i & (1 << 27)) /* Add offset */ -#define LDST_W_BIT(i) (i & (1 << 25)) /* Writeback */ -#define LDST_L_BIT(i) (i & (1 << 24)) /* Load */ - -#define LDST_P_EQ_U(i) ((((i) ^ ((i) >> 1)) & (1 << 27)) == 0) - -#define LDSTH_I_BIT(i) (i & (1 << 26)) /* half-word immed */ -#define LDM_S_BIT(i) (i & (1 << 26)) /* write ASR from BSR */ -#define LDM_H_BIT(i) (i & (1 << 6)) /* select r0-r15 or r16-r31 */ - -#define RN_BITS(i) ((i >> 19) & 31) /* Rn */ -#define RD_BITS(i) ((i >> 14) & 31) /* Rd */ -#define RM_BITS(i) (i & 31) /* Rm */ - -#define REGMASK_BITS(i) (((i & 0x7fe00) >> 3) | (i & 0x3f)) -#define OFFSET_BITS(i) (i & 0x03fff) - -#define SHIFT_BITS(i) ((i >> 9) & 0x1f) -#define SHIFT_TYPE(i) (i & 0xc0) -#define SHIFT_LSL 0x00 -#define SHIFT_LSR 0x40 -#define SHIFT_ASR 0x80 -#define SHIFT_RORRRX 0xc0 - -union offset_union { - unsigned long un; - signed long sn; -}; - -#define TYPE_ERROR 0 -#define TYPE_FAULT 1 -#define TYPE_LDST 2 -#define TYPE_DONE 3 -#define TYPE_SWAP 4 -#define TYPE_COLS 5 /* Coprocessor load/store */ - -#define get8_unaligned_check(val, addr, err) \ - __asm__( \ - "1: ldb.u %1, [%2], #1\n" \ - "2:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "3: mov %0, #1\n" \ - " b 2b\n" \ - " .popsection\n" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .long 1b, 3b\n" \ - " .popsection\n" \ - : "=r" (err), "=&r" (val), "=r" (addr) \ - : "0" (err), "2" (addr)) - -#define get8t_unaligned_check(val, addr, err) \ - __asm__( \ - "1: ldb.u %1, [%2], #1\n" \ - "2:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "3: mov %0, #1\n" \ - " b 2b\n" \ - " .popsection\n" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .long 1b, 3b\n" \ - " .popsection\n" \ - : "=r" (err), "=&r" (val), "=r" (addr) \ - : "0" (err), "2" (addr)) - -#define get16_unaligned_check(val, addr) \ - do { \ - unsigned int err = 0, v, a = addr; \ - get8_unaligned_check(val, a, err); \ - get8_unaligned_check(v, a, err); \ - val |= v << 8; \ - if (err) \ - goto fault; \ - } while (0) - -#define put16_unaligned_check(val, addr) \ - do { \ - unsigned int err = 0, v = val, a = addr; \ - __asm__( \ - "1: stb.u %1, [%2], #1\n" \ - " mov %1, %1 >> #8\n" \ - "2: stb.u %1, [%2]\n" \ - "3:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "4: mov %0, #1\n" \ - " b 3b\n" \ - " .popsection\n" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .long 1b, 4b\n" \ - " .long 2b, 4b\n" \ - " .popsection\n" \ - : "=r" (err), "=&r" (v), "=&r" (a) \ - : "0" (err), "1" (v), "2" (a)); \ - if (err) \ - goto fault; \ - } while (0) - -#define __put32_unaligned_check(ins, val, addr) \ - do { \ - unsigned int err = 0, v = val, a = addr; \ - __asm__( \ - "1: "ins" %1, [%2], #1\n" \ - " mov %1, %1 >> #8\n" \ - "2: "ins" %1, [%2], #1\n" \ - " mov %1, %1 >> #8\n" \ - "3: "ins" %1, [%2], #1\n" \ - " mov %1, %1 >> #8\n" \ - "4: "ins" %1, [%2]\n" \ - "5:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "6: mov %0, #1\n" \ - " b 5b\n" \ - " .popsection\n" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .long 1b, 6b\n" \ - " .long 2b, 6b\n" \ - " .long 3b, 6b\n" \ - " .long 4b, 6b\n" \ - " .popsection\n" \ - : "=r" (err), "=&r" (v), "=&r" (a) \ - : "0" (err), "1" (v), "2" (a)); \ - if (err) \ - goto fault; \ - } while (0) - -#define get32_unaligned_check(val, addr) \ - do { \ - unsigned int err = 0, v, a = addr; \ - get8_unaligned_check(val, a, err); \ - get8_unaligned_check(v, a, err); \ - val |= v << 8; \ - get8_unaligned_check(v, a, err); \ - val |= v << 16; \ - get8_unaligned_check(v, a, err); \ - val |= v << 24; \ - if (err) \ - goto fault; \ - } while (0) - -#define put32_unaligned_check(val, addr) \ - __put32_unaligned_check("stb.u", val, addr) - -#define get32t_unaligned_check(val, addr) \ - do { \ - unsigned int err = 0, v, a = addr; \ - get8t_unaligned_check(val, a, err); \ - get8t_unaligned_check(v, a, err); \ - val |= v << 8; \ - get8t_unaligned_check(v, a, err); \ - val |= v << 16; \ - get8t_unaligned_check(v, a, err); \ - val |= v << 24; \ - if (err) \ - goto fault; \ - } while (0) - -#define put32t_unaligned_check(val, addr) \ - __put32_unaligned_check("stb.u", val, addr) - -static void -do_alignment_finish_ldst(unsigned long addr, unsigned long instr, - struct pt_regs *regs, union offset_union offset) -{ - if (!LDST_U_BIT(instr)) - offset.un = -offset.un; - - if (!LDST_P_BIT(instr)) - addr += offset.un; - - if (!LDST_P_BIT(instr) || LDST_W_BIT(instr)) - regs->uregs[RN_BITS(instr)] = addr; -} - -static int -do_alignment_ldrhstrh(unsigned long addr, unsigned long instr, - struct pt_regs *regs) -{ - unsigned int rd = RD_BITS(instr); - - /* old value 0x40002120, can't judge swap instr correctly */ - if ((instr & 0x4b003fe0) == 0x40000120) - goto swp; - - if (LDST_L_BIT(instr)) { - unsigned long val; - get16_unaligned_check(val, addr); - - /* signed half-word? */ - if (instr & 0x80) - val = (signed long)((signed short)val); - - regs->uregs[rd] = val; - } else - put16_unaligned_check(regs->uregs[rd], addr); - - return TYPE_LDST; - -swp: - /* only handle swap word - * for swap byte should not active this alignment exception */ - get32_unaligned_check(regs->uregs[RD_BITS(instr)], addr); - put32_unaligned_check(regs->uregs[RM_BITS(instr)], addr); - return TYPE_SWAP; - -fault: - return TYPE_FAULT; -} - -static int -do_alignment_ldrstr(unsigned long addr, unsigned long instr, - struct pt_regs *regs) -{ - unsigned int rd = RD_BITS(instr); - - if (!LDST_P_BIT(instr) && LDST_W_BIT(instr)) - goto trans; - - if (LDST_L_BIT(instr)) - get32_unaligned_check(regs->uregs[rd], addr); - else - put32_unaligned_check(regs->uregs[rd], addr); - return TYPE_LDST; - -trans: - if (LDST_L_BIT(instr)) - get32t_unaligned_check(regs->uregs[rd], addr); - else - put32t_unaligned_check(regs->uregs[rd], addr); - return TYPE_LDST; - -fault: - return TYPE_FAULT; -} - -/* - * LDM/STM alignment handler. - * - * There are 4 variants of this instruction: - * - * B = rn pointer before instruction, A = rn pointer after instruction - * ------ increasing address -----> - * | | r0 | r1 | ... | rx | | - * PU = 01 B A - * PU = 11 B A - * PU = 00 A B - * PU = 10 A B - */ -static int -do_alignment_ldmstm(unsigned long addr, unsigned long instr, - struct pt_regs *regs) -{ - unsigned int rd, rn, pc_correction, reg_correction, nr_regs, regbits; - unsigned long eaddr, newaddr; - - if (LDM_S_BIT(instr)) - goto bad; - - pc_correction = 4; /* processor implementation defined */ - - /* count the number of registers in the mask to be transferred */ - nr_regs = hweight16(REGMASK_BITS(instr)) * 4; - - rn = RN_BITS(instr); - newaddr = eaddr = regs->uregs[rn]; - - if (!LDST_U_BIT(instr)) - nr_regs = -nr_regs; - newaddr += nr_regs; - if (!LDST_U_BIT(instr)) - eaddr = newaddr; - - if (LDST_P_EQ_U(instr)) /* U = P */ - eaddr += 4; - - /* - * This is a "hint" - we already have eaddr worked out by the - * processor for us. - */ - if (addr != eaddr) { - printk(KERN_ERR "LDMSTM: PC = %08lx, instr = %08lx, " - "addr = %08lx, eaddr = %08lx\n", - instruction_pointer(regs), instr, addr, eaddr); - show_regs(regs); - } - - if (LDM_H_BIT(instr)) - reg_correction = 0x10; - else - reg_correction = 0x00; - - for (regbits = REGMASK_BITS(instr), rd = 0; regbits; - regbits >>= 1, rd += 1) - if (regbits & 1) { - if (LDST_L_BIT(instr)) - get32_unaligned_check(regs-> - uregs[rd + reg_correction], eaddr); - else - put32_unaligned_check(regs-> - uregs[rd + reg_correction], eaddr); - eaddr += 4; - } - - if (LDST_W_BIT(instr)) - regs->uregs[rn] = newaddr; - return TYPE_DONE; - -fault: - regs->UCreg_pc -= pc_correction; - return TYPE_FAULT; - -bad: - printk(KERN_ERR "Alignment trap: not handling ldm with s-bit set\n"); - return TYPE_ERROR; -} - -static int -do_alignment(unsigned long addr, unsigned int error_code, struct pt_regs *regs) -{ - union offset_union offset; - unsigned long instr, instrptr; - int (*handler) (unsigned long addr, unsigned long instr, - struct pt_regs *regs); - unsigned int type; - - instrptr = instruction_pointer(regs); - if (instrptr >= PAGE_OFFSET) - instr = *(unsigned long *)instrptr; - else { - __asm__ __volatile__( - "ldw.u %0, [%1]\n" - : "=&r"(instr) - : "r"(instrptr)); - } - - regs->UCreg_pc += 4; - - switch (CODING_BITS(instr)) { - case 0x40000120: /* ldrh or strh */ - if (LDSTH_I_BIT(instr)) - offset.un = (instr & 0x3e00) >> 4 | (instr & 31); - else - offset.un = regs->uregs[RM_BITS(instr)]; - handler = do_alignment_ldrhstrh; - break; - - case 0x60000000: /* ldr or str immediate */ - case 0x60000100: /* ldr or str immediate */ - case 0x60000020: /* ldr or str immediate */ - case 0x60000120: /* ldr or str immediate */ - offset.un = OFFSET_BITS(instr); - handler = do_alignment_ldrstr; - break; - - case 0x40000000: /* ldr or str register */ - offset.un = regs->uregs[RM_BITS(instr)]; - { - unsigned int shiftval = SHIFT_BITS(instr); - - switch (SHIFT_TYPE(instr)) { - case SHIFT_LSL: - offset.un <<= shiftval; - break; - - case SHIFT_LSR: - offset.un >>= shiftval; - break; - - case SHIFT_ASR: - offset.sn >>= shiftval; - break; - - case SHIFT_RORRRX: - if (shiftval == 0) { - offset.un >>= 1; - if (regs->UCreg_asr & PSR_C_BIT) - offset.un |= 1 << 31; - } else - offset.un = offset.un >> shiftval | - offset.un << (32 - shiftval); - break; - } - } - handler = do_alignment_ldrstr; - break; - - case 0x80000000: /* ldm or stm */ - case 0x80000020: /* ldm or stm */ - handler = do_alignment_ldmstm; - break; - - default: - goto bad; - } - - type = handler(addr, instr, regs); - - if (type == TYPE_ERROR || type == TYPE_FAULT) - goto bad_or_fault; - - if (type == TYPE_LDST) - do_alignment_finish_ldst(addr, instr, regs, offset); - - return 0; - -bad_or_fault: - if (type == TYPE_ERROR) - goto bad; - regs->UCreg_pc -= 4; - /* - * We got a fault - fix it up, or die. - */ - do_bad_area(addr, error_code, regs); - return 0; - -bad: - /* - * Oops, we didn't handle the instruction. - * However, we must handle fpu instr firstly. - */ -#ifdef CONFIG_UNICORE_FPU_F64 - /* handle co.load/store */ -#define CODING_COLS 0xc0000000 -#define COLS_OFFSET_BITS(i) (i & 0x1FF) -#define COLS_L_BITS(i) (i & (1<<24)) -#define COLS_FN_BITS(i) ((i>>14) & 31) - if ((instr & 0xe0000000) == CODING_COLS) { - unsigned int fn = COLS_FN_BITS(instr); - unsigned long val = 0; - if (COLS_L_BITS(instr)) { - get32t_unaligned_check(val, addr); - switch (fn) { -#define ASM_MTF(n) case n: \ - __asm__ __volatile__("MTF %0, F" __stringify(n) \ - : : "r"(val)); \ - break; - ASM_MTF(0); ASM_MTF(1); ASM_MTF(2); ASM_MTF(3); - ASM_MTF(4); ASM_MTF(5); ASM_MTF(6); ASM_MTF(7); - ASM_MTF(8); ASM_MTF(9); ASM_MTF(10); ASM_MTF(11); - ASM_MTF(12); ASM_MTF(13); ASM_MTF(14); ASM_MTF(15); - ASM_MTF(16); ASM_MTF(17); ASM_MTF(18); ASM_MTF(19); - ASM_MTF(20); ASM_MTF(21); ASM_MTF(22); ASM_MTF(23); - ASM_MTF(24); ASM_MTF(25); ASM_MTF(26); ASM_MTF(27); - ASM_MTF(28); ASM_MTF(29); ASM_MTF(30); ASM_MTF(31); -#undef ASM_MTF - } - } else { - switch (fn) { -#define ASM_MFF(n) case n: \ - __asm__ __volatile__("MFF %0, F" __stringify(n) \ - : : "r"(val)); \ - break; - ASM_MFF(0); ASM_MFF(1); ASM_MFF(2); ASM_MFF(3); - ASM_MFF(4); ASM_MFF(5); ASM_MFF(6); ASM_MFF(7); - ASM_MFF(8); ASM_MFF(9); ASM_MFF(10); ASM_MFF(11); - ASM_MFF(12); ASM_MFF(13); ASM_MFF(14); ASM_MFF(15); - ASM_MFF(16); ASM_MFF(17); ASM_MFF(18); ASM_MFF(19); - ASM_MFF(20); ASM_MFF(21); ASM_MFF(22); ASM_MFF(23); - ASM_MFF(24); ASM_MFF(25); ASM_MFF(26); ASM_MFF(27); - ASM_MFF(28); ASM_MFF(29); ASM_MFF(30); ASM_MFF(31); -#undef ASM_MFF - } - put32t_unaligned_check(val, addr); - } - return TYPE_COLS; - } -fault: - return TYPE_FAULT; -#endif - printk(KERN_ERR "Alignment trap: not handling instruction " - "%08lx at [<%08lx>]\n", instr, instrptr); - return 1; -} - -/* - * This needs to be done after sysctl_init, otherwise sys/ will be - * overwritten. Actually, this shouldn't be in sys/ at all since - * it isn't a sysctl, and it doesn't contain sysctl information. - */ -static int __init alignment_init(void) -{ - hook_fault_code(1, do_alignment, SIGBUS, BUS_ADRALN, - "alignment exception"); - - return 0; -} - -fs_initcall(alignment_init); diff --git a/arch/unicore32/mm/cache-ucv2.S b/arch/unicore32/mm/cache-ucv2.S deleted file mode 100644 index 2108837d6f4f..000000000000 --- a/arch/unicore32/mm/cache-ucv2.S +++ /dev/null @@ -1,209 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/mm/cache-ucv2.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This is the "shell" of the UniCore-v2 processor support. - */ -#include -#include -#include -#include - -#include "proc-macros.S" - -/* - * __cpuc_flush_icache_all() - * __cpuc_flush_kern_all() - * __cpuc_flush_user_all() - * - * Flush the entire cache. - */ -ENTRY(__cpuc_flush_icache_all) - /*FALLTHROUGH*/ -ENTRY(__cpuc_flush_kern_all) - /*FALLTHROUGH*/ -ENTRY(__cpuc_flush_user_all) - mov r0, #0 - movc p0.c5, r0, #14 @ Dcache flush all - nop8 - - mov r0, #0 - movc p0.c5, r0, #20 @ Icache invalidate all - nop8 - - mov pc, lr - -/* - * __cpuc_flush_user_range(start, end, flags) - * - * Flush a range of TLB entries in the specified address space. - * - * - start - start address (may not be aligned) - * - end - end address (exclusive, may not be aligned) - * - flags - vm_area_struct flags describing address space - */ -ENTRY(__cpuc_flush_user_range) - cxor.a r2, #0 - beq __cpuc_dma_flush_range - -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - andn r0, r0, #CACHE_LINESIZE - 1 @ Safety check - sub r1, r1, r0 - csub.a r1, #MAX_AREA_SIZE - bsg 2f - - andn r1, r1, #CACHE_LINESIZE - 1 - add r1, r1, #CACHE_LINESIZE - -101: dcacheline_flush r0, r11, r12 - - add r0, r0, #CACHE_LINESIZE - sub.a r1, r1, #CACHE_LINESIZE - bns 101b - b 3f -#endif -2: mov ip, #0 - movc p0.c5, ip, #14 @ Dcache flush all - nop8 - -3: mov ip, #0 - movc p0.c5, ip, #20 @ Icache invalidate all - nop8 - - mov pc, lr - -/* - * __cpuc_coherent_kern_range(start,end) - * __cpuc_coherent_user_range(start,end) - * - * Ensure that the I and D caches are coherent within specified - * region. This is typically used when code has been written to - * a memory region, and will be executed. - * - * - start - virtual start address of region - * - end - virtual end address of region - */ -ENTRY(__cpuc_coherent_kern_range) - /* FALLTHROUGH */ -ENTRY(__cpuc_coherent_user_range) -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - andn r0, r0, #CACHE_LINESIZE - 1 @ Safety check - sub r1, r1, r0 - csub.a r1, #MAX_AREA_SIZE - bsg 2f - - andn r1, r1, #CACHE_LINESIZE - 1 - add r1, r1, #CACHE_LINESIZE - - @ r0 va2pa r10 - mov r9, #PAGE_SZ - sub r9, r9, #1 @ PAGE_MASK -101: va2pa r0, r10, r11, r12, r13, 2f @ r10 is PA - b 103f -102: cand.a r0, r9 - beq 101b - -103: movc p0.c5, r10, #11 @ Dcache clean line of R10 - nop8 - - add r0, r0, #CACHE_LINESIZE - add r10, r10, #CACHE_LINESIZE - sub.a r1, r1, #CACHE_LINESIZE - bns 102b - b 3f -#endif -2: mov ip, #0 - movc p0.c5, ip, #10 @ Dcache clean all - nop8 - -3: mov ip, #0 - movc p0.c5, ip, #20 @ Icache invalidate all - nop8 - - mov pc, lr - -/* - * __cpuc_flush_kern_dcache_area(void *addr, size_t size) - * - * - addr - kernel address - * - size - region size - */ -ENTRY(__cpuc_flush_kern_dcache_area) - mov ip, #0 - movc p0.c5, ip, #14 @ Dcache flush all - nop8 - mov pc, lr - -/* - * __cpuc_dma_clean_range(start,end) - * - start - virtual start address of region - * - end - virtual end address of region - */ -ENTRY(__cpuc_dma_clean_range) -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - andn r0, r0, #CACHE_LINESIZE - 1 - sub r1, r1, r0 - andn r1, r1, #CACHE_LINESIZE - 1 - add r1, r1, #CACHE_LINESIZE - - csub.a r1, #MAX_AREA_SIZE - bsg 2f - - @ r0 va2pa r10 - mov r9, #PAGE_SZ - sub r9, r9, #1 @ PAGE_MASK -101: va2pa r0, r10, r11, r12, r13, 2f @ r10 is PA - b 1f -102: cand.a r0, r9 - beq 101b - -1: movc p0.c5, r10, #11 @ Dcache clean line of R10 - nop8 - add r0, r0, #CACHE_LINESIZE - add r10, r10, #CACHE_LINESIZE - sub.a r1, r1, #CACHE_LINESIZE - bns 102b - mov pc, lr -#endif -2: mov ip, #0 - movc p0.c5, ip, #10 @ Dcache clean all - nop8 - - mov pc, lr - -/* - * __cpuc_dma_inv_range(start,end) - * __cpuc_dma_flush_range(start,end) - * - start - virtual start address of region - * - end - virtual end address of region - */ -__cpuc_dma_inv_range: - /* FALLTHROUGH */ -ENTRY(__cpuc_dma_flush_range) -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - andn r0, r0, #CACHE_LINESIZE - 1 - sub r1, r1, r0 - andn r1, r1, #CACHE_LINESIZE - 1 - add r1, r1, #CACHE_LINESIZE - - csub.a r1, #MAX_AREA_SIZE - bsg 2f - - @ r0 va2pa r10 -101: dcacheline_flush r0, r11, r12 - - add r0, r0, #CACHE_LINESIZE - sub.a r1, r1, #CACHE_LINESIZE - bns 101b - mov pc, lr -#endif -2: mov ip, #0 - movc p0.c5, ip, #14 @ Dcache flush all - nop8 - - mov pc, lr - diff --git a/arch/unicore32/mm/extable.c b/arch/unicore32/mm/extable.c deleted file mode 100644 index e53352b41c4a..000000000000 --- a/arch/unicore32/mm/extable.c +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/extable.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - -int fixup_exception(struct pt_regs *regs) -{ - const struct exception_table_entry *fixup; - - fixup = search_exception_tables(instruction_pointer(regs)); - if (fixup) - regs->UCreg_pc = fixup->fixup; - - return fixup != NULL; -} diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c deleted file mode 100644 index 7654bddde133..000000000000 --- a/arch/unicore32/mm/fault.c +++ /dev/null @@ -1,481 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/fault.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Fault status register encodings. We steal bit 31 for our own purposes. - */ -#define FSR_LNX_PF (1 << 31) - -static inline int fsr_fs(unsigned int fsr) -{ - /* xyabcde will be abcde+xy */ - return (fsr & 31) + ((fsr & (3 << 5)) >> 5); -} - -/* - * This is useful to dump out the page tables associated with - * 'addr' in mm 'mm'. - */ -void show_pte(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pgd; - - if (!mm) - mm = &init_mm; - - printk(KERN_ALERT "pgd = %p\n", mm->pgd); - pgd = pgd_offset(mm, addr); - printk(KERN_ALERT "[%08lx] *pgd=%08lx", addr, pgd_val(*pgd)); - - do { - pmd_t *pmd; - pte_t *pte; - - if (pgd_none(*pgd)) - break; - - if (pgd_bad(*pgd)) { - printk("(bad)"); - break; - } - - pmd = pmd_offset((pud_t *) pgd, addr); - if (PTRS_PER_PMD != 1) - printk(", *pmd=%08lx", pmd_val(*pmd)); - - if (pmd_none(*pmd)) - break; - - if (pmd_bad(*pmd)) { - printk("(bad)"); - break; - } - - /* We must not map this if we have highmem enabled */ - if (PageHighMem(pfn_to_page(pmd_val(*pmd) >> PAGE_SHIFT))) - break; - - pte = pte_offset_map(pmd, addr); - printk(", *pte=%08lx", pte_val(*pte)); - pte_unmap(pte); - } while (0); - - printk("\n"); -} - -/* - * Oops. The kernel tried to access some page that wasn't present. - */ -static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, - unsigned int fsr, struct pt_regs *regs) -{ - /* - * Are we prepared to handle this kernel fault? - */ - if (fixup_exception(regs)) - return; - - /* - * No handler, we'll have to terminate things with extreme prejudice. - */ - bust_spinlocks(1); - printk(KERN_ALERT - "Unable to handle kernel %s at virtual address %08lx\n", - (addr < PAGE_SIZE) ? "NULL pointer dereference" : - "paging request", addr); - - show_pte(mm, addr); - die("Oops", regs, fsr); - bust_spinlocks(0); - do_exit(SIGKILL); -} - -/* - * Something tried to access memory that isn't in our memory map.. - * User mode accesses just cause a SIGSEGV - */ -static void __do_user_fault(unsigned long addr, unsigned int fsr, - unsigned int sig, int code, struct pt_regs *regs) -{ - struct task_struct *tsk = current; - - tsk->thread.address = addr; - tsk->thread.error_code = fsr; - tsk->thread.trap_no = 14; - force_sig_fault(sig, code, (void __user *)addr); -} - -void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) -{ - struct task_struct *tsk = current; - struct mm_struct *mm = tsk->active_mm; - - /* - * If we are in kernel mode at this point, we - * have no context to handle this fault with. - */ - if (user_mode(regs)) - __do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs); - else - __do_kernel_fault(mm, addr, fsr, regs); -} - -#define VM_FAULT_BADMAP 0x010000 -#define VM_FAULT_BADACCESS 0x020000 - -/* - * Check that the permissions on the VMA allow for the fault which occurred. - * If we encountered a write fault, we must have write permission, otherwise - * we allow any permission. - */ -static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma) -{ - unsigned int mask = VM_ACCESS_FLAGS; - - if (!(fsr ^ 0x12)) /* write? */ - mask = VM_WRITE; - if (fsr & FSR_LNX_PF) - mask = VM_EXEC; - - return vma->vm_flags & mask ? false : true; -} - -static vm_fault_t __do_pf(struct mm_struct *mm, unsigned long addr, - unsigned int fsr, unsigned int flags, struct task_struct *tsk) -{ - struct vm_area_struct *vma; - vm_fault_t fault; - - vma = find_vma(mm, addr); - fault = VM_FAULT_BADMAP; - if (unlikely(!vma)) - goto out; - if (unlikely(vma->vm_start > addr)) - goto check_stack; - - /* - * Ok, we have a good vm_area for this - * memory access, so we can handle it. - */ -good_area: - if (access_error(fsr, vma)) { - fault = VM_FAULT_BADACCESS; - goto out; - } - - /* - * If for any reason at all we couldn't handle the fault, make - * sure we exit gracefully rather than endlessly redo the fault. - */ - fault = handle_mm_fault(vma, addr & PAGE_MASK, flags); - return fault; - -check_stack: - if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr)) - goto good_area; -out: - return fault; -} - -static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) -{ - struct task_struct *tsk; - struct mm_struct *mm; - int sig, code; - vm_fault_t fault; - unsigned int flags = FAULT_FLAG_DEFAULT; - - tsk = current; - mm = tsk->mm; - - /* - * If we're in an interrupt or have no user - * context, we must not take the fault.. - */ - if (faulthandler_disabled() || !mm) - goto no_context; - - if (user_mode(regs)) - flags |= FAULT_FLAG_USER; - if (!(fsr ^ 0x12)) - flags |= FAULT_FLAG_WRITE; - - /* - * As per x86, we may deadlock here. However, since the kernel only - * validly references user space from well defined areas of the code, - * we can bug out early if this is from code which shouldn't. - */ - if (!mmap_read_trylock(mm)) { - if (!user_mode(regs) - && !search_exception_tables(regs->UCreg_pc)) - goto no_context; -retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case, we'll have missed the might_sleep() from - * down_read() - */ - might_sleep(); -#ifdef CONFIG_DEBUG_VM - if (!user_mode(regs) && - !search_exception_tables(regs->UCreg_pc)) - goto no_context; -#endif - } - - fault = __do_pf(mm, addr, fsr, flags, tsk); - - /* If we need to retry but a fatal signal is pending, handle the - * signal first. We do not need to release the mmap_lock because - * it would already be released in __lock_page_or_retry in - * mm/filemap.c. */ - if (fault_signal_pending(fault, regs)) - return 0; - - if (!(fault & VM_FAULT_ERROR) && (flags & FAULT_FLAG_ALLOW_RETRY)) { - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; - goto retry; - } - } - - mmap_read_unlock(mm); - - /* - * Handle the "normal" case first - VM_FAULT_MAJOR - */ - if (likely(!(fault & - (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) - return 0; - - /* - * If we are in kernel mode at this point, we - * have no context to handle this fault with. - */ - if (!user_mode(regs)) - goto no_context; - - if (fault & VM_FAULT_OOM) { - /* - * We ran out of memory, call the OOM killer, and return to - * userspace (which will retry the fault, or kill us if we - * got oom-killed) - */ - pagefault_out_of_memory(); - return 0; - } - - if (fault & VM_FAULT_SIGBUS) { - /* - * We had some memory, but were unable to - * successfully fix up this page fault. - */ - sig = SIGBUS; - code = BUS_ADRERR; - } else { - /* - * Something tried to access memory that - * isn't in our memory map.. - */ - sig = SIGSEGV; - code = fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR; - } - - __do_user_fault(addr, fsr, sig, code, regs); - return 0; - -no_context: - __do_kernel_fault(mm, addr, fsr, regs); - return 0; -} - -/* - * First Level Translation Fault Handler - * - * We enter here because the first level page table doesn't contain - * a valid entry for the address. - * - * If the address is in kernel space (>= TASK_SIZE), then we are - * probably faulting in the vmalloc() area. - * - * If the init_task's first level page tables contains the relevant - * entry, we copy the it to this task. If not, we send the process - * a signal, fixup the exception, or oops the kernel. - * - * NOTE! We MUST NOT take any locks for this case. We may be in an - * interrupt or a critical region, and should only copy the information - * from the master page table, nothing more. - */ -static int do_ifault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) -{ - unsigned int index; - pgd_t *pgd, *pgd_k; - pmd_t *pmd, *pmd_k; - - if (addr < TASK_SIZE) - return do_pf(addr, fsr, regs); - - if (user_mode(regs)) - goto bad_area; - - index = pgd_index(addr); - - pgd = cpu_get_pgd() + index; - pgd_k = init_mm.pgd + index; - - if (pgd_none(*pgd_k)) - goto bad_area; - - pmd_k = pmd_offset((pud_t *) pgd_k, addr); - pmd = pmd_offset((pud_t *) pgd, addr); - - if (pmd_none(*pmd_k)) - goto bad_area; - - set_pmd(pmd, *pmd_k); - flush_pmd_entry(pmd); - return 0; - -bad_area: - do_bad_area(addr, fsr, regs); - return 0; -} - -/* - * This abort handler always returns "fault". - */ -static int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs) -{ - return 1; -} - -static int do_good(unsigned long addr, unsigned int fsr, struct pt_regs *regs) -{ - unsigned int res1, res2; - - printk("dabt exception but no error!\n"); - - __asm__ __volatile__( - "mff %0,f0\n" - "mff %1,f1\n" - : "=r"(res1), "=r"(res2) - : - : "memory"); - - printk(KERN_EMERG "r0 :%08x r1 :%08x\n", res1, res2); - panic("shut up\n"); - return 0; -} - -static struct fsr_info { - int (*fn) (unsigned long addr, unsigned int fsr, struct pt_regs *regs); - int sig; - int code; - const char *name; -} fsr_info[] = { - /* - * The following are the standard Unicore-I and UniCore-II aborts. - */ - { do_good, SIGBUS, 0, "no error" }, - { do_bad, SIGBUS, BUS_ADRALN, "alignment exception" }, - { do_bad, SIGBUS, BUS_OBJERR, "external exception" }, - { do_bad, SIGBUS, 0, "burst operation" }, - { do_bad, SIGBUS, 0, "unknown 00100" }, - { do_ifault, SIGSEGV, SEGV_MAPERR, "2nd level pt non-exist"}, - { do_bad, SIGBUS, 0, "2nd lvl large pt non-exist" }, - { do_bad, SIGBUS, 0, "invalid pte" }, - { do_pf, SIGSEGV, SEGV_MAPERR, "page miss" }, - { do_bad, SIGBUS, 0, "middle page miss" }, - { do_bad, SIGBUS, 0, "large page miss" }, - { do_pf, SIGSEGV, SEGV_MAPERR, "super page (section) miss" }, - { do_bad, SIGBUS, 0, "unknown 01100" }, - { do_bad, SIGBUS, 0, "unknown 01101" }, - { do_bad, SIGBUS, 0, "unknown 01110" }, - { do_bad, SIGBUS, 0, "unknown 01111" }, - { do_bad, SIGBUS, 0, "addr: up 3G or IO" }, - { do_pf, SIGSEGV, SEGV_ACCERR, "read unreadable addr" }, - { do_pf, SIGSEGV, SEGV_ACCERR, "write unwriteable addr"}, - { do_pf, SIGSEGV, SEGV_ACCERR, "exec unexecutable addr"}, - { do_bad, SIGBUS, 0, "unknown 10100" }, - { do_bad, SIGBUS, 0, "unknown 10101" }, - { do_bad, SIGBUS, 0, "unknown 10110" }, - { do_bad, SIGBUS, 0, "unknown 10111" }, - { do_bad, SIGBUS, 0, "unknown 11000" }, - { do_bad, SIGBUS, 0, "unknown 11001" }, - { do_bad, SIGBUS, 0, "unknown 11010" }, - { do_bad, SIGBUS, 0, "unknown 11011" }, - { do_bad, SIGBUS, 0, "unknown 11100" }, - { do_bad, SIGBUS, 0, "unknown 11101" }, - { do_bad, SIGBUS, 0, "unknown 11110" }, - { do_bad, SIGBUS, 0, "unknown 11111" } -}; - -void __init hook_fault_code(int nr, - int (*fn) (unsigned long, unsigned int, struct pt_regs *), - int sig, int code, const char *name) -{ - if (nr < 0 || nr >= ARRAY_SIZE(fsr_info)) - BUG(); - - fsr_info[nr].fn = fn; - fsr_info[nr].sig = sig; - fsr_info[nr].code = code; - fsr_info[nr].name = name; -} - -/* - * Dispatch a data abort to the relevant handler. - */ -asmlinkage void do_DataAbort(unsigned long addr, unsigned int fsr, - struct pt_regs *regs) -{ - const struct fsr_info *inf = fsr_info + fsr_fs(fsr); - - if (!inf->fn(addr, fsr & ~FSR_LNX_PF, regs)) - return; - - printk(KERN_ALERT "Unhandled fault: %s (0x%03x) at 0x%08lx\n", - inf->name, fsr, addr); - - uc32_notify_die("", regs, inf->sig, inf->code, (void __user *)addr, - fsr, 0); -} - -asmlinkage void do_PrefetchAbort(unsigned long addr, - unsigned int ifsr, struct pt_regs *regs) -{ - const struct fsr_info *inf = fsr_info + fsr_fs(ifsr); - - if (!inf->fn(addr, ifsr | FSR_LNX_PF, regs)) - return; - - printk(KERN_ALERT "Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n", - inf->name, ifsr, addr); - - uc32_notify_die("", regs, inf->sig, inf->code, (void __user *)addr, - ifsr, 0); -} diff --git a/arch/unicore32/mm/flush.c b/arch/unicore32/mm/flush.c deleted file mode 100644 index 65954f8d89a2..000000000000 --- a/arch/unicore32/mm/flush.c +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/flush.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include - -#include -#include - -void flush_cache_mm(struct mm_struct *mm) -{ -} - -void flush_cache_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - if (vma->vm_flags & VM_EXEC) - __flush_icache_all(); -} - -void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, - unsigned long pfn) -{ -} - -static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, - unsigned long uaddr, void *kaddr, unsigned long len) -{ - /* VIPT non-aliasing D-cache */ - if (vma->vm_flags & VM_EXEC) { - unsigned long addr = (unsigned long)kaddr; - - __cpuc_coherent_kern_range(addr, addr + len); - } -} - -/* - * Copy user data from/to a page which is mapped into a different - * processes address space. Really, we want to allow our "user - * space" model to handle this. - * - * Note that this code needs to run on the current CPU. - */ -void copy_to_user_page(struct vm_area_struct *vma, struct page *page, - unsigned long uaddr, void *dst, const void *src, - unsigned long len) -{ - memcpy(dst, src, len); - flush_ptrace_access(vma, page, uaddr, dst, len); -} - -void __flush_dcache_page(struct address_space *mapping, struct page *page) -{ - /* - * Writeback any data associated with the kernel mapping of this - * page. This ensures that data in the physical page is mutually - * coherent with the kernels mapping. - */ - __cpuc_flush_kern_dcache_area(page_address(page), PAGE_SIZE); -} - -/* - * Ensure cache coherency between kernel mapping and userspace mapping - * of this page. - */ -void flush_dcache_page(struct page *page) -{ - struct address_space *mapping; - - /* - * The zero page is never written to, so never has any dirty - * cache lines, and therefore never needs to be flushed. - */ - if (page == ZERO_PAGE(0)) - return; - - mapping = page_mapping_file(page); - - if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &page->flags); - else { - __flush_dcache_page(mapping, page); - if (mapping) - __flush_icache_all(); - set_bit(PG_dcache_clean, &page->flags); - } -} -EXPORT_SYMBOL(flush_dcache_page); diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c deleted file mode 100644 index 52425d383cea..000000000000 --- a/arch/unicore32/mm/init.c +++ /dev/null @@ -1,261 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/init.c - * - * Copyright (C) 2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "mm.h" - -/* - * This keeps memory configuration data used by a couple memory - * initialization functions, as well as show_mem() for the skipping - * of holes in the memory map. It is populated by uc32_add_memory(). - */ -struct meminfo meminfo; - -static void __init find_limits(unsigned long *min, unsigned long *max_low, - unsigned long *max_high) -{ - struct meminfo *mi = &meminfo; - int i; - - *min = -1UL; - *max_low = *max_high = 0; - - for_each_bank(i, mi) { - struct membank *bank = &mi->bank[i]; - unsigned long start, end; - - start = bank_pfn_start(bank); - end = bank_pfn_end(bank); - - if (*min > start) - *min = start; - if (*max_high < end) - *max_high = end; - if (bank->highmem) - continue; - if (*max_low < end) - *max_low = end; - } -} - -static void __init uc32_bootmem_free(unsigned long max_low) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - - max_zone_pfn[ZONE_DMA] = max_low; - max_zone_pfn[ZONE_NORMAL] = max_low; - - /* - * Adjust the sizes according to any special requirements for - * this machine type. - * This might lower ZONE_DMA limit. - */ - arch_adjust_zones(max_zone_pfn); - - free_area_init(max_zone_pfn); -} - -int pfn_valid(unsigned long pfn) -{ - return memblock_is_memory(pfn << PAGE_SHIFT); -} -EXPORT_SYMBOL(pfn_valid); - -static void uc32_memory_present(void) -{ -} - -static int __init meminfo_cmp(const void *_a, const void *_b) -{ - const struct membank *a = _a, *b = _b; - long cmp = bank_pfn_start(a) - bank_pfn_start(b); - return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; -} - -void __init uc32_memblock_init(struct meminfo *mi) -{ - int i; - - sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), - meminfo_cmp, NULL); - - for (i = 0; i < mi->nr_banks; i++) - memblock_add(mi->bank[i].start, mi->bank[i].size); - - /* Register the kernel text, kernel data and initrd with memblock. */ - memblock_reserve(__pa(_text), _end - _text); - -#ifdef CONFIG_BLK_DEV_INITRD - if (!phys_initrd_size) { - phys_initrd_start = 0x01000000; - phys_initrd_size = SZ_8M; - } - - if (phys_initrd_size) { - memblock_reserve(phys_initrd_start, phys_initrd_size); - - /* Now convert initrd to virtual addresses */ - initrd_start = __phys_to_virt(phys_initrd_start); - initrd_end = initrd_start + phys_initrd_size; - } -#endif - - uc32_mm_memblock_reserve(); - - memblock_allow_resize(); - memblock_dump_all(); -} - -void __init bootmem_init(void) -{ - unsigned long min, max_low, max_high; - - max_low = max_high = 0; - - find_limits(&min, &max_low, &max_high); - - node_set_online(0); - - /* - * Sparsemem tries to allocate bootmem in memory_present(), - * so must be done after the fixed reservations - */ - uc32_memory_present(); - - /* - * sparse_init() needs the bootmem allocator up and running. - */ - sparse_init(); - - /* - * Now free the memory - free_area_init needs - * the sparse mem_map arrays initialized by sparse_init() - * for memmap_init_zone(), otherwise all PFNs are invalid. - */ - uc32_bootmem_free(max_low); - - high_memory = __va((max_low << PAGE_SHIFT) - 1) + 1; - - /* - * This doesn't seem to be used by the Linux memory manager any - * more, but is used by ll_rw_block. If we can get rid of it, we - * also get rid of some of the stuff above as well. - * - * Note: max_low_pfn and max_pfn reflect the number of _pages_ in - * the system, not the maximum PFN. - */ - max_low_pfn = max_low - PHYS_PFN_OFFSET; - max_pfn = max_high - PHYS_PFN_OFFSET; -} - -static inline void -free_memmap(unsigned long start_pfn, unsigned long end_pfn) -{ - struct page *start_pg, *end_pg; - unsigned long pg, pgend; - - /* - * Convert start_pfn/end_pfn to a struct page pointer. - */ - start_pg = pfn_to_page(start_pfn - 1) + 1; - end_pg = pfn_to_page(end_pfn); - - /* - * Convert to physical addresses, and - * round start upwards and end downwards. - */ - pg = PAGE_ALIGN(__pa(start_pg)); - pgend = __pa(end_pg) & PAGE_MASK; - - /* - * If there are free pages between these, - * free the section of the memmap array. - */ - if (pg < pgend) - memblock_free(pg, pgend - pg); -} - -/* - * The mem_map array can get very big. Free the unused area of the memory map. - */ -static void __init free_unused_memmap(struct meminfo *mi) -{ - unsigned long bank_start, prev_bank_end = 0; - unsigned int i; - - /* - * This relies on each bank being in address order. - * The banks are sorted previously in bootmem_init(). - */ - for_each_bank(i, mi) { - struct membank *bank = &mi->bank[i]; - - bank_start = bank_pfn_start(bank); - - /* - * If we had a previous bank, and there is a space - * between the current bank and the previous, free it. - */ - if (prev_bank_end && prev_bank_end < bank_start) - free_memmap(prev_bank_end, bank_start); - - /* - * Align up here since the VM subsystem insists that the - * memmap entries are valid from the bank end aligned to - * MAX_ORDER_NR_PAGES. - */ - prev_bank_end = ALIGN(bank_pfn_end(bank), MAX_ORDER_NR_PAGES); - } -} - -/* - * mem_init() marks the free areas in the mem_map and tells us how much - * memory is free. This is done after various parts of the system have - * claimed their memory after the kernel image. - */ -void __init mem_init(void) -{ - max_mapnr = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map; - - free_unused_memmap(&meminfo); - - /* this will put all unused low memory onto the freelists */ - memblock_free_all(); - - mem_init_print_info(NULL); - - BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); - BUG_ON(TASK_SIZE > MODULES_VADDR); - - if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) { - /* - * On a machine this small we won't get - * anywhere without overcommit, so turn - * it on by default. - */ - sysctl_overcommit_memory = OVERCOMMIT_ALWAYS; - } -} diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c deleted file mode 100644 index 46a64bd6156a..000000000000 --- a/arch/unicore32/mm/ioremap.c +++ /dev/null @@ -1,242 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/ioremap.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Re-map IO memory to kernel address space so that we can access it. - * - * This allows a driver to remap an arbitrary region of bus memory into - * virtual space. One should *only* use readl, writel, memcpy_toio and - * so on with such remapped areas. - * - * Because UniCore only has a 32-bit address space we can't address the - * whole of the (physical) PCI space at once. PCI huge-mode addressing - * allows us to circumvent this restriction by splitting PCI space into - * two 2GB chunks and mapping only one at a time into processor memory. - * We use MMU protection domains to trap any attempt to access the bank - * that is not currently mapped. (This isn't fully implemented yet.) - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include "mm.h" - -/* - * Used by ioremap() and iounmap() code to mark (super)section-mapped - * I/O regions in vm_struct->flags field. - */ -#define VM_UNICORE_SECTION_MAPPING 0x80000000 - -int ioremap_page(unsigned long virt, unsigned long phys, - const struct mem_type *mtype) -{ - return ioremap_page_range(virt, virt + PAGE_SIZE, phys, - __pgprot(mtype->prot_pte)); -} -EXPORT_SYMBOL(ioremap_page); - -/* - * Section support is unsafe on SMP - If you iounmap and ioremap a region, - * the other CPUs will not see this change until their next context switch. - * Meanwhile, (eg) if an interrupt comes in on one of those other CPUs - * which requires the new ioremap'd region to be referenced, the CPU will - * reference the _old_ region. - * - * Note that get_vm_area_caller() allocates a guard 4K page, so we need to - * mask the size back to 4MB aligned or we will overflow in the loop below. - */ -static void unmap_area_sections(unsigned long virt, unsigned long size) -{ - unsigned long addr = virt, end = virt + (size & ~(SZ_4M - 1)); - pgd_t *pgd; - - flush_cache_vunmap(addr, end); - pgd = pgd_offset_k(addr); - do { - pmd_t pmd, *pmdp = pmd_offset((pud_t *)pgd, addr); - - pmd = *pmdp; - if (!pmd_none(pmd)) { - /* - * Clear the PMD from the page table, and - * increment the kvm sequence so others - * notice this change. - * - * Note: this is still racy on SMP machines. - */ - pmd_clear(pmdp); - - /* - * Free the page table, if there was one. - */ - if ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE) - pte_free_kernel(&init_mm, pmd_page_vaddr(pmd)); - } - - addr += PGDIR_SIZE; - pgd++; - } while (addr < end); - - flush_tlb_kernel_range(virt, end); -} - -static int -remap_area_sections(unsigned long virt, unsigned long pfn, - size_t size, const struct mem_type *type) -{ - unsigned long addr = virt, end = virt + size; - pgd_t *pgd; - - /* - * Remove and free any PTE-based mapping, and - * sync the current kernel mapping. - */ - unmap_area_sections(virt, size); - - pgd = pgd_offset_k(addr); - do { - pmd_t *pmd = pmd_offset((pud_t *)pgd, addr); - - set_pmd(pmd, __pmd(__pfn_to_phys(pfn) | type->prot_sect)); - pfn += SZ_4M >> PAGE_SHIFT; - flush_pmd_entry(pmd); - - addr += PGDIR_SIZE; - pgd++; - } while (addr < end); - - return 0; -} - -void __iomem *__uc32_ioremap_pfn_caller(unsigned long pfn, - unsigned long offset, size_t size, unsigned int mtype, void *caller) -{ - const struct mem_type *type; - int err; - unsigned long addr; - struct vm_struct *area; - - /* - * High mappings must be section aligned - */ - if (pfn >= 0x100000 && (__pfn_to_phys(pfn) & ~SECTION_MASK)) - return NULL; - - /* - * Don't allow RAM to be mapped - */ - if (pfn_valid(pfn)) { - WARN(1, "BUG: Your driver calls ioremap() on\n" - "system memory. This leads to architecturally\n" - "unpredictable behaviour, and ioremap() will fail in\n" - "the next kernel release. Please fix your driver.\n"); - return NULL; - } - - type = get_mem_type(mtype); - if (!type) - return NULL; - - /* - * Page align the mapping size, taking account of any offset. - */ - size = PAGE_ALIGN(offset + size); - - area = get_vm_area_caller(size, VM_IOREMAP, caller); - if (!area) - return NULL; - addr = (unsigned long)area->addr; - - if (!((__pfn_to_phys(pfn) | size | addr) & ~PMD_MASK)) { - area->flags |= VM_UNICORE_SECTION_MAPPING; - err = remap_area_sections(addr, pfn, size, type); - } else - err = ioremap_page_range(addr, addr + size, __pfn_to_phys(pfn), - __pgprot(type->prot_pte)); - - if (err) { - vunmap((void *)addr); - return NULL; - } - - flush_cache_vmap(addr, addr + size); - return (void __iomem *) (offset + addr); -} - -void __iomem *__uc32_ioremap_caller(unsigned long phys_addr, size_t size, - unsigned int mtype, void *caller) -{ - unsigned long last_addr; - unsigned long offset = phys_addr & ~PAGE_MASK; - unsigned long pfn = __phys_to_pfn(phys_addr); - - /* - * Don't allow wraparound or zero size - */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - return __uc32_ioremap_pfn_caller(pfn, offset, size, mtype, caller); -} - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * -__uc32_ioremap_pfn(unsigned long pfn, unsigned long offset, size_t size, - unsigned int mtype) -{ - return __uc32_ioremap_pfn_caller(pfn, offset, size, mtype, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(__uc32_ioremap_pfn); - -void __iomem * -__uc32_ioremap(unsigned long phys_addr, size_t size) -{ - return __uc32_ioremap_caller(phys_addr, size, MT_DEVICE, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(__uc32_ioremap); - -void __uc32_iounmap(volatile void __iomem *io_addr) -{ - void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr); - struct vm_struct *vm; - - /* - * If this is a section based mapping we need to handle it - * specially as the VM subsystem does not know how to handle - * such a beast. We need the lock here b/c we need to clear - * all the mappings before the area can be reclaimed - * by someone else. - */ - vm = find_vm_area(addr); - if (vm && (vm->flags & VM_IOREMAP) && - (vm->flags & VM_UNICORE_SECTION_MAPPING)) - unmap_area_sections((unsigned long)vm->addr, vm->size); - - vunmap(addr); -} -EXPORT_SYMBOL(__uc32_iounmap); diff --git a/arch/unicore32/mm/mm.h b/arch/unicore32/mm/mm.h deleted file mode 100644 index f157f5d249ab..000000000000 --- a/arch/unicore32/mm/mm.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/mm/mm.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include - -/* the upper-most page table pointer */ -extern pmd_t *top_pmd; -extern int sysctl_overcommit_memory; - -#define TOP_PTE(x) pte_offset_kernel(top_pmd, x) - -struct mem_type { - unsigned int prot_pte; - unsigned int prot_l1; - unsigned int prot_sect; -}; - -const struct mem_type *get_mem_type(unsigned int type); - -extern void __flush_dcache_page(struct address_space *, struct page *); -extern void hook_fault_code(int nr, int (*fn) - (unsigned long, unsigned int, struct pt_regs *), - int sig, int code, const char *name); - -void __init bootmem_init(void); -void uc32_mm_memblock_reserve(void); diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c deleted file mode 100644 index 183d5b056814..000000000000 --- a/arch/unicore32/mm/mmu.c +++ /dev/null @@ -1,513 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/mmu.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include "mm.h" - -/* - * empty_zero_page is a special page that is used for - * zero-initialized data and COW. - */ -struct page *empty_zero_page; -EXPORT_SYMBOL(empty_zero_page); - -/* - * The pmd table for the upper-most set of pages. - */ -pmd_t *top_pmd; - -pgprot_t pgprot_user; -EXPORT_SYMBOL(pgprot_user); - -pgprot_t pgprot_kernel; -EXPORT_SYMBOL(pgprot_kernel); - -static int __init noalign_setup(char *__unused) -{ - cr_alignment &= ~CR_A; - cr_no_alignment &= ~CR_A; - set_cr(cr_alignment); - return 1; -} -__setup("noalign", noalign_setup); - -void adjust_cr(unsigned long mask, unsigned long set) -{ - unsigned long flags; - - mask &= ~CR_A; - - set &= mask; - - local_irq_save(flags); - - cr_no_alignment = (cr_no_alignment & ~mask) | set; - cr_alignment = (cr_alignment & ~mask) | set; - - set_cr((get_cr() & ~mask) | set); - - local_irq_restore(flags); -} - -struct map_desc { - unsigned long virtual; - unsigned long pfn; - unsigned long length; - unsigned int type; -}; - -#define PROT_PTE_DEVICE (PTE_PRESENT | PTE_YOUNG | \ - PTE_DIRTY | PTE_READ | PTE_WRITE) -#define PROT_SECT_DEVICE (PMD_TYPE_SECT | PMD_PRESENT | \ - PMD_SECT_READ | PMD_SECT_WRITE) - -static struct mem_type mem_types[] = { - [MT_DEVICE] = { /* Strongly ordered */ - .prot_pte = PROT_PTE_DEVICE, - .prot_l1 = PMD_TYPE_TABLE | PMD_PRESENT, - .prot_sect = PROT_SECT_DEVICE, - }, - /* - * MT_KUSER: pte for vecpage -- cacheable, - * and sect for unigfx mmap -- noncacheable - */ - [MT_KUSER] = { - .prot_pte = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY | - PTE_CACHEABLE | PTE_READ | PTE_EXEC, - .prot_l1 = PMD_TYPE_TABLE | PMD_PRESENT, - .prot_sect = PROT_SECT_DEVICE, - }, - [MT_HIGH_VECTORS] = { - .prot_pte = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY | - PTE_CACHEABLE | PTE_READ | PTE_WRITE | - PTE_EXEC, - .prot_l1 = PMD_TYPE_TABLE | PMD_PRESENT, - }, - [MT_MEMORY] = { - .prot_pte = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY | - PTE_WRITE | PTE_EXEC, - .prot_l1 = PMD_TYPE_TABLE | PMD_PRESENT, - .prot_sect = PMD_TYPE_SECT | PMD_PRESENT | PMD_SECT_CACHEABLE | - PMD_SECT_READ | PMD_SECT_WRITE | PMD_SECT_EXEC, - }, - [MT_ROM] = { - .prot_sect = PMD_TYPE_SECT | PMD_PRESENT | PMD_SECT_CACHEABLE | - PMD_SECT_READ, - }, -}; - -const struct mem_type *get_mem_type(unsigned int type) -{ - return type < ARRAY_SIZE(mem_types) ? &mem_types[type] : NULL; -} -EXPORT_SYMBOL(get_mem_type); - -/* - * Adjust the PMD section entries according to the CPU in use. - */ -static void __init build_mem_type_table(void) -{ - pgprot_user = __pgprot(PTE_PRESENT | PTE_YOUNG | PTE_CACHEABLE); - pgprot_kernel = __pgprot(PTE_PRESENT | PTE_YOUNG | - PTE_DIRTY | PTE_READ | PTE_WRITE | - PTE_EXEC | PTE_CACHEABLE); -} - -#define vectors_base() (vectors_high() ? 0xffff0000 : 0) - -static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr, - unsigned long prot) -{ - if (pmd_none(*pmd)) { - size_t size = PTRS_PER_PTE * sizeof(pte_t); - pte_t *pte = memblock_alloc(size, size); - - if (!pte) - panic("%s: Failed to allocate %zu bytes align=%zx\n", - __func__, size, size); - - __pmd_populate(pmd, __pa(pte) | prot); - } - BUG_ON(pmd_bad(*pmd)); - return pte_offset_kernel(pmd, addr); -} - -static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, - unsigned long end, unsigned long pfn, - const struct mem_type *type) -{ - pte_t *pte = early_pte_alloc(pmd, addr, type->prot_l1); - do { - set_pte(pte, pfn_pte(pfn, __pgprot(type->prot_pte))); - pfn++; - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -static void __init alloc_init_section(pgd_t *pgd, unsigned long addr, - unsigned long end, unsigned long phys, - const struct mem_type *type) -{ - pmd_t *pmd = pmd_offset((pud_t *)pgd, addr); - - /* - * Try a section mapping - end, addr and phys must all be aligned - * to a section boundary. - */ - if (((addr | end | phys) & ~SECTION_MASK) == 0) { - pmd_t *p = pmd; - - do { - set_pmd(pmd, __pmd(phys | type->prot_sect)); - phys += SECTION_SIZE; - } while (pmd++, addr += SECTION_SIZE, addr != end); - - flush_pmd_entry(p); - } else { - /* - * No need to loop; pte's aren't interested in the - * individual L1 entries. - */ - alloc_init_pte(pmd, addr, end, __phys_to_pfn(phys), type); - } -} - -/* - * Create the page directory entries and any necessary - * page tables for the mapping specified by `md'. We - * are able to cope here with varying sizes and address - * offsets, and we take full advantage of sections. - */ -static void __init create_mapping(struct map_desc *md) -{ - unsigned long phys, addr, length, end; - const struct mem_type *type; - pgd_t *pgd; - - if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) { - printk(KERN_WARNING "BUG: not creating mapping for " - "0x%08llx at 0x%08lx in user region\n", - __pfn_to_phys((u64)md->pfn), md->virtual); - return; - } - - if ((md->type == MT_DEVICE || md->type == MT_ROM) && - md->virtual >= PAGE_OFFSET && md->virtual < VMALLOC_END) { - printk(KERN_WARNING "BUG: mapping for 0x%08llx at 0x%08lx " - "overlaps vmalloc space\n", - __pfn_to_phys((u64)md->pfn), md->virtual); - } - - type = &mem_types[md->type]; - - addr = md->virtual & PAGE_MASK; - phys = (unsigned long)__pfn_to_phys(md->pfn); - length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK)); - - if (type->prot_l1 == 0 && ((addr | phys | length) & ~SECTION_MASK)) { - printk(KERN_WARNING "BUG: map for 0x%08lx at 0x%08lx can not " - "be mapped using pages, ignoring.\n", - __pfn_to_phys(md->pfn), addr); - return; - } - - pgd = pgd_offset_k(addr); - end = addr + length; - do { - unsigned long next = pgd_addr_end(addr, end); - - alloc_init_section(pgd, addr, next, phys, type); - - phys += next - addr; - addr = next; - } while (pgd++, addr != end); -} - -static void * __initdata vmalloc_min = (void *)(VMALLOC_END - SZ_128M); - -/* - * vmalloc=size forces the vmalloc area to be exactly 'size' - * bytes. This can be used to increase (or decrease) the vmalloc - * area - the default is 128m. - */ -static int __init early_vmalloc(char *arg) -{ - unsigned long vmalloc_reserve = memparse(arg, NULL); - - if (vmalloc_reserve < SZ_16M) { - vmalloc_reserve = SZ_16M; - printk(KERN_WARNING - "vmalloc area too small, limiting to %luMB\n", - vmalloc_reserve >> 20); - } - - if (vmalloc_reserve > VMALLOC_END - (PAGE_OFFSET + SZ_32M)) { - vmalloc_reserve = VMALLOC_END - (PAGE_OFFSET + SZ_32M); - printk(KERN_WARNING - "vmalloc area is too big, limiting to %luMB\n", - vmalloc_reserve >> 20); - } - - vmalloc_min = (void *)(VMALLOC_END - vmalloc_reserve); - return 0; -} -early_param("vmalloc", early_vmalloc); - -static phys_addr_t lowmem_limit __initdata = SZ_1G; - -static void __init sanity_check_meminfo(void) -{ - int i, j; - - lowmem_limit = __pa(vmalloc_min - 1) + 1; - memblock_set_current_limit(lowmem_limit); - - for (i = 0, j = 0; i < meminfo.nr_banks; i++) { - struct membank *bank = &meminfo.bank[j]; - *bank = meminfo.bank[i]; - j++; - } - meminfo.nr_banks = j; -} - -static inline void prepare_page_table(void) -{ - unsigned long addr; - phys_addr_t end; - - /* - * Clear out all the mappings below the kernel image. - */ - for (addr = 0; addr < MODULES_VADDR; addr += PGDIR_SIZE) - pmd_clear(pmd_off_k(addr)); - - for ( ; addr < PAGE_OFFSET; addr += PGDIR_SIZE) - pmd_clear(pmd_off_k(addr)); - - /* - * Find the end of the first block of lowmem. - */ - end = memblock.memory.regions[0].base + memblock.memory.regions[0].size; - if (end >= lowmem_limit) - end = lowmem_limit; - - /* - * Clear out all the kernel space mappings, except for the first - * memory bank, up to the end of the vmalloc region. - */ - for (addr = __phys_to_virt(end); - addr < VMALLOC_END; addr += PGDIR_SIZE) - pmd_clear(pmd_off_k(addr)); -} - -/* - * Reserve the special regions of memory - */ -void __init uc32_mm_memblock_reserve(void) -{ - /* - * Reserve the page tables. These are already in use, - * and can only be in node 0. - */ - memblock_reserve(__pa(swapper_pg_dir), PTRS_PER_PGD * sizeof(pgd_t)); -} - -/* - * Set up device the mappings. Since we clear out the page tables for all - * mappings above VMALLOC_END, we will remove any debug device mappings. - * This means you have to be careful how you debug this function, or any - * called function. This means you can't use any function or debugging - * method which may touch any device, otherwise the kernel _will_ crash. - */ -static void __init devicemaps_init(void) -{ - struct map_desc map; - unsigned long addr; - void *vectors; - - /* - * Allocate the vector page early. - */ - vectors = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!vectors) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); - - for (addr = VMALLOC_END; addr; addr += PGDIR_SIZE) - pmd_clear(pmd_off_k(addr)); - - /* - * Create a mapping for the machine vectors at the high-vectors - * location (0xffff0000). If we aren't using high-vectors, also - * create a mapping at the low-vectors virtual address. - */ - map.pfn = __phys_to_pfn(virt_to_phys(vectors)); - map.virtual = VECTORS_BASE; - map.length = PAGE_SIZE; - map.type = MT_HIGH_VECTORS; - create_mapping(&map); - - /* - * Create a mapping for the kuser page at the special - * location (0xbfff0000) to the same vectors location. - */ - map.pfn = __phys_to_pfn(virt_to_phys(vectors)); - map.virtual = KUSER_VECPAGE_BASE; - map.length = PAGE_SIZE; - map.type = MT_KUSER; - create_mapping(&map); - - /* - * Finally flush the caches and tlb to ensure that we're in a - * consistent state wrt the writebuffer. This also ensures that - * any write-allocated cache lines in the vector page are written - * back. After this point, we can start to touch devices again. - */ - local_flush_tlb_all(); - flush_cache_all(); -} - -static void __init map_lowmem(void) -{ - struct memblock_region *reg; - - /* Map all the lowmem memory banks. */ - for_each_memblock(memory, reg) { - phys_addr_t start = reg->base; - phys_addr_t end = start + reg->size; - struct map_desc map; - - if (end > lowmem_limit) - end = lowmem_limit; - if (start >= end) - break; - - map.pfn = __phys_to_pfn(start); - map.virtual = __phys_to_virt(start); - map.length = end - start; - map.type = MT_MEMORY; - - create_mapping(&map); - } -} - -/* - * paging_init() sets up the page tables, initialises the zone memory - * maps, and sets up the zero page, bad page and bad page tables. - */ -void __init paging_init(void) -{ - void *zero_page; - - build_mem_type_table(); - sanity_check_meminfo(); - prepare_page_table(); - map_lowmem(); - devicemaps_init(); - - top_pmd = pmd_off_k(0xffff0000); - - /* allocate the zero page. */ - zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); - - bootmem_init(); - - empty_zero_page = virt_to_page(zero_page); - __flush_dcache_page(NULL, empty_zero_page); -} - -/* - * In order to soft-boot, we need to insert a 1:1 mapping in place of - * the user-mode pages. This will then ensure that we have predictable - * results when turning the mmu off - */ -void setup_mm_for_reboot(void) -{ - unsigned long base_pmdval; - pgd_t *pgd; - int i; - - /* - * We need to access to user-mode page tables here. For kernel threads - * we don't have any user-mode mappings so we use the context that we - * "borrowed". - */ - pgd = current->active_mm->pgd; - - base_pmdval = PMD_SECT_WRITE | PMD_SECT_READ | PMD_TYPE_SECT; - - for (i = 0; i < FIRST_USER_PGD_NR + USER_PTRS_PER_PGD; i++, pgd++) { - unsigned long pmdval = (i << PGDIR_SHIFT) | base_pmdval; - pmd_t *pmd; - - pmd = pmd_off(pgd, i << PGDIR_SHIFT); - set_pmd(pmd, __pmd(pmdval)); - flush_pmd_entry(pmd); - } - - local_flush_tlb_all(); -} - -/* - * Take care of architecture specific things when placing a new PTE into - * a page table, or changing an existing PTE. Basically, there are two - * things that we need to take care of: - * - * 1. If PG_dcache_clean is not set for the page, we need to ensure - * that any cache entries for the kernels virtual memory - * range are written back to the page. - * 2. If we have multiple shared mappings of the same space in - * an object, we need to deal with the cache aliasing issues. - * - * Note that the pte lock will be held. - */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) -{ - unsigned long pfn = pte_pfn(*ptep); - struct address_space *mapping; - struct page *page; - - if (!pfn_valid(pfn)) - return; - - /* - * The zero page is never written to, so never has any dirty - * cache lines, and therefore never needs to be flushed. - */ - page = pfn_to_page(pfn); - if (page == ZERO_PAGE(0)) - return; - - mapping = page_mapping_file(page); - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) - __flush_dcache_page(mapping, page); - if (mapping) - if (vma->vm_flags & VM_EXEC) - __flush_icache_all(); -} diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c deleted file mode 100644 index f01c73e04836..000000000000 --- a/arch/unicore32/mm/pgd.c +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/pgd.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include - -#include -#include -#include - -#include "mm.h" - -#define FIRST_KERNEL_PGD_NR (FIRST_USER_PGD_NR + USER_PTRS_PER_PGD) - -/* - * need to get a 4k page for level 1 - */ -pgd_t *get_pgd_slow(struct mm_struct *mm) -{ - pgd_t *new_pgd, *init_pgd; - pmd_t *new_pmd, *init_pmd; - pte_t *new_pte, *init_pte; - - new_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, 0); - if (!new_pgd) - goto no_pgd; - - memset(new_pgd, 0, FIRST_KERNEL_PGD_NR * sizeof(pgd_t)); - - /* - * Copy over the kernel and IO PGD entries - */ - init_pgd = pgd_offset_k(0); - memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR, - (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t)); - - clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - - if (!vectors_high()) { - /* - * On UniCore, first page must always be allocated since it - * contains the machine vectors. - */ - new_pmd = pmd_alloc(mm, (pud_t *)new_pgd, 0); - if (!new_pmd) - goto no_pmd; - - new_pte = pte_alloc_map(mm, new_pmd, 0); - if (!new_pte) - goto no_pte; - - init_pmd = pmd_offset((pud_t *)init_pgd, 0); - init_pte = pte_offset_map(init_pmd, 0); - set_pte(new_pte, *init_pte); - pte_unmap(init_pte); - pte_unmap(new_pte); - } - - return new_pgd; - -no_pte: - pmd_free(mm, new_pmd); - mm_dec_nr_pmds(mm); -no_pmd: - free_pages((unsigned long)new_pgd, 0); -no_pgd: - return NULL; -} - -void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) -{ - pmd_t *pmd; - pgtable_t pte; - - if (!pgd) - return; - - /* pgd is always present and good */ - pmd = pmd_off(pgd, 0); - if (pmd_none(*pmd)) - goto free; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - goto free; - } - - pte = pmd_pgtable(*pmd); - pmd_clear(pmd); - pte_free(mm, pte); - mm_dec_nr_ptes(mm); - pmd_free(mm, pmd); - mm_dec_nr_pmds(mm); -free: - free_pages((unsigned long) pgd, 0); -} diff --git a/arch/unicore32/mm/proc-macros.S b/arch/unicore32/mm/proc-macros.S deleted file mode 100644 index 3b0ae7d5bd80..000000000000 --- a/arch/unicore32/mm/proc-macros.S +++ /dev/null @@ -1,142 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/mm/proc-macros.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * We need constants.h for: - * VMA_VM_MM - * VMA_VM_FLAGS - * VM_EXEC - */ -#include -#include -#include - -/* - * the cache line sizes of the I and D cache are the same - */ -#define CACHE_LINESIZE 32 - -/* - * This is the maximum size of an area which will be invalidated - * using the single invalidate entry instructions. Anything larger - * than this, and we go for the whole cache. - * - * This value should be chosen such that we choose the cheapest - * alternative. - */ -#ifdef CONFIG_CPU_UCV2 -#define MAX_AREA_SIZE 0x800 /* 64 cache line */ -#endif - -/* - * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm) - */ - .macro vma_vm_mm, rd, rn - ldw \rd, [\rn+], #VMA_VM_MM - .endm - -/* - * vma_vm_flags - get vma->vm_flags - */ - .macro vma_vm_flags, rd, rn - ldw \rd, [\rn+], #VMA_VM_FLAGS - .endm - - .macro tsk_mm, rd, rn - ldw \rd, [\rn+], #TI_TASK - ldw \rd, [\rd+], #TSK_ACTIVE_MM - .endm - -/* - * act_mm - get current->active_mm - */ - .macro act_mm, rd - andn \rd, sp, #8128 - andn \rd, \rd, #63 - ldw \rd, [\rd+], #TI_TASK - ldw \rd, [\rd+], #TSK_ACTIVE_MM - .endm - -/* - * mmid - get context id from mm pointer (mm->context.id) - */ - .macro mmid, rd, rn - ldw \rd, [\rn+], #MM_CONTEXT_ID - .endm - -/* - * mask_asid - mask the ASID from the context ID - */ - .macro asid, rd, rn - and \rd, \rn, #255 - .endm - - .macro crval, clear, mmuset, ucset - .word \clear - .word \mmuset - .endm - -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE -/* - * va2pa va, pa, tbl, msk, off, err - * This macro is used to translate virtual address to its physical address. - * - * va: virtual address - * pa: physical address, result is stored in this register - * tbl, msk, off: temp registers, will be destroyed - * err: jump to error label if the physical address not exist - * NOTE: all regs must be different - */ - .macro va2pa, va, pa, tbl, msk, off, err=990f - movc \pa, p0.c2, #0 - mov \off, \va >> #22 @ off <- index of 1st page table - adr \tbl, 910f @ tbl <- table of 1st page table -900: @ ---- handle 1, 2 page table - add \pa, \pa, #PAGE_OFFSET @ pa <- virt addr of page table - ldw \pa, [\pa+], \off << #2 @ pa <- the content of pt - cand.a \pa, #4 @ test exist bit - beq \err @ if not exist - and \off, \pa, #3 @ off <- the last 2 bits - add \tbl, \tbl, \off << #3 @ cmove table pointer - ldw \msk, [\tbl+], #0 @ get the mask - ldw pc, [\tbl+], #4 -930: @ ---- handle 2nd page table - and \pa, \pa, \msk @ pa <- phys addr of 2nd pt - mov \off, \va << #10 - cntlo \tbl, \msk @ use tbl as temp reg - mov \off, \off >> \tbl - mov \off, \off >> #2 @ off <- index of 2nd pt - adr \tbl, 920f @ tbl <- table of 2nd pt - b 900b -910: @ 1st level page table - .word 0xfffff000, 930b @ second level page table - .word 0xfffffc00, 930b @ second level large page table - .word 0x00000000, \err @ invalid - .word 0xffc00000, 980f @ super page - -920: @ 2nd level page table - .word 0xfffff000, 980f @ page - .word 0xffffc000, 980f @ middle page - .word 0xffff0000, 980f @ large page - .word 0x00000000, \err @ invalid -980: - andn \tbl, \va, \msk - and \pa, \pa, \msk - or \pa, \pa, \tbl -990: - .endm -#endif - - .macro dcacheline_flush, addr, t1, t2 - mov \t1, \addr << #20 - ldw \t2, =_stext @ _stext must ALIGN(4096) - add \t2, \t2, \t1 >> #20 - ldw \t1, [\t2+], #0x0000 - ldw \t1, [\t2+], #0x1000 - ldw \t1, [\t2+], #0x2000 - ldw \t1, [\t2+], #0x3000 - .endm diff --git a/arch/unicore32/mm/proc-syms.c b/arch/unicore32/mm/proc-syms.c deleted file mode 100644 index 6c081616fc3c..000000000000 --- a/arch/unicore32/mm/proc-syms.c +++ /dev/null @@ -1,19 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/proc-syms.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - -#include -#include -#include - -EXPORT_SYMBOL(cpu_dcache_clean_area); -EXPORT_SYMBOL(cpu_set_pte); - -EXPORT_SYMBOL(__cpuc_coherent_kern_range); diff --git a/arch/unicore32/mm/proc-ucv2.S b/arch/unicore32/mm/proc-ucv2.S deleted file mode 100644 index 18f8c4fb21a0..000000000000 --- a/arch/unicore32/mm/proc-ucv2.S +++ /dev/null @@ -1,131 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/mm/proc-ucv2.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include - -#include "proc-macros.S" - -ENTRY(cpu_proc_fin) - stm.w (lr), [sp-] - mov ip, #PSR_R_BIT | PSR_I_BIT | PRIV_MODE - mov.a asr, ip - b.l __cpuc_flush_kern_all - ldm.w (pc), [sp]+ - -/* - * cpu_reset(loc) - * - * Perform a soft reset of the system. Put the CPU into the - * same state as it would be if it had been reset, and branch - * to what would be the reset vector. - * - * - loc - location to jump to for soft reset - */ - .align 5 -ENTRY(cpu_reset) - mov ip, #0 - movc p0.c5, ip, #28 @ Cache invalidate all - nop8 - - movc p0.c6, ip, #6 @ TLB invalidate all - nop8 - - movc ip, p0.c1, #0 @ ctrl register - or ip, ip, #0x2000 @ vector base address - andn ip, ip, #0x000f @ ............idam - movc p0.c1, ip, #0 @ disable caches and mmu - nop - mov pc, r0 @ jump to loc - nop8 - -/* - * cpu_do_idle() - * - * Idle the processor (eg, wait for interrupt). - * - * IRQs are already disabled. - */ -ENTRY(cpu_do_idle) - mov r0, #0 @ PCI address - .rept 8 - ldw r1, [r0] - .endr - mov pc, lr - -ENTRY(cpu_dcache_clean_area) -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - csub.a r1, #MAX_AREA_SIZE - bsg 101f - mov r9, #PAGE_SZ - sub r9, r9, #1 @ PAGE_MASK -1: va2pa r0, r10, r11, r12, r13 @ r10 is PA - b 3f -2: cand.a r0, r9 - beq 1b -3: movc p0.c5, r10, #11 @ clean D entry - nop8 - add r0, r0, #CACHE_LINESIZE - add r10, r10, #CACHE_LINESIZE - sub.a r1, r1, #CACHE_LINESIZE - bua 2b - mov pc, lr -#endif -101: mov ip, #0 - movc p0.c5, ip, #10 @ Dcache clean all - nop8 - - mov pc, lr - -/* - * cpu_do_switch_mm(pgd_phys) - * - * Set the translation table base pointer to be pgd_phys - * - * - pgd_phys - physical address of new pgd - * - * It is assumed that: - * - we are not using split page tables - */ - .align 5 -ENTRY(cpu_do_switch_mm) - movc p0.c2, r0, #0 @ update page table ptr - nop8 - - movc p0.c6, ip, #6 @ TLB invalidate all - nop8 - - mov pc, lr - -/* - * cpu_set_pte(ptep, pte) - * - * Set a level 2 translation table entry. - * - * - ptep - pointer to level 2 translation table entry - * - pte - PTE value to store - */ - .align 5 -ENTRY(cpu_set_pte) - stw r1, [r0] -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - sub r2, r0, #PAGE_OFFSET - movc p0.c5, r2, #11 @ Dcache clean line - nop8 -#else - mov ip, #0 - movc p0.c5, ip, #10 @ Dcache clean all - nop8 - @dcacheline_flush r0, r2, ip -#endif - mov pc, lr - diff --git a/arch/unicore32/mm/tlb-ucv2.S b/arch/unicore32/mm/tlb-ucv2.S deleted file mode 100644 index 0ce9c6b6f1db..000000000000 --- a/arch/unicore32/mm/tlb-ucv2.S +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/mm/tlb-ucv2.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include "proc-macros.S" - -/* - * __cpu_flush_user_tlb_range(start, end, vma) - * - * Invalidate a range of TLB entries in the specified address space. - * - * - start - start address (may not be aligned) - * - end - end address (exclusive, may not be aligned) - * - vma - vma_struct describing address range - */ -ENTRY(__cpu_flush_user_tlb_range) -#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE - mov r0, r0 >> #PAGE_SHIFT @ align address - mov r0, r0 << #PAGE_SHIFT - vma_vm_flags r2, r2 @ get vma->vm_flags -1: - movc p0.c6, r0, #3 - nop8 - - cand.a r2, #VM_EXEC @ Executable area ? - beq 2f - - movc p0.c6, r0, #5 - nop8 -2: - add r0, r0, #PAGE_SZ - csub.a r0, r1 - beb 1b -#else - movc p0.c6, r0, #2 - nop8 - - cand.a r2, #VM_EXEC @ Executable area ? - beq 2f - - movc p0.c6, r0, #4 - nop8 -2: -#endif - mov pc, lr - -/* - * __cpu_flush_kern_tlb_range(start,end) - * - * Invalidate a range of kernel TLB entries - * - * - start - start address (may not be aligned) - * - end - end address (exclusive, may not be aligned) - */ -ENTRY(__cpu_flush_kern_tlb_range) -#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE - mov r0, r0 >> #PAGE_SHIFT @ align address - mov r0, r0 << #PAGE_SHIFT -1: - movc p0.c6, r0, #3 - nop8 - - movc p0.c6, r0, #5 - nop8 - - add r0, r0, #PAGE_SZ - csub.a r0, r1 - beb 1b -#else - movc p0.c6, r0, #2 - nop8 - - movc p0.c6, r0, #4 - nop8 -#endif - mov pc, lr - diff --git a/kernel/reboot.c b/kernel/reboot.c index 491f1347bf43..e7b78d5ae1ab 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -26,7 +26,7 @@ int C_A_D = 1; struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); -#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32) +#if defined(CONFIG_ARM) #define DEFAULT_REBOOT_MODE = REBOOT_HARD #else #define DEFAULT_REBOOT_MODE From 5853d602dc581b891cc591ec5f67024bcaacdfcf Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 10 Jun 2020 09:57:50 +0300 Subject: [PATCH 230/502] cpufreq: remove unicore32 driver The unicore32 port is removed from the kernel. There is no point to keep stale cpufreq driver for this architecture. Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: Guenter Roeck --- drivers/cpufreq/Makefile | 1 - drivers/cpufreq/unicore2-cpufreq.c | 76 ------------------------------ 2 files changed, 77 deletions(-) delete mode 100644 drivers/cpufreq/unicore2-cpufreq.c diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index f6670c4abbb0..089938ead681 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -108,4 +108,3 @@ obj-$(CONFIG_LOONGSON1_CPUFREQ) += loongson1-cpufreq.o obj-$(CONFIG_SH_CPU_FREQ) += sh-cpufreq.o obj-$(CONFIG_SPARC_US2E_CPUFREQ) += sparc-us2e-cpufreq.o obj-$(CONFIG_SPARC_US3_CPUFREQ) += sparc-us3-cpufreq.o -obj-$(CONFIG_UNICORE32) += unicore2-cpufreq.o diff --git a/drivers/cpufreq/unicore2-cpufreq.c b/drivers/cpufreq/unicore2-cpufreq.c deleted file mode 100644 index 98d392196df2..000000000000 --- a/drivers/cpufreq/unicore2-cpufreq.c +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * clock scaling for the UniCore-II - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include - -#include - -static struct cpufreq_driver ucv2_driver; - -/* make sure that only the "userspace" governor is run - * -- anything else wouldn't make sense on this platform, anyway. - */ -static int ucv2_verify_speed(struct cpufreq_policy_data *policy) -{ - if (policy->cpu) - return -EINVAL; - - cpufreq_verify_within_cpu_limits(policy); - return 0; -} - -static int ucv2_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - struct cpufreq_freqs freqs; - int ret; - - freqs.old = policy->cur; - freqs.new = target_freq; - - cpufreq_freq_transition_begin(policy, &freqs); - ret = clk_set_rate(policy->clk, target_freq * 1000); - cpufreq_freq_transition_end(policy, &freqs, ret); - - return ret; -} - -static int __init ucv2_cpu_init(struct cpufreq_policy *policy) -{ - if (policy->cpu != 0) - return -EINVAL; - - policy->min = policy->cpuinfo.min_freq = 250000; - policy->max = policy->cpuinfo.max_freq = 1000000; - policy->clk = clk_get(NULL, "MAIN_CLK"); - return PTR_ERR_OR_ZERO(policy->clk); -} - -static struct cpufreq_driver ucv2_driver = { - .flags = CPUFREQ_STICKY | CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING, - .verify = ucv2_verify_speed, - .target = ucv2_target, - .get = cpufreq_generic_get, - .init = ucv2_cpu_init, - .name = "UniCore-II", -}; - -static int __init ucv2_cpufreq_init(void) -{ - return cpufreq_register_driver(&ucv2_driver); -} - -arch_initcall(ucv2_cpufreq_init); From c59e68250c4b317c99f1d1a1e8f990fd8e608afd Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 10 Jun 2020 09:57:50 +0300 Subject: [PATCH 231/502] i2c/buses: remove i2c-puv3 driver The unicore32 port is removed from the kernel. There is no point to keep stale i2c bus driver for this architecture. Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: Guenter Roeck --- MAINTAINERS | 1 - drivers/i2c/busses/Kconfig | 11 -- drivers/i2c/busses/Makefile | 1 - drivers/i2c/busses/i2c-puv3.c | 275 ---------------------------------- 4 files changed, 288 deletions(-) delete mode 100644 drivers/i2c/busses/i2c-puv3.c diff --git a/MAINTAINERS b/MAINTAINERS index 1de95aa44bbb..ec65e063e258 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13585,7 +13585,6 @@ M: Guan Xuetao S: Maintained W: http://mprc.pku.edu.cn/~guanxuetao/linux T: git git://github.com/gxt/linux.git -F: drivers/i2c/busses/i2c-puv3.c F: drivers/input/serio/i8042-unicore32io.h F: drivers/rtc/rtc-puv3.c F: drivers/video/fbdev/fb-puv3.c diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 735bf31a3fdf..88639e52c73a 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -866,17 +866,6 @@ config I2C_PNX This driver can also be built as a module. If so, the module will be called i2c-pnx. -config I2C_PUV3 - tristate "PKUnity v3 I2C bus support" - depends on UNICORE32 && ARCH_PUV3 - select I2C_ALGOBIT - help - This driver supports the I2C IP inside the PKUnity-v3 SoC. - This I2C bus controller is under AMBA/AXI bus. - - This driver can also be built as a module. If so, the module - will be called i2c-puv3. - config I2C_PXA tristate "Intel PXA2XX I2C adapter" depends on ARCH_PXA || ARCH_MMP || ARCH_MVEBU || (X86_32 && PCI && OF) || COMPILE_TEST diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index 306d5dc3f417..19aff0e45cb5 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -88,7 +88,6 @@ obj-$(CONFIG_I2C_PASEMI) += i2c-pasemi.o obj-$(CONFIG_I2C_PCA_PLATFORM) += i2c-pca-platform.o obj-$(CONFIG_I2C_PMCMSP) += i2c-pmcmsp.o obj-$(CONFIG_I2C_PNX) += i2c-pnx.o -obj-$(CONFIG_I2C_PUV3) += i2c-puv3.o obj-$(CONFIG_I2C_PXA) += i2c-pxa.o obj-$(CONFIG_I2C_PXA_PCI) += i2c-pxa-pci.o obj-$(CONFIG_I2C_QCOM_CCI) += i2c-qcom-cci.o diff --git a/drivers/i2c/busses/i2c-puv3.c b/drivers/i2c/busses/i2c-puv3.c deleted file mode 100644 index 5cec5a36807d..000000000000 --- a/drivers/i2c/busses/i2c-puv3.c +++ /dev/null @@ -1,275 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * I2C driver for PKUnity-v3 SoC - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Poll the i2c status register until the specified bit is set. - * Returns 0 if timed out (100 msec). - */ -static short poll_status(unsigned long bit) -{ - int loop_cntr = 1000; - - if (bit & I2C_STATUS_TFNF) { - do { - udelay(10); - } while (!(readl(I2C_STATUS) & bit) && (--loop_cntr > 0)); - } else { - /* RXRDY handler */ - do { - if (readl(I2C_TAR) == I2C_TAR_EEPROM) - msleep(20); - else - udelay(10); - } while (!(readl(I2C_RXFLR) & 0xf) && (--loop_cntr > 0)); - } - - return (loop_cntr > 0); -} - -static int xfer_read(struct i2c_adapter *adap, unsigned char *buf, int length) -{ - int i2c_reg = *buf; - - /* Read data */ - while (length--) { - if (!poll_status(I2C_STATUS_TFNF)) { - dev_dbg(&adap->dev, "Tx FIFO Not Full timeout\n"); - return -ETIMEDOUT; - } - - /* send addr */ - writel(i2c_reg | I2C_DATACMD_WRITE, I2C_DATACMD); - - /* get ready to next write */ - i2c_reg++; - - /* send read CMD */ - writel(I2C_DATACMD_READ, I2C_DATACMD); - - /* wait until the Rx FIFO have available */ - if (!poll_status(I2C_STATUS_RFNE)) { - dev_dbg(&adap->dev, "RXRDY timeout\n"); - return -ETIMEDOUT; - } - - /* read the data to buf */ - *buf = (readl(I2C_DATACMD) & I2C_DATACMD_DAT_MASK); - buf++; - } - - return 0; -} - -static int xfer_write(struct i2c_adapter *adap, unsigned char *buf, int length) -{ - int i2c_reg = *buf; - - /* Do nothing but storing the reg_num to a static variable */ - if (i2c_reg == -1) { - printk(KERN_WARNING "Error i2c reg\n"); - return -ETIMEDOUT; - } - - if (length == 1) - return 0; - - buf++; - length--; - while (length--) { - /* send addr */ - writel(i2c_reg | I2C_DATACMD_WRITE, I2C_DATACMD); - - /* send write CMD */ - writel(*buf | I2C_DATACMD_WRITE, I2C_DATACMD); - - /* wait until the Rx FIFO have available */ - msleep(20); - - /* read the data to buf */ - i2c_reg++; - buf++; - } - - return 0; -} - -/* - * Generic i2c master transfer entrypoint. - * - */ -static int puv3_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *pmsg, - int num) -{ - int i, ret; - unsigned char swap; - - /* Disable i2c */ - writel(I2C_ENABLE_DISABLE, I2C_ENABLE); - - /* Set the work mode and speed*/ - writel(I2C_CON_MASTER | I2C_CON_SPEED_STD | I2C_CON_SLAVEDISABLE, I2C_CON); - - writel(pmsg->addr, I2C_TAR); - - /* Enable i2c */ - writel(I2C_ENABLE_ENABLE, I2C_ENABLE); - - dev_dbg(&adap->dev, "puv3_i2c_xfer: processing %d messages:\n", num); - - for (i = 0; i < num; i++) { - dev_dbg(&adap->dev, " #%d: %sing %d byte%s %s 0x%02x\n", i, - pmsg->flags & I2C_M_RD ? "read" : "writ", - pmsg->len, pmsg->len > 1 ? "s" : "", - pmsg->flags & I2C_M_RD ? "from" : "to", pmsg->addr); - - if (pmsg->len && pmsg->buf) { /* sanity check */ - if (pmsg->flags & I2C_M_RD) - ret = xfer_read(adap, pmsg->buf, pmsg->len); - else - ret = xfer_write(adap, pmsg->buf, pmsg->len); - - if (ret) - return ret; - - } - dev_dbg(&adap->dev, "transfer complete\n"); - pmsg++; /* next message */ - } - - /* XXX: fixup be16_to_cpu in bq27x00_battery.c */ - if (pmsg->addr == I2C_TAR_PWIC) { - swap = pmsg->buf[0]; - pmsg->buf[0] = pmsg->buf[1]; - pmsg->buf[1] = swap; - } - - return i; -} - -/* - * Return list of supported functionality. - */ -static u32 puv3_i2c_func(struct i2c_adapter *adapter) -{ - return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL; -} - -static const struct i2c_algorithm puv3_i2c_algorithm = { - .master_xfer = puv3_i2c_xfer, - .functionality = puv3_i2c_func, -}; - -/* - * Main initialization routine. - */ -static int puv3_i2c_probe(struct platform_device *pdev) -{ - struct i2c_adapter *adapter; - struct resource *mem; - int rc; - - mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!mem) - return -ENODEV; - - if (!request_mem_region(mem->start, resource_size(mem), "puv3_i2c")) - return -EBUSY; - - adapter = kzalloc(sizeof(struct i2c_adapter), GFP_KERNEL); - if (adapter == NULL) { - dev_err(&pdev->dev, "can't allocate interface!\n"); - rc = -ENOMEM; - goto fail_nomem; - } - snprintf(adapter->name, sizeof(adapter->name), "PUV3-I2C at 0x%08x", - mem->start); - adapter->algo = &puv3_i2c_algorithm; - adapter->class = I2C_CLASS_HWMON; - adapter->dev.parent = &pdev->dev; - - platform_set_drvdata(pdev, adapter); - - adapter->nr = pdev->id; - rc = i2c_add_numbered_adapter(adapter); - if (rc) - goto fail_add_adapter; - - dev_info(&pdev->dev, "PKUnity v3 i2c bus adapter.\n"); - return 0; - -fail_add_adapter: - kfree(adapter); -fail_nomem: - release_mem_region(mem->start, resource_size(mem)); - - return rc; -} - -static int puv3_i2c_remove(struct platform_device *pdev) -{ - struct i2c_adapter *adapter = platform_get_drvdata(pdev); - struct resource *mem; - - i2c_del_adapter(adapter); - - put_device(&pdev->dev); - - mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - release_mem_region(mem->start, resource_size(mem)); - - return 0; -} - -#ifdef CONFIG_PM_SLEEP -static int puv3_i2c_suspend(struct device *dev) -{ - int poll_count; - /* Disable the IIC */ - writel(I2C_ENABLE_DISABLE, I2C_ENABLE); - for (poll_count = 0; poll_count < 50; poll_count++) { - if (readl(I2C_ENSTATUS) & I2C_ENSTATUS_ENABLE) - udelay(25); - } - - return 0; -} - -static SIMPLE_DEV_PM_OPS(puv3_i2c_pm, puv3_i2c_suspend, NULL); -#define PUV3_I2C_PM (&puv3_i2c_pm) - -#else -#define PUV3_I2C_PM NULL -#endif - -static struct platform_driver puv3_i2c_driver = { - .probe = puv3_i2c_probe, - .remove = puv3_i2c_remove, - .driver = { - .name = "PKUnity-v3-I2C", - .pm = PUV3_I2C_PM, - } -}; - -module_platform_driver(puv3_i2c_driver); - -MODULE_DESCRIPTION("PKUnity v3 I2C driver"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS("platform:puv3_i2c"); From a559063a6865357f5ae2c407a092a75ae9f1c84d Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 10 Jun 2020 09:57:50 +0300 Subject: [PATCH 232/502] input: i8042: remove support for 8042-unicore32io The unicore32 port is removed from the kernel. There is no point to keep stale definitions to support this architecture. Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: Guenter Roeck --- MAINTAINERS | 1 - drivers/input/serio/i8042-unicore32io.h | 70 ------------------------- drivers/input/serio/i8042.h | 2 - 3 files changed, 73 deletions(-) delete mode 100644 drivers/input/serio/i8042-unicore32io.h diff --git a/MAINTAINERS b/MAINTAINERS index ec65e063e258..e5035fda296e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13585,7 +13585,6 @@ M: Guan Xuetao S: Maintained W: http://mprc.pku.edu.cn/~guanxuetao/linux T: git git://github.com/gxt/linux.git -F: drivers/input/serio/i8042-unicore32io.h F: drivers/rtc/rtc-puv3.c F: drivers/video/fbdev/fb-puv3.c diff --git a/drivers/input/serio/i8042-unicore32io.h b/drivers/input/serio/i8042-unicore32io.h deleted file mode 100644 index 50bb3ed94b56..000000000000 --- a/drivers/input/serio/i8042-unicore32io.h +++ /dev/null @@ -1,70 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2011 Guan Xuetao - */ -#ifndef _I8042_UNICORE32_H -#define _I8042_UNICORE32_H - -#include - -/* - * Names. - */ -#define I8042_KBD_PHYS_DESC "isa0060/serio0" -#define I8042_AUX_PHYS_DESC "isa0060/serio1" -#define I8042_MUX_PHYS_DESC "isa0060/serio%d" - -/* - * IRQs. - */ -#define I8042_KBD_IRQ IRQ_PS2_KBD -#define I8042_AUX_IRQ IRQ_PS2_AUX - -/* - * Register numbers. - */ -#define I8042_COMMAND_REG PS2_COMMAND -#define I8042_STATUS_REG PS2_STATUS -#define I8042_DATA_REG PS2_DATA - -#define I8042_REGION_START (resource_size_t)(PS2_DATA) -#define I8042_REGION_SIZE (resource_size_t)(16) - -static inline int i8042_read_data(void) -{ - return readb(I8042_DATA_REG); -} - -static inline int i8042_read_status(void) -{ - return readb(I8042_STATUS_REG); -} - -static inline void i8042_write_data(int val) -{ - writeb(val, I8042_DATA_REG); -} - -static inline void i8042_write_command(int val) -{ - writeb(val, I8042_COMMAND_REG); -} - -static inline int i8042_platform_init(void) -{ - if (!request_mem_region(I8042_REGION_START, I8042_REGION_SIZE, "i8042")) - return -EBUSY; - - i8042_reset = I8042_RESET_ALWAYS; - return 0; -} - -static inline void i8042_platform_exit(void) -{ - release_mem_region(I8042_REGION_START, I8042_REGION_SIZE); -} - -#endif /* _I8042_UNICORE32_H */ diff --git a/drivers/input/serio/i8042.h b/drivers/input/serio/i8042.h index eb376700dfff..55381783dc82 100644 --- a/drivers/input/serio/i8042.h +++ b/drivers/input/serio/i8042.h @@ -21,8 +21,6 @@ #include "i8042-sparcio.h" #elif defined(CONFIG_X86) || defined(CONFIG_IA64) #include "i8042-x86ia64io.h" -#elif defined(CONFIG_UNICORE32) -#include "i8042-unicore32io.h" #else #include "i8042-io.h" #endif From a2022e1cf368c5d8794b75a9b5eb5f078a9bdb76 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 10 Jun 2020 09:57:50 +0300 Subject: [PATCH 233/502] pwm: remove pwm-puv3 driver The unicore32 port is removed from the kernel. There is no point to keep stale PWM driver for this architecture. Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: Guenter Roeck --- drivers/pwm/Kconfig | 9 --- drivers/pwm/Makefile | 1 - drivers/pwm/pwm-puv3.c | 150 ----------------------------------------- 3 files changed, 160 deletions(-) delete mode 100644 drivers/pwm/pwm-puv3.c diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index cb8d739067d2..7dbcf6973d33 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -370,15 +370,6 @@ config PWM_PCA9685 To compile this driver as a module, choose M here: the module will be called pwm-pca9685. -config PWM_PUV3 - tristate "PKUnity NetBook-0916 PWM support" - depends on ARCH_PUV3 - help - Generic PWM framework driver for PKUnity NetBook-0916. - - To compile this driver as a module, choose M here: the module - will be called pwm-puv3. - config PWM_PXA tristate "PXA PWM support" depends on ARCH_PXA || COMPILE_TEST diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index a59c710e98c7..2c2ba0a03557 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -34,7 +34,6 @@ obj-$(CONFIG_PWM_MTK_DISP) += pwm-mtk-disp.o obj-$(CONFIG_PWM_MXS) += pwm-mxs.o obj-$(CONFIG_PWM_OMAP_DMTIMER) += pwm-omap-dmtimer.o obj-$(CONFIG_PWM_PCA9685) += pwm-pca9685.o -obj-$(CONFIG_PWM_PUV3) += pwm-puv3.o obj-$(CONFIG_PWM_PXA) += pwm-pxa.o obj-$(CONFIG_PWM_RCAR) += pwm-rcar.o obj-$(CONFIG_PWM_RENESAS_TPU) += pwm-renesas-tpu.o diff --git a/drivers/pwm/pwm-puv3.c b/drivers/pwm/pwm-puv3.c deleted file mode 100644 index 9d0bd87a425e..000000000000 --- a/drivers/pwm/pwm-puv3.c +++ /dev/null @@ -1,150 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/pwm.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -struct puv3_pwm_chip { - struct pwm_chip chip; - void __iomem *base; - struct clk *clk; -}; - -static inline struct puv3_pwm_chip *to_puv3(struct pwm_chip *chip) -{ - return container_of(chip, struct puv3_pwm_chip, chip); -} - -/* - * period_ns = 10^9 * (PRESCALE + 1) * (PV + 1) / PWM_CLK_RATE - * duty_ns = 10^9 * (PRESCALE + 1) * DC / PWM_CLK_RATE - */ -static int puv3_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) -{ - unsigned long period_cycles, prescale, pv, dc; - struct puv3_pwm_chip *puv3 = to_puv3(chip); - unsigned long long c; - - c = clk_get_rate(puv3->clk); - c = c * period_ns; - do_div(c, 1000000000); - period_cycles = c; - - if (period_cycles < 1) - period_cycles = 1; - - prescale = (period_cycles - 1) / 1024; - pv = period_cycles / (prescale + 1) - 1; - - if (prescale > 63) - return -EINVAL; - - if (duty_ns == period_ns) - dc = OST_PWMDCCR_FDCYCLE; - else - dc = (pv + 1) * duty_ns / period_ns; - - /* - * NOTE: the clock to PWM has to be enabled first - * before writing to the registers - */ - clk_prepare_enable(puv3->clk); - - writel(prescale, puv3->base + OST_PWM_PWCR); - writel(pv - dc, puv3->base + OST_PWM_DCCR); - writel(pv, puv3->base + OST_PWM_PCR); - - clk_disable_unprepare(puv3->clk); - - return 0; -} - -static int puv3_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) -{ - struct puv3_pwm_chip *puv3 = to_puv3(chip); - - return clk_prepare_enable(puv3->clk); -} - -static void puv3_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) -{ - struct puv3_pwm_chip *puv3 = to_puv3(chip); - - clk_disable_unprepare(puv3->clk); -} - -static const struct pwm_ops puv3_pwm_ops = { - .config = puv3_pwm_config, - .enable = puv3_pwm_enable, - .disable = puv3_pwm_disable, - .owner = THIS_MODULE, -}; - -static int pwm_probe(struct platform_device *pdev) -{ - struct puv3_pwm_chip *puv3; - struct resource *r; - int ret; - - puv3 = devm_kzalloc(&pdev->dev, sizeof(*puv3), GFP_KERNEL); - if (!puv3) - return -ENOMEM; - - puv3->clk = devm_clk_get(&pdev->dev, "OST_CLK"); - if (IS_ERR(puv3->clk)) - return PTR_ERR(puv3->clk); - - r = platform_get_resource(pdev, IORESOURCE_MEM, 0); - puv3->base = devm_ioremap_resource(&pdev->dev, r); - if (IS_ERR(puv3->base)) - return PTR_ERR(puv3->base); - - puv3->chip.dev = &pdev->dev; - puv3->chip.ops = &puv3_pwm_ops; - puv3->chip.base = -1; - puv3->chip.npwm = 1; - - ret = pwmchip_add(&puv3->chip); - if (ret < 0) { - dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret); - return ret; - } - - platform_set_drvdata(pdev, puv3); - return 0; -} - -static int pwm_remove(struct platform_device *pdev) -{ - struct puv3_pwm_chip *puv3 = platform_get_drvdata(pdev); - - return pwmchip_remove(&puv3->chip); -} - -static struct platform_driver puv3_pwm_driver = { - .driver = { - .name = "PKUnity-v3-PWM", - }, - .probe = pwm_probe, - .remove = pwm_remove, -}; -module_platform_driver(puv3_pwm_driver); - -MODULE_LICENSE("GPL v2"); From e26e59190ecd0b09a8778bbdc8239d0db78903c9 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 10 Jun 2020 09:57:50 +0300 Subject: [PATCH 234/502] video: fbdev: remove fb-puv3 driver The unicore32 port is removed from the kernel. There is no point to keep stale fbdev driver for this architecture. Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: Guenter Roeck --- MAINTAINERS | 1 - drivers/video/fbdev/Kconfig | 11 - drivers/video/fbdev/Makefile | 1 - drivers/video/fbdev/fb-puv3.c | 836 ---------------------------------- 4 files changed, 849 deletions(-) delete mode 100644 drivers/video/fbdev/fb-puv3.c diff --git a/MAINTAINERS b/MAINTAINERS index e5035fda296e..79d70acdf119 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13586,7 +13586,6 @@ S: Maintained W: http://mprc.pku.edu.cn/~guanxuetao/linux T: git git://github.com/gxt/linux.git F: drivers/rtc/rtc-puv3.c -F: drivers/video/fbdev/fb-puv3.c PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER M: Tomasz Duszynski diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index 0f559aeaf469..32a2698914c3 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -2198,17 +2198,6 @@ config FB_BROADSHEET and could also have been called by other names when coupled with a bridge adapter. -config FB_PUV3_UNIGFX - tristate "PKUnity v3 Unigfx framebuffer support" - depends on FB && UNICORE32 && ARCH_PUV3 - select FB_SYS_FILLRECT - select FB_SYS_COPYAREA - select FB_SYS_IMAGEBLIT - select FB_SYS_FOPS - help - Choose this option if you want to use the Unigfx device as a - framebuffer device. Without the support of PCI & AGP. - config FB_HYPERV tristate "Microsoft Hyper-V Synthetic Video support" depends on FB && HYPERV diff --git a/drivers/video/fbdev/Makefile b/drivers/video/fbdev/Makefile index aa6352798cf4..a0705b99e643 100644 --- a/drivers/video/fbdev/Makefile +++ b/drivers/video/fbdev/Makefile @@ -116,7 +116,6 @@ obj-y += omap2/ obj-$(CONFIG_XEN_FBDEV_FRONTEND) += xen-fbfront.o obj-$(CONFIG_FB_CARMINE) += carminefb.o obj-$(CONFIG_FB_MB862XX) += mb862xx/ -obj-$(CONFIG_FB_PUV3_UNIGFX) += fb-puv3.o obj-$(CONFIG_FB_HYPERV) += hyperv_fb.o obj-$(CONFIG_FB_OPENCORES) += ocfb.o obj-$(CONFIG_FB_SM712) += sm712fb.o diff --git a/drivers/video/fbdev/fb-puv3.c b/drivers/video/fbdev/fb-puv3.c deleted file mode 100644 index 030e85c11a78..000000000000 --- a/drivers/video/fbdev/fb-puv3.c +++ /dev/null @@ -1,836 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Frame Buffer Driver for PKUnity-v3 Unigfx - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* Platform_data reserved for unifb registers. */ -#define UNIFB_REGS_NUM 10 -/* RAM reserved for the frame buffer. */ -#define UNIFB_MEMSIZE (SZ_4M) /* 4 MB for 1024*768*32b */ - -/* - * cause UNIGFX don not have EDID - * all the modes are organized as follow - */ -static const struct fb_videomode unifb_modes[] = { - /* 0 640x480-60 VESA */ - { "640x480@60", 60, 640, 480, 25175000, 48, 16, 34, 10, 96, 1, - 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, - /* 1 640x480-75 VESA */ - { "640x480@75", 75, 640, 480, 31500000, 120, 16, 18, 1, 64, 1, - 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, - /* 2 800x600-60 VESA */ - { "800x600@60", 60, 800, 600, 40000000, 88, 40, 26, 1, 128, 1, - 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, - /* 3 800x600-75 VESA */ - { "800x600@75", 75, 800, 600, 49500000, 160, 16, 23, 1, 80, 1, - 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, - /* 4 1024x768-60 VESA */ - { "1024x768@60", 60, 1024, 768, 65000000, 160, 24, 34, 3, 136, 1, - 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, - /* 5 1024x768-75 VESA */ - { "1024x768@75", 75, 1024, 768, 78750000, 176, 16, 30, 1, 96, 1, - 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, - /* 6 1280x960-60 VESA */ - { "1280x960@60", 60, 1280, 960, 108000000, 312, 96, 38, 1, 112, 1, - 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, - /* 7 1440x900-60 VESA */ - { "1440x900@60", 60, 1440, 900, 106500000, 232, 80, 30, 3, 152, 1, - 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, - /* 8 FIXME 9 1024x600-60 VESA UNTESTED */ - { "1024x600@60", 60, 1024, 600, 50650000, 160, 24, 26, 1, 136, 1, - 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, - /* 9 FIXME 10 1024x600-75 VESA UNTESTED */ - { "1024x600@75", 75, 1024, 600, 61500000, 176, 16, 23, 1, 96, 1, - 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, - /* 10 FIXME 11 1366x768-60 VESA UNTESTED */ - { "1366x768@60", 60, 1366, 768, 85500000, 256, 58, 18, 1, 112, 3, - 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, -}; - -static const struct fb_var_screeninfo unifb_default = { - .xres = 640, - .yres = 480, - .xres_virtual = 640, - .yres_virtual = 480, - .bits_per_pixel = 16, - .red = { 11, 5, 0 }, - .green = { 5, 6, 0 }, - .blue = { 0, 5, 0 }, - .activate = FB_ACTIVATE_NOW, - .height = -1, - .width = -1, - .pixclock = 25175000, - .left_margin = 48, - .right_margin = 16, - .upper_margin = 33, - .lower_margin = 10, - .hsync_len = 96, - .vsync_len = 2, - .vmode = FB_VMODE_NONINTERLACED, -}; - -static struct fb_fix_screeninfo unifb_fix = { - .id = "UNIGFX FB", - .type = FB_TYPE_PACKED_PIXELS, - .visual = FB_VISUAL_TRUECOLOR, - .xpanstep = 1, - .ypanstep = 1, - .ywrapstep = 1, - .accel = FB_ACCEL_NONE, -}; - -static void unifb_sync(struct fb_info *info) -{ - /* TODO: may, this can be replaced by interrupt */ - int cnt; - - for (cnt = 0; cnt < 0x10000000; cnt++) { - if (readl(UGE_COMMAND) & 0x1000000) - return; - } - - if (cnt > 0x8000000) - dev_warn(info->device, "Warning: UniGFX GE time out ...\n"); -} - -static void unifb_prim_fillrect(struct fb_info *info, - const struct fb_fillrect *region) -{ - int awidth = region->width; - int aheight = region->height; - int m_iBpp = info->var.bits_per_pixel; - int screen_width = info->var.xres; - int src_sel = 1; /* from fg_color */ - int pat_sel = 1; - int src_x0 = 0; - int dst_x0 = region->dx; - int src_y0 = 0; - int dst_y0 = region->dy; - int rop_alpha_sel = 0; - int rop_alpha_code = 0xCC; - int x_dir = 1; - int y_dir = 1; - int alpha_r = 0; - int alpha_sel = 0; - int dst_pitch = screen_width * (m_iBpp / 8); - int dst_offset = dst_y0 * dst_pitch + dst_x0 * (m_iBpp / 8); - int src_pitch = screen_width * (m_iBpp / 8); - int src_offset = src_y0 * src_pitch + src_x0 * (m_iBpp / 8); - unsigned int command = 0; - int clip_region = 0; - int clip_en = 0; - int tp_en = 0; - int fg_color = 0; - int bottom = info->var.yres - 1; - int right = info->var.xres - 1; - int top = 0; - - bottom = (bottom << 16) | right; - command = (rop_alpha_sel << 26) | (pat_sel << 18) | (src_sel << 16) - | (x_dir << 20) | (y_dir << 21) | (command << 24) - | (clip_region << 23) | (clip_en << 22) | (tp_en << 27); - src_pitch = (dst_pitch << 16) | src_pitch; - awidth = awidth | (aheight << 16); - alpha_r = ((rop_alpha_code & 0xff) << 8) | (alpha_r & 0xff) - | (alpha_sel << 16); - src_x0 = (src_x0 & 0x1fff) | ((src_y0 & 0x1fff) << 16); - dst_x0 = (dst_x0 & 0x1fff) | ((dst_y0 & 0x1fff) << 16); - fg_color = region->color; - - unifb_sync(info); - - writel(((u32 *)(info->pseudo_palette))[fg_color], UGE_FCOLOR); - writel(0, UGE_BCOLOR); - writel(src_pitch, UGE_PITCH); - writel(src_offset, UGE_SRCSTART); - writel(dst_offset, UGE_DSTSTART); - writel(awidth, UGE_WIDHEIGHT); - writel(top, UGE_CLIP0); - writel(bottom, UGE_CLIP1); - writel(alpha_r, UGE_ROPALPHA); - writel(src_x0, UGE_SRCXY); - writel(dst_x0, UGE_DSTXY); - writel(command, UGE_COMMAND); -} - -static void unifb_fillrect(struct fb_info *info, - const struct fb_fillrect *region) -{ - struct fb_fillrect modded; - int vxres, vyres; - - if (info->flags & FBINFO_HWACCEL_DISABLED) { - sys_fillrect(info, region); - return; - } - - vxres = info->var.xres_virtual; - vyres = info->var.yres_virtual; - - memcpy(&modded, region, sizeof(struct fb_fillrect)); - - if (!modded.width || !modded.height || - modded.dx >= vxres || modded.dy >= vyres) - return; - - if (modded.dx + modded.width > vxres) - modded.width = vxres - modded.dx; - if (modded.dy + modded.height > vyres) - modded.height = vyres - modded.dy; - - unifb_prim_fillrect(info, &modded); -} - -static void unifb_prim_copyarea(struct fb_info *info, - const struct fb_copyarea *area) -{ - int awidth = area->width; - int aheight = area->height; - int m_iBpp = info->var.bits_per_pixel; - int screen_width = info->var.xres; - int src_sel = 2; /* from mem */ - int pat_sel = 0; - int src_x0 = area->sx; - int dst_x0 = area->dx; - int src_y0 = area->sy; - int dst_y0 = area->dy; - - int rop_alpha_sel = 0; - int rop_alpha_code = 0xCC; - int x_dir = 1; - int y_dir = 1; - - int alpha_r = 0; - int alpha_sel = 0; - int dst_pitch = screen_width * (m_iBpp / 8); - int dst_offset = dst_y0 * dst_pitch + dst_x0 * (m_iBpp / 8); - int src_pitch = screen_width * (m_iBpp / 8); - int src_offset = src_y0 * src_pitch + src_x0 * (m_iBpp / 8); - unsigned int command = 0; - int clip_region = 0; - int clip_en = 1; - int tp_en = 0; - int top = 0; - int bottom = info->var.yres; - int right = info->var.xres; - int fg_color = 0; - int bg_color = 0; - - if (src_x0 < 0) - src_x0 = 0; - if (src_y0 < 0) - src_y0 = 0; - - if (src_y0 - dst_y0 > 0) { - y_dir = 1; - } else { - y_dir = 0; - src_offset = (src_y0 + aheight) * src_pitch + - src_x0 * (m_iBpp / 8); - dst_offset = (dst_y0 + aheight) * dst_pitch + - dst_x0 * (m_iBpp / 8); - src_y0 += aheight; - dst_y0 += aheight; - } - - command = (rop_alpha_sel << 26) | (pat_sel << 18) | (src_sel << 16) | - (x_dir << 20) | (y_dir << 21) | (command << 24) | - (clip_region << 23) | (clip_en << 22) | (tp_en << 27); - src_pitch = (dst_pitch << 16) | src_pitch; - awidth = awidth | (aheight << 16); - alpha_r = ((rop_alpha_code & 0xff) << 8) | (alpha_r & 0xff) | - (alpha_sel << 16); - src_x0 = (src_x0 & 0x1fff) | ((src_y0 & 0x1fff) << 16); - dst_x0 = (dst_x0 & 0x1fff) | ((dst_y0 & 0x1fff) << 16); - bottom = (bottom << 16) | right; - - unifb_sync(info); - - writel(src_pitch, UGE_PITCH); - writel(src_offset, UGE_SRCSTART); - writel(dst_offset, UGE_DSTSTART); - writel(awidth, UGE_WIDHEIGHT); - writel(top, UGE_CLIP0); - writel(bottom, UGE_CLIP1); - writel(bg_color, UGE_BCOLOR); - writel(fg_color, UGE_FCOLOR); - writel(alpha_r, UGE_ROPALPHA); - writel(src_x0, UGE_SRCXY); - writel(dst_x0, UGE_DSTXY); - writel(command, UGE_COMMAND); -} - -static void unifb_copyarea(struct fb_info *info, const struct fb_copyarea *area) -{ - struct fb_copyarea modded; - u32 vxres, vyres; - modded.sx = area->sx; - modded.sy = area->sy; - modded.dx = area->dx; - modded.dy = area->dy; - modded.width = area->width; - modded.height = area->height; - - if (info->flags & FBINFO_HWACCEL_DISABLED) { - sys_copyarea(info, area); - return; - } - - vxres = info->var.xres_virtual; - vyres = info->var.yres_virtual; - - if (!modded.width || !modded.height || - modded.sx >= vxres || modded.sy >= vyres || - modded.dx >= vxres || modded.dy >= vyres) - return; - - if (modded.sx + modded.width > vxres) - modded.width = vxres - modded.sx; - if (modded.dx + modded.width > vxres) - modded.width = vxres - modded.dx; - if (modded.sy + modded.height > vyres) - modded.height = vyres - modded.sy; - if (modded.dy + modded.height > vyres) - modded.height = vyres - modded.dy; - - unifb_prim_copyarea(info, &modded); -} - -static void unifb_imageblit(struct fb_info *info, const struct fb_image *image) -{ - sys_imageblit(info, image); -} - -static u_long get_line_length(int xres_virtual, int bpp) -{ - u_long length; - - length = xres_virtual * bpp; - length = (length + 31) & ~31; - length >>= 3; - return length; -} - -/* - * Setting the video mode has been split into two parts. - * First part, xxxfb_check_var, must not write anything - * to hardware, it should only verify and adjust var. - * This means it doesn't alter par but it does use hardware - * data from it to check this var. - */ -static int unifb_check_var(struct fb_var_screeninfo *var, - struct fb_info *info) -{ - u_long line_length; - - /* - * FB_VMODE_CONUPDATE and FB_VMODE_SMOOTH_XPAN are equal! - * as FB_VMODE_SMOOTH_XPAN is only used internally - */ - - if (var->vmode & FB_VMODE_CONUPDATE) { - var->vmode |= FB_VMODE_YWRAP; - var->xoffset = info->var.xoffset; - var->yoffset = info->var.yoffset; - } - - /* - * Some very basic checks - */ - if (!var->xres) - var->xres = 1; - if (!var->yres) - var->yres = 1; - if (var->xres > var->xres_virtual) - var->xres_virtual = var->xres; - if (var->yres > var->yres_virtual) - var->yres_virtual = var->yres; - if (var->bits_per_pixel <= 1) - var->bits_per_pixel = 1; - else if (var->bits_per_pixel <= 8) - var->bits_per_pixel = 8; - else if (var->bits_per_pixel <= 16) - var->bits_per_pixel = 16; - else if (var->bits_per_pixel <= 24) - var->bits_per_pixel = 24; - else if (var->bits_per_pixel <= 32) - var->bits_per_pixel = 32; - else - return -EINVAL; - - if (var->xres_virtual < var->xoffset + var->xres) - var->xres_virtual = var->xoffset + var->xres; - if (var->yres_virtual < var->yoffset + var->yres) - var->yres_virtual = var->yoffset + var->yres; - - /* - * Memory limit - */ - line_length = - get_line_length(var->xres_virtual, var->bits_per_pixel); - if (line_length * var->yres_virtual > UNIFB_MEMSIZE) - return -ENOMEM; - - /* - * Now that we checked it we alter var. The reason being is that the - * video mode passed in might not work but slight changes to it might - * make it work. This way we let the user know what is acceptable. - */ - switch (var->bits_per_pixel) { - case 1: - case 8: - var->red.offset = 0; - var->red.length = 8; - var->green.offset = 0; - var->green.length = 8; - var->blue.offset = 0; - var->blue.length = 8; - var->transp.offset = 0; - var->transp.length = 0; - break; - case 16: /* RGBA 5551 */ - if (var->transp.length) { - var->red.offset = 0; - var->red.length = 5; - var->green.offset = 5; - var->green.length = 5; - var->blue.offset = 10; - var->blue.length = 5; - var->transp.offset = 15; - var->transp.length = 1; - } else { /* RGB 565 */ - var->red.offset = 11; - var->red.length = 5; - var->green.offset = 5; - var->green.length = 6; - var->blue.offset = 0; - var->blue.length = 5; - var->transp.offset = 0; - var->transp.length = 0; - } - break; - case 24: /* RGB 888 */ - var->red.offset = 0; - var->red.length = 8; - var->green.offset = 8; - var->green.length = 8; - var->blue.offset = 16; - var->blue.length = 8; - var->transp.offset = 0; - var->transp.length = 0; - break; - case 32: /* RGBA 8888 */ - var->red.offset = 16; - var->red.length = 8; - var->green.offset = 8; - var->green.length = 8; - var->blue.offset = 0; - var->blue.length = 8; - var->transp.offset = 24; - var->transp.length = 8; - break; - } - var->red.msb_right = 0; - var->green.msb_right = 0; - var->blue.msb_right = 0; - var->transp.msb_right = 0; - - return 0; -} - -/* - * This routine actually sets the video mode. It's in here where we - * the hardware state info->par and fix which can be affected by the - * change in par. For this driver it doesn't do much. - */ -static int unifb_set_par(struct fb_info *info) -{ - int hTotal, vTotal, hSyncStart, hSyncEnd, vSyncStart, vSyncEnd; - int format; - -#ifdef CONFIG_PUV3_PM - struct clk *clk_vga; - u32 pixclk = 0; - int i; - - for (i = 0; i <= 10; i++) { - if (info->var.xres == unifb_modes[i].xres - && info->var.yres == unifb_modes[i].yres - && info->var.upper_margin == unifb_modes[i].upper_margin - && info->var.lower_margin == unifb_modes[i].lower_margin - && info->var.left_margin == unifb_modes[i].left_margin - && info->var.right_margin == unifb_modes[i].right_margin - && info->var.hsync_len == unifb_modes[i].hsync_len - && info->var.vsync_len == unifb_modes[i].vsync_len) { - pixclk = unifb_modes[i].pixclock; - break; - } - } - - /* set clock rate */ - clk_vga = clk_get(info->device, "VGA_CLK"); - if (clk_vga == ERR_PTR(-ENOENT)) - return -ENOENT; - - if (pixclk != 0) { - if (clk_set_rate(clk_vga, pixclk)) { /* set clock failed */ - info->fix = unifb_fix; - info->var = unifb_default; - if (clk_set_rate(clk_vga, unifb_default.pixclock)) - return -EINVAL; - } - } -#endif - - info->fix.line_length = get_line_length(info->var.xres_virtual, - info->var.bits_per_pixel); - - hSyncStart = info->var.xres + info->var.right_margin; - hSyncEnd = hSyncStart + info->var.hsync_len; - hTotal = hSyncEnd + info->var.left_margin; - - vSyncStart = info->var.yres + info->var.lower_margin; - vSyncEnd = vSyncStart + info->var.vsync_len; - vTotal = vSyncEnd + info->var.upper_margin; - - switch (info->var.bits_per_pixel) { - case 8: - format = UDE_CFG_DST8; - break; - case 16: - format = UDE_CFG_DST16; - break; - case 24: - format = UDE_CFG_DST24; - break; - case 32: - format = UDE_CFG_DST32; - break; - default: - return -EINVAL; - } - - writel(info->fix.smem_start, UDE_FSA); - writel(info->var.yres, UDE_LS); - writel(get_line_length(info->var.xres, - info->var.bits_per_pixel) >> 3, UDE_PS); - /* >> 3 for hardware required. */ - writel((hTotal << 16) | (info->var.xres), UDE_HAT); - writel(((hTotal - 1) << 16) | (info->var.xres - 1), UDE_HBT); - writel(((hSyncEnd - 1) << 16) | (hSyncStart - 1), UDE_HST); - writel((vTotal << 16) | (info->var.yres), UDE_VAT); - writel(((vTotal - 1) << 16) | (info->var.yres - 1), UDE_VBT); - writel(((vSyncEnd - 1) << 16) | (vSyncStart - 1), UDE_VST); - writel(UDE_CFG_GDEN_ENABLE | UDE_CFG_TIMEUP_ENABLE - | format | 0xC0000001, UDE_CFG); - - return 0; -} - -/* - * Set a single color register. The values supplied are already - * rounded down to the hardware's capabilities (according to the - * entries in the var structure). Return != 0 for invalid regno. - */ -static int unifb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, - u_int transp, struct fb_info *info) -{ - if (regno >= 256) /* no. of hw registers */ - return 1; - - /* grayscale works only partially under directcolor */ - if (info->var.grayscale) { - /* grayscale = 0.30*R + 0.59*G + 0.11*B */ - red = green = blue = - (red * 77 + green * 151 + blue * 28) >> 8; - } - -#define CNVT_TOHW(val, width) ((((val)<<(width))+0x7FFF-(val))>>16) - switch (info->fix.visual) { - case FB_VISUAL_TRUECOLOR: - case FB_VISUAL_PSEUDOCOLOR: - red = CNVT_TOHW(red, info->var.red.length); - green = CNVT_TOHW(green, info->var.green.length); - blue = CNVT_TOHW(blue, info->var.blue.length); - transp = CNVT_TOHW(transp, info->var.transp.length); - break; - case FB_VISUAL_DIRECTCOLOR: - red = CNVT_TOHW(red, 8); /* expect 8 bit DAC */ - green = CNVT_TOHW(green, 8); - blue = CNVT_TOHW(blue, 8); - /* hey, there is bug in transp handling... */ - transp = CNVT_TOHW(transp, 8); - break; - } -#undef CNVT_TOHW - /* Truecolor has hardware independent palette */ - if (info->fix.visual == FB_VISUAL_TRUECOLOR) { - u32 v; - - if (regno >= 16) - return 1; - - v = (red << info->var.red.offset) | - (green << info->var.green.offset) | - (blue << info->var.blue.offset) | - (transp << info->var.transp.offset); - switch (info->var.bits_per_pixel) { - case 8: - break; - case 16: - case 24: - case 32: - ((u32 *) (info->pseudo_palette))[regno] = v; - break; - default: - return 1; - } - return 0; - } - return 0; -} - -/* - * Pan or Wrap the Display - * - * This call looks only at xoffset, yoffset and the FB_VMODE_YWRAP flag - */ -static int unifb_pan_display(struct fb_var_screeninfo *var, - struct fb_info *info) -{ - if (var->vmode & FB_VMODE_YWRAP) { - if (var->yoffset < 0 - || var->yoffset >= info->var.yres_virtual - || var->xoffset) - return -EINVAL; - } else { - if (var->xoffset + info->var.xres > info->var.xres_virtual || - var->yoffset + info->var.yres > info->var.yres_virtual) - return -EINVAL; - } - info->var.xoffset = var->xoffset; - info->var.yoffset = var->yoffset; - if (var->vmode & FB_VMODE_YWRAP) - info->var.vmode |= FB_VMODE_YWRAP; - else - info->var.vmode &= ~FB_VMODE_YWRAP; - return 0; -} - -int unifb_mmap(struct fb_info *info, - struct vm_area_struct *vma) -{ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - return vm_iomap_memory(vma, info->fix.smem_start, info->fix.smem_len); -} - -static const struct fb_ops unifb_ops = { - .fb_read = fb_sys_read, - .fb_write = fb_sys_write, - .fb_check_var = unifb_check_var, - .fb_set_par = unifb_set_par, - .fb_setcolreg = unifb_setcolreg, - .fb_pan_display = unifb_pan_display, - .fb_fillrect = unifb_fillrect, - .fb_copyarea = unifb_copyarea, - .fb_imageblit = unifb_imageblit, - .fb_mmap = unifb_mmap, -}; - -/* - * Initialisation - */ -static int unifb_probe(struct platform_device *dev) -{ - struct fb_info *info; - u32 unifb_regs[UNIFB_REGS_NUM]; - int retval = -ENOMEM; - struct resource *iomem; - void *videomemory; - - videomemory = (void *)__get_free_pages(GFP_KERNEL | __GFP_COMP, - get_order(UNIFB_MEMSIZE)); - if (!videomemory) - goto err; - - memset(videomemory, 0, UNIFB_MEMSIZE); - - unifb_fix.smem_start = virt_to_phys(videomemory); - unifb_fix.smem_len = UNIFB_MEMSIZE; - - iomem = platform_get_resource(dev, IORESOURCE_MEM, 0); - unifb_fix.mmio_start = iomem->start; - - info = framebuffer_alloc(sizeof(u32)*256, &dev->dev); - if (!info) - goto err; - - info->screen_base = (char __iomem *)videomemory; - info->fbops = &unifb_ops; - - retval = fb_find_mode(&info->var, info, NULL, - unifb_modes, 10, &unifb_modes[0], 16); - - if (!retval || (retval == 4)) - info->var = unifb_default; - - info->fix = unifb_fix; - info->pseudo_palette = info->par; - info->par = NULL; - info->flags = FBINFO_FLAG_DEFAULT; -#ifdef FB_ACCEL_PUV3_UNIGFX - info->fix.accel = FB_ACCEL_PUV3_UNIGFX; -#endif - - retval = fb_alloc_cmap(&info->cmap, 256, 0); - if (retval < 0) - goto err1; - - retval = register_framebuffer(info); - if (retval < 0) - goto err2; - platform_set_drvdata(dev, info); - platform_device_add_data(dev, unifb_regs, sizeof(u32) * UNIFB_REGS_NUM); - - fb_info(info, "Virtual frame buffer device, using %dM of video memory\n", - UNIFB_MEMSIZE >> 20); - return 0; -err2: - fb_dealloc_cmap(&info->cmap); -err1: - framebuffer_release(info); -err: - return retval; -} - -static int unifb_remove(struct platform_device *dev) -{ - struct fb_info *info = platform_get_drvdata(dev); - - if (info) { - unregister_framebuffer(info); - fb_dealloc_cmap(&info->cmap); - framebuffer_release(info); - } - return 0; -} - -#ifdef CONFIG_PM -static int unifb_resume(struct platform_device *dev) -{ - int rc = 0; - u32 *unifb_regs = dev->dev.platform_data; - - if (dev->dev.power.power_state.event == PM_EVENT_ON) - return 0; - - console_lock(); - - if (dev->dev.power.power_state.event == PM_EVENT_SUSPEND) { - writel(unifb_regs[0], UDE_FSA); - writel(unifb_regs[1], UDE_LS); - writel(unifb_regs[2], UDE_PS); - writel(unifb_regs[3], UDE_HAT); - writel(unifb_regs[4], UDE_HBT); - writel(unifb_regs[5], UDE_HST); - writel(unifb_regs[6], UDE_VAT); - writel(unifb_regs[7], UDE_VBT); - writel(unifb_regs[8], UDE_VST); - writel(unifb_regs[9], UDE_CFG); - } - dev->dev.power.power_state = PMSG_ON; - - console_unlock(); - - return rc; -} - -static int unifb_suspend(struct platform_device *dev, pm_message_t mesg) -{ - u32 *unifb_regs = dev->dev.platform_data; - - unifb_regs[0] = readl(UDE_FSA); - unifb_regs[1] = readl(UDE_LS); - unifb_regs[2] = readl(UDE_PS); - unifb_regs[3] = readl(UDE_HAT); - unifb_regs[4] = readl(UDE_HBT); - unifb_regs[5] = readl(UDE_HST); - unifb_regs[6] = readl(UDE_VAT); - unifb_regs[7] = readl(UDE_VBT); - unifb_regs[8] = readl(UDE_VST); - unifb_regs[9] = readl(UDE_CFG); - - if (mesg.event == dev->dev.power.power_state.event) - return 0; - - switch (mesg.event) { - case PM_EVENT_FREEZE: /* about to take snapshot */ - case PM_EVENT_PRETHAW: /* before restoring snapshot */ - goto done; - } - - console_lock(); - - /* do nothing... */ - - console_unlock(); - -done: - dev->dev.power.power_state = mesg; - - return 0; -} -#else -#define unifb_resume NULL -#define unifb_suspend NULL -#endif - -static struct platform_driver unifb_driver = { - .probe = unifb_probe, - .remove = unifb_remove, - .resume = unifb_resume, - .suspend = unifb_suspend, - .driver = { - .name = "PKUnity-v3-UNIGFX", - }, -}; - -static int __init unifb_init(void) -{ -#ifndef MODULE - if (fb_get_options("unifb", NULL)) - return -ENODEV; -#endif - - return platform_driver_register(&unifb_driver); -} - -module_init(unifb_init); - -static void __exit unifb_exit(void) -{ - platform_driver_unregister(&unifb_driver); -} - -module_exit(unifb_exit); - -MODULE_LICENSE("GPL v2"); From fa4b9519f074646252f6aeb33d9329a384439632 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 10 Jun 2020 09:57:50 +0300 Subject: [PATCH 235/502] rtc: remove fb-puv3 driver The unicore32 port is removed from the kernel. There is no point to keep stale RTC driver for this architecture. Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: Guenter Roeck --- MAINTAINERS | 1 - drivers/rtc/Kconfig | 9 -- drivers/rtc/Makefile | 1 - drivers/rtc/rtc-puv3.c | 286 ----------------------------------------- 4 files changed, 297 deletions(-) delete mode 100644 drivers/rtc/rtc-puv3.c diff --git a/MAINTAINERS b/MAINTAINERS index 79d70acdf119..e4787ac42153 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13585,7 +13585,6 @@ M: Guan Xuetao S: Maintained W: http://mprc.pku.edu.cn/~guanxuetao/linux T: git git://github.com/gxt/linux.git -F: drivers/rtc/rtc-puv3.c PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER M: Tomasz Duszynski diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index b54d87d45c89..f3b8e6dcd879 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1729,15 +1729,6 @@ config RTC_DRV_TEGRA This drive can also be built as a module. If so, the module will be called rtc-tegra. -config RTC_DRV_PUV3 - tristate "PKUnity v3 RTC support" - depends on ARCH_PUV3 - help - This enables support for the RTC in the PKUnity-v3 SoCs. - - This drive can also be built as a module. If so, the module - will be called rtc-puv3. - config RTC_DRV_LOONGSON1 tristate "loongson1 RTC support" depends on MACH_LOONGSON32 diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 0721752c6ed4..880e08a409c3 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -128,7 +128,6 @@ obj-$(CONFIG_RTC_DRV_PL030) += rtc-pl030.o obj-$(CONFIG_RTC_DRV_PL031) += rtc-pl031.o obj-$(CONFIG_RTC_DRV_PM8XXX) += rtc-pm8xxx.o obj-$(CONFIG_RTC_DRV_PS3) += rtc-ps3.o -obj-$(CONFIG_RTC_DRV_PUV3) += rtc-puv3.o obj-$(CONFIG_RTC_DRV_PXA) += rtc-pxa.o obj-$(CONFIG_RTC_DRV_R7301) += rtc-r7301.o obj-$(CONFIG_RTC_DRV_R9701) += rtc-r9701.o diff --git a/drivers/rtc/rtc-puv3.c b/drivers/rtc/rtc-puv3.c deleted file mode 100644 index 954b88d2485f..000000000000 --- a/drivers/rtc/rtc-puv3.c +++ /dev/null @@ -1,286 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * RTC driver code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -static struct resource *puv3_rtc_mem; - -static int puv3_rtc_alarmno = IRQ_RTCAlarm; -static int puv3_rtc_tickno = IRQ_RTC; - -static DEFINE_SPINLOCK(puv3_rtc_pie_lock); - -/* IRQ Handlers */ -static irqreturn_t puv3_rtc_alarmirq(int irq, void *id) -{ - struct rtc_device *rdev = id; - - writel(readl(RTC_RTSR) | RTC_RTSR_AL, RTC_RTSR); - rtc_update_irq(rdev, 1, RTC_AF | RTC_IRQF); - return IRQ_HANDLED; -} - -static irqreturn_t puv3_rtc_tickirq(int irq, void *id) -{ - struct rtc_device *rdev = id; - - writel(readl(RTC_RTSR) | RTC_RTSR_HZ, RTC_RTSR); - rtc_update_irq(rdev, 1, RTC_PF | RTC_IRQF); - return IRQ_HANDLED; -} - -/* Update control registers */ -static void puv3_rtc_setaie(struct device *dev, int to) -{ - unsigned int tmp; - - dev_dbg(dev, "%s: aie=%d\n", __func__, to); - - tmp = readl(RTC_RTSR) & ~RTC_RTSR_ALE; - - if (to) - tmp |= RTC_RTSR_ALE; - - writel(tmp, RTC_RTSR); -} - -static int puv3_rtc_setpie(struct device *dev, int enabled) -{ - unsigned int tmp; - - dev_dbg(dev, "%s: pie=%d\n", __func__, enabled); - - spin_lock_irq(&puv3_rtc_pie_lock); - tmp = readl(RTC_RTSR) & ~RTC_RTSR_HZE; - - if (enabled) - tmp |= RTC_RTSR_HZE; - - writel(tmp, RTC_RTSR); - spin_unlock_irq(&puv3_rtc_pie_lock); - - return 0; -} - -/* Time read/write */ -static int puv3_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm) -{ - rtc_time64_to_tm(readl(RTC_RCNR), rtc_tm); - - dev_dbg(dev, "read time %ptRr\n", rtc_tm); - - return 0; -} - -static int puv3_rtc_settime(struct device *dev, struct rtc_time *tm) -{ - dev_dbg(dev, "set time %ptRr\n", tm); - - writel(rtc_tm_to_time64(tm), RTC_RCNR); - - return 0; -} - -static int puv3_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm) -{ - struct rtc_time *alm_tm = &alrm->time; - - rtc_time64_to_tm(readl(RTC_RTAR), alm_tm); - - alrm->enabled = readl(RTC_RTSR) & RTC_RTSR_ALE; - - dev_dbg(dev, "read alarm: %d, %ptRr\n", alrm->enabled, alm_tm); - - return 0; -} - -static int puv3_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) -{ - struct rtc_time *tm = &alrm->time; - - dev_dbg(dev, "set alarm: %d, %ptRr\n", alrm->enabled, tm); - - writel(rtc_tm_to_time64(tm), RTC_RTAR); - - puv3_rtc_setaie(dev, alrm->enabled); - - if (alrm->enabled) - enable_irq_wake(puv3_rtc_alarmno); - else - disable_irq_wake(puv3_rtc_alarmno); - - return 0; -} - -static int puv3_rtc_proc(struct device *dev, struct seq_file *seq) -{ - seq_printf(seq, "periodic_IRQ\t: %s\n", - (readl(RTC_RTSR) & RTC_RTSR_HZE) ? "yes" : "no"); - return 0; -} - -static const struct rtc_class_ops puv3_rtcops = { - .read_time = puv3_rtc_gettime, - .set_time = puv3_rtc_settime, - .read_alarm = puv3_rtc_getalarm, - .set_alarm = puv3_rtc_setalarm, - .proc = puv3_rtc_proc, -}; - -static void puv3_rtc_enable(struct device *dev, int en) -{ - if (!en) { - writel(readl(RTC_RTSR) & ~RTC_RTSR_HZE, RTC_RTSR); - } else { - /* re-enable the device, and check it is ok */ - if ((readl(RTC_RTSR) & RTC_RTSR_HZE) == 0) { - dev_info(dev, "rtc disabled, re-enabling\n"); - writel(readl(RTC_RTSR) | RTC_RTSR_HZE, RTC_RTSR); - } - } -} - -static int puv3_rtc_remove(struct platform_device *dev) -{ - puv3_rtc_setpie(&dev->dev, 0); - puv3_rtc_setaie(&dev->dev, 0); - - release_resource(puv3_rtc_mem); - kfree(puv3_rtc_mem); - - return 0; -} - -static int puv3_rtc_probe(struct platform_device *pdev) -{ - struct rtc_device *rtc; - struct resource *res; - int ret; - - dev_dbg(&pdev->dev, "%s: probe=%p\n", __func__, pdev); - - /* find the IRQs */ - puv3_rtc_tickno = platform_get_irq(pdev, 1); - if (puv3_rtc_tickno < 0) - return -ENOENT; - - puv3_rtc_alarmno = platform_get_irq(pdev, 0); - if (puv3_rtc_alarmno < 0) - return -ENOENT; - - dev_dbg(&pdev->dev, "PKUnity_rtc: tick irq %d, alarm irq %d\n", - puv3_rtc_tickno, puv3_rtc_alarmno); - - rtc = devm_rtc_allocate_device(&pdev->dev); - if (IS_ERR(rtc)) - return PTR_ERR(rtc); - - ret = devm_request_irq(&pdev->dev, puv3_rtc_alarmno, puv3_rtc_alarmirq, - 0, "pkunity-rtc alarm", rtc); - if (ret) { - dev_err(&pdev->dev, "IRQ%d error %d\n", puv3_rtc_alarmno, ret); - return ret; - } - - ret = devm_request_irq(&pdev->dev, puv3_rtc_tickno, puv3_rtc_tickirq, - 0, "pkunity-rtc tick", rtc); - if (ret) { - dev_err(&pdev->dev, "IRQ%d error %d\n", puv3_rtc_tickno, ret); - return ret; - } - - /* get the memory region */ - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (res == NULL) { - dev_err(&pdev->dev, "failed to get memory region resource\n"); - return -ENOENT; - } - - puv3_rtc_mem = request_mem_region(res->start, resource_size(res), - pdev->name); - - if (puv3_rtc_mem == NULL) { - dev_err(&pdev->dev, "failed to reserve memory region\n"); - ret = -ENOENT; - goto err_nores; - } - - puv3_rtc_enable(&pdev->dev, 1); - - /* register RTC and exit */ - rtc->ops = &puv3_rtcops; - rtc->range_max = U32_MAX; - ret = rtc_register_device(rtc); - if (ret) - goto err_nortc; - - /* platform setup code should have handled this; sigh */ - if (!device_can_wakeup(&pdev->dev)) - device_init_wakeup(&pdev->dev, 1); - - platform_set_drvdata(pdev, rtc); - return 0; - - err_nortc: - puv3_rtc_enable(&pdev->dev, 0); - release_resource(puv3_rtc_mem); - - err_nores: - return ret; -} - -#ifdef CONFIG_PM_SLEEP -static int ticnt_save; - -static int puv3_rtc_suspend(struct device *dev) -{ - /* save RTAR for anyone using periodic interrupts */ - ticnt_save = readl(RTC_RTAR); - puv3_rtc_enable(dev, 0); - return 0; -} - -static int puv3_rtc_resume(struct device *dev) -{ - puv3_rtc_enable(dev, 1); - writel(ticnt_save, RTC_RTAR); - return 0; -} -#endif - -static SIMPLE_DEV_PM_OPS(puv3_rtc_pm_ops, puv3_rtc_suspend, puv3_rtc_resume); - -static struct platform_driver puv3_rtc_driver = { - .probe = puv3_rtc_probe, - .remove = puv3_rtc_remove, - .driver = { - .name = "PKUnity-v3-RTC", - .pm = &puv3_rtc_pm_ops, - } -}; - -module_platform_driver(puv3_rtc_driver); - -MODULE_DESCRIPTION("RTC Driver for the PKUnity v3 chip"); -MODULE_AUTHOR("Hu Dongliang"); -MODULE_LICENSE("GPL v2"); From 3346dd99fb4cd174fdbfb68dc62cd109e4323f0f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 10 Jun 2020 10:24:55 +0300 Subject: [PATCH 236/502] MAINTAINERS: remove "PKUNITY SOC DRIVERS" entry There no PkUnity drivers left, so remove the MAINTAINERS entry. Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: Guenter Roeck --- MAINTAINERS | 6 ------ 1 file changed, 6 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index e4787ac42153..6f8c204cb60e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13580,12 +13580,6 @@ F: drivers/block/pktcdvd.c F: include/linux/pktcdvd.h F: include/uapi/linux/pktcdvd.h -PKUNITY SOC DRIVERS -M: Guan Xuetao -S: Maintained -W: http://mprc.pku.edu.cn/~guanxuetao/linux -T: git git://github.com/gxt/linux.git - PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER M: Tomasz Duszynski S: Maintained From 66a049b764a71dc32031b7b533f98fc0299e6e11 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 25 Jun 2020 21:53:17 +0200 Subject: [PATCH 237/502] s390/stp: allow group and users to read stp sysfs files There are no secrets in these files, so allow all users to read it. Signed-off-by: Sven Schnelle Acked-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/kernel/time.c | 49 ++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 6bc20861fff9..700127ba689d 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -683,7 +683,7 @@ static struct bus_type stp_subsys = { .dev_name = "stp", }; -static ssize_t stp_ctn_id_show(struct device *dev, +static ssize_t ctn_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -693,9 +693,9 @@ static ssize_t stp_ctn_id_show(struct device *dev, *(unsigned long long *) stp_info.ctnid); } -static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL); +static DEVICE_ATTR_RO(ctn_id); -static ssize_t stp_ctn_type_show(struct device *dev, +static ssize_t ctn_type_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -704,9 +704,9 @@ static ssize_t stp_ctn_type_show(struct device *dev, return sprintf(buf, "%i\n", stp_info.ctn); } -static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL); +static DEVICE_ATTR_RO(ctn_type); -static ssize_t stp_dst_offset_show(struct device *dev, +static ssize_t dst_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -715,9 +715,9 @@ static ssize_t stp_dst_offset_show(struct device *dev, return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto); } -static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL); +static DEVICE_ATTR_RO(dst_offset); -static ssize_t stp_leap_seconds_show(struct device *dev, +static ssize_t leap_seconds_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -726,9 +726,9 @@ static ssize_t stp_leap_seconds_show(struct device *dev, return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps); } -static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL); +static DEVICE_ATTR_RO(leap_seconds); -static ssize_t stp_stratum_show(struct device *dev, +static ssize_t stratum_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -737,9 +737,9 @@ static ssize_t stp_stratum_show(struct device *dev, return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum); } -static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL); +static DEVICE_ATTR_RO(stratum); -static ssize_t stp_time_offset_show(struct device *dev, +static ssize_t time_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -748,9 +748,9 @@ static ssize_t stp_time_offset_show(struct device *dev, return sprintf(buf, "%i\n", (int) stp_info.tto); } -static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL); +static DEVICE_ATTR_RO(time_offset); -static ssize_t stp_time_zone_offset_show(struct device *dev, +static ssize_t time_zone_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -759,10 +759,9 @@ static ssize_t stp_time_zone_offset_show(struct device *dev, return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo); } -static DEVICE_ATTR(time_zone_offset, 0400, - stp_time_zone_offset_show, NULL); +static DEVICE_ATTR_RO(time_zone_offset); -static ssize_t stp_timing_mode_show(struct device *dev, +static ssize_t timing_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -771,9 +770,9 @@ static ssize_t stp_timing_mode_show(struct device *dev, return sprintf(buf, "%i\n", stp_info.tmd); } -static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL); +static DEVICE_ATTR_RO(timing_mode); -static ssize_t stp_timing_state_show(struct device *dev, +static ssize_t timing_state_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -782,16 +781,16 @@ static ssize_t stp_timing_state_show(struct device *dev, return sprintf(buf, "%i\n", stp_info.tst); } -static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL); +static DEVICE_ATTR_RO(timing_state); -static ssize_t stp_online_show(struct device *dev, +static ssize_t online_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%i\n", stp_online); } -static ssize_t stp_online_store(struct device *dev, +static ssize_t online_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -817,18 +816,14 @@ static ssize_t stp_online_store(struct device *dev, * Can't use DEVICE_ATTR because the attribute should be named * stp/online but dev_attr_online already exists in this file .. */ -static struct device_attribute dev_attr_stp_online = { - .attr = { .name = "online", .mode = 0600 }, - .show = stp_online_show, - .store = stp_online_store, -}; +static DEVICE_ATTR_RW(online); static struct device_attribute *stp_attributes[] = { &dev_attr_ctn_id, &dev_attr_ctn_type, &dev_attr_dst_offset, &dev_attr_leap_seconds, - &dev_attr_stp_online, + &dev_attr_online, &dev_attr_stratum, &dev_attr_time_offset, &dev_attr_time_zone_offset, From f05f62d04271faa265c7a4f75638ebc380d182fa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Jun 2020 17:00:29 +0200 Subject: [PATCH 238/502] s390/vmem: get rid of memory segment list I can't come up with a satisfying reason why we still need the memory segment list. We used to represent in the list: - boot memory - standby memory added via add_memory() - loaded dcss segments When loading/unloading dcss segments, we already track them in a separate list and check for overlaps (arch/s390/mm/extmem.c:segment_overlaps_others()) when loading segments. The overlap check was introduced for some segments in commit b2300b9efe1b ("[S390] dcssblk: add >2G DCSSs support and stacked contiguous DCSSs support.") and was extended to cover all dcss segments in commit ca57114609d1 ("s390/extmem: remove code for 31 bit addressing mode"). Although I doubt that overlaps with boot memory and standby memory are relevant, let's reshuffle the checks in load_segment() to request the resource first. This will bail out in case we have overlaps with other resources (esp. boot memory and standby memory). The order is now different compared to segment_unload() and segment_unload(), but that should not matter. This smells like a leftover from ancient times, let's get rid of it. We can now convert vmem_remove_mapping() into a void function - everybody ignored the return value already. Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Andrew Morton Signed-off-by: David Hildenbrand Message-Id: <20200625150029.45019-1-david@redhat.com> Reviewed-by: Gerald Schaefer Tested-by: Gerald Schaefer [DCSS] Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pgtable.h | 2 +- arch/s390/mm/extmem.c | 25 +++---- arch/s390/mm/vmem.c | 115 ++------------------------------ 3 files changed, 21 insertions(+), 121 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 19d603bd1f36..7eb01a5459cd 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1669,7 +1669,7 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define kern_addr_valid(addr) (1) extern int vmem_add_mapping(unsigned long start, unsigned long size); -extern int vmem_remove_mapping(unsigned long start, unsigned long size); +extern void vmem_remove_mapping(unsigned long start, unsigned long size); extern int s390_enable_sie(void); extern int s390_enable_skey(void); extern void s390_reset_cmma(struct mm_struct *mm); diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 9e0aa7aa03ba..105c09282f8c 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -313,15 +313,10 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long goto out_free; } - rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); - - if (rc) - goto out_free; - seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL); if (seg->res == NULL) { rc = -ENOMEM; - goto out_shared; + goto out_free; } seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; seg->res->start = seg->start_addr; @@ -335,12 +330,17 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long if (rc == SEG_TYPE_SC || ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared)) seg->res->flags |= IORESOURCE_READONLY; + + /* Check for overlapping resources before adding the mapping. */ if (request_resource(&iomem_resource, seg->res)) { rc = -EBUSY; - kfree(seg->res); - goto out_shared; + goto out_free_resource; } + rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + if (rc) + goto out_resource; + if (do_nonshared) diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name, &start_addr, &end_addr); @@ -351,14 +351,14 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); rc = diag_cc; - goto out_resource; + goto out_mapping; } if (diag_cc > 1) { pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr); rc = dcss_diag_translate_rc(end_addr); dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); - goto out_resource; + goto out_mapping; } seg->start_addr = start_addr; seg->end = end_addr; @@ -377,11 +377,12 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long (void*) seg->end, segtype_string[seg->vm_segtype]); } goto out; + out_mapping: + vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); out_resource: release_resource(seg->res); + out_free_resource: kfree(seg->res); - out_shared: - vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); out_free: kfree(seg); out: diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 8b6282cf7d13..3b9e71654c37 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -20,14 +20,6 @@ static DEFINE_MUTEX(vmem_mutex); -struct memory_segment { - struct list_head list; - unsigned long start; - unsigned long size; -}; - -static LIST_HEAD(mem_segs); - static void __ref *vmem_alloc_pages(unsigned int order) { unsigned long size = PAGE_SIZE << order; @@ -300,94 +292,25 @@ void vmemmap_free(unsigned long start, unsigned long end, { } -/* - * Add memory segment to the segment list if it doesn't overlap with - * an already present segment. - */ -static int insert_memory_segment(struct memory_segment *seg) +void vmem_remove_mapping(unsigned long start, unsigned long size) { - struct memory_segment *tmp; - - if (seg->start + seg->size > VMEM_MAX_PHYS || - seg->start + seg->size < seg->start) - return -ERANGE; - - list_for_each_entry(tmp, &mem_segs, list) { - if (seg->start >= tmp->start + tmp->size) - continue; - if (seg->start + seg->size <= tmp->start) - continue; - return -ENOSPC; - } - list_add(&seg->list, &mem_segs); - return 0; -} - -/* - * Remove memory segment from the segment list. - */ -static void remove_memory_segment(struct memory_segment *seg) -{ - list_del(&seg->list); -} - -static void __remove_shared_memory(struct memory_segment *seg) -{ - remove_memory_segment(seg); - vmem_remove_range(seg->start, seg->size); -} - -int vmem_remove_mapping(unsigned long start, unsigned long size) -{ - struct memory_segment *seg; - int ret; - mutex_lock(&vmem_mutex); - - ret = -ENOENT; - list_for_each_entry(seg, &mem_segs, list) { - if (seg->start == start && seg->size == size) - break; - } - - if (seg->start != start || seg->size != size) - goto out; - - ret = 0; - __remove_shared_memory(seg); - kfree(seg); -out: + vmem_remove_range(start, size); mutex_unlock(&vmem_mutex); - return ret; } int vmem_add_mapping(unsigned long start, unsigned long size) { - struct memory_segment *seg; int ret; + if (start + size > VMEM_MAX_PHYS || + start + size < start) + return -ERANGE; + mutex_lock(&vmem_mutex); - ret = -ENOMEM; - seg = kzalloc(sizeof(*seg), GFP_KERNEL); - if (!seg) - goto out; - seg->start = start; - seg->size = size; - - ret = insert_memory_segment(seg); - if (ret) - goto out_free; - ret = vmem_add_mem(start, size); if (ret) - goto out_remove; - goto out; - -out_remove: - __remove_shared_memory(seg); -out_free: - kfree(seg); -out: + vmem_remove_range(start, size); mutex_unlock(&vmem_mutex); return ret; } @@ -421,27 +344,3 @@ void __init vmem_map_init(void) pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } - -/* - * Convert memblock.memory to a memory segment list so there is a single - * list that contains all memory segments. - */ -static int __init vmem_convert_memory_chunk(void) -{ - struct memblock_region *reg; - struct memory_segment *seg; - - mutex_lock(&vmem_mutex); - for_each_memblock(memory, reg) { - seg = kzalloc(sizeof(*seg), GFP_KERNEL); - if (!seg) - panic("Out of memory...\n"); - seg->start = reg->base; - seg->size = reg->size; - insert_memory_segment(seg); - } - mutex_unlock(&vmem_mutex); - return 0; -} - -core_initcall(vmem_convert_memory_chunk); From 5cdfbdce5de6b5b56e104676409762fc1289a9c2 Mon Sep 17 00:00:00 2001 From: Oscar Carter Date: Sat, 27 Jun 2020 14:54:17 +0200 Subject: [PATCH 239/502] s390/tty3270: remove function callback casts In an effort to enable -Wcast-function-type in the top-level Makefile to support Control Flow Integrity builds, remove all the function callback casts. To do this modify the function prototypes accordingly. Signed-off-by: Oscar Carter Message-Id: <20200627125417.18887-1-oscar.carter@gmx.com> Reviewed-by: Kees Cook [heiko.carstens@de.ibm.com: coding style changes] Signed-off-by: Heiko Carstens --- drivers/s390/char/tty3270.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c index 98d7fc152e32..aec996de44d9 100644 --- a/drivers/s390/char/tty3270.c +++ b/drivers/s390/char/tty3270.c @@ -556,8 +556,9 @@ tty3270_scroll_backward(struct kbd_data *kbd) * Pass input line to tty. */ static void -tty3270_read_tasklet(struct raw3270_request *rrq) +tty3270_read_tasklet(unsigned long data) { + struct raw3270_request *rrq = (struct raw3270_request *)data; static char kreset_data = TW_KR; struct tty3270 *tp = container_of(rrq->view, struct tty3270, view); char *input; @@ -652,8 +653,9 @@ tty3270_issue_read(struct tty3270 *tp, int lock) * Hang up the tty */ static void -tty3270_hangup_tasklet(struct tty3270 *tp) +tty3270_hangup_tasklet(unsigned long data) { + struct tty3270 *tp = (struct tty3270 *)data; tty_port_tty_hangup(&tp->port, true); raw3270_put_view(&tp->view); } @@ -752,11 +754,9 @@ tty3270_alloc_view(void) tty_port_init(&tp->port); timer_setup(&tp->timer, tty3270_update, 0); - tasklet_init(&tp->readlet, - (void (*)(unsigned long)) tty3270_read_tasklet, + tasklet_init(&tp->readlet, tty3270_read_tasklet, (unsigned long) tp->read); - tasklet_init(&tp->hanglet, - (void (*)(unsigned long)) tty3270_hangup_tasklet, + tasklet_init(&tp->hanglet, tty3270_hangup_tasklet, (unsigned long) tp); INIT_WORK(&tp->resize_work, tty3270_resize_work); From d4e0340919fb9190a57e879fb3125c4acce0d9b2 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Mon, 22 Jun 2020 18:18:02 -0700 Subject: [PATCH 240/502] arm64/module: Optimize module load time by optimizing PLT counting When loading a module, module_frob_arch_sections() tries to figure out the number of PLTs that'll be needed to handle all the RELAs. While doing this, it tries to dedupe PLT allocations for multiple R_AARCH64_CALL26 relocations to the same symbol. It does the same for R_AARCH64_JUMP26 relocations. To make checks for duplicates easier/faster, it sorts the relocation list by type, symbol and addend. That way, to check for a duplicate relocation, it just needs to compare with the previous entry. However, sorting the entire relocation array is unnecessary and expensive (O(n log n)) because there are a lot of other relocation types that don't need deduping or can't be deduped. So this commit partitions the array into entries that need deduping and those that don't. And then sorts just the part that needs deduping. And when CONFIG_RANDOMIZE_BASE is disabled, the sorting is skipped entirely because PLTs are not allocated for R_AARCH64_CALL26 and R_AARCH64_JUMP26 if it's disabled. This gives significant reduction in module load time for modules with large number of relocations with no measurable impact on modules with a small number of relocations. In my test setup with CONFIG_RANDOMIZE_BASE enabled, these were the results for a few downstream modules: Module Size (MB) wlan 14 video codec 3.8 drm 1.8 IPA 2.5 audio 1.2 gpu 1.8 Without this patch: Module Number of entries sorted Module load time (ms) wlan 243739 283 video codec 74029 138 drm 53837 67 IPA 42800 90 audio 21326 27 gpu 20967 32 Total time to load all these module: 637 ms With this patch: Module Number of entries sorted Module load time (ms) wlan 22454 61 video codec 10150 47 drm 13014 40 IPA 8097 63 audio 4606 16 gpu 6527 20 Total time to load all these modules: 247 Time saved during boot for just these 6 modules: 390 ms Signed-off-by: Saravana Kannan Acked-by: Will Deacon Cc: Ard Biesheuvel Link: https://lore.kernel.org/r/20200623011803.91232-1-saravanak@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/module-plts.c | 46 ++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c index 65b08a74aec6..0ce3a28e3347 100644 --- a/arch/arm64/kernel/module-plts.c +++ b/arch/arm64/kernel/module-plts.c @@ -253,6 +253,40 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, return ret; } +static bool branch_rela_needs_plt(Elf64_Sym *syms, Elf64_Rela *rela, + Elf64_Word dstidx) +{ + + Elf64_Sym *s = syms + ELF64_R_SYM(rela->r_info); + + if (s->st_shndx == dstidx) + return false; + + return ELF64_R_TYPE(rela->r_info) == R_AARCH64_JUMP26 || + ELF64_R_TYPE(rela->r_info) == R_AARCH64_CALL26; +} + +/* Group branch PLT relas at the front end of the array. */ +static int partition_branch_plt_relas(Elf64_Sym *syms, Elf64_Rela *rela, + int numrels, Elf64_Word dstidx) +{ + int i = 0, j = numrels - 1; + + if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return 0; + + while (i < j) { + if (branch_rela_needs_plt(syms, &rela[i], dstidx)) + i++; + else if (branch_rela_needs_plt(syms, &rela[j], dstidx)) + swap(rela[i], rela[j]); + else + j--; + } + + return i; +} + int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { @@ -290,7 +324,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, for (i = 0; i < ehdr->e_shnum; i++) { Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset; - int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); + int nents, numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info; if (sechdrs[i].sh_type != SHT_RELA) @@ -300,8 +334,14 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dstsec->sh_flags & SHF_EXECINSTR)) continue; - /* sort by type, symbol index and addend */ - sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL); + /* + * sort branch relocations requiring a PLT by type, symbol index + * and addend + */ + nents = partition_branch_plt_relas(syms, rels, numrels, + sechdrs[i].sh_info); + if (nents) + sort(rels, nents, sizeof(Elf64_Rela), cmp_rela, NULL); if (!str_has_prefix(secstrings + dstsec->sh_name, ".init")) core_plts += count_plts(syms, rels, numrels, From 3cb9d5464c1ceea86f6225089b2f7965989cf316 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 13 Jun 2020 16:09:46 +0800 Subject: [PATCH 241/502] perf/x86: Fix variable types for LBR registers The MSR variable type can be 'unsigned int', which uses less memory than the longer 'unsigned long'. Fix 'struct x86_pmu' for that. The lbr_nr won't be a negative number, so make it 'unsigned int' as well. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Wei Wang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200613080958.132489-2-like.xu@linux.intel.com --- arch/x86/events/perf_event.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e17a3d8a47ed..eb37f6c43c96 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -673,8 +673,8 @@ struct x86_pmu { /* * Intel LBR */ - unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ - int lbr_nr; /* hardware stack size */ + unsigned int lbr_tos, lbr_from, lbr_to, + lbr_nr; /* LBR base regs and size */ u64 lbr_sel_mask; /* LBR_SELECT valid bits */ const int *lbr_sel_map; /* lbr_select mappings */ bool lbr_double_abort; /* duplicated lbr aborts */ From 027440b5d426a51f33b515bbd236cc479d1e051f Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sat, 13 Jun 2020 16:09:47 +0800 Subject: [PATCH 242/502] perf/x86/core: Refactor hw->idx checks and cleanup For intel_pmu_en/disable_event(), reorder the branches checks for hw->idx and make them sorted by probability: gp,fixed,bts,others. Clean up the x86_assign_hw_event() by converting multiple if-else statements to a switch statement. To skip x86_perf_event_update() and x86_perf_event_set_period(), it's generic to replace "idx == INTEL_PMC_IDX_FIXED_BTS" check with '!hwc->event_base' because that should be 0 for all non-gp/fixed cases. Wrap related bit operations into intel_set/clear_masks() and make the main path more cleaner and readable. No functional changes. Signed-off-by: Like Xu Original-by: Peter Zijlstra (Intel) Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200613080958.132489-3-like.xu@linux.intel.com --- arch/x86/events/core.c | 25 +++++++---- arch/x86/events/intel/core.c | 85 +++++++++++++++++++----------------- 2 files changed, 62 insertions(+), 48 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 4103665c6e03..15cb7af7db18 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -71,10 +71,9 @@ u64 x86_perf_event_update(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; - int idx = hwc->idx; u64 delta; - if (idx == INTEL_PMC_IDX_FIXED_BTS) + if (unlikely(!hwc->event_base)) return 0; /* @@ -1097,22 +1096,30 @@ static inline void x86_assign_hw_event(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { struct hw_perf_event *hwc = &event->hw; + int idx; - hwc->idx = cpuc->assign[i]; + idx = hwc->idx = cpuc->assign[i]; hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; - if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { + switch (hwc->idx) { + case INTEL_PMC_IDX_FIXED_BTS: hwc->config_base = 0; hwc->event_base = 0; - } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { + break; + + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); - hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; - } else { + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + + (idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | 1<<30; + break; + + default: hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); + break; } } @@ -1233,7 +1240,7 @@ int x86_perf_event_set_period(struct perf_event *event) s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; - if (idx == INTEL_PMC_IDX_FIXED_BTS) + if (unlikely(!hwc->event_base)) return 0; /* diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ca35c8b5ee10..8dac4c61bf76 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2136,8 +2136,35 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) +static inline bool event_is_checkpointed(struct perf_event *event) { + return unlikely(event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; +} + +static inline void intel_set_masks(struct perf_event *event, int idx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (event->attr.exclude_host) + __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask); + if (event->attr.exclude_guest) + __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask); + if (event_is_checkpointed(event)) + __set_bit(idx, (unsigned long *)&cpuc->intel_cp_status); +} + +static inline void intel_clear_masks(struct perf_event *event, int idx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + __clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask); + __clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask); + __clear_bit(idx, (unsigned long *)&cpuc->intel_cp_status); +} + +static void intel_pmu_disable_fixed(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -2148,31 +2175,22 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) wrmsrl(hwc->config_base, ctrl_val); } -static inline bool event_is_checkpointed(struct perf_event *event) -{ - return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; -} - static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int idx = hwc->idx; - if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { + if (idx < INTEL_PMC_IDX_FIXED) { + intel_clear_masks(event, idx); + x86_pmu_disable_event(event); + } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { + intel_clear_masks(event, idx); + intel_pmu_disable_fixed(event); + } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); - return; } - cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); - cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); - cpuc->intel_cp_status &= ~(1ull << hwc->idx); - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - intel_pmu_disable_fixed(hwc); - else - x86_pmu_disable_event(event); - /* * Needs to be called after x86_pmu_disable_event, * so we don't trigger the event without PEBS bit set. @@ -2238,33 +2256,22 @@ static void intel_pmu_enable_fixed(struct perf_event *event) static void intel_pmu_enable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { - if (!__this_cpu_read(cpu_hw_events.enabled)) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - - if (event->attr.exclude_host) - cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); - if (event->attr.exclude_guest) - cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); - - if (unlikely(event_is_checkpointed(event))) - cpuc->intel_cp_status |= (1ull << hwc->idx); + int idx = hwc->idx; if (unlikely(event->attr.precise_ip)) intel_pmu_pebs_enable(event); - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + if (idx < INTEL_PMC_IDX_FIXED) { + intel_set_masks(event, idx); + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { + intel_set_masks(event, idx); intel_pmu_enable_fixed(event); - return; + } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { + if (!__this_cpu_read(cpu_hw_events.enabled)) + return; + intel_pmu_enable_bts(hwc->config); } - - __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } static void intel_pmu_add_event(struct perf_event *event) From b2d6504761a50b9493eb4b20f6e188b673f20c32 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sat, 13 Jun 2020 16:09:48 +0800 Subject: [PATCH 243/502] perf/x86/lbr: Add interface to get LBR information The LBR records msrs are model specific. The perf subsystem has already obtained the base addresses of LBR records based on the cpu model. Therefore, an interface is added to allow callers outside the perf subsystem to obtain these LBR information. It's useful for hypervisors to emulate the LBR feature for guests with less code. Signed-off-by: Like Xu Signed-off-by: Wei Wang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200613080958.132489-4-like.xu@linux.intel.com --- arch/x86/events/intel/lbr.c | 20 ++++++++++++++++++++ arch/x86/include/asm/perf_event.h | 12 ++++++++++++ 2 files changed, 32 insertions(+) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 65113b16804a..2ed3f2a51bdf 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1343,3 +1343,23 @@ void intel_pmu_lbr_init_knl(void) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_LIP) x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } + +/** + * x86_perf_get_lbr - get the LBR records information + * + * @lbr: the caller's memory to store the LBR records information + * + * Returns: 0 indicates the LBR info has been successfully obtained + */ +int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +{ + int lbr_fmt = x86_pmu.intel_cap.lbr_format; + + lbr->nr = x86_pmu.lbr_nr; + lbr->from = x86_pmu.lbr_from; + lbr->to = x86_pmu.lbr_to; + lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? MSR_LBR_INFO_0 : 0; + + return 0; +} +EXPORT_SYMBOL_GPL(x86_perf_get_lbr); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index e855e9cf2c37..5d2c30f0df02 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -333,6 +333,13 @@ struct perf_guest_switch_msr { u64 host, guest; }; +struct x86_pmu_lbr { + unsigned int nr; + unsigned int from; + unsigned int to; + unsigned int info; +}; + extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); extern void perf_check_microcode(void); extern int x86_perf_rdpmc_index(struct perf_event *event); @@ -348,12 +355,17 @@ static inline void perf_check_microcode(void) { } #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { *nr = 0; return NULL; } +static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +{ + return -1; +} #endif #ifdef CONFIG_CPU_SUP_INTEL From 097e4311cda952dfb047f2a49d35aa5de500d474 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sat, 13 Jun 2020 16:09:49 +0800 Subject: [PATCH 244/502] perf/x86: Add constraint to create guest LBR event without hw counter The hypervisor may request the perf subsystem to schedule a time window to directly access the LBR records msrs for its own use. Normally, it would create a guest LBR event with callstack mode enabled, which is scheduled along with other ordinary LBR events on the host but in an exclusive way. To avoid wasting a counter for the guest LBR event, the perf tracks its hw->idx via INTEL_PMC_IDX_FIXED_VLBR and assigns it with a fake VLBR counter with the help of new vlbr_constraint. As with the BTS event, there is actually no hardware counter assigned for the guest LBR event. Signed-off-by: Like Xu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200514083054.62538-5-like.xu@linux.intel.com --- arch/x86/events/core.c | 1 + arch/x86/events/intel/core.c | 18 ++++++++++++++++++ arch/x86/events/intel/lbr.c | 4 ++++ arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/perf_event.h | 22 +++++++++++++++++++++- 5 files changed, 45 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 15cb7af7db18..d740c861724c 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1104,6 +1104,7 @@ static inline void x86_assign_hw_event(struct perf_event *event, switch (hwc->idx) { case INTEL_PMC_IDX_FIXED_BTS: + case INTEL_PMC_IDX_FIXED_VLBR: hwc->config_base = 0; hwc->event_base = 0; break; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8dac4c61bf76..51e1fba7b1d1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2621,6 +2621,20 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +/* + * Note: matches a fake event, like Fixed2. + */ +static struct event_constraint * +intel_vlbr_constraints(struct perf_event *event) +{ + struct event_constraint *c = &vlbr_constraint; + + if (unlikely(constraint_match(c, event->hw.config))) + return c; + + return NULL; +} + static int intel_alt_er(int idx, u64 config) { int alt_idx = idx; @@ -2811,6 +2825,10 @@ __intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, { struct event_constraint *c; + c = intel_vlbr_constraints(event); + if (c) + return c; + c = intel_bts_constraints(event); if (c) return c; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 2ed3f2a51bdf..d285d26c1578 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1363,3 +1363,7 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) return 0; } EXPORT_SYMBOL_GPL(x86_perf_get_lbr); + +struct event_constraint vlbr_constraint = + FIXED_EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, + (INTEL_PMC_IDX_FIXED_VLBR - INTEL_PMC_IDX_FIXED)); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index eb37f6c43c96..77a6dd66bd9a 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -990,6 +990,7 @@ void release_ds_buffers(void); void reserve_ds_buffers(void); extern struct event_constraint bts_constraint; +extern struct event_constraint vlbr_constraint; void intel_pmu_enable_bts(u64 config); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 5d2c30f0df02..2df707311d17 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -192,9 +192,29 @@ struct x86_pmu_capability { #define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) #define GLOBAL_STATUS_ASIF BIT_ULL(60) #define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) -#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58) +#define GLOBAL_STATUS_LBRS_FROZEN_BIT 58 +#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55) +/* + * We model guest LBR event tracing as another fixed-mode PMC like BTS. + * + * We choose bit 58 because it's used to indicate LBR stack frozen state + * for architectural perfmon v4, also we unconditionally mask that bit in + * the handle_pmi_common(), so it'll never be set in the overflow handling. + * + * With this fake counter assigned, the guest LBR event user (such as KVM), + * can program the LBR registers on its own, and we don't actually do anything + * with then in the host context. + */ +#define INTEL_PMC_IDX_FIXED_VLBR (GLOBAL_STATUS_LBRS_FROZEN_BIT) + +/* + * Pseudo-encoding the guest LBR event as event=0x00,umask=0x1b, + * since it would claim bit 58 which is effectively Fixed26. + */ +#define INTEL_FIXED_VLBR_EVENT 0x1b00 + /* * Adaptive PEBS v4 */ From e1ad1ac2deb8f90af9f12ff316989dd5675dec11 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sat, 13 Jun 2020 16:09:50 +0800 Subject: [PATCH 245/502] perf/x86: Keep LBR records unchanged in host context for guest usage When a guest wants to use the LBR registers, its hypervisor creates a guest LBR event and let host perf schedules it. The LBR records msrs are accessible to the guest when its guest LBR event is scheduled on by the perf subsystem. Before scheduling this event out, we should avoid host changes on IA32_DEBUGCTLMSR or LBR_SELECT. Otherwise, some unexpected branch operations may interfere with guest behavior, pollute LBR records, and even cause host branches leakage. In addition, the read operation on host is also avoidable. To ensure that guest LBR records are not lost during the context switch, the guest LBR event would enable the callstack mode which could save/restore guest unread LBR records with the help of intel_pmu_lbr_sched_task() naturally. However, the guest LBR_SELECT may changes for its own use and the host LBR event doesn't save/restore it. To ensure that we doesn't lost the guest LBR_SELECT value when the guest LBR event is running, the vlbr_constraint is bound up with a new constraint flag PERF_X86_EVENT_LBR_SELECT. Signed-off-by: Like Xu Signed-off-by: Wei Wang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200514083054.62538-6-like.xu@linux.intel.com --- arch/x86/events/intel/core.c | 6 ++++-- arch/x86/events/intel/lbr.c | 31 ++++++++++++++++++++++++++----- arch/x86/events/perf_event.h | 3 +++ 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 51e1fba7b1d1..582ddff9a359 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2189,7 +2189,8 @@ static void intel_pmu_disable_event(struct perf_event *event) } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); - } + } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) + intel_clear_masks(event, idx); /* * Needs to be called after x86_pmu_disable_event, @@ -2271,7 +2272,8 @@ static void intel_pmu_enable_event(struct perf_event *event) if (!__this_cpu_read(cpu_hw_events.enabled)) return; intel_pmu_enable_bts(hwc->config); - } + } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) + intel_set_masks(event, idx); } static void intel_pmu_add_event(struct perf_event *event) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index d285d26c1578..d03de7539957 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -383,6 +383,9 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) wrmsrl(x86_pmu.lbr_tos, tos); task_ctx->lbr_stack_state = LBR_NONE; + + if (cpuc->lbr_select) + wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) @@ -415,6 +418,9 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) cpuc->last_task_ctx = task_ctx; cpuc->last_log_id = ++task_ctx->log_id; + + if (cpuc->lbr_select) + rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, @@ -485,6 +491,9 @@ void intel_pmu_lbr_add(struct perf_event *event) if (!x86_pmu.lbr_nr) return; + if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) + cpuc->lbr_select = 1; + cpuc->br_sel = event->hw.branch_reg.reg; if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { @@ -532,6 +541,9 @@ void intel_pmu_lbr_del(struct perf_event *event) task_ctx->lbr_callstack_users--; } + if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) + cpuc->lbr_select = 0; + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) cpuc->lbr_pebs_users--; cpuc->lbr_users--; @@ -540,11 +552,19 @@ void intel_pmu_lbr_del(struct perf_event *event) perf_sched_cb_dec(event->ctx->pmu); } +static inline bool vlbr_exclude_host(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + return test_bit(INTEL_PMC_IDX_FIXED_VLBR, + (unsigned long *)&cpuc->intel_ctrl_guest_mask); +} + void intel_pmu_lbr_enable_all(bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (cpuc->lbr_users) + if (cpuc->lbr_users && !vlbr_exclude_host()) __intel_pmu_lbr_enable(pmi); } @@ -552,7 +572,7 @@ void intel_pmu_lbr_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (cpuc->lbr_users) + if (cpuc->lbr_users && !vlbr_exclude_host()) __intel_pmu_lbr_disable(); } @@ -694,7 +714,8 @@ void intel_pmu_lbr_read(void) * This could be smarter and actually check the event, * but this simple approach seems to work for now. */ - if (!cpuc->lbr_users || cpuc->lbr_users == cpuc->lbr_pebs_users) + if (!cpuc->lbr_users || vlbr_exclude_host() || + cpuc->lbr_users == cpuc->lbr_pebs_users) return; if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) @@ -1365,5 +1386,5 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) EXPORT_SYMBOL_GPL(x86_perf_get_lbr); struct event_constraint vlbr_constraint = - FIXED_EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, - (INTEL_PMC_IDX_FIXED_VLBR - INTEL_PMC_IDX_FIXED)); + __EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR), + FIXED_EVENT_FLAGS, 1, 0, PERF_X86_EVENT_LBR_SELECT); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 77a6dd66bd9a..81475963df99 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -78,6 +78,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ #define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ #define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ +#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ struct amd_nb { int nb_id; /* NorthBridge id */ @@ -237,6 +238,7 @@ struct cpu_hw_events { u64 br_sel; struct x86_perf_task_context *last_task_ctx; int last_log_id; + int lbr_select; /* * Intel host/guest exclude bits @@ -722,6 +724,7 @@ struct x86_perf_task_context { u64 lbr_from[MAX_LBR_ENTRIES]; u64 lbr_to[MAX_LBR_ENTRIES]; u64 lbr_info[MAX_LBR_ENTRIES]; + u64 lbr_sel; int tos; int valid_lbrs; int lbr_callstack_users; From 638d503130098e234b002942b33a4d886ef6f270 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 29 Jun 2020 10:08:31 +0530 Subject: [PATCH 246/502] arm64/panic: Unify all three existing notifier blocks Currently there are three different registered panic notifier blocks. This unifies all of them into a single one i.e arm64_panic_block, hence reducing code duplication and required calling sequence during panic. This preserves the existing dump sequence. While here, just use device_initcall() directly instead of __initcall() which has been a legacy alias for the earlier. This replacement is a pure cleanup with no functional implications. Signed-off-by: Anshuman Khandual Acked-by: Mark Rutland Cc: Will Deacon Cc: Steve Capper Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593405511-7625-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpufeature.h | 1 + arch/arm64/include/asm/memory.h | 1 + arch/arm64/kernel/cpufeature.c | 15 +-------------- arch/arm64/kernel/setup.c | 24 ++++++++++++++---------- arch/arm64/mm/init.c | 18 +----------------- 5 files changed, 18 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 5d1f4ae42799..e375529ca9fc 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -774,6 +774,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) } u32 get_kvm_ipa_limit(void); +void dump_cpu_features(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index a1871bb32bb1..2a88cb734d06 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -322,6 +322,7 @@ static inline void *phys_to_virt(phys_addr_t x) __is_lm_address(__addr) && pfn_valid(virt_to_pfn(__addr)); \ }) +void dump_mem_limit(void); #endif /* !ASSEMBLY */ /* diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9f63053a63a9..9b79df930396 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -119,25 +119,12 @@ static inline void finalize_system_capabilities(void) static_branch_enable(&arm64_const_caps_ready); } -static int dump_cpu_hwcaps(struct notifier_block *self, unsigned long v, void *p) +void dump_cpu_features(void) { /* file-wide pr_fmt adds "CPU features: " prefix */ pr_emerg("0x%*pb\n", ARM64_NCAPS, &cpu_hwcaps); - return 0; } -static struct notifier_block cpu_hwcaps_notifier = { - .notifier_call = dump_cpu_hwcaps -}; - -static int __init register_cpu_hwcaps_dumper(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, - &cpu_hwcaps_notifier); - return 0; -} -__initcall(register_cpu_hwcaps_dumper); - DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS); EXPORT_SYMBOL(cpu_hwcap_keys); diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 93b3844cf442..c793276ec7ad 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -400,11 +400,7 @@ static int __init topology_init(void) } subsys_initcall(topology_init); -/* - * Dump out kernel offset information on panic. - */ -static int dump_kernel_offset(struct notifier_block *self, unsigned long v, - void *p) +static void dump_kernel_offset(void) { const unsigned long offset = kaslr_offset(); @@ -415,17 +411,25 @@ static int dump_kernel_offset(struct notifier_block *self, unsigned long v, } else { pr_emerg("Kernel Offset: disabled\n"); } +} + +static int arm64_panic_block_dump(struct notifier_block *self, + unsigned long v, void *p) +{ + dump_kernel_offset(); + dump_cpu_features(); + dump_mem_limit(); return 0; } -static struct notifier_block kernel_offset_notifier = { - .notifier_call = dump_kernel_offset +static struct notifier_block arm64_panic_block = { + .notifier_call = arm64_panic_block_dump }; -static int __init register_kernel_offset_dumper(void) +static int __init register_arm64_panic_block(void) { atomic_notifier_chain_register(&panic_notifier_list, - &kernel_offset_notifier); + &arm64_panic_block); return 0; } -__initcall(register_kernel_offset_dumper); +device_initcall(register_arm64_panic_block); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1e93cfc7c47a..6c3eb424c613 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -563,27 +563,11 @@ void free_initmem(void) unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin)); } -/* - * Dump out memory limit information on panic. - */ -static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p) +void dump_mem_limit(void) { if (memory_limit != PHYS_ADDR_MAX) { pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20); } else { pr_emerg("Memory Limit: none\n"); } - return 0; } - -static struct notifier_block mem_limit_notifier = { - .notifier_call = dump_mem_limit, -}; - -static int __init register_mem_limit_dumper(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, - &mem_limit_notifier); - return 0; -} -__initcall(register_mem_limit_dumper); From 1d50e5d0c5052446cb85a3bf11fe8ba4e8d770ca Mon Sep 17 00:00:00 2001 From: Bhupesh Sharma Date: Thu, 14 May 2020 00:22:36 +0530 Subject: [PATCH 247/502] crash_core, vmcoreinfo: Append 'MAX_PHYSMEM_BITS' to vmcoreinfo Right now user-space tools like 'makedumpfile' and 'crash' need to rely on a best-guess method of determining value of 'MAX_PHYSMEM_BITS' supported by underlying kernel. This value is used in user-space code to calculate the bit-space required to store a section for SPARESMEM (similar to the existing calculation method used in the kernel implementation): #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) Now, regressions have been reported in user-space utilities like 'makedumpfile' and 'crash' on arm64, with the recently added kernel support for 52-bit physical address space, as there is no clear method of determining this value in user-space (other than reading kernel CONFIG flags). As per suggestion from makedumpfile maintainer (Kazu), it makes more sense to append 'MAX_PHYSMEM_BITS' to vmcoreinfo in the core code itself rather than in arch-specific code, so that the user-space code for other archs can also benefit from this addition to the vmcoreinfo and use it as a standard way of determining 'SECTIONS_SHIFT' value in user-land. A reference 'makedumpfile' implementation which reads the 'MAX_PHYSMEM_BITS' value from vmcoreinfo in a arch-independent fashion is available here: While at it also update vmcoreinfo documentation for 'MAX_PHYSMEM_BITS' variable being added to vmcoreinfo. 'MAX_PHYSMEM_BITS' defines the maximum supported physical address space memory. Signed-off-by: Bhupesh Sharma Tested-by: John Donnelly Acked-by: Dave Young Cc: Boris Petkov Cc: Ingo Molnar Cc: Thomas Gleixner Cc: James Morse Cc: Mark Rutland Cc: Will Deacon Cc: Michael Ellerman Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Dave Anderson Cc: Kazuhito Hagio Cc: x86@kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: kexec@lists.infradead.org Link: https://lore.kernel.org/r/1589395957-24628-2-git-send-email-bhsharma@redhat.com Signed-off-by: Catalin Marinas --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 5 +++++ kernel/crash_core.c | 1 + 2 files changed, 6 insertions(+) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index e4ee8b2db604..2a632020f809 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -93,6 +93,11 @@ It exists in the sparse memory mapping model, and it is also somewhat similar to the mem_map variable, both of them are used to translate an address. +MAX_PHYSMEM_BITS +---------------- + +Defines the maximum supported physical address space memory. + page ---- diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 9f1557b98468..18175687133a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -413,6 +413,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); VMCOREINFO_STRUCT_SIZE(pglist_data); From bbdbc11804ff0b4130e7550113b452e96a74d16e Mon Sep 17 00:00:00 2001 From: Bhupesh Sharma Date: Thu, 14 May 2020 00:22:37 +0530 Subject: [PATCH 248/502] arm64/crash_core: Export TCR_EL1.T1SZ in vmcoreinfo TCR_EL1.TxSZ, which controls the VA space size, is configured by a single kernel image to support either 48-bit or 52-bit VA space. If the ARMv8.2-LVA optional feature is present and we are running with a 64KB page size, then it is possible to use 52-bits of address space for both userspace and kernel addresses. However, any kernel binary that supports 52-bit must also be able to fall back to 48-bit at early boot time if the hardware feature is not present. Since TCR_EL1.T1SZ indicates the size of the memory region addressed by TTBR1_EL1, export the same in vmcoreinfo. User-space utilities like makedumpfile and crash-utility need to read this value from vmcoreinfo for determining if a virtual address lies in the linear map range. While at it also add documentation for TCR_EL1.T1SZ variable being added to vmcoreinfo. It indicates the size offset of the memory region addressed by TTBR1_EL1. Signed-off-by: Bhupesh Sharma Tested-by: John Donnelly Tested-by: Kamlakant Patel Tested-by: Amit Daniel Kachhap Reviewed-by: James Morse Reviewed-by: Amit Daniel Kachhap Cc: James Morse Cc: Mark Rutland Cc: Will Deacon Cc: Steve Capper Cc: Ard Biesheuvel Cc: Dave Anderson Cc: Kazuhito Hagio Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: kexec@lists.infradead.org Link: https://lore.kernel.org/r/1589395957-24628-3-git-send-email-bhsharma@redhat.com [catalin.marinas@arm.com: removed vabits_actual from the commit log] Signed-off-by: Catalin Marinas --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 11 +++++++++++ arch/arm64/include/asm/pgtable-hwdef.h | 1 + arch/arm64/kernel/crash_core.c | 10 ++++++++++ 3 files changed, 22 insertions(+) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 2a632020f809..2baad0bfb09d 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -404,6 +404,17 @@ KERNELPACMASK The mask to extract the Pointer Authentication Code from a kernel virtual address. +TCR_EL1.T1SZ +------------ + +Indicates the size offset of the memory region addressed by TTBR1_EL1. +The region size is 2^(64-T1SZ) bytes. + +TTBR1_EL1 is the table base address register specified by ARMv8-A +architecture which is used to lookup the page-tables for the Virtual +addresses in the higher VA range (refer to ARMv8 ARM document for +more details). + arm === diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 9c91a8f93a0e..9a757d724974 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -216,6 +216,7 @@ #define TCR_TxSZ(x) (TCR_T0SZ(x) | TCR_T1SZ(x)) #define TCR_TxSZ_WIDTH 6 #define TCR_T0SZ_MASK (((UL(1) << TCR_TxSZ_WIDTH) - 1) << TCR_T0SZ_OFFSET) +#define TCR_T1SZ_MASK (((UL(1) << TCR_TxSZ_WIDTH) - 1) << TCR_T1SZ_OFFSET) #define TCR_EPD0_SHIFT 7 #define TCR_EPD0_MASK (UL(1) << TCR_EPD0_SHIFT) diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c index 1f646b07e3e9..314391a156ee 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/crash_core.c @@ -7,6 +7,14 @@ #include #include #include +#include + +static inline u64 get_tcr_el1_t1sz(void); + +static inline u64 get_tcr_el1_t1sz(void) +{ + return (read_sysreg(tcr_el1) & TCR_T1SZ_MASK) >> TCR_T1SZ_OFFSET; +} void arch_crash_save_vmcoreinfo(void) { @@ -16,6 +24,8 @@ void arch_crash_save_vmcoreinfo(void) kimage_voffset); vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n", PHYS_OFFSET); + vmcoreinfo_append_str("NUMBER(TCR_EL1_T1SZ)=0x%llx\n", + get_tcr_el1_t1sz()); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); vmcoreinfo_append_str("NUMBER(KERNELPACMASK)=0x%llx\n", system_supports_address_auth() ? From dd72078466ecd525f4d489e7b0093cd9b5044c8e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 25 Jun 2020 14:15:07 +0100 Subject: [PATCH 249/502] arm64: Document sysctls for emulated deprecated instructions We have support for emulating a number of deprecated instructions in the kernel with individual Kconfig options enabling this support per instruction. In addition to the Kconfig options we also provide runtime control via sysctls but this is not currently mentioned in the Kconfig so not very discoverable for users. This is particularly important for SWP/SWPB since this is disabled by default at runtime and must be enabled via the sysctl, causing considerable frustration for users who have enabled the config option and are then confused to find that the instruction is still faulting. Add a reference to the sysctls in the help text for each of the config options, noting that SWP/SWPB is disabled by default, to improve the user experience. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20200625131507.32334-1-broonie@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 66dc41fd49f2..6c560caf9503 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1327,6 +1327,8 @@ config SWP_EMULATION ARMv8 obsoletes the use of A32 SWP/SWPB instructions such that they are always undefined. Say Y here to enable software emulation of these instructions for userspace using LDXR/STXR. + This feature can be controlled at runtime with the abi.swp + sysctl which is disabled by default. In some older versions of glibc [<=2.8] SWP is used during futex trylock() operations with the assumption that the code will not @@ -1353,7 +1355,8 @@ config CP15_BARRIER_EMULATION Say Y here to enable software emulation of these instructions for AArch32 userspace code. When this option is enabled, CP15 barrier usage is traced which can help - identify software that needs updating. + identify software that needs updating. This feature can be + controlled at runtime with the abi.cp15_barrier sysctl. If unsure, say Y @@ -1364,7 +1367,8 @@ config SETEND_EMULATION AArch32 EL0, and is deprecated in ARMv8. Say Y here to enable software emulation of the instruction - for AArch32 userspace code. + for AArch32 userspace code. This feature can be controlled + at runtime with the abi.setend sysctl. Note: All the cpus on the system must have mixed endian support at EL0 for this feature to be enabled. If a new CPU - which doesn't support mixed From 24840e76bf8a679d26d373a0edc44284bfd9dc18 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 1 Jul 2020 11:16:16 +0200 Subject: [PATCH 250/502] s390/smp: move smp_cpus_done() to header file Saves us a couple of bytes. Signed-off-by: Heiko Carstens --- arch/s390/include/asm/smp.h | 4 ++++ arch/s390/kernel/smp.c | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 7326f110d48c..20b37b059e2b 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -54,6 +54,10 @@ static inline int smp_get_base_cpu(int cpu) return cpu - (cpu % (smp_cpu_mtid + 1)); } +static inline void smp_cpus_done(unsigned int max_cpus) +{ +} + extern int smp_rescan_cpus(void); extern void __noreturn cpu_die(void); extern void __cpu_die(unsigned int cpu); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index e6be63ff162a..b4f2795a123d 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -1012,10 +1012,6 @@ void __init smp_prepare_boot_cpu(void) smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); } -void __init smp_cpus_done(unsigned int max_cpus) -{ -} - void __init smp_setup_processor_id(void) { pcpu_devices[0].address = stap(); From 8e1398f8987851bb266c1d8d911752a18e1d05b4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 1 Jul 2020 11:17:52 +0200 Subject: [PATCH 251/502] s390/smp: add missing linebreak Signed-off-by: Heiko Carstens --- arch/s390/kernel/smp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index b4f2795a123d..f685a38f166d 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -1141,6 +1141,7 @@ static int smp_cpu_online(unsigned int cpu) return sysfs_create_group(&s->kobj, &cpu_online_attr_group); } + static int smp_cpu_pre_down(unsigned int cpu) { struct device *s = &per_cpu(cpu_device, cpu)->dev; From 0ef5d691aae0322cbab0807c184ba534536a4698 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 30 Jun 2020 10:42:40 +0200 Subject: [PATCH 252/502] s390/extmem: remove stale -ENOSPC comment and handling segment_load() will no longer return -ENOSPC. If a segment overlaps with storage, we now also return -EBUSY. Remove the stale comment from __segment_load() and the stale handling from segment_warning(). Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Andrew Morton Suggested-by: Gerald Schaefer Signed-off-by: David Hildenbrand Message-Id: <20200630084240.8283-1-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/mm/extmem.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 105c09282f8c..5060956b8e7d 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -401,8 +401,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long * -EIO : could not perform query or load diagnose * -ENOENT : no such segment * -EOPNOTSUPP: multi-part segment cannot be used with linux - * -ENOSPC : segment cannot be used (overlaps with storage) - * -EBUSY : segment can temporarily not be used (overlaps with dcss) + * -EBUSY : segment cannot be used (overlaps with dcss or storage) * -ERANGE : segment cannot be used (exceeds kernel mapping range) * -EPERM : segment is currently loaded with incompatible permissions * -ENOMEM : out of memory @@ -627,10 +626,6 @@ void segment_warning(int rc, char *seg_name) pr_err("DCSS %s has multiple page ranges and cannot be " "loaded or queried\n", seg_name); break; - case -ENOSPC: - pr_err("DCSS %s overlaps with used storage and cannot " - "be loaded\n", seg_name); - break; case -EBUSY: pr_err("%s needs used memory resources and cannot be " "loaded or queried\n", seg_name); From c6337c6e89a695819d94949a7170e1bd0d131e31 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Tue, 30 Jun 2020 09:42:23 +0200 Subject: [PATCH 253/502] s390/pkey: fix smatch warning inconsistent indenting Fix smatch warnings: pkey_api.c:1606 pkey_ccacipher_aes_attr_read() warn: inconsistent indenting Reported-by: kernel test robot Signed-off-by: Harald Freudenberger Signed-off-by: Heiko Carstens --- drivers/s390/crypto/pkey_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index 74e63ec49068..d5880f52dc2b 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -1603,8 +1603,8 @@ static ssize_t pkey_ccacipher_aes_attr_read(enum pkey_key_size keybits, if (rc == 0) break; } - if (rc) - return rc; + if (rc) + return rc; if (is_xts) { keysize = CCACIPHERTOKENSIZE; From 47c07bffeb32aa2a8e798d8ce25fa693e1364e11 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Tue, 30 Jun 2020 09:54:50 +0200 Subject: [PATCH 254/502] s390/zcrypt: fix smatch warnings Fix these smatch warnings: zcrypt_api.c:986 _zcrypt_send_ep11_cprb() error: uninitialized symbol 'pref_weight'. zcrypt_api.c:1008 _zcrypt_send_ep11_cprb() error: uninitialized symbol 'weight'. zcrypt_api.c:676 zcrypt_rsa_modexpo() error: uninitialized symbol 'pref_weight'. zcrypt_api.c:694 zcrypt_rsa_modexpo() error: uninitialized symbol 'weight'. zcrypt_api.c:760 zcrypt_rsa_crt() error: uninitialized symbol 'pref_weight'. zcrypt_api.c:778 zcrypt_rsa_crt() error: uninitialized symbol 'weight'. zcrypt_api.c:824 _zcrypt_send_cprb() warn: always true condition '(tdom >= 0) => (0-u16max >= 0)' zcrypt_api.c:846 _zcrypt_send_cprb() error: uninitialized symbol 'pref_weight'. zcrypt_api.c:867 _zcrypt_send_cprb() error: uninitialized symbol 'weight'. zcrypt_api.c:1065 zcrypt_rng() error: uninitialized symbol 'pref_weight'. zcrypt_api.c:1079 zcrypt_rng() error: uninitialized symbol 'weight'. zcrypt_cex4.c:251 ep11_card_op_modes_show() warn: should '(1 << ep11_op_modes[i]->mode_bit)' be a 64 bit type? zcrypt_cex4.c:346 ep11_queue_op_modes_show() warn: should '(1 << ep11_op_modes[i]->mode_bit)' be a 64 bit type? Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Harald Freudenberger Signed-off-by: Heiko Carstens --- drivers/s390/crypto/zcrypt_api.c | 12 ++++++------ drivers/s390/crypto/zcrypt_cex4.c | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 56a405dce8bc..7775ff84f223 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -634,7 +634,7 @@ static long zcrypt_rsa_modexpo(struct ap_perms *perms, { struct zcrypt_card *zc, *pref_zc; struct zcrypt_queue *zq, *pref_zq; - unsigned int weight, pref_weight; + unsigned int weight = 0, pref_weight = 0; unsigned int func_code; int qid = 0, rc = -ENODEV; struct module *mod; @@ -718,7 +718,7 @@ static long zcrypt_rsa_crt(struct ap_perms *perms, { struct zcrypt_card *zc, *pref_zc; struct zcrypt_queue *zq, *pref_zq; - unsigned int weight, pref_weight; + unsigned int weight = 0, pref_weight = 0; unsigned int func_code; int qid = 0, rc = -ENODEV; struct module *mod; @@ -803,7 +803,7 @@ static long _zcrypt_send_cprb(struct ap_perms *perms, struct zcrypt_card *zc, *pref_zc; struct zcrypt_queue *zq, *pref_zq; struct ap_message ap_msg; - unsigned int weight, pref_weight; + unsigned int weight = 0, pref_weight = 0; unsigned int func_code; unsigned short *domain, tdom; int qid = 0, rc = -ENODEV; @@ -822,7 +822,7 @@ static long _zcrypt_send_cprb(struct ap_perms *perms, * domain but a control only domain, use the default domain as target. */ tdom = *domain; - if (tdom >= 0 && tdom < AP_DOMAINS && + if (tdom < AP_DOMAINS && !ap_test_config_usage_domain(tdom) && ap_test_config_ctrl_domain(tdom) && ap_domain_index >= 0) @@ -931,7 +931,7 @@ static long _zcrypt_send_ep11_cprb(struct ap_perms *perms, struct zcrypt_queue *zq, *pref_zq; struct ep11_target_dev *targets; unsigned short target_num; - unsigned int weight, pref_weight; + unsigned int weight = 0, pref_weight = 0; unsigned int func_code; struct ap_message ap_msg; int qid = 0, rc = -ENODEV; @@ -1040,7 +1040,7 @@ static long zcrypt_rng(char *buffer) { struct zcrypt_card *zc, *pref_zc; struct zcrypt_queue *zq, *pref_zq; - unsigned int weight, pref_weight; + unsigned int weight = 0, pref_weight = 0; unsigned int func_code; struct ap_message ap_msg; unsigned int domain; diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c index cdaa8348ad04..337ec71ddb58 100644 --- a/drivers/s390/crypto/zcrypt_cex4.c +++ b/drivers/s390/crypto/zcrypt_cex4.c @@ -250,7 +250,7 @@ static ssize_t ep11_card_op_modes_show(struct device *dev, ep11_get_card_info(ac->id, &ci, zc->online); for (i = 0; ep11_op_modes[i].mode_txt; i++) { - if (ci.op_mode & (1 << ep11_op_modes[i].mode_bit)) { + if (ci.op_mode & (1ULL << ep11_op_modes[i].mode_bit)) { if (n > 0) buf[n++] = ' '; n += scnprintf(buf + n, PAGE_SIZE - n, @@ -345,7 +345,7 @@ static ssize_t ep11_queue_op_modes_show(struct device *dev, &di); for (i = 0; ep11_op_modes[i].mode_txt; i++) { - if (di.op_mode & (1 << ep11_op_modes[i].mode_bit)) { + if (di.op_mode & (1ULL << ep11_op_modes[i].mode_bit)) { if (n > 0) buf[n++] = ' '; n += scnprintf(buf + n, PAGE_SIZE - n, From 74ecbef7b90800e368809642ecc671ba4a57ab09 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Thu, 30 Apr 2020 12:23:29 +0200 Subject: [PATCH 255/502] s390/zcrypt: code beautification and struct field renames Some beautifications related to the internal only used struct ap_message and related code. Instead of one int carrying only the special flag now a u32 flags field is used. At struct CPRBX the pointers to additional data are now marked with __user. This caused some changes needed on code, where these structs are also used within the zcrypt misc functions. The ica_rsa_* structs now use the generic types __u8, __u32, ... instead of char, unsigned int. zcrypt_msg6 and zcrypt_msg50 use min_t() instead of min(). Signed-off-by: Harald Freudenberger Signed-off-by: Heiko Carstens --- arch/s390/include/uapi/asm/zcrypt.h | 140 ++++++++++++------------- drivers/s390/crypto/ap_bus.h | 11 +- drivers/s390/crypto/ap_queue.c | 9 +- drivers/s390/crypto/zcrypt_ccamisc.c | 69 ++++++------ drivers/s390/crypto/zcrypt_cex2c.c | 15 ++- drivers/s390/crypto/zcrypt_error.h | 4 +- drivers/s390/crypto/zcrypt_msgtype50.c | 64 +++++------ drivers/s390/crypto/zcrypt_msgtype6.c | 112 ++++++++++---------- drivers/s390/crypto/zcrypt_msgtype6.h | 4 +- drivers/s390/crypto/zcrypt_queue.c | 8 +- 10 files changed, 217 insertions(+), 219 deletions(-) diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h index 5a2177e96e88..22fd202856bc 100644 --- a/arch/s390/include/uapi/asm/zcrypt.h +++ b/arch/s390/include/uapi/asm/zcrypt.h @@ -36,12 +36,12 @@ * - length(n_modulus) = inputdatalength */ struct ica_rsa_modexpo { - char __user *inputdata; - unsigned int inputdatalength; - char __user *outputdata; - unsigned int outputdatalength; - char __user *b_key; - char __user *n_modulus; + __u8 __user *inputdata; + __u32 inputdatalength; + __u8 __user *outputdata; + __u32 outputdatalength; + __u8 __user *b_key; + __u8 __user *n_modulus; }; /** @@ -59,15 +59,15 @@ struct ica_rsa_modexpo { * - length(u_mult_inv) = inputdatalength/2 + 8 */ struct ica_rsa_modexpo_crt { - char __user *inputdata; - unsigned int inputdatalength; - char __user *outputdata; - unsigned int outputdatalength; - char __user *bp_key; - char __user *bq_key; - char __user *np_prime; - char __user *nq_prime; - char __user *u_mult_inv; + __u8 __user *inputdata; + __u32 inputdatalength; + __u8 __user *outputdata; + __u32 outputdatalength; + __u8 __user *bp_key; + __u8 __user *bq_key; + __u8 __user *np_prime; + __u8 __user *nq_prime; + __u8 __user *u_mult_inv; }; /** @@ -83,67 +83,67 @@ struct ica_rsa_modexpo_crt { * key block */ struct CPRBX { - unsigned short cprb_len; /* CPRB length 220 */ - unsigned char cprb_ver_id; /* CPRB version id. 0x02 */ - unsigned char pad_000[3]; /* Alignment pad bytes */ - unsigned char func_id[2]; /* function id 0x5432 */ - unsigned char cprb_flags[4]; /* Flags */ - unsigned int req_parml; /* request parameter buffer len */ - unsigned int req_datal; /* request data buffer */ - unsigned int rpl_msgbl; /* reply message block length */ - unsigned int rpld_parml; /* replied parameter block len */ - unsigned int rpl_datal; /* reply data block len */ - unsigned int rpld_datal; /* replied data block len */ - unsigned int req_extbl; /* request extension block len */ - unsigned char pad_001[4]; /* reserved */ - unsigned int rpld_extbl; /* replied extension block len */ - unsigned char padx000[16 - sizeof(char *)]; - unsigned char *req_parmb; /* request parm block 'address' */ - unsigned char padx001[16 - sizeof(char *)]; - unsigned char *req_datab; /* request data block 'address' */ - unsigned char padx002[16 - sizeof(char *)]; - unsigned char *rpl_parmb; /* reply parm block 'address' */ - unsigned char padx003[16 - sizeof(char *)]; - unsigned char *rpl_datab; /* reply data block 'address' */ - unsigned char padx004[16 - sizeof(char *)]; - unsigned char *req_extb; /* request extension block 'addr'*/ - unsigned char padx005[16 - sizeof(char *)]; - unsigned char *rpl_extb; /* reply extension block 'address'*/ - unsigned short ccp_rtcode; /* server return code */ - unsigned short ccp_rscode; /* server reason code */ - unsigned int mac_data_len; /* Mac Data Length */ - unsigned char logon_id[8]; /* Logon Identifier */ - unsigned char mac_value[8]; /* Mac Value */ - unsigned char mac_content_flgs;/* Mac content flag byte */ - unsigned char pad_002; /* Alignment */ - unsigned short domain; /* Domain */ - unsigned char usage_domain[4];/* Usage domain */ - unsigned char cntrl_domain[4];/* Control domain */ - unsigned char S390enf_mask[4];/* S/390 enforcement mask */ - unsigned char pad_004[36]; /* reserved */ + __u16 cprb_len; /* CPRB length 220 */ + __u8 cprb_ver_id; /* CPRB version id. 0x02 */ + __u8 pad_000[3]; /* Alignment pad bytes */ + __u8 func_id[2]; /* function id 0x5432 */ + __u8 cprb_flags[4]; /* Flags */ + __u32 req_parml; /* request parameter buffer len */ + __u32 req_datal; /* request data buffer */ + __u32 rpl_msgbl; /* reply message block length */ + __u32 rpld_parml; /* replied parameter block len */ + __u32 rpl_datal; /* reply data block len */ + __u32 rpld_datal; /* replied data block len */ + __u32 req_extbl; /* request extension block len */ + __u8 pad_001[4]; /* reserved */ + __u32 rpld_extbl; /* replied extension block len */ + __u8 padx000[16 - sizeof(__u8 *)]; + __u8 __user *req_parmb; /* request parm block 'address' */ + __u8 padx001[16 - sizeof(__u8 *)]; + __u8 __user *req_datab; /* request data block 'address' */ + __u8 padx002[16 - sizeof(__u8 *)]; + __u8 __user *rpl_parmb; /* reply parm block 'address' */ + __u8 padx003[16 - sizeof(__u8 *)]; + __u8 __user *rpl_datab; /* reply data block 'address' */ + __u8 padx004[16 - sizeof(__u8 *)]; + __u8 __user *req_extb; /* request extension block 'addr'*/ + __u8 padx005[16 - sizeof(__u8 *)]; + __u8 __user *rpl_extb; /* reply extension block 'address'*/ + __u16 ccp_rtcode; /* server return code */ + __u16 ccp_rscode; /* server reason code */ + __u32 mac_data_len; /* Mac Data Length */ + __u8 logon_id[8]; /* Logon Identifier */ + __u8 mac_value[8]; /* Mac Value */ + __u8 mac_content_flgs; /* Mac content flag byte */ + __u8 pad_002; /* Alignment */ + __u16 domain; /* Domain */ + __u8 usage_domain[4]; /* Usage domain */ + __u8 cntrl_domain[4]; /* Control domain */ + __u8 S390enf_mask[4]; /* S/390 enforcement mask */ + __u8 pad_004[36]; /* reserved */ } __attribute__((packed)); /** * xcRB */ struct ica_xcRB { - unsigned short agent_ID; - unsigned int user_defined; - unsigned short request_ID; - unsigned int request_control_blk_length; - unsigned char padding1[16 - sizeof(char *)]; - char __user *request_control_blk_addr; - unsigned int request_data_length; - char padding2[16 - sizeof(char *)]; - char __user *request_data_address; - unsigned int reply_control_blk_length; - char padding3[16 - sizeof(char *)]; - char __user *reply_control_blk_addr; - unsigned int reply_data_length; - char padding4[16 - sizeof(char *)]; - char __user *reply_data_addr; - unsigned short priority_window; - unsigned int status; + __u16 agent_ID; + __u32 user_defined; + __u16 request_ID; + __u32 request_control_blk_length; + __u8 _padding1[16 - sizeof(__u8 *)]; + __u8 __user *request_control_blk_addr; + __u32 request_data_length; + __u8 _padding2[16 - sizeof(__u8 *)]; + __u8 __user *request_data_address; + __u32 reply_control_blk_length; + __u8 _padding3[16 - sizeof(__u8 *)]; + __u8 __user *reply_control_blk_addr; + __u32 reply_data_length; + __u8 __padding4[16 - sizeof(__u8 *)]; + __u8 __user *reply_data_addr; + __u16 priority_window; + __u32 status; } __attribute__((packed)); /** diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 053cc34d2ca2..69432e93643a 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -190,17 +190,18 @@ typedef enum ap_wait (ap_func_t)(struct ap_queue *queue); struct ap_message { struct list_head list; /* Request queueing. */ unsigned long long psmid; /* Message id. */ - void *message; /* Pointer to message buffer. */ - size_t length; /* Message length. */ + void *msg; /* Pointer to message buffer. */ + unsigned int len; /* Message length. */ + u32 flags; /* Flags, see AP_MSG_FLAG_xxx */ int rc; /* Return code for this message */ - void *private; /* ap driver private pointer. */ - unsigned int special:1; /* Used for special commands. */ /* receive is called from tasklet context */ void (*receive)(struct ap_queue *, struct ap_message *, struct ap_message *); }; +#define AP_MSG_FLAG_SPECIAL (1 << 16) /* flag msg as 'special' with NQAP */ + /** * ap_init_message() - Initialize ap_message. * Initialize a message before using. Otherwise this might result in @@ -218,7 +219,7 @@ static inline void ap_init_message(struct ap_message *ap_msg) */ static inline void ap_release_message(struct ap_message *ap_msg) { - kzfree(ap_msg->message); + kzfree(ap_msg->msg); kzfree(ap_msg->private); } diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 73b077dca3e6..d6cc384f294b 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -69,9 +69,9 @@ static int ap_queue_enable_interruption(struct ap_queue *aq, void *ind) */ static inline struct ap_queue_status __ap_send(ap_qid_t qid, unsigned long long psmid, void *msg, size_t length, - unsigned int special) + int special) { - if (special == 1) + if (special) qid |= 0x400000UL; return ap_nqap(qid, psmid, msg, length); } @@ -137,7 +137,7 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq) struct ap_message *ap_msg; status = ap_dqap(aq->qid, &aq->reply->psmid, - aq->reply->message, aq->reply->length); + aq->reply->msg, aq->reply->len); switch (status.response_code) { case AP_RESPONSE_NORMAL: aq->queue_count--; @@ -216,7 +216,8 @@ static enum ap_wait ap_sm_write(struct ap_queue *aq) /* Start the next request on the queue. */ ap_msg = list_entry(aq->requestq.next, struct ap_message, list); status = __ap_send(aq->qid, ap_msg->psmid, - ap_msg->message, ap_msg->length, ap_msg->special); + ap_msg->msg, ap_msg->len, + ap_msg->flags & AP_MSG_FLAG_SPECIAL); switch (status.response_code) { case AP_RESPONSE_NORMAL: aq->queue_count++; diff --git a/drivers/s390/crypto/zcrypt_ccamisc.c b/drivers/s390/crypto/zcrypt_ccamisc.c index 1b835398feec..3f5b61351cde 100644 --- a/drivers/s390/crypto/zcrypt_ccamisc.c +++ b/drivers/s390/crypto/zcrypt_ccamisc.c @@ -205,9 +205,9 @@ static int alloc_and_prep_cprbmem(size_t paramblen, preqcblk->rpl_msgbl = cprbplusparamblen; if (paramblen) { preqcblk->req_parmb = - ((u8 *) preqcblk) + sizeof(struct CPRBX); + ((u8 __user *) preqcblk) + sizeof(struct CPRBX); preqcblk->rpl_parmb = - ((u8 *) prepcblk) + sizeof(struct CPRBX); + ((u8 __user *) prepcblk) + sizeof(struct CPRBX); } *pcprbmem = cprbmem; @@ -274,7 +274,7 @@ int cca_genseckey(u16 cardnr, u16 domain, { int i, rc, keysize; int seckeysize; - u8 *mem; + u8 *mem, *ptr; struct CPRBX *preqcblk, *prepcblk; struct ica_xcRB xcrb; struct kgreqparm { @@ -320,7 +320,7 @@ int cca_genseckey(u16 cardnr, u16 domain, preqcblk->domain = domain; /* fill request cprb param block with KG request */ - preqparm = (struct kgreqparm *) preqcblk->req_parmb; + preqparm = (struct kgreqparm __force *) preqcblk->req_parmb; memcpy(preqparm->subfunc_code, "KG", 2); preqparm->rule_array_len = sizeof(preqparm->rule_array_len); preqparm->lv1.len = sizeof(struct lv1); @@ -377,8 +377,9 @@ int cca_genseckey(u16 cardnr, u16 domain, } /* process response cprb param block */ - prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX); - prepparm = (struct kgrepparm *) prepcblk->rpl_parmb; + ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX); + prepcblk->rpl_parmb = (u8 __user *) ptr; + prepparm = (struct kgrepparm *) ptr; /* check length of the returned secure key token */ seckeysize = prepparm->lv3.keyblock.toklen @@ -415,7 +416,7 @@ int cca_clr2seckey(u16 cardnr, u16 domain, u32 keybitsize, const u8 *clrkey, u8 seckey[SECKEYBLOBSIZE]) { int rc, keysize, seckeysize; - u8 *mem; + u8 *mem, *ptr; struct CPRBX *preqcblk, *prepcblk; struct ica_xcRB xcrb; struct cmreqparm { @@ -460,7 +461,7 @@ int cca_clr2seckey(u16 cardnr, u16 domain, u32 keybitsize, preqcblk->domain = domain; /* fill request cprb param block with CM request */ - preqparm = (struct cmreqparm *) preqcblk->req_parmb; + preqparm = (struct cmreqparm __force *) preqcblk->req_parmb; memcpy(preqparm->subfunc_code, "CM", 2); memcpy(preqparm->rule_array, "AES ", 8); preqparm->rule_array_len = @@ -514,8 +515,9 @@ int cca_clr2seckey(u16 cardnr, u16 domain, u32 keybitsize, } /* process response cprb param block */ - prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX); - prepparm = (struct cmrepparm *) prepcblk->rpl_parmb; + ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX); + prepcblk->rpl_parmb = (u8 __user *) ptr; + prepparm = (struct cmrepparm *) ptr; /* check length of the returned secure key token */ seckeysize = prepparm->lv3.keyblock.toklen @@ -554,7 +556,7 @@ int cca_sec2protkey(u16 cardnr, u16 domain, u8 *protkey, u32 *protkeylen, u32 *protkeytype) { int rc; - u8 *mem; + u8 *mem, *ptr; struct CPRBX *preqcblk, *prepcblk; struct ica_xcRB xcrb; struct uskreqparm { @@ -605,7 +607,7 @@ int cca_sec2protkey(u16 cardnr, u16 domain, preqcblk->domain = domain; /* fill request cprb param block with USK request */ - preqparm = (struct uskreqparm *) preqcblk->req_parmb; + preqparm = (struct uskreqparm __force *) preqcblk->req_parmb; memcpy(preqparm->subfunc_code, "US", 2); preqparm->rule_array_len = sizeof(preqparm->rule_array_len); preqparm->lv1.len = sizeof(struct lv1); @@ -646,8 +648,9 @@ int cca_sec2protkey(u16 cardnr, u16 domain, } /* process response cprb param block */ - prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX); - prepparm = (struct uskrepparm *) prepcblk->rpl_parmb; + ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX); + prepcblk->rpl_parmb = (u8 __user *) ptr; + prepparm = (struct uskrepparm *) ptr; /* check the returned keyblock */ if (prepparm->lv3.ckb.version != 0x01 && @@ -714,7 +717,7 @@ int cca_gencipherkey(u16 cardnr, u16 domain, u32 keybitsize, u32 keygenflags, u8 *keybuf, size_t *keybufsize) { int rc; - u8 *mem; + u8 *mem, *ptr; struct CPRBX *preqcblk, *prepcblk; struct ica_xcRB xcrb; struct gkreqparm { @@ -796,7 +799,7 @@ int cca_gencipherkey(u16 cardnr, u16 domain, u32 keybitsize, u32 keygenflags, preqcblk->req_parml = sizeof(struct gkreqparm); /* prepare request param block with GK request */ - preqparm = (struct gkreqparm *) preqcblk->req_parmb; + preqparm = (struct gkreqparm __force *) preqcblk->req_parmb; memcpy(preqparm->subfunc_code, "GK", 2); preqparm->rule_array_len = sizeof(uint16_t) + 2 * 8; memcpy(preqparm->rule_array, "AES OP ", 2*8); @@ -867,8 +870,9 @@ int cca_gencipherkey(u16 cardnr, u16 domain, u32 keybitsize, u32 keygenflags, } /* process response cprb param block */ - prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX); - prepparm = (struct gkrepparm *) prepcblk->rpl_parmb; + ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX); + prepcblk->rpl_parmb = (u8 __user *) ptr; + prepparm = (struct gkrepparm *) ptr; /* do some plausibility checks on the key block */ if (prepparm->kb.len < 120 + 5 * sizeof(uint16_t) || @@ -917,7 +921,7 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain, int *key_token_size) { int rc, n; - u8 *mem; + u8 *mem, *ptr; struct CPRBX *preqcblk, *prepcblk; struct ica_xcRB xcrb; struct rule_array_block { @@ -974,7 +978,7 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain, preqcblk->req_parml = 0; /* prepare request param block with IP request */ - preq_ra_block = (struct rule_array_block *) preqcblk->req_parmb; + preq_ra_block = (struct rule_array_block __force *) preqcblk->req_parmb; memcpy(preq_ra_block->subfunc_code, "IP", 2); preq_ra_block->rule_array_len = sizeof(uint16_t) + 2 * 8; memcpy(preq_ra_block->rule_array, rule_array_1, 8); @@ -987,7 +991,7 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain, } /* prepare vud block */ - preq_vud_block = (struct vud_block *) + preq_vud_block = (struct vud_block __force *) (preqcblk->req_parmb + preqcblk->req_parml); n = complete ? 0 : (clr_key_bit_size + 7) / 8; preq_vud_block->len = sizeof(struct vud_block) + n; @@ -1001,7 +1005,7 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain, preqcblk->req_parml += preq_vud_block->len; /* prepare key block */ - preq_key_block = (struct key_block *) + preq_key_block = (struct key_block __force *) (preqcblk->req_parmb + preqcblk->req_parml); n = *key_token_size; preq_key_block->len = sizeof(struct key_block) + n; @@ -1034,8 +1038,9 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain, } /* process response cprb param block */ - prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX); - prepparm = (struct iprepparm *) prepcblk->rpl_parmb; + ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX); + prepcblk->rpl_parmb = (u8 __user *) ptr; + prepparm = (struct iprepparm *) ptr; /* do some plausibility checks on the key block */ if (prepparm->kb.len < 120 + 3 * sizeof(uint16_t) || @@ -1151,7 +1156,7 @@ int cca_cipher2protkey(u16 cardnr, u16 domain, const u8 *ckey, u8 *protkey, u32 *protkeylen, u32 *protkeytype) { int rc; - u8 *mem; + u8 *mem, *ptr; struct CPRBX *preqcblk, *prepcblk; struct ica_xcRB xcrb; struct aureqparm { @@ -1208,7 +1213,7 @@ int cca_cipher2protkey(u16 cardnr, u16 domain, const u8 *ckey, preqcblk->domain = domain; /* fill request cprb param block with AU request */ - preqparm = (struct aureqparm *) preqcblk->req_parmb; + preqparm = (struct aureqparm __force *) preqcblk->req_parmb; memcpy(preqparm->subfunc_code, "AU", 2); preqparm->rule_array_len = sizeof(preqparm->rule_array_len) @@ -1257,8 +1262,9 @@ int cca_cipher2protkey(u16 cardnr, u16 domain, const u8 *ckey, } /* process response cprb param block */ - prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX); - prepparm = (struct aurepparm *) prepcblk->rpl_parmb; + ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX); + prepcblk->rpl_parmb = (u8 __user *) ptr; + prepparm = (struct aurepparm *) ptr; /* check the returned keyblock */ if (prepparm->vud.ckb.version != 0x01 && @@ -1347,7 +1353,7 @@ int cca_query_crypto_facility(u16 cardnr, u16 domain, preqcblk->domain = domain; /* fill request cprb param block with FQ request */ - preqparm = (struct fqreqparm *) preqcblk->req_parmb; + preqparm = (struct fqreqparm __force *) preqcblk->req_parmb; memcpy(preqparm->subfunc_code, "FQ", 2); memcpy(preqparm->rule_array, keyword, sizeof(preqparm->rule_array)); preqparm->rule_array_len = @@ -1378,8 +1384,9 @@ int cca_query_crypto_facility(u16 cardnr, u16 domain, } /* process response cprb param block */ - prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX); - prepparm = (struct fqrepparm *) prepcblk->rpl_parmb; + ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX); + prepcblk->rpl_parmb = (u8 __user *) ptr; + prepparm = (struct fqrepparm *) ptr; ptr = prepparm->lvdata; /* check and possibly copy reply rule array */ diff --git a/drivers/s390/crypto/zcrypt_cex2c.c b/drivers/s390/crypto/zcrypt_cex2c.c index 266440168bb7..993addb726e0 100644 --- a/drivers/s390/crypto/zcrypt_cex2c.c +++ b/drivers/s390/crypto/zcrypt_cex2c.c @@ -87,24 +87,23 @@ static int zcrypt_cex2c_rng_supported(struct ap_queue *aq) int rc, i; ap_init_message(&ap_msg); - ap_msg.message = (void *) get_zeroed_page(GFP_KERNEL); - if (!ap_msg.message) + ap_msg.msg = (void *) get_zeroed_page(GFP_KERNEL); + if (!ap_msg.msg) return -ENOMEM; rng_type6CPRB_msgX(&ap_msg, 4, &domain); - msg = ap_msg.message; + msg = ap_msg.msg; msg->cprbx.domain = AP_QID_QUEUE(aq->qid); - rc = ap_send(aq->qid, 0x0102030405060708ULL, ap_msg.message, - ap_msg.length); + rc = ap_send(aq->qid, 0x0102030405060708ULL, ap_msg.msg, ap_msg.len); if (rc) goto out_free; /* Wait for the test message to complete. */ for (i = 0; i < 2 * HZ; i++) { msleep(1000 / HZ); - rc = ap_recv(aq->qid, &psmid, ap_msg.message, 4096); + rc = ap_recv(aq->qid, &psmid, ap_msg.msg, 4096); if (rc == 0 && psmid == 0x0102030405060708ULL) break; } @@ -115,13 +114,13 @@ static int zcrypt_cex2c_rng_supported(struct ap_queue *aq) goto out_free; } - reply = ap_msg.message; + reply = ap_msg.msg; if (reply->cprbx.ccp_rtcode == 0 && reply->cprbx.ccp_rscode == 0) rc = 1; else rc = 0; out_free: - free_page((unsigned long) ap_msg.message); + free_page((unsigned long) ap_msg.msg); return rc; } diff --git a/drivers/s390/crypto/zcrypt_error.h b/drivers/s390/crypto/zcrypt_error.h index 4f4dd9d727c9..54a04f8c38ef 100644 --- a/drivers/s390/crypto/zcrypt_error.h +++ b/drivers/s390/crypto/zcrypt_error.h @@ -80,7 +80,7 @@ struct error_hdr { static inline int convert_error(struct zcrypt_queue *zq, struct ap_message *reply) { - struct error_hdr *ehdr = reply->message; + struct error_hdr *ehdr = reply->msg; int card = AP_QID_CARD(zq->queue->qid); int queue = AP_QID_QUEUE(zq->queue->qid); @@ -127,7 +127,7 @@ static inline int convert_error(struct zcrypt_queue *zq, struct { struct type86_hdr hdr; struct type86_fmt2_ext fmt2; - } __packed * head = reply->message; + } __packed * head = reply->msg; unsigned int apfs = *((u32 *)head->fmt2.apfs); ZCRYPT_DBF(DBF_ERR, diff --git a/drivers/s390/crypto/zcrypt_msgtype50.c b/drivers/s390/crypto/zcrypt_msgtype50.c index fc4295b3d801..7aedc338b445 100644 --- a/drivers/s390/crypto/zcrypt_msgtype50.c +++ b/drivers/s390/crypto/zcrypt_msgtype50.c @@ -207,10 +207,10 @@ static int ICAMEX_msg_to_type50MEX_msg(struct zcrypt_queue *zq, mod_len = mex->inputdatalength; if (mod_len <= 128) { - struct type50_meb1_msg *meb1 = ap_msg->message; + struct type50_meb1_msg *meb1 = ap_msg->msg; memset(meb1, 0, sizeof(*meb1)); - ap_msg->length = sizeof(*meb1); + ap_msg->len = sizeof(*meb1); meb1->header.msg_type_code = TYPE50_TYPE_CODE; meb1->header.msg_len = sizeof(*meb1); meb1->keyblock_type = TYPE50_MEB1_FMT; @@ -218,10 +218,10 @@ static int ICAMEX_msg_to_type50MEX_msg(struct zcrypt_queue *zq, exp = meb1->exponent + sizeof(meb1->exponent) - mod_len; inp = meb1->message + sizeof(meb1->message) - mod_len; } else if (mod_len <= 256) { - struct type50_meb2_msg *meb2 = ap_msg->message; + struct type50_meb2_msg *meb2 = ap_msg->msg; memset(meb2, 0, sizeof(*meb2)); - ap_msg->length = sizeof(*meb2); + ap_msg->len = sizeof(*meb2); meb2->header.msg_type_code = TYPE50_TYPE_CODE; meb2->header.msg_len = sizeof(*meb2); meb2->keyblock_type = TYPE50_MEB2_FMT; @@ -229,10 +229,10 @@ static int ICAMEX_msg_to_type50MEX_msg(struct zcrypt_queue *zq, exp = meb2->exponent + sizeof(meb2->exponent) - mod_len; inp = meb2->message + sizeof(meb2->message) - mod_len; } else if (mod_len <= 512) { - struct type50_meb3_msg *meb3 = ap_msg->message; + struct type50_meb3_msg *meb3 = ap_msg->msg; memset(meb3, 0, sizeof(*meb3)); - ap_msg->length = sizeof(*meb3); + ap_msg->len = sizeof(*meb3); meb3->header.msg_type_code = TYPE50_TYPE_CODE; meb3->header.msg_len = sizeof(*meb3); meb3->keyblock_type = TYPE50_MEB3_FMT; @@ -275,10 +275,10 @@ static int ICACRT_msg_to_type50CRT_msg(struct zcrypt_queue *zq, * 512 byte modulus (4k keys). */ if (mod_len <= 128) { /* up to 1024 bit key size */ - struct type50_crb1_msg *crb1 = ap_msg->message; + struct type50_crb1_msg *crb1 = ap_msg->msg; memset(crb1, 0, sizeof(*crb1)); - ap_msg->length = sizeof(*crb1); + ap_msg->len = sizeof(*crb1); crb1->header.msg_type_code = TYPE50_TYPE_CODE; crb1->header.msg_len = sizeof(*crb1); crb1->keyblock_type = TYPE50_CRB1_FMT; @@ -289,10 +289,10 @@ static int ICACRT_msg_to_type50CRT_msg(struct zcrypt_queue *zq, u = crb1->u + sizeof(crb1->u) - short_len; inp = crb1->message + sizeof(crb1->message) - mod_len; } else if (mod_len <= 256) { /* up to 2048 bit key size */ - struct type50_crb2_msg *crb2 = ap_msg->message; + struct type50_crb2_msg *crb2 = ap_msg->msg; memset(crb2, 0, sizeof(*crb2)); - ap_msg->length = sizeof(*crb2); + ap_msg->len = sizeof(*crb2); crb2->header.msg_type_code = TYPE50_TYPE_CODE; crb2->header.msg_len = sizeof(*crb2); crb2->keyblock_type = TYPE50_CRB2_FMT; @@ -304,10 +304,10 @@ static int ICACRT_msg_to_type50CRT_msg(struct zcrypt_queue *zq, inp = crb2->message + sizeof(crb2->message) - mod_len; } else if ((mod_len <= 512) && /* up to 4096 bit key size */ (zq->zcard->max_mod_size == CEX3A_MAX_MOD_SIZE)) { - struct type50_crb3_msg *crb3 = ap_msg->message; + struct type50_crb3_msg *crb3 = ap_msg->msg; memset(crb3, 0, sizeof(*crb3)); - ap_msg->length = sizeof(*crb3); + ap_msg->len = sizeof(*crb3); crb3->header.msg_type_code = TYPE50_TYPE_CODE; crb3->header.msg_len = sizeof(*crb3); crb3->keyblock_type = TYPE50_CRB3_FMT; @@ -350,7 +350,7 @@ static int convert_type80(struct zcrypt_queue *zq, char __user *outputdata, unsigned int outputdatalength) { - struct type80_hdr *t80h = reply->message; + struct type80_hdr *t80h = reply->msg; unsigned char *data; if (t80h->len < sizeof(*t80h) + outputdatalength) { @@ -370,7 +370,7 @@ static int convert_type80(struct zcrypt_queue *zq, BUG_ON(t80h->len > CEX2A_MAX_RESPONSE_SIZE); else BUG_ON(t80h->len > CEX3A_MAX_RESPONSE_SIZE); - data = reply->message + t80h->len - outputdatalength; + data = reply->msg + t80h->len - outputdatalength; if (copy_to_user(outputdata, data, outputdatalength)) return -EFAULT; return 0; @@ -382,7 +382,7 @@ static int convert_response(struct zcrypt_queue *zq, unsigned int outputdatalength) { /* Response type byte is the second byte in the response. */ - unsigned char rtype = ((unsigned char *) reply->message)[1]; + unsigned char rtype = ((unsigned char *) reply->msg)[1]; switch (rtype) { case TYPE82_RSP_CODE: @@ -422,22 +422,20 @@ static void zcrypt_cex2a_receive(struct ap_queue *aq, .reply_code = REP82_ERROR_MACHINE_FAILURE, }; struct type80_hdr *t80h; - int length; + int len; /* Copy the reply message to the request message buffer. */ if (!reply) goto out; /* ap_msg->rc indicates the error */ - t80h = reply->message; + t80h = reply->msg; if (t80h->type == TYPE80_RSP_CODE) { if (aq->ap_dev.device_type == AP_DEVICE_TYPE_CEX2A) - length = min_t(int, - CEX2A_MAX_RESPONSE_SIZE, t80h->len); + len = min_t(int, CEX2A_MAX_RESPONSE_SIZE, t80h->len); else - length = min_t(int, - CEX3A_MAX_RESPONSE_SIZE, t80h->len); - memcpy(msg->message, reply->message, length); + len = min_t(int, CEX3A_MAX_RESPONSE_SIZE, t80h->len); + memcpy(msg->msg, reply->msg, len); } else - memcpy(msg->message, reply->message, sizeof(error_reply)); + memcpy(msg->msg, reply->msg, sizeof(error_reply)); out: complete((struct completion *) msg->private); } @@ -460,12 +458,10 @@ static long zcrypt_cex2a_modexpo(struct zcrypt_queue *zq, ap_init_message(&ap_msg); if (zq->zcard->user_space_type == ZCRYPT_CEX2A) - ap_msg.message = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE, - GFP_KERNEL); + ap_msg.msg = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE, GFP_KERNEL); else - ap_msg.message = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE, - GFP_KERNEL); - if (!ap_msg.message) + ap_msg.msg = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE, GFP_KERNEL); + if (!ap_msg.msg) return -ENOMEM; ap_msg.receive = zcrypt_cex2a_receive; ap_msg.psmid = (((unsigned long long) current->pid) << 32) + @@ -486,7 +482,7 @@ static long zcrypt_cex2a_modexpo(struct zcrypt_queue *zq, /* Signal pending. */ ap_cancel_message(zq->queue, &ap_msg); out_free: - kfree(ap_msg.message); + kfree(ap_msg.msg); return rc; } @@ -506,12 +502,10 @@ static long zcrypt_cex2a_modexpo_crt(struct zcrypt_queue *zq, ap_init_message(&ap_msg); if (zq->zcard->user_space_type == ZCRYPT_CEX2A) - ap_msg.message = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE, - GFP_KERNEL); + ap_msg.msg = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE, GFP_KERNEL); else - ap_msg.message = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE, - GFP_KERNEL); - if (!ap_msg.message) + ap_msg.msg = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE, GFP_KERNEL); + if (!ap_msg.msg) return -ENOMEM; ap_msg.receive = zcrypt_cex2a_receive; ap_msg.psmid = (((unsigned long long) current->pid) << 32) + @@ -532,7 +526,7 @@ static long zcrypt_cex2a_modexpo_crt(struct zcrypt_queue *zq, /* Signal pending. */ ap_cancel_message(zq->queue, &ap_msg); out_free: - kfree(ap_msg.message); + kfree(ap_msg.msg); return rc; } diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c index fd1cbb2d6b3f..d77991c74c25 100644 --- a/drivers/s390/crypto/zcrypt_msgtype6.c +++ b/drivers/s390/crypto/zcrypt_msgtype6.c @@ -266,7 +266,7 @@ static int ICAMEX_msg_to_type6MEX_msgX(struct zcrypt_queue *zq, struct function_and_rules_block fr; unsigned short length; char text[0]; - } __packed * msg = ap_msg->message; + } __packed * msg = ap_msg->msg; int size; /* @@ -301,7 +301,7 @@ static int ICAMEX_msg_to_type6MEX_msgX(struct zcrypt_queue *zq, msg->cprbx.req_parml = size - sizeof(msg->hdr) - sizeof(msg->cprbx); - ap_msg->length = size; + ap_msg->len = size; return 0; } @@ -336,7 +336,7 @@ static int ICACRT_msg_to_type6CRT_msgX(struct zcrypt_queue *zq, struct function_and_rules_block fr; unsigned short length; char text[0]; - } __packed * msg = ap_msg->message; + } __packed * msg = ap_msg->msg; int size; /* @@ -370,7 +370,7 @@ static int ICACRT_msg_to_type6CRT_msgX(struct zcrypt_queue *zq, msg->fr = static_pkd_fnr; - ap_msg->length = size; + ap_msg->len = size; return 0; } @@ -400,11 +400,11 @@ static int XCRB_msg_to_type6CPRB_msgX(struct ap_message *ap_msg, struct { struct type6_hdr hdr; struct CPRBX cprbx; - } __packed * msg = ap_msg->message; + } __packed * msg = ap_msg->msg; int rcblen = CEIL4(xcRB->request_control_blk_length); int replylen, req_sumlen, resp_sumlen; - char *req_data = ap_msg->message + sizeof(struct type6_hdr) + rcblen; + char *req_data = ap_msg->msg + sizeof(struct type6_hdr) + rcblen; char *function_code; if (CEIL4(xcRB->request_control_blk_length) < @@ -412,10 +412,10 @@ static int XCRB_msg_to_type6CPRB_msgX(struct ap_message *ap_msg, return -EINVAL; /* overflow after alignment*/ /* length checks */ - ap_msg->length = sizeof(struct type6_hdr) + + ap_msg->len = sizeof(struct type6_hdr) + CEIL4(xcRB->request_control_blk_length) + xcRB->request_data_length; - if (ap_msg->length > MSGTYPE06_MAX_MSG_SIZE) + if (ap_msg->len > MSGTYPE06_MAX_MSG_SIZE) return -EINVAL; /* @@ -480,9 +480,7 @@ static int XCRB_msg_to_type6CPRB_msgX(struct ap_message *ap_msg, if (memcmp(function_code, "US", 2) == 0 || memcmp(function_code, "AU", 2) == 0) - ap_msg->special = 1; - else - ap_msg->special = 0; + ap_msg->flags |= AP_MSG_FLAG_SPECIAL; /* copy data block */ if (xcRB->request_data_length && @@ -512,7 +510,7 @@ static int xcrb_msg_to_type6_ep11cprb_msgx(struct ap_message *ap_msg, struct ep11_cprb cprbx; unsigned char pld_tag; /* fixed value 0x30 */ unsigned char pld_lenfmt; /* payload length format */ - } __packed * msg = ap_msg->message; + } __packed * msg = ap_msg->msg; struct pld_hdr { unsigned char func_tag; /* fixed value 0x4 */ @@ -527,7 +525,7 @@ static int xcrb_msg_to_type6_ep11cprb_msgx(struct ap_message *ap_msg, return -EINVAL; /* overflow after alignment*/ /* length checks */ - ap_msg->length = sizeof(struct type6_hdr) + xcRB->req_len; + ap_msg->len = sizeof(struct type6_hdr) + xcRB->req_len; if (CEIL4(xcRB->req_len) > MSGTYPE06_MAX_MSG_SIZE - (sizeof(struct type6_hdr))) return -EINVAL; @@ -569,7 +567,7 @@ static int xcrb_msg_to_type6_ep11cprb_msgx(struct ap_message *ap_msg, /* enable special processing based on the cprbs flags special bit */ if (msg->cprbx.flags & 0x20) - ap_msg->special = 1; + ap_msg->flags |= AP_MSG_FLAG_SPECIAL; return 0; } @@ -639,7 +637,7 @@ static int convert_type86_ica(struct zcrypt_queue *zq, 0x35, 0x9D, 0xD3, 0xD3, 0xA7, 0x9D, 0x5D, 0x41, 0x6F, 0x65, 0x1B, 0xCF, 0xA9, 0x87, 0x91, 0x09 }; - struct type86x_reply *msg = reply->message; + struct type86x_reply *msg = reply->msg; unsigned short service_rc, service_rs; unsigned int reply_len, pad_len; char *data; @@ -713,8 +711,8 @@ static int convert_type86_xcrb(struct zcrypt_queue *zq, struct ap_message *reply, struct ica_xcRB *xcRB) { - struct type86_fmt2_msg *msg = reply->message; - char *data = reply->message; + struct type86_fmt2_msg *msg = reply->msg; + char *data = reply->msg; /* Copy CPRB to user */ if (copy_to_user(xcRB->reply_control_blk_addr, @@ -744,8 +742,8 @@ static int convert_type86_ep11_xcrb(struct zcrypt_queue *zq, struct ap_message *reply, struct ep11_urb *xcRB) { - struct type86_fmt2_msg *msg = reply->message; - char *data = reply->message; + struct type86_fmt2_msg *msg = reply->msg; + char *data = reply->msg; if (xcRB->resp_len < msg->fmt2.count1) return -EINVAL; @@ -766,8 +764,8 @@ static int convert_type86_rng(struct zcrypt_queue *zq, struct type86_hdr hdr; struct type86_fmt2_ext fmt2; struct CPRBX cprbx; - } __packed * msg = reply->message; - char *data = reply->message; + } __packed * msg = reply->msg; + char *data = reply->msg; if (msg->cprbx.ccp_rtcode != 0 || msg->cprbx.ccp_rscode != 0) return -EINVAL; @@ -780,7 +778,7 @@ static int convert_response_ica(struct zcrypt_queue *zq, char __user *outputdata, unsigned int outputdatalength) { - struct type86x_reply *msg = reply->message; + struct type86x_reply *msg = reply->msg; switch (msg->hdr.type) { case TYPE82_RSP_CODE: @@ -820,7 +818,7 @@ static int convert_response_xcrb(struct zcrypt_queue *zq, struct ap_message *reply, struct ica_xcRB *xcRB) { - struct type86x_reply *msg = reply->message; + struct type86x_reply *msg = reply->msg; switch (msg->hdr.type) { case TYPE82_RSP_CODE: @@ -853,7 +851,7 @@ static int convert_response_xcrb(struct zcrypt_queue *zq, static int convert_response_ep11_xcrb(struct zcrypt_queue *zq, struct ap_message *reply, struct ep11_urb *xcRB) { - struct type86_ep11_reply *msg = reply->message; + struct type86_ep11_reply *msg = reply->msg; switch (msg->hdr.type) { case TYPE82_RSP_CODE: @@ -883,7 +881,7 @@ static int convert_response_rng(struct zcrypt_queue *zq, struct ap_message *reply, char *data) { - struct type86x_reply *msg = reply->message; + struct type86x_reply *msg = reply->msg; switch (msg->hdr.type) { case TYPE82_RSP_CODE: @@ -928,32 +926,30 @@ static void zcrypt_msgtype6_receive(struct ap_queue *aq, struct response_type *resp_type = (struct response_type *) msg->private; struct type86x_reply *t86r; - int length; + int len; /* Copy the reply message to the request message buffer. */ if (!reply) goto out; /* ap_msg->rc indicates the error */ - t86r = reply->message; + t86r = reply->msg; if (t86r->hdr.type == TYPE86_RSP_CODE && t86r->cprbx.cprb_ver_id == 0x02) { switch (resp_type->type) { case CEXXC_RESPONSE_TYPE_ICA: - length = sizeof(struct type86x_reply) - + t86r->length - 2; - length = min(CEXXC_MAX_ICA_RESPONSE_SIZE, length); - memcpy(msg->message, reply->message, length); + len = sizeof(struct type86x_reply) + t86r->length - 2; + len = min_t(int, CEXXC_MAX_ICA_RESPONSE_SIZE, len); + memcpy(msg->msg, reply->msg, len); break; case CEXXC_RESPONSE_TYPE_XCRB: - length = t86r->fmt2.offset2 + t86r->fmt2.count2; - length = min(MSGTYPE06_MAX_MSG_SIZE, length); - memcpy(msg->message, reply->message, length); + len = t86r->fmt2.offset2 + t86r->fmt2.count2; + len = min_t(int, MSGTYPE06_MAX_MSG_SIZE, len); + memcpy(msg->msg, reply->msg, len); break; default: - memcpy(msg->message, &error_reply, - sizeof(error_reply)); + memcpy(msg->msg, &error_reply, sizeof(error_reply)); } } else - memcpy(msg->message, reply->message, sizeof(error_reply)); + memcpy(msg->msg, reply->msg, sizeof(error_reply)); out: complete(&(resp_type->work)); } @@ -977,25 +973,25 @@ static void zcrypt_msgtype6_receive_ep11(struct ap_queue *aq, struct response_type *resp_type = (struct response_type *)msg->private; struct type86_ep11_reply *t86r; - int length; + int len; /* Copy the reply message to the request message buffer. */ if (!reply) goto out; /* ap_msg->rc indicates the error */ - t86r = reply->message; + t86r = reply->msg; if (t86r->hdr.type == TYPE86_RSP_CODE && t86r->cprbx.cprb_ver_id == 0x04) { switch (resp_type->type) { case CEXXC_RESPONSE_TYPE_EP11: - length = t86r->fmt2.offset1 + t86r->fmt2.count1; - length = min(MSGTYPE06_MAX_MSG_SIZE, length); - memcpy(msg->message, reply->message, length); + len = t86r->fmt2.offset1 + t86r->fmt2.count1; + len = min_t(int, MSGTYPE06_MAX_MSG_SIZE, len); + memcpy(msg->msg, reply->msg, len); break; default: - memcpy(msg->message, &error_reply, sizeof(error_reply)); + memcpy(msg->msg, &error_reply, sizeof(error_reply)); } } else { - memcpy(msg->message, reply->message, sizeof(error_reply)); + memcpy(msg->msg, reply->msg, sizeof(error_reply)); } out: complete(&(resp_type->work)); @@ -1020,8 +1016,8 @@ static long zcrypt_msgtype6_modexpo(struct zcrypt_queue *zq, int rc; ap_init_message(&ap_msg); - ap_msg.message = (void *) get_zeroed_page(GFP_KERNEL); - if (!ap_msg.message) + ap_msg.msg = (void *) get_zeroed_page(GFP_KERNEL); + if (!ap_msg.msg) return -ENOMEM; ap_msg.receive = zcrypt_msgtype6_receive; ap_msg.psmid = (((unsigned long long) current->pid) << 32) + @@ -1043,7 +1039,7 @@ static long zcrypt_msgtype6_modexpo(struct zcrypt_queue *zq, /* Signal pending. */ ap_cancel_message(zq->queue, &ap_msg); out_free: - free_page((unsigned long) ap_msg.message); + free_page((unsigned long) ap_msg.msg); return rc; } @@ -1064,8 +1060,8 @@ static long zcrypt_msgtype6_modexpo_crt(struct zcrypt_queue *zq, int rc; ap_init_message(&ap_msg); - ap_msg.message = (void *) get_zeroed_page(GFP_KERNEL); - if (!ap_msg.message) + ap_msg.msg = (void *) get_zeroed_page(GFP_KERNEL); + if (!ap_msg.msg) return -ENOMEM; ap_msg.receive = zcrypt_msgtype6_receive; ap_msg.psmid = (((unsigned long long) current->pid) << 32) + @@ -1088,7 +1084,7 @@ static long zcrypt_msgtype6_modexpo_crt(struct zcrypt_queue *zq, ap_cancel_message(zq->queue, &ap_msg); } out_free: - free_page((unsigned long) ap_msg.message); + free_page((unsigned long) ap_msg.msg); return rc; } @@ -1107,8 +1103,8 @@ unsigned int get_cprb_fc(struct ica_xcRB *xcRB, .type = CEXXC_RESPONSE_TYPE_XCRB, }; - ap_msg->message = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL); - if (!ap_msg->message) + ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL); + if (!ap_msg->msg) return -ENOMEM; ap_msg->receive = zcrypt_msgtype6_receive; ap_msg->psmid = (((unsigned long long) current->pid) << 32) + @@ -1162,8 +1158,8 @@ unsigned int get_ep11cprb_fc(struct ep11_urb *xcrb, .type = CEXXC_RESPONSE_TYPE_EP11, }; - ap_msg->message = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL); - if (!ap_msg->message) + ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL); + if (!ap_msg->msg) return -ENOMEM; ap_msg->receive = zcrypt_msgtype6_receive_ep11; ap_msg->psmid = (((unsigned long long) current->pid) << 32) + @@ -1193,7 +1189,7 @@ static long zcrypt_msgtype6_send_ep11_cprb(struct zcrypt_queue *zq, struct ep11_cprb cprbx; unsigned char pld_tag; /* fixed value 0x30 */ unsigned char pld_lenfmt; /* payload length format */ - } __packed * msg = ap_msg->message; + } __packed * msg = ap_msg->msg; struct pld_hdr { unsigned char func_tag; /* fixed value 0x4 */ unsigned char func_len; /* fixed value 0x4 */ @@ -1256,8 +1252,8 @@ unsigned int get_rng_fc(struct ap_message *ap_msg, int *func_code, .type = CEXXC_RESPONSE_TYPE_XCRB, }; - ap_msg->message = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL); - if (!ap_msg->message) + ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL); + if (!ap_msg->msg) return -ENOMEM; ap_msg->receive = zcrypt_msgtype6_receive; ap_msg->psmid = (((unsigned long long) current->pid) << 32) + @@ -1290,7 +1286,7 @@ static long zcrypt_msgtype6_rng(struct zcrypt_queue *zq, char rule[8]; short int verb_length; short int key_length; - } __packed * msg = ap_msg->message; + } __packed * msg = ap_msg->msg; struct response_type *rtype = (struct response_type *)(ap_msg->private); int rc; diff --git a/drivers/s390/crypto/zcrypt_msgtype6.h b/drivers/s390/crypto/zcrypt_msgtype6.h index 41a0df5f070f..0de280a81dd4 100644 --- a/drivers/s390/crypto/zcrypt_msgtype6.h +++ b/drivers/s390/crypto/zcrypt_msgtype6.h @@ -127,7 +127,7 @@ static inline void rng_type6CPRB_msgX(struct ap_message *ap_msg, char rule[8]; short int verb_length; short int key_length; - } __packed * msg = ap_msg->message; + } __packed * msg = ap_msg->msg; static struct type6_hdr static_type6_hdrX = { .type = 0x06, .offset1 = 0x00000058, @@ -154,7 +154,7 @@ static inline void rng_type6CPRB_msgX(struct ap_message *ap_msg, memcpy(msg->rule, "RANDOM ", 8); msg->verb_length = 0x02; msg->key_length = 0x02; - ap_msg->length = sizeof(*msg); + ap_msg->len = sizeof(*msg); *domain = (unsigned short)msg->cprbx.domain; } diff --git a/drivers/s390/crypto/zcrypt_queue.c b/drivers/s390/crypto/zcrypt_queue.c index b7d9fa567880..8bae6ad159a7 100644 --- a/drivers/s390/crypto/zcrypt_queue.c +++ b/drivers/s390/crypto/zcrypt_queue.c @@ -107,10 +107,10 @@ struct zcrypt_queue *zcrypt_queue_alloc(size_t max_response_size) zq = kzalloc(sizeof(struct zcrypt_queue), GFP_KERNEL); if (!zq) return NULL; - zq->reply.message = kmalloc(max_response_size, GFP_KERNEL); - if (!zq->reply.message) + zq->reply.msg = kmalloc(max_response_size, GFP_KERNEL); + if (!zq->reply.msg) goto out_free; - zq->reply.length = max_response_size; + zq->reply.len = max_response_size; INIT_LIST_HEAD(&zq->list); kref_init(&zq->refcount); return zq; @@ -123,7 +123,7 @@ EXPORT_SYMBOL(zcrypt_queue_alloc); void zcrypt_queue_free(struct zcrypt_queue *zq) { - kfree(zq->reply.message); + kfree(zq->reply.msg); kfree(zq); } EXPORT_SYMBOL(zcrypt_queue_free); From 7e202acb5c4397b17e275c017f84e4df34314578 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Wed, 20 May 2020 16:07:19 +0200 Subject: [PATCH 256/502] s390/zcrypt: split ioctl function into smaller code units The zcrpyt_unlocked_ioctl() function has become large. So split away into new static functions the 4 ioctl ICARSAMODEXPO, ICARSACRT, ZSECSENDCPRB and ZSENDEP11CPRB. This makes the code more readable and is a preparation step for further improvements needed on these ioctls. Signed-off-by: Harald Freudenberger Signed-off-by: Heiko Carstens --- drivers/s390/crypto/zcrypt_api.c | 182 +++++++++++++++++-------------- 1 file changed, 101 insertions(+), 81 deletions(-) diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 7775ff84f223..4dbbfd88262c 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -1298,6 +1298,99 @@ static int zcrypt_requestq_count(void) return requestq_count; } +static int icarsamodexpo_ioctl(struct ap_perms *perms, unsigned long arg) +{ + int rc; + struct ica_rsa_modexpo mex; + struct ica_rsa_modexpo __user *umex = (void __user *) arg; + + if (copy_from_user(&mex, umex, sizeof(mex))) + return -EFAULT; + do { + rc = zcrypt_rsa_modexpo(perms, &mex); + } while (rc == -EAGAIN); + /* on failure: retry once again after a requested rescan */ + if ((rc == -ENODEV) && (zcrypt_process_rescan())) + do { + rc = zcrypt_rsa_modexpo(perms, &mex); + } while (rc == -EAGAIN); + if (rc) { + ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSAMODEXPO rc=%d\n", rc); + return rc; + } + return put_user(mex.outputdatalength, &umex->outputdatalength); +} + +static int icarsacrt_ioctl(struct ap_perms *perms, unsigned long arg) +{ + int rc; + struct ica_rsa_modexpo_crt crt; + struct ica_rsa_modexpo_crt __user *ucrt = (void __user *) arg; + + if (copy_from_user(&crt, ucrt, sizeof(crt))) + return -EFAULT; + do { + rc = zcrypt_rsa_crt(perms, &crt); + } while (rc == -EAGAIN); + /* on failure: retry once again after a requested rescan */ + if ((rc == -ENODEV) && (zcrypt_process_rescan())) + do { + rc = zcrypt_rsa_crt(perms, &crt); + } while (rc == -EAGAIN); + if (rc) { + ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSACRT rc=%d\n", rc); + return rc; + } + return put_user(crt.outputdatalength, &ucrt->outputdatalength); +} + +static int zsecsendcprb_ioctl(struct ap_perms *perms, unsigned long arg) +{ + int rc; + struct ica_xcRB xcRB; + struct ica_xcRB __user *uxcRB = (void __user *) arg; + + if (copy_from_user(&xcRB, uxcRB, sizeof(xcRB))) + return -EFAULT; + do { + rc = _zcrypt_send_cprb(perms, &xcRB); + } while (rc == -EAGAIN); + /* on failure: retry once again after a requested rescan */ + if ((rc == -ENODEV) && (zcrypt_process_rescan())) + do { + rc = _zcrypt_send_cprb(perms, &xcRB); + } while (rc == -EAGAIN); + if (rc) + ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDCPRB rc=%d status=0x%x\n", + rc, xcRB.status); + if (copy_to_user(uxcRB, &xcRB, sizeof(xcRB))) + return -EFAULT; + return rc; +} + +static int zsendep11cprb_ioctl(struct ap_perms *perms, unsigned long arg) +{ + int rc; + struct ep11_urb xcrb; + struct ep11_urb __user *uxcrb = (void __user *)arg; + + if (copy_from_user(&xcrb, uxcrb, sizeof(xcrb))) + return -EFAULT; + do { + rc = _zcrypt_send_ep11_cprb(perms, &xcrb); + } while (rc == -EAGAIN); + /* on failure: retry once again after a requested rescan */ + if ((rc == -ENODEV) && (zcrypt_process_rescan())) + do { + rc = _zcrypt_send_ep11_cprb(perms, &xcrb); + } while (rc == -EAGAIN); + if (rc) + ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDEP11CPRB rc=%d\n", rc); + if (copy_to_user(uxcrb, &xcrb, sizeof(xcrb))) + return -EFAULT; + return rc; +} + static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -1310,87 +1403,14 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd, return rc; switch (cmd) { - case ICARSAMODEXPO: { - struct ica_rsa_modexpo __user *umex = (void __user *) arg; - struct ica_rsa_modexpo mex; - - if (copy_from_user(&mex, umex, sizeof(mex))) - return -EFAULT; - do { - rc = zcrypt_rsa_modexpo(perms, &mex); - } while (rc == -EAGAIN); - /* on failure: retry once again after a requested rescan */ - if ((rc == -ENODEV) && (zcrypt_process_rescan())) - do { - rc = zcrypt_rsa_modexpo(perms, &mex); - } while (rc == -EAGAIN); - if (rc) { - ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSAMODEXPO rc=%d\n", rc); - return rc; - } - return put_user(mex.outputdatalength, &umex->outputdatalength); - } - case ICARSACRT: { - struct ica_rsa_modexpo_crt __user *ucrt = (void __user *) arg; - struct ica_rsa_modexpo_crt crt; - - if (copy_from_user(&crt, ucrt, sizeof(crt))) - return -EFAULT; - do { - rc = zcrypt_rsa_crt(perms, &crt); - } while (rc == -EAGAIN); - /* on failure: retry once again after a requested rescan */ - if ((rc == -ENODEV) && (zcrypt_process_rescan())) - do { - rc = zcrypt_rsa_crt(perms, &crt); - } while (rc == -EAGAIN); - if (rc) { - ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSACRT rc=%d\n", rc); - return rc; - } - return put_user(crt.outputdatalength, &ucrt->outputdatalength); - } - case ZSECSENDCPRB: { - struct ica_xcRB __user *uxcRB = (void __user *) arg; - struct ica_xcRB xcRB; - - if (copy_from_user(&xcRB, uxcRB, sizeof(xcRB))) - return -EFAULT; - do { - rc = _zcrypt_send_cprb(perms, &xcRB); - } while (rc == -EAGAIN); - /* on failure: retry once again after a requested rescan */ - if ((rc == -ENODEV) && (zcrypt_process_rescan())) - do { - rc = _zcrypt_send_cprb(perms, &xcRB); - } while (rc == -EAGAIN); - if (rc) - ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDCPRB rc=%d status=0x%x\n", - rc, xcRB.status); - if (copy_to_user(uxcRB, &xcRB, sizeof(xcRB))) - return -EFAULT; - return rc; - } - case ZSENDEP11CPRB: { - struct ep11_urb __user *uxcrb = (void __user *)arg; - struct ep11_urb xcrb; - - if (copy_from_user(&xcrb, uxcrb, sizeof(xcrb))) - return -EFAULT; - do { - rc = _zcrypt_send_ep11_cprb(perms, &xcrb); - } while (rc == -EAGAIN); - /* on failure: retry once again after a requested rescan */ - if ((rc == -ENODEV) && (zcrypt_process_rescan())) - do { - rc = _zcrypt_send_ep11_cprb(perms, &xcrb); - } while (rc == -EAGAIN); - if (rc) - ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDEP11CPRB rc=%d\n", rc); - if (copy_to_user(uxcrb, &xcrb, sizeof(xcrb))) - return -EFAULT; - return rc; - } + case ICARSAMODEXPO: + return icarsamodexpo_ioctl(perms, arg); + case ICARSACRT: + return icarsacrt_ioctl(perms, arg); + case ZSECSENDCPRB: + return zsecsendcprb_ioctl(perms, arg); + case ZSENDEP11CPRB: + return zsendep11cprb_ioctl(perms, arg); case ZCRYPT_DEVICE_STATUS: { struct zcrypt_device_status_ext *device_status; size_t total_size = MAX_ZDEV_ENTRIES_EXT From dc4b6ded3c17ebe1d7532943192b2308c031c43b Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Tue, 26 May 2020 10:49:33 +0200 Subject: [PATCH 257/502] s390/ap: rename and clarify ap state machine related stuff There is a state machine held for each ap queue device. The states and functions related to this where somethimes noted with _sm_ somethimes without. This patch clarifies and renames all the ap queue state machine related functions, enums and defines to have a _sm_ in the name. There is no functional change coming with this patch - it's only beautifying code. Signed-off-by: Harald Freudenberger Signed-off-by: Heiko Carstens --- drivers/s390/crypto/ap_bus.c | 18 +-- drivers/s390/crypto/ap_bus.h | 58 +++++----- drivers/s390/crypto/ap_queue.c | 200 ++++++++++++++++----------------- 3 files changed, 138 insertions(+), 138 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index e71ca4a719a5..64fa66788194 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -342,13 +342,13 @@ static int ap_query_queue(ap_qid_t qid, int *queue_depth, int *device_type, } } -void ap_wait(enum ap_wait wait) +void ap_wait(enum ap_sm_wait wait) { ktime_t hr_time; switch (wait) { - case AP_WAIT_AGAIN: - case AP_WAIT_INTERRUPT: + case AP_SM_WAIT_AGAIN: + case AP_SM_WAIT_INTERRUPT: if (ap_using_interrupts()) break; if (ap_poll_kthread) { @@ -356,7 +356,7 @@ void ap_wait(enum ap_wait wait) break; } fallthrough; - case AP_WAIT_TIMEOUT: + case AP_SM_WAIT_TIMEOUT: spin_lock_bh(&ap_poll_timer_lock); if (!hrtimer_is_queued(&ap_poll_timer)) { hr_time = poll_timeout; @@ -365,7 +365,7 @@ void ap_wait(enum ap_wait wait) } spin_unlock_bh(&ap_poll_timer_lock); break; - case AP_WAIT_NONE: + case AP_SM_WAIT_NONE: default: break; } @@ -382,7 +382,7 @@ void ap_request_timeout(struct timer_list *t) struct ap_queue *aq = from_timer(aq, t, timeout); spin_lock_bh(&aq->lock); - ap_wait(ap_sm_event(aq, AP_EVENT_TIMEOUT)); + ap_wait(ap_sm_event(aq, AP_SM_EVENT_TIMEOUT)); spin_unlock_bh(&aq->lock); } @@ -418,7 +418,7 @@ static void ap_tasklet_fn(unsigned long dummy) { int bkt; struct ap_queue *aq; - enum ap_wait wait = AP_WAIT_NONE; + enum ap_sm_wait wait = AP_SM_WAIT_NONE; /* Reset the indicator if interrupts are used. Thus new interrupts can * be received. Doing it in the beginning of the tasklet is therefor @@ -430,7 +430,7 @@ static void ap_tasklet_fn(unsigned long dummy) spin_lock_bh(&ap_queues_lock); hash_for_each(ap_queues, bkt, aq, hnode) { spin_lock_bh(&aq->lock); - wait = min(wait, ap_sm_event_loop(aq, AP_EVENT_POLL)); + wait = min(wait, ap_sm_event_loop(aq, AP_SM_EVENT_POLL)); spin_unlock_bh(&aq->lock); } spin_unlock_bh(&ap_queues_lock); @@ -1370,7 +1370,7 @@ static void _ap_scan_bus_adapter(int id) borked = 1; else { spin_lock_bh(&aq->lock); - borked = aq->state == AP_STATE_BORKED; + borked = aq->sm_state == AP_SM_STATE_BORKED; spin_unlock_bh(&aq->lock); } if (borked) { diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 69432e93643a..1a1d5e3c8d45 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -83,39 +83,39 @@ static inline int ap_test_bit(unsigned int *ptr, unsigned int nr) #define AP_INTR_ENABLED 1 /* AP interrupt enabled */ /* - * AP device states + * AP queue state machine states */ -enum ap_state { - AP_STATE_RESET_START, - AP_STATE_RESET_WAIT, - AP_STATE_SETIRQ_WAIT, - AP_STATE_IDLE, - AP_STATE_WORKING, - AP_STATE_QUEUE_FULL, - AP_STATE_REMOVE, /* about to be removed from driver */ - AP_STATE_UNBOUND, /* momentary not bound to a driver */ - AP_STATE_BORKED, /* broken */ - NR_AP_STATES +enum ap_sm_state { + AP_SM_STATE_RESET_START, + AP_SM_STATE_RESET_WAIT, + AP_SM_STATE_SETIRQ_WAIT, + AP_SM_STATE_IDLE, + AP_SM_STATE_WORKING, + AP_SM_STATE_QUEUE_FULL, + AP_SM_STATE_REMOVE, /* about to be removed from driver */ + AP_SM_STATE_UNBOUND, /* momentary not bound to a driver */ + AP_SM_STATE_BORKED, /* broken */ + NR_AP_SM_STATES }; /* - * AP device events + * AP queue state machine events */ -enum ap_event { - AP_EVENT_POLL, - AP_EVENT_TIMEOUT, - NR_AP_EVENTS +enum ap_sm_event { + AP_SM_EVENT_POLL, + AP_SM_EVENT_TIMEOUT, + NR_AP_SM_EVENTS }; /* - * AP wait behaviour + * AP queue state wait behaviour */ -enum ap_wait { - AP_WAIT_AGAIN, /* retry immediately */ - AP_WAIT_TIMEOUT, /* wait for timeout */ - AP_WAIT_INTERRUPT, /* wait for thin interrupt (if available) */ - AP_WAIT_NONE, /* no wait */ - NR_AP_WAIT +enum ap_sm_wait { + AP_SM_WAIT_AGAIN, /* retry immediately */ + AP_SM_WAIT_TIMEOUT, /* wait for timeout */ + AP_SM_WAIT_INTERRUPT, /* wait for thin interrupt (if available) */ + AP_SM_WAIT_NONE, /* no wait */ + NR_AP_SM_WAIT }; struct ap_device; @@ -172,7 +172,7 @@ struct ap_queue { ap_qid_t qid; /* AP queue id. */ int interrupt; /* indicate if interrupts are enabled */ int queue_count; /* # messages currently on AP queue. */ - enum ap_state state; /* State of the AP device. */ + enum ap_sm_state sm_state; /* ap queue state machine state */ int pendingq_count; /* # requests on pendingq list. */ int requestq_count; /* # requests on requestq list. */ u64 total_request_count; /* # requests ever for this AP device.*/ @@ -185,7 +185,7 @@ struct ap_queue { #define to_ap_queue(x) container_of((x), struct ap_queue, ap_dev.device) -typedef enum ap_wait (ap_func_t)(struct ap_queue *queue); +typedef enum ap_sm_wait (ap_func_t)(struct ap_queue *queue); struct ap_message { struct list_head list; /* Request queueing. */ @@ -231,15 +231,15 @@ static inline void ap_release_message(struct ap_message *ap_msg) int ap_send(ap_qid_t, unsigned long long, void *, size_t); int ap_recv(ap_qid_t, unsigned long long *, void *, size_t); -enum ap_wait ap_sm_event(struct ap_queue *aq, enum ap_event event); -enum ap_wait ap_sm_event_loop(struct ap_queue *aq, enum ap_event event); +enum ap_sm_wait ap_sm_event(struct ap_queue *aq, enum ap_sm_event event); +enum ap_sm_wait ap_sm_event_loop(struct ap_queue *aq, enum ap_sm_event event); void ap_queue_message(struct ap_queue *aq, struct ap_message *ap_msg); void ap_cancel_message(struct ap_queue *aq, struct ap_message *ap_msg); void ap_flush_queue(struct ap_queue *aq); void *ap_airq_ptr(void); -void ap_wait(enum ap_wait wait); +void ap_wait(enum ap_sm_wait wait); void ap_request_timeout(struct timer_list *t); void ap_bus_force_rescan(void); diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index d6cc384f294b..688ebebbf98c 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -119,9 +119,9 @@ EXPORT_SYMBOL(ap_recv); /* State machine definitions and helpers */ -static enum ap_wait ap_sm_nop(struct ap_queue *aq) +static enum ap_sm_wait ap_sm_nop(struct ap_queue *aq) { - return AP_WAIT_NONE; + return AP_SM_WAIT_NONE; } /** @@ -129,7 +129,7 @@ static enum ap_wait ap_sm_nop(struct ap_queue *aq) * not change the state of the device. * @aq: pointer to the AP queue * - * Returns AP_WAIT_NONE, AP_WAIT_AGAIN, or AP_WAIT_INTERRUPT + * Returns AP_SM_WAIT_NONE, AP_SM_WAIT_AGAIN, or AP_SM_WAIT_INTERRUPT */ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq) { @@ -172,31 +172,31 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq) * ap_sm_read(): Receive pending reply messages from an AP queue. * @aq: pointer to the AP queue * - * Returns AP_WAIT_NONE, AP_WAIT_AGAIN, or AP_WAIT_INTERRUPT + * Returns AP_SM_WAIT_NONE, AP_SM_WAIT_AGAIN, or AP_SM_WAIT_INTERRUPT */ -static enum ap_wait ap_sm_read(struct ap_queue *aq) +static enum ap_sm_wait ap_sm_read(struct ap_queue *aq) { struct ap_queue_status status; if (!aq->reply) - return AP_WAIT_NONE; + return AP_SM_WAIT_NONE; status = ap_sm_recv(aq); switch (status.response_code) { case AP_RESPONSE_NORMAL: if (aq->queue_count > 0) { - aq->state = AP_STATE_WORKING; - return AP_WAIT_AGAIN; + aq->sm_state = AP_SM_STATE_WORKING; + return AP_SM_WAIT_AGAIN; } - aq->state = AP_STATE_IDLE; - return AP_WAIT_NONE; + aq->sm_state = AP_SM_STATE_IDLE; + return AP_SM_WAIT_NONE; case AP_RESPONSE_NO_PENDING_REPLY: if (aq->queue_count > 0) - return AP_WAIT_INTERRUPT; - aq->state = AP_STATE_IDLE; - return AP_WAIT_NONE; + return AP_SM_WAIT_INTERRUPT; + aq->sm_state = AP_SM_STATE_IDLE; + return AP_SM_WAIT_NONE; default: - aq->state = AP_STATE_BORKED; - return AP_WAIT_NONE; + aq->sm_state = AP_SM_STATE_BORKED; + return AP_SM_WAIT_NONE; } } @@ -204,15 +204,15 @@ static enum ap_wait ap_sm_read(struct ap_queue *aq) * ap_sm_write(): Send messages from the request queue to an AP queue. * @aq: pointer to the AP queue * - * Returns AP_WAIT_NONE, AP_WAIT_AGAIN, or AP_WAIT_INTERRUPT + * Returns AP_SM_WAIT_NONE, AP_SM_WAIT_AGAIN, or AP_SM_WAIT_INTERRUPT */ -static enum ap_wait ap_sm_write(struct ap_queue *aq) +static enum ap_sm_wait ap_sm_write(struct ap_queue *aq) { struct ap_queue_status status; struct ap_message *ap_msg; if (aq->requestq_count <= 0) - return AP_WAIT_NONE; + return AP_SM_WAIT_NONE; /* Start the next request on the queue. */ ap_msg = list_entry(aq->requestq.next, struct ap_message, list); status = __ap_send(aq->qid, ap_msg->psmid, @@ -227,26 +227,26 @@ static enum ap_wait ap_sm_write(struct ap_queue *aq) aq->requestq_count--; aq->pendingq_count++; if (aq->queue_count < aq->card->queue_depth) { - aq->state = AP_STATE_WORKING; - return AP_WAIT_AGAIN; + aq->sm_state = AP_SM_STATE_WORKING; + return AP_SM_WAIT_AGAIN; } fallthrough; case AP_RESPONSE_Q_FULL: - aq->state = AP_STATE_QUEUE_FULL; - return AP_WAIT_INTERRUPT; + aq->sm_state = AP_SM_STATE_QUEUE_FULL; + return AP_SM_WAIT_INTERRUPT; case AP_RESPONSE_RESET_IN_PROGRESS: - aq->state = AP_STATE_RESET_WAIT; - return AP_WAIT_TIMEOUT; + aq->sm_state = AP_SM_STATE_RESET_WAIT; + return AP_SM_WAIT_TIMEOUT; case AP_RESPONSE_MESSAGE_TOO_BIG: case AP_RESPONSE_REQ_FAC_NOT_INST: list_del_init(&ap_msg->list); aq->requestq_count--; ap_msg->rc = -EINVAL; ap_msg->receive(aq, ap_msg, NULL); - return AP_WAIT_AGAIN; + return AP_SM_WAIT_AGAIN; default: - aq->state = AP_STATE_BORKED; - return AP_WAIT_NONE; + aq->sm_state = AP_SM_STATE_BORKED; + return AP_SM_WAIT_NONE; } } @@ -254,9 +254,9 @@ static enum ap_wait ap_sm_write(struct ap_queue *aq) * ap_sm_read_write(): Send and receive messages to/from an AP queue. * @aq: pointer to the AP queue * - * Returns AP_WAIT_NONE, AP_WAIT_AGAIN, or AP_WAIT_INTERRUPT + * Returns AP_SM_WAIT_NONE, AP_SM_WAIT_AGAIN, or AP_SM_WAIT_INTERRUPT */ -static enum ap_wait ap_sm_read_write(struct ap_queue *aq) +static enum ap_sm_wait ap_sm_read_write(struct ap_queue *aq) { return min(ap_sm_read(aq), ap_sm_write(aq)); } @@ -267,7 +267,7 @@ static enum ap_wait ap_sm_read_write(struct ap_queue *aq) * * Submit the Reset command to an AP queue. */ -static enum ap_wait ap_sm_reset(struct ap_queue *aq) +static enum ap_sm_wait ap_sm_reset(struct ap_queue *aq) { struct ap_queue_status status; @@ -275,17 +275,17 @@ static enum ap_wait ap_sm_reset(struct ap_queue *aq) switch (status.response_code) { case AP_RESPONSE_NORMAL: case AP_RESPONSE_RESET_IN_PROGRESS: - aq->state = AP_STATE_RESET_WAIT; + aq->sm_state = AP_SM_STATE_RESET_WAIT; aq->interrupt = AP_INTR_DISABLED; - return AP_WAIT_TIMEOUT; + return AP_SM_WAIT_TIMEOUT; case AP_RESPONSE_BUSY: - return AP_WAIT_TIMEOUT; + return AP_SM_WAIT_TIMEOUT; case AP_RESPONSE_Q_NOT_AVAIL: case AP_RESPONSE_DECONFIGURED: case AP_RESPONSE_CHECKSTOPPED: default: - aq->state = AP_STATE_BORKED; - return AP_WAIT_NONE; + aq->sm_state = AP_SM_STATE_BORKED; + return AP_SM_WAIT_NONE; } } @@ -295,7 +295,7 @@ static enum ap_wait ap_sm_reset(struct ap_queue *aq) * * Returns AP_POLL_IMMEDIATELY, AP_POLL_AFTER_TIMEROUT or 0. */ -static enum ap_wait ap_sm_reset_wait(struct ap_queue *aq) +static enum ap_sm_wait ap_sm_reset_wait(struct ap_queue *aq) { struct ap_queue_status status; void *lsi_ptr; @@ -311,20 +311,20 @@ static enum ap_wait ap_sm_reset_wait(struct ap_queue *aq) case AP_RESPONSE_NORMAL: lsi_ptr = ap_airq_ptr(); if (lsi_ptr && ap_queue_enable_interruption(aq, lsi_ptr) == 0) - aq->state = AP_STATE_SETIRQ_WAIT; + aq->sm_state = AP_SM_STATE_SETIRQ_WAIT; else - aq->state = (aq->queue_count > 0) ? - AP_STATE_WORKING : AP_STATE_IDLE; - return AP_WAIT_AGAIN; + aq->sm_state = (aq->queue_count > 0) ? + AP_SM_STATE_WORKING : AP_SM_STATE_IDLE; + return AP_SM_WAIT_AGAIN; case AP_RESPONSE_BUSY: case AP_RESPONSE_RESET_IN_PROGRESS: - return AP_WAIT_TIMEOUT; + return AP_SM_WAIT_TIMEOUT; case AP_RESPONSE_Q_NOT_AVAIL: case AP_RESPONSE_DECONFIGURED: case AP_RESPONSE_CHECKSTOPPED: default: - aq->state = AP_STATE_BORKED; - return AP_WAIT_NONE; + aq->sm_state = AP_SM_STATE_BORKED; + return AP_SM_WAIT_NONE; } } @@ -334,7 +334,7 @@ static enum ap_wait ap_sm_reset_wait(struct ap_queue *aq) * * Returns AP_POLL_IMMEDIATELY, AP_POLL_AFTER_TIMEROUT or 0. */ -static enum ap_wait ap_sm_setirq_wait(struct ap_queue *aq) +static enum ap_sm_wait ap_sm_setirq_wait(struct ap_queue *aq) { struct ap_queue_status status; @@ -348,75 +348,75 @@ static enum ap_wait ap_sm_setirq_wait(struct ap_queue *aq) if (status.irq_enabled == 1) { /* Irqs are now enabled */ aq->interrupt = AP_INTR_ENABLED; - aq->state = (aq->queue_count > 0) ? - AP_STATE_WORKING : AP_STATE_IDLE; + aq->sm_state = (aq->queue_count > 0) ? + AP_SM_STATE_WORKING : AP_SM_STATE_IDLE; } switch (status.response_code) { case AP_RESPONSE_NORMAL: if (aq->queue_count > 0) - return AP_WAIT_AGAIN; + return AP_SM_WAIT_AGAIN; fallthrough; case AP_RESPONSE_NO_PENDING_REPLY: - return AP_WAIT_TIMEOUT; + return AP_SM_WAIT_TIMEOUT; default: - aq->state = AP_STATE_BORKED; - return AP_WAIT_NONE; + aq->sm_state = AP_SM_STATE_BORKED; + return AP_SM_WAIT_NONE; } } /* * AP state machine jump table */ -static ap_func_t *ap_jumptable[NR_AP_STATES][NR_AP_EVENTS] = { - [AP_STATE_RESET_START] = { - [AP_EVENT_POLL] = ap_sm_reset, - [AP_EVENT_TIMEOUT] = ap_sm_nop, +static ap_func_t *ap_jumptable[NR_AP_SM_STATES][NR_AP_SM_EVENTS] = { + [AP_SM_STATE_RESET_START] = { + [AP_SM_EVENT_POLL] = ap_sm_reset, + [AP_SM_EVENT_TIMEOUT] = ap_sm_nop, }, - [AP_STATE_RESET_WAIT] = { - [AP_EVENT_POLL] = ap_sm_reset_wait, - [AP_EVENT_TIMEOUT] = ap_sm_nop, + [AP_SM_STATE_RESET_WAIT] = { + [AP_SM_EVENT_POLL] = ap_sm_reset_wait, + [AP_SM_EVENT_TIMEOUT] = ap_sm_nop, }, - [AP_STATE_SETIRQ_WAIT] = { - [AP_EVENT_POLL] = ap_sm_setirq_wait, - [AP_EVENT_TIMEOUT] = ap_sm_nop, + [AP_SM_STATE_SETIRQ_WAIT] = { + [AP_SM_EVENT_POLL] = ap_sm_setirq_wait, + [AP_SM_EVENT_TIMEOUT] = ap_sm_nop, }, - [AP_STATE_IDLE] = { - [AP_EVENT_POLL] = ap_sm_write, - [AP_EVENT_TIMEOUT] = ap_sm_nop, + [AP_SM_STATE_IDLE] = { + [AP_SM_EVENT_POLL] = ap_sm_write, + [AP_SM_EVENT_TIMEOUT] = ap_sm_nop, }, - [AP_STATE_WORKING] = { - [AP_EVENT_POLL] = ap_sm_read_write, - [AP_EVENT_TIMEOUT] = ap_sm_reset, + [AP_SM_STATE_WORKING] = { + [AP_SM_EVENT_POLL] = ap_sm_read_write, + [AP_SM_EVENT_TIMEOUT] = ap_sm_reset, }, - [AP_STATE_QUEUE_FULL] = { - [AP_EVENT_POLL] = ap_sm_read, - [AP_EVENT_TIMEOUT] = ap_sm_reset, + [AP_SM_STATE_QUEUE_FULL] = { + [AP_SM_EVENT_POLL] = ap_sm_read, + [AP_SM_EVENT_TIMEOUT] = ap_sm_reset, }, - [AP_STATE_REMOVE] = { - [AP_EVENT_POLL] = ap_sm_nop, - [AP_EVENT_TIMEOUT] = ap_sm_nop, + [AP_SM_STATE_REMOVE] = { + [AP_SM_EVENT_POLL] = ap_sm_nop, + [AP_SM_EVENT_TIMEOUT] = ap_sm_nop, }, - [AP_STATE_UNBOUND] = { - [AP_EVENT_POLL] = ap_sm_nop, - [AP_EVENT_TIMEOUT] = ap_sm_nop, + [AP_SM_STATE_UNBOUND] = { + [AP_SM_EVENT_POLL] = ap_sm_nop, + [AP_SM_EVENT_TIMEOUT] = ap_sm_nop, }, - [AP_STATE_BORKED] = { - [AP_EVENT_POLL] = ap_sm_nop, - [AP_EVENT_TIMEOUT] = ap_sm_nop, + [AP_SM_STATE_BORKED] = { + [AP_SM_EVENT_POLL] = ap_sm_nop, + [AP_SM_EVENT_TIMEOUT] = ap_sm_nop, }, }; -enum ap_wait ap_sm_event(struct ap_queue *aq, enum ap_event event) +enum ap_sm_wait ap_sm_event(struct ap_queue *aq, enum ap_sm_event event) { - return ap_jumptable[aq->state][event](aq); + return ap_jumptable[aq->sm_state][event](aq); } -enum ap_wait ap_sm_event_loop(struct ap_queue *aq, enum ap_event event) +enum ap_sm_wait ap_sm_event_loop(struct ap_queue *aq, enum ap_sm_event event) { - enum ap_wait wait; + enum ap_sm_wait wait; - while ((wait = ap_sm_event(aq, event)) == AP_WAIT_AGAIN) + while ((wait = ap_sm_event(aq, event)) == AP_SM_WAIT_AGAIN) ; return wait; } @@ -487,13 +487,13 @@ static ssize_t reset_show(struct device *dev, int rc = 0; spin_lock_bh(&aq->lock); - switch (aq->state) { - case AP_STATE_RESET_START: - case AP_STATE_RESET_WAIT: + switch (aq->sm_state) { + case AP_SM_STATE_RESET_START: + case AP_SM_STATE_RESET_WAIT: rc = scnprintf(buf, PAGE_SIZE, "Reset in progress.\n"); break; - case AP_STATE_WORKING: - case AP_STATE_QUEUE_FULL: + case AP_SM_STATE_WORKING: + case AP_SM_STATE_QUEUE_FULL: rc = scnprintf(buf, PAGE_SIZE, "Reset Timer armed.\n"); break; default: @@ -511,8 +511,8 @@ static ssize_t reset_store(struct device *dev, spin_lock_bh(&aq->lock); __ap_flush_queue(aq); - aq->state = AP_STATE_RESET_START; - ap_wait(ap_sm_event(aq, AP_EVENT_POLL)); + aq->sm_state = AP_SM_STATE_RESET_START; + ap_wait(ap_sm_event(aq, AP_SM_EVENT_POLL)); spin_unlock_bh(&aq->lock); AP_DBF(DBF_INFO, "reset queue=%02x.%04x triggered by user\n", @@ -530,7 +530,7 @@ static ssize_t interrupt_show(struct device *dev, int rc = 0; spin_lock_bh(&aq->lock); - if (aq->state == AP_STATE_SETIRQ_WAIT) + if (aq->sm_state == AP_SM_STATE_SETIRQ_WAIT) rc = scnprintf(buf, PAGE_SIZE, "Enable Interrupt pending.\n"); else if (aq->interrupt == AP_INTR_ENABLED) rc = scnprintf(buf, PAGE_SIZE, "Interrupts enabled.\n"); @@ -587,7 +587,7 @@ struct ap_queue *ap_queue_create(ap_qid_t qid, int device_type) aq->ap_dev.device.type = &ap_queue_type; aq->ap_dev.device_type = device_type; aq->qid = qid; - aq->state = AP_STATE_UNBOUND; + aq->sm_state = AP_SM_STATE_UNBOUND; aq->interrupt = AP_INTR_DISABLED; spin_lock_init(&aq->lock); INIT_LIST_HEAD(&aq->pendingq); @@ -602,7 +602,7 @@ void ap_queue_init_reply(struct ap_queue *aq, struct ap_message *reply) aq->reply = reply; spin_lock_bh(&aq->lock); - ap_wait(ap_sm_event(aq, AP_EVENT_POLL)); + ap_wait(ap_sm_event(aq, AP_SM_EVENT_POLL)); spin_unlock_bh(&aq->lock); } EXPORT_SYMBOL(ap_queue_init_reply); @@ -626,7 +626,7 @@ void ap_queue_message(struct ap_queue *aq, struct ap_message *ap_msg) aq->total_request_count++; atomic64_inc(&aq->card->total_request_count); /* Send/receive as many request from the queue as possible. */ - ap_wait(ap_sm_event_loop(aq, AP_EVENT_POLL)); + ap_wait(ap_sm_event_loop(aq, AP_SM_EVENT_POLL)); spin_unlock_bh(&aq->lock); } EXPORT_SYMBOL(ap_queue_message); @@ -699,7 +699,7 @@ void ap_queue_prepare_remove(struct ap_queue *aq) /* flush queue */ __ap_flush_queue(aq); /* set REMOVE state to prevent new messages are queued in */ - aq->state = AP_STATE_REMOVE; + aq->sm_state = AP_SM_STATE_REMOVE; spin_unlock_bh(&aq->lock); del_timer_sync(&aq->timeout); } @@ -708,22 +708,22 @@ void ap_queue_remove(struct ap_queue *aq) { /* * all messages have been flushed and the state is - * AP_STATE_REMOVE. Now reset with zero which also + * AP_SM_STATE_REMOVE. Now reset with zero which also * clears the irq registration and move the state - * to AP_STATE_UNBOUND to signal that this queue + * to AP_SM_STATE_UNBOUND to signal that this queue * is not used by any driver currently. */ spin_lock_bh(&aq->lock); ap_zapq(aq->qid); - aq->state = AP_STATE_UNBOUND; + aq->sm_state = AP_SM_STATE_UNBOUND; spin_unlock_bh(&aq->lock); } void ap_queue_init_state(struct ap_queue *aq) { spin_lock_bh(&aq->lock); - aq->state = AP_STATE_RESET_START; - ap_wait(ap_sm_event(aq, AP_EVENT_POLL)); + aq->sm_state = AP_SM_STATE_RESET_START; + ap_wait(ap_sm_event(aq, AP_SM_EVENT_POLL)); spin_unlock_bh(&aq->lock); } EXPORT_SYMBOL(ap_queue_init_state); From a303e88743f6514995c31fe611011935ea7f040c Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 12 Jun 2020 10:13:23 +0200 Subject: [PATCH 258/502] s390/zcrypt: provide cex4 cca sysfs attributes for cex3 This patch introduces the sysfs attributes serialnr and mkvps for cex2c and cex3c cards. These sysfs attributes are available for cex4c and higher since commit 7c4e91c0959b ("s390/zcrypt: new sysfs attributes serialnr and mkvps")' and this patch now provides the same for the older cex2 and cex3 cards. Signed-off-by: Harald Freudenberger Signed-off-by: Heiko Carstens --- drivers/s390/crypto/zcrypt_cex2c.c | 114 +++++++++++++++++++++++++++++ drivers/s390/crypto/zcrypt_cex4.c | 26 +++++-- 2 files changed, 132 insertions(+), 8 deletions(-) diff --git a/drivers/s390/crypto/zcrypt_cex2c.c b/drivers/s390/crypto/zcrypt_cex2c.c index 993addb726e0..f00127a78bab 100644 --- a/drivers/s390/crypto/zcrypt_cex2c.c +++ b/drivers/s390/crypto/zcrypt_cex2c.c @@ -25,6 +25,7 @@ #include "zcrypt_msgtype6.h" #include "zcrypt_cex2c.h" #include "zcrypt_cca_key.h" +#include "zcrypt_ccamisc.h" #define CEX2C_MIN_MOD_SIZE 16 /* 128 bits */ #define CEX2C_MAX_MOD_SIZE 256 /* 2048 bits */ @@ -58,6 +59,91 @@ static struct ap_device_id zcrypt_cex2c_queue_ids[] = { MODULE_DEVICE_TABLE(ap, zcrypt_cex2c_queue_ids); +/* + * CCA card additional device attributes + */ +static ssize_t cca_serialnr_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cca_info ci; + struct ap_card *ac = to_ap_card(dev); + struct zcrypt_card *zc = ac->private; + + memset(&ci, 0, sizeof(ci)); + + if (ap_domain_index >= 0) + cca_get_info(ac->id, ap_domain_index, &ci, zc->online); + + return scnprintf(buf, PAGE_SIZE, "%s\n", ci.serial); +} + +static struct device_attribute dev_attr_cca_serialnr = + __ATTR(serialnr, 0444, cca_serialnr_show, NULL); + +static struct attribute *cca_card_attrs[] = { + &dev_attr_cca_serialnr.attr, + NULL, +}; + +static const struct attribute_group cca_card_attr_grp = { + .attrs = cca_card_attrs, +}; + + /* + * CCA queue additional device attributes + */ +static ssize_t cca_mkvps_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int n = 0; + struct cca_info ci; + struct zcrypt_queue *zq = to_ap_queue(dev)->private; + static const char * const cao_state[] = { "invalid", "valid" }; + static const char * const new_state[] = { "empty", "partial", "full" }; + + memset(&ci, 0, sizeof(ci)); + + cca_get_info(AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), + &ci, zq->online); + + if (ci.new_mk_state >= '1' && ci.new_mk_state <= '3') + n = scnprintf(buf, PAGE_SIZE, "AES NEW: %s 0x%016llx\n", + new_state[ci.new_mk_state - '1'], ci.new_mkvp); + else + n = scnprintf(buf, PAGE_SIZE, "AES NEW: - -\n"); + + if (ci.cur_mk_state >= '1' && ci.cur_mk_state <= '2') + n += scnprintf(buf + n, PAGE_SIZE - n, + "AES CUR: %s 0x%016llx\n", + cao_state[ci.cur_mk_state - '1'], ci.cur_mkvp); + else + n += scnprintf(buf + n, PAGE_SIZE - n, "AES CUR: - -\n"); + + if (ci.old_mk_state >= '1' && ci.old_mk_state <= '2') + n += scnprintf(buf + n, PAGE_SIZE - n, + "AES OLD: %s 0x%016llx\n", + cao_state[ci.old_mk_state - '1'], ci.old_mkvp); + else + n += scnprintf(buf + n, PAGE_SIZE - n, "AES OLD: - -\n"); + + return n; +} + +static struct device_attribute dev_attr_cca_mkvps = + __ATTR(mkvps, 0444, cca_mkvps_show, NULL); + +static struct attribute *cca_queue_attrs[] = { + &dev_attr_cca_mkvps.attr, + NULL, +}; + +static const struct attribute_group cca_queue_attr_grp = { + .attrs = cca_queue_attrs, +}; + /** * Large random number detection function. Its sends a message to a CEX2C/CEX3C * card to find out if large random numbers are supported. @@ -178,6 +264,17 @@ static int zcrypt_cex2c_card_probe(struct ap_device *ap_dev) if (rc) { ac->private = NULL; zcrypt_card_free(zc); + return rc; + } + + if (ap_test_bit(&ac->functions, AP_FUNC_COPRO)) { + rc = sysfs_create_group(&ap_dev->device.kobj, + &cca_card_attr_grp); + if (rc) { + zcrypt_card_unregister(zc); + ac->private = NULL; + zcrypt_card_free(zc); + } } return rc; @@ -189,8 +286,11 @@ static int zcrypt_cex2c_card_probe(struct ap_device *ap_dev) */ static void zcrypt_cex2c_card_remove(struct ap_device *ap_dev) { + struct ap_card *ac = to_ap_card(&ap_dev->device); struct zcrypt_card *zc = to_ap_card(&ap_dev->device)->private; + if (ap_test_bit(&ac->functions, AP_FUNC_COPRO)) + sysfs_remove_group(&ap_dev->device.kobj, &cca_card_attr_grp); if (zc) zcrypt_card_unregister(zc); } @@ -239,7 +339,19 @@ static int zcrypt_cex2c_queue_probe(struct ap_device *ap_dev) if (rc) { aq->private = NULL; zcrypt_queue_free(zq); + return rc; } + + if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO)) { + rc = sysfs_create_group(&ap_dev->device.kobj, + &cca_queue_attr_grp); + if (rc) { + zcrypt_queue_unregister(zq); + aq->private = NULL; + zcrypt_queue_free(zq); + } + } + return rc; } @@ -252,6 +364,8 @@ static void zcrypt_cex2c_queue_remove(struct ap_device *ap_dev) struct ap_queue *aq = to_ap_queue(&ap_dev->device); struct zcrypt_queue *zq = aq->private; + if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO)) + sysfs_remove_group(&ap_dev->device.kobj, &cca_queue_attr_grp); if (zq) zcrypt_queue_unregister(zq); } diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c index 337ec71ddb58..dc20d983e468 100644 --- a/drivers/s390/crypto/zcrypt_cex4.c +++ b/drivers/s390/crypto/zcrypt_cex4.c @@ -529,22 +529,27 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) if (rc) { ac->private = NULL; zcrypt_card_free(zc); - goto out; + return rc; } if (ap_test_bit(&ac->functions, AP_FUNC_COPRO)) { rc = sysfs_create_group(&ap_dev->device.kobj, &cca_card_attr_grp); - if (rc) + if (rc) { zcrypt_card_unregister(zc); + ac->private = NULL; + zcrypt_card_free(zc); + } } else if (ap_test_bit(&ac->functions, AP_FUNC_EP11)) { rc = sysfs_create_group(&ap_dev->device.kobj, &ep11_card_attr_grp); - if (rc) + if (rc) { zcrypt_card_unregister(zc); + ac->private = NULL; + zcrypt_card_free(zc); + } } -out: return rc; } @@ -617,22 +622,27 @@ static int zcrypt_cex4_queue_probe(struct ap_device *ap_dev) if (rc) { aq->private = NULL; zcrypt_queue_free(zq); - goto out; + return rc; } if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO)) { rc = sysfs_create_group(&ap_dev->device.kobj, &cca_queue_attr_grp); - if (rc) + if (rc) { zcrypt_queue_unregister(zq); + aq->private = NULL; + zcrypt_queue_free(zq); + } } else if (ap_test_bit(&aq->card->functions, AP_FUNC_EP11)) { rc = sysfs_create_group(&ap_dev->device.kobj, &ep11_queue_attr_grp); - if (rc) + if (rc) { zcrypt_queue_unregister(zq); + aq->private = NULL; + zcrypt_queue_free(zq); + } } -out: return rc; } From bc67f10ad1d76a30e01c539c0043417fa34648d7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 3 Jul 2020 09:21:34 +0530 Subject: [PATCH 259/502] arm64/cpufeature: Add remaining feature bits in ID_AA64MMFR0 register Enable EVC, FGT, EXS features bits in ID_AA64MMFR0 register as per ARM DDI 0487F.a specification. Suggested-by: Will Deacon Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Cc: Will Deacon Cc: Mark Rutland Cc: Suzuki K Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593748297-1965-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/kernel/cpufeature.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 463175f80341..2e36dfde2570 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -706,6 +706,9 @@ #define ID_AA64ZFR0_SVEVER_SVE2 0x1 /* id_aa64mmfr0 */ +#define ID_AA64MMFR0_ECV_SHIFT 60 +#define ID_AA64MMFR0_FGT_SHIFT 56 +#define ID_AA64MMFR0_EXS_SHIFT 44 #define ID_AA64MMFR0_TGRAN4_2_SHIFT 40 #define ID_AA64MMFR0_TGRAN64_2_SHIFT 36 #define ID_AA64MMFR0_TGRAN16_2_SHIFT 32 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9f63053a63a9..7a84f5f31527 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -269,6 +269,9 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ECV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_FGT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EXS_SHIFT, 4, 0), /* * Page size not being supported at Stage-2 is not fatal. You * just give up KVM if PAGE_SIZE isn't supported there. Go fix From 853772ba8023c25b1caae56b6426ca76dae1eaff Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 3 Jul 2020 09:21:35 +0530 Subject: [PATCH 260/502] arm64/cpufeature: Add remaining feature bits in ID_AA64MMFR1 register Enable ETS, TWED, XNX and SPECSEI features bits in ID_AA64MMFR1 register as per ARM DDI 0487F.a specification. Suggested-by: Will Deacon Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Cc: Will Deacon Cc: Mark Rutland Cc: Suzuki K Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593748297-1965-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 4 ++++ arch/arm64/kernel/cpufeature.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 2e36dfde2570..889fa7729719 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -737,6 +737,10 @@ #endif /* id_aa64mmfr1 */ +#define ID_AA64MMFR1_ETS_SHIFT 36 +#define ID_AA64MMFR1_TWED_SHIFT 32 +#define ID_AA64MMFR1_XNX_SHIFT 28 +#define ID_AA64MMFR1_SPECSEI_SHIFT 24 #define ID_AA64MMFR1_PAN_SHIFT 20 #define ID_AA64MMFR1_LOR_SHIFT 16 #define ID_AA64MMFR1_HPD_SHIFT 12 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7a84f5f31527..764793c4a188 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -315,6 +315,10 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_ETS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_TWED_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_XNX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64MMFR1_SPECSEI_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_PAN_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_LOR_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_HPD_SHIFT, 4, 0), From 356fdfbe8761da55c4100bd543259f349fc1ca3a Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 3 Jul 2020 09:21:36 +0530 Subject: [PATCH 261/502] arm64/cpufeature: Add remaining feature bits in ID_AA64MMFR2 register Enable EVT, BBM, TTL, IDS, ST, NV and CCIDX features bits in ID_AA64MMFR2 register as per ARM DDI 0487F.a specification. Suggested-by: Will Deacon Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Cc: Will Deacon Cc: Mark Rutland Cc: Suzuki K Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593748297-1965-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 7 +++++++ arch/arm64/kernel/cpufeature.c | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 889fa7729719..9ee324936ea2 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -753,8 +753,15 @@ /* id_aa64mmfr2 */ #define ID_AA64MMFR2_E0PD_SHIFT 60 +#define ID_AA64MMFR2_EVT_SHIFT 56 +#define ID_AA64MMFR2_BBM_SHIFT 52 +#define ID_AA64MMFR2_TTL_SHIFT 48 #define ID_AA64MMFR2_FWB_SHIFT 40 +#define ID_AA64MMFR2_IDS_SHIFT 36 #define ID_AA64MMFR2_AT_SHIFT 32 +#define ID_AA64MMFR2_ST_SHIFT 28 +#define ID_AA64MMFR2_NV_SHIFT 24 +#define ID_AA64MMFR2_CCIDX_SHIFT 20 #define ID_AA64MMFR2_LVA_SHIFT 16 #define ID_AA64MMFR2_IESB_SHIFT 12 #define ID_AA64MMFR2_LSM_SHIFT 8 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 764793c4a188..93797d9bb931 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -330,8 +330,15 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_E0PD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_BBM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_TTL_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IDS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_ST_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_NV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_CCIDX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LSM_SHIFT, 4, 0), From 8d3154afc10dd474265b62752cd169f66f40ae0d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 3 Jul 2020 09:21:37 +0530 Subject: [PATCH 262/502] arm64/cpufeature: Replace all open bits shift encodings with macros There are many open bits shift encodings for various CPU ID registers that are scattered across cpufeature. This replaces them with register specific sensible macro definitions. This should not have any functional change. Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Cc: Will Deacon Cc: Marc Zyngier Cc: Mark Rutland Cc: James Morse Cc: Suzuki K Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593748297-1965-5-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 28 +++++++++++++++++ arch/arm64/kernel/cpufeature.c | 53 +++++++++++++++++---------------- 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9ee324936ea2..b74c727c3bcd 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -769,6 +769,7 @@ #define ID_AA64MMFR2_CNP_SHIFT 0 /* id_aa64dfr0 */ +#define ID_AA64DFR0_DOUBLELOCK_SHIFT 36 #define ID_AA64DFR0_PMSVER_SHIFT 32 #define ID_AA64DFR0_CTX_CMPS_SHIFT 28 #define ID_AA64DFR0_WRPS_SHIFT 20 @@ -821,18 +822,40 @@ #define ID_ISAR6_DP_SHIFT 4 #define ID_ISAR6_JSCVT_SHIFT 0 +#define ID_MMFR0_INNERSHR_SHIFT 28 +#define ID_MMFR0_FCSE_SHIFT 24 +#define ID_MMFR0_AUXREG_SHIFT 20 +#define ID_MMFR0_TCM_SHIFT 16 +#define ID_MMFR0_SHARELVL_SHIFT 12 +#define ID_MMFR0_OUTERSHR_SHIFT 8 +#define ID_MMFR0_PMSA_SHIFT 4 +#define ID_MMFR0_VMSA_SHIFT 0 + #define ID_MMFR4_EVT_SHIFT 28 #define ID_MMFR4_CCIDX_SHIFT 24 #define ID_MMFR4_LSM_SHIFT 20 #define ID_MMFR4_HPDS_SHIFT 16 #define ID_MMFR4_CNP_SHIFT 12 #define ID_MMFR4_XNX_SHIFT 8 +#define ID_MMFR4_AC2_SHIFT 4 #define ID_MMFR4_SPECSEI_SHIFT 0 #define ID_MMFR5_ETS_SHIFT 0 #define ID_PFR0_DIT_SHIFT 24 #define ID_PFR0_CSV2_SHIFT 16 +#define ID_PFR0_STATE3_SHIFT 12 +#define ID_PFR0_STATE2_SHIFT 8 +#define ID_PFR0_STATE1_SHIFT 4 +#define ID_PFR0_STATE0_SHIFT 0 + +#define ID_DFR0_PERFMON_SHIFT 24 +#define ID_DFR0_MPROFDBG_SHIFT 20 +#define ID_DFR0_MMAPTRC_SHIFT 16 +#define ID_DFR0_COPTRC_SHIFT 12 +#define ID_DFR0_MMAPDBG_SHIFT 8 +#define ID_DFR0_COPSDBG_SHIFT 4 +#define ID_DFR0_COPDBG_SHIFT 0 #define ID_PFR2_SSBS_SHIFT 4 #define ID_PFR2_CSV3_SHIFT 0 @@ -875,6 +898,11 @@ #define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN64_SUPPORTED #endif +#define MVFR2_FPMISC_SHIFT 4 +#define MVFR2_SIMDMISC_SHIFT 0 + +#define DCZID_DZP_SHIFT 4 +#define DCZID_BS_SHIFT 0 /* * The ZCR_ELx_LEN_* definitions intentionally include bits [8:4] which diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 93797d9bb931..19146bd338b4 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -359,7 +359,7 @@ static const struct arm64_ftr_bits ftr_ctr[] = { * make use of *minLine. * If we have differing I-cache policies, report it as the weakest - VIPT. */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, 14, 2, ICACHE_POLICY_VIPT), /* L1Ip */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_L1IP_SHIFT, 2, ICACHE_POLICY_VIPT), /* L1Ip */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IMINLINE_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -370,19 +370,19 @@ struct arm64_ftr_reg arm64_ftr_reg_ctrel0 = { }; static const struct arm64_ftr_bits ftr_id_mmfr0[] = { - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0xf), /* InnerShr */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0), /* FCSE */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 20, 4, 0), /* AuxReg */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), /* TCM */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), /* ShareLvl */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0xf), /* OuterShr */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* PMSA */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* VMSA */ + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_INNERSHR_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_FCSE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_MMFR0_AUXREG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_TCM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_SHARELVL_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_OUTERSHR_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_PMSA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_VMSA_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 36, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_DOUBLELOCK_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_PMSVER_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_CTX_CMPS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_WRPS_SHIFT, 4, 0), @@ -398,14 +398,14 @@ static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { }; static const struct arm64_ftr_bits ftr_mvfr2[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* FPMisc */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* SIMDMisc */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_FPMISC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_SIMDMISC_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_dczid[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 4, 1, 1), /* DZP */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* BS */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_DZP_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_BS_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -437,7 +437,8 @@ static const struct arm64_ftr_bits ftr_id_mmfr4[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_HPDS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_CNP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_XNX_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* ac2 */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_AC2_SHIFT, 4, 0), + /* * SpecSEI = 1 indicates that the PE might generate an SError on an * external abort on speculative read. It is safe to assume that an @@ -479,10 +480,10 @@ static const struct arm64_ftr_bits ftr_id_isar6[] = { static const struct arm64_ftr_bits ftr_id_pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_DIT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR0_CSV2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), /* State3 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), /* State2 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* State1 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* State0 */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE0_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -506,13 +507,13 @@ static const struct arm64_ftr_bits ftr_id_pfr2[] = { static const struct arm64_ftr_bits ftr_id_dfr0[] = { /* [31:28] TraceFilt */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0xf), /* PerfMon */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_PERFMON_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MPROFDBG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MMAPTRC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPTRC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MMAPDBG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPSDBG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPDBG_SHIFT, 4, 0), ARM64_FTR_END, }; From 2a379716f3d76aebc5574155de247b547a0214cc Mon Sep 17 00:00:00 2001 From: Bhupesh Sharma Date: Tue, 7 Apr 2020 04:01:40 +0530 Subject: [PATCH 263/502] arm64/defconfig: Enable CONFIG_KEXEC_FILE kexec_file_load() syscall interface is now supported for arm64 architecture as well via commits: 3751e728cef2 ("arm64: kexec_file: add crash dump support") and 3ddd9992a590 ("arm64: enable KEXEC_FILE config")]. This patch enables config KEXEC_FILE by default in the arm64 defconfig, so that user-space tools like kexec-tools can use the same as the default interface for kexec/kdump on arm64. Cc: AKASHI Takahiro Cc: James Morse Cc: Mark Rutland Cc: Will Deacon Cc: kexec@lists.infradead.org Signed-off-by: Bhupesh Sharma Acked-by: Mark Rutland Link: https://lore.kernel.org/r/1586212300-30797-1-git-send-email-bhsharma@redhat.com Signed-off-by: Catalin Marinas --- arch/arm64/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 883e8bace3ed..1a33697a8492 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -66,6 +66,7 @@ CONFIG_SCHED_SMT=y CONFIG_NUMA=y CONFIG_SECCOMP=y CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y CONFIG_CRASH_DUMP=y CONFIG_XEN=y CONFIG_COMPAT=y From a1634a542f74309f843742fa849208bb26e279e4 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 30 Jun 2020 16:24:28 +1000 Subject: [PATCH 264/502] arm64/mm: Redefine CONT_{PTE, PMD}_SHIFT Currently, the value of CONT_{PTE, PMD}_SHIFT is off from standard {PAGE, PMD}_SHIFT. In turn, we have to consider adding {PAGE, PMD}_SHIFT when using CONT_{PTE, PMD}_SHIFT in the function hugetlbpage_init(). It's a bit confusing. This redefines CONT_{PTE, PMD}_SHIFT with {PAGE, PMD}_SHIFT included so that the later values needn't be added when using the former ones in function hugetlbpage_init(). Note that the values of CONT_{PTES, PMDS} are unchanged. Suggested-by: Will Deacon Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Link: https://lkml.org/lkml/2020/5/6/190 Link: https://lore.kernel.org/r/20200630062428.194235-1-gshan@redhat.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-hwdef.h | 16 ++++++++-------- arch/arm64/mm/hugetlbpage.c | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 9c91a8f93a0e..ce3d14abb360 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -82,20 +82,20 @@ * Contiguous page definitions. */ #ifdef CONFIG_ARM64_64K_PAGES -#define CONT_PTE_SHIFT 5 -#define CONT_PMD_SHIFT 5 +#define CONT_PTE_SHIFT (5 + PAGE_SHIFT) +#define CONT_PMD_SHIFT (5 + PMD_SHIFT) #elif defined(CONFIG_ARM64_16K_PAGES) -#define CONT_PTE_SHIFT 7 -#define CONT_PMD_SHIFT 5 +#define CONT_PTE_SHIFT (7 + PAGE_SHIFT) +#define CONT_PMD_SHIFT (5 + PMD_SHIFT) #else -#define CONT_PTE_SHIFT 4 -#define CONT_PMD_SHIFT 4 +#define CONT_PTE_SHIFT (4 + PAGE_SHIFT) +#define CONT_PMD_SHIFT (4 + PMD_SHIFT) #endif -#define CONT_PTES (1 << CONT_PTE_SHIFT) +#define CONT_PTES (1 << (CONT_PTE_SHIFT - PAGE_SHIFT)) #define CONT_PTE_SIZE (CONT_PTES * PAGE_SIZE) #define CONT_PTE_MASK (~(CONT_PTE_SIZE - 1)) -#define CONT_PMDS (1 << CONT_PMD_SHIFT) +#define CONT_PMDS (1 << (CONT_PMD_SHIFT - PMD_SHIFT)) #define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE) #define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1)) /* the the numerical offset of the PTE within a range of CONT_PTES */ diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 0a52ce46f020..c79084739096 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -457,9 +457,9 @@ static int __init hugetlbpage_init(void) #ifdef CONFIG_ARM64_4K_PAGES hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); #endif - hugetlb_add_hstate((CONT_PMD_SHIFT + PMD_SHIFT) - PAGE_SHIFT); + hugetlb_add_hstate(CONT_PMD_SHIFT - PAGE_SHIFT); hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - hugetlb_add_hstate((CONT_PTE_SHIFT + PAGE_SHIFT) - PAGE_SHIFT); + hugetlb_add_hstate(CONT_PTE_SHIFT - PAGE_SHIFT); return 0; } From 4c6e277c4cc4a6b3b2b9c66a7b014787ae757cc1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 1 Jul 2020 11:29:10 -0600 Subject: [PATCH 265/502] io_uring: abstract out task work running Provide a helper to run task_work instead of checking and running manually in a bunch of different spots. While doing so, also move the task run state setting where we run the task work. Then we can move it out of the callback helpers. This also helps ensure we only do this once per task_work list run, not per task_work item. Suggested-by: Oleg Nesterov Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7426e4f23f9b..65a6978e1795 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1714,7 +1714,6 @@ static void __io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - __set_current_state(TASK_RUNNING); if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL, NULL); @@ -1899,6 +1898,17 @@ static int io_put_kbuf(struct io_kiocb *req) return cflags; } +static inline bool io_run_task_work(void) +{ + if (current->task_works) { + __set_current_state(TASK_RUNNING); + task_work_run(); + return true; + } + + return false; +} + static void io_iopoll_queue(struct list_head *again) { struct io_kiocb *req; @@ -2079,8 +2089,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); - if (current->task_works) - task_work_run(); + io_run_task_work(); mutex_lock(&ctx->uring_lock); } @@ -2176,8 +2185,6 @@ static void io_rw_resubmit(struct callback_head *cb) struct io_ring_ctx *ctx = req->ctx; int err; - __set_current_state(TASK_RUNNING); - err = io_sq_thread_acquire_mm(ctx, req); if (io_resubmit_prep(req, err)) { @@ -6361,8 +6368,7 @@ static int io_sq_thread(void *data) if (!list_empty(&ctx->poll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { - if (current->task_works) - task_work_run(); + io_run_task_work(); cond_resched(); continue; } @@ -6394,8 +6400,7 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); break; } - if (current->task_works) { - task_work_run(); + if (io_run_task_work()) { finish_wait(&ctx->sqo_wait, &wait); continue; } @@ -6420,8 +6425,7 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (current->task_works) - task_work_run(); + io_run_task_work(); io_sq_thread_drop_mm(ctx); revert_creds(old_cred); @@ -6486,9 +6490,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { if (io_cqring_events(ctx, false) >= min_events) return 0; - if (!current->task_works) + if (!io_run_task_work()) break; - task_work_run(); } while (1); if (sig) { @@ -6510,8 +6513,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ - if (current->task_works) - task_work_run(); + if (io_run_task_work()) + continue; if (signal_pending(current)) { if (current->jobctl & JOBCTL_TASK_WORK) { spin_lock_irq(¤t->sighand->siglock); @@ -7953,8 +7956,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; - if (current->task_works) - task_work_run(); + io_run_task_work(); if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; From c2c4c83c58cbca23527fee93b49738a5a84272a1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 1 Jul 2020 15:37:11 -0600 Subject: [PATCH 266/502] io_uring: use new io_req_task_work_add() helper throughout Since we now have that in the 5.9 branch, convert the existing users of task_work_add() to use this new helper. Signed-off-by: Jens Axboe --- fs/io_uring.c | 77 +++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 65a6978e1795..2b849984bae5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1689,6 +1689,29 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) +{ + struct task_struct *tsk = req->task; + struct io_ring_ctx *ctx = req->ctx; + int ret, notify = TWA_RESUME; + + /* + * SQPOLL kernel thread doesn't need notification, just a wakeup. + * If we're not using an eventfd, then TWA_RESUME is always fine, + * as we won't have dependencies between request completions for + * other kernel wait conditions. + */ + if (ctx->flags & IORING_SETUP_SQPOLL) + notify = 0; + else if (ctx->cq_ev_fd) + notify = TWA_SIGNAL; + + ret = task_work_add(tsk, cb, notify); + if (!ret) + wake_up_process(tsk); + return ret; +} + static void __io_req_task_cancel(struct io_kiocb *req, int error) { struct io_ring_ctx *ctx = req->ctx; @@ -1732,18 +1755,19 @@ static void io_req_task_submit(struct callback_head *cb) static void io_req_task_queue(struct io_kiocb *req) { - struct task_struct *tsk = req->task; int ret; init_task_work(&req->task_work, io_req_task_submit); - ret = task_work_add(tsk, &req->task_work, true); + ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { + struct task_struct *tsk; + init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, true); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); } - wake_up_process(tsk); } static void io_queue_next(struct io_kiocb *req) @@ -2197,19 +2221,15 @@ static void io_rw_resubmit(struct callback_head *cb) static bool io_rw_reissue(struct io_kiocb *req, long res) { #ifdef CONFIG_BLOCK - struct task_struct *tsk; int ret; if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) return false; - tsk = req->task; init_task_work(&req->task_work, io_rw_resubmit); - ret = task_work_add(tsk, &req->task_work, true); - if (!ret) { - wake_up_process(tsk); + ret = io_req_task_work_add(req, &req->task_work); + if (!ret) return true; - } #endif return false; } @@ -2909,7 +2929,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, struct io_kiocb *req = wait->private; struct io_async_rw *rw = &req->io->rw; struct wait_page_key *key = arg; - struct task_struct *tsk; int ret; wpq = container_of(wait, struct wait_page_queue, wait); @@ -2923,15 +2942,16 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, init_task_work(&rw->task_work, io_async_buf_retry); /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - tsk = req->task; - ret = task_work_add(tsk, &rw->task_work, true); + ret = io_req_task_work_add(req, &rw->task_work); if (unlikely(ret)) { + struct task_struct *tsk; + /* queue just for cancelation */ init_task_work(&rw->task_work, io_async_buf_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &rw->task_work, true); + task_work_add(tsk, &rw->task_work, 0); + wake_up_process(tsk); } - wake_up_process(tsk); return 1; } @@ -4424,33 +4444,9 @@ struct io_poll_table { int error; }; -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) -{ - struct task_struct *tsk = req->task; - struct io_ring_ctx *ctx = req->ctx; - int ret, notify = TWA_RESUME; - - /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. - * If we're not using an eventfd, then TWA_RESUME is always fine, - * as we won't have dependencies between request completions for - * other kernel wait conditions. - */ - if (ctx->flags & IORING_SETUP_SQPOLL) - notify = 0; - else if (ctx->cq_ev_fd) - notify = TWA_SIGNAL; - - ret = task_work_add(tsk, cb, notify); - if (!ret) - wake_up_process(tsk); - return ret; -} - static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { - struct task_struct *tsk; int ret; /* for instances that support it check for an event match first: */ @@ -4461,7 +4457,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, list_del_init(&poll->wait.entry); - tsk = req->task; req->result = mask; init_task_work(&req->task_work, func); /* @@ -4472,6 +4467,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, */ ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { + struct task_struct *tsk; + WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); task_work_add(tsk, &req->task_work, 0); From 6df1db6b542436c6d429caa66e1045862fa36155 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 3 Jul 2020 22:15:06 +0300 Subject: [PATCH 267/502] io_uring: fix mis-refcounting linked timeouts io_prep_linked_timeout() sets REQ_F_LINK_TIMEOUT altering refcounting of the following linked request. After that someone should call io_queue_linked_timeout(), otherwise a submission reference of the linked timeout won't be ever dropped. That's what happens in io_steal_work() if io-wq decides to postpone linked request with io_wqe_enqueue(). io_queue_linked_timeout() can also be potentially called twice without synchronisation during re-submission, e.g. io_rw_resubmit(). There are the rules, whoever did io_prep_linked_timeout() must also call io_queue_linked_timeout(). To not do it twice, io_prep_linked_timeout() will return non NULL only for the first call. That's controlled by REQ_F_LINK_TIMEOUT flag. Also kill REQ_F_QUEUE_TIMEOUT. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 33 +++++++-------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2b849984bae5..cf1b3d4ac241 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -538,7 +538,6 @@ enum { REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, - REQ_F_QUEUE_TIMEOUT_BIT, REQ_F_WORK_INITIALIZED_BIT, REQ_F_TASK_PINNED_BIT, @@ -586,8 +585,6 @@ enum { REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), /* doesn't need file table for this request */ REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), - /* needs to queue linked timeout */ - REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), /* req->task is refcounted */ @@ -1842,7 +1839,7 @@ static void io_put_req(struct io_kiocb *req) static struct io_wq_work *io_steal_work(struct io_kiocb *req) { - struct io_kiocb *timeout, *nxt = NULL; + struct io_kiocb *nxt; /* * A ref is owned by io-wq in which context we're. So, if that's the @@ -1853,13 +1850,7 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) return NULL; nxt = io_req_find_next(req); - if (!nxt) - return NULL; - - timeout = io_prep_linked_timeout(nxt); - if (timeout) - nxt->flags |= REQ_F_QUEUE_TIMEOUT; - return &nxt->work; + return nxt ? &nxt->work : NULL; } /* @@ -5702,24 +5693,15 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_arm_async_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *link; - - /* link head's timeout is queued in io_queue_async_work() */ - if (!(req->flags & REQ_F_QUEUE_TIMEOUT)) - return; - - link = list_first_entry(&req->link_list, struct io_kiocb, link_list); - io_queue_linked_timeout(link); -} - static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_kiocb *timeout; int ret = 0; - io_arm_async_linked_timeout(req); + timeout = io_prep_linked_timeout(req); + if (timeout) + io_queue_linked_timeout(timeout); /* if NO_CANCEL is set, we must still run the work */ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == @@ -5893,8 +5875,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK_HEAD)) return NULL; - /* for polled retry, if flag is set, we already went through here */ - if (req->flags & REQ_F_POLLED) + if (req->flags & REQ_F_LINK_TIMEOUT) return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, From 652532ad459524d32c6bf1522e0b88d83b084d1a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 3 Jul 2020 22:15:07 +0300 Subject: [PATCH 268/502] io_uring: keep queue_sqe()'s fail path separately A preparation path, extracts error path into a separate block. It looks saner then calling req_set_fail_links() after io_put_req_find_next(), even though it have been working well. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cf1b3d4ac241..7147e87a24b5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5937,22 +5937,21 @@ punt: goto exit; } + if (unlikely(ret)) { err: + /* un-prep timeout, so it'll be killed as any other linked */ + req->flags &= ~REQ_F_LINK_TIMEOUT; + req_set_fail_links(req); + io_put_req(req); + io_req_complete(req, ret); + goto exit; + } + /* drop submission reference */ nxt = io_put_req_find_next(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); - if (linked_timeout) { - if (!ret) - io_queue_linked_timeout(linked_timeout); - else - io_put_req(linked_timeout); - } - - /* and drop final reference, if we failed */ - if (ret) { - req_set_fail_links(req); - io_req_complete(req, ret); - } if (nxt) { req = nxt; From 8b3656af2a37dc538d21e144a5a94bacae05e9f1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 3 Jul 2020 22:15:08 +0300 Subject: [PATCH 269/502] io_uring: fix lost cqe->flags Don't forget to fill cqe->flags properly in io_submit_flush_completions() Fixes: a1d7c393c4711 ("io_uring: enable READ/WRITE to use deferred completions") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7147e87a24b5..9464f9470bbc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1416,7 +1416,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs) req = list_first_entry(&cs->list, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(req, req->result); + __io_cqring_fill_event(req, req->result, req->cflags); if (!(req->flags & REQ_F_LINK_HEAD)) { req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -1441,6 +1441,7 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, io_put_req(req); } else { req->result = res; + req->cflags = cflags; list_add_tail(&req->list, &cs->list); if (++cs->nr >= 32) io_submit_flush_completions(cs); From 3aadc23e6054353ca056bf14e87250c79efbd7ed Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 6 Jul 2020 17:59:29 +0300 Subject: [PATCH 270/502] io_uring: don't delay iopoll'ed req completion ->iopoll() may have completed current request, but instead of reaping it, io_do_iopoll() just continues with the next request in the list. As a result it can leave just polled and completed request in the list up until next syscall. Even outer loop in io_iopoll_getevents() doesn't help the situation. E.g. poll_list: req0 -> req1 If req0->iopoll() completed both requests, and @min<=1, then @req0 will be left behind. Check whether a req was completed after ->iopoll(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9464f9470bbc..60f1a81c6c35 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2015,6 +2015,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (ret < 0) break; + /* iopoll may have completed current req */ + if (READ_ONCE(req->iopoll_completed)) + list_move_tail(&req->list, &done); + if (ret && spin) spin = false; ret = 0; From eba0a4dd2aa5c47ca5b0c56ffb6d6665e047ff72 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 6 Jul 2020 17:59:30 +0300 Subject: [PATCH 271/502] io_uring: fix stopping iopoll'ing too early Nobody adjusts *nr_events (number of completed requests) before calling io_iopoll_getevents(), so the passed @min shouldn't be adjusted as well. Othewise it can return less than initially asked @min without hitting need_resched(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 60f1a81c6c35..332008f346e3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2044,7 +2044,7 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, ret = io_do_iopoll(ctx, nr_events, min); if (ret < 0) return ret; - if (!min || *nr_events >= min) + if (*nr_events >= min) return 0; } @@ -2087,8 +2087,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ mutex_lock(&ctx->uring_lock); do { - int tmin = 0; - /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that @@ -2113,10 +2111,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, mutex_lock(&ctx->uring_lock); } - if (*nr_events < min) - tmin = min - *nr_events; - - ret = io_iopoll_getevents(ctx, nr_events, tmin); + ret = io_iopoll_getevents(ctx, nr_events, min); if (ret <= 0) break; ret = 0; From 3fcee5a6d5414df8ff4ee22f2477bde76d34527c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 6 Jul 2020 17:59:31 +0300 Subject: [PATCH 272/502] io_uring: briefly loose locks while reaping events It's not nice to hold @uring_lock for too long io_iopoll_reap_events(). For instance, the lock is needed to publish requests to @poll_list, and that locks out tasks doing that for no good reason. Loose it occasionally. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 332008f346e3..6e3169834bf7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2069,8 +2069,13 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. + * Also let task_work, etc. to progress by releasing the mutex */ - cond_resched(); + if (need_resched()) { + mutex_unlock(&ctx->uring_lock); + cond_resched(); + mutex_lock(&ctx->uring_lock); + } } mutex_unlock(&ctx->uring_lock); } From b037b09b9058d84882fa2c4db3806433e2b0f912 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2020 10:02:58 -0700 Subject: [PATCH 273/502] x86/entry: Rename idtentry_enter/exit_cond_rcu() to idtentry_enter/exit() They were originally called _cond_rcu because they were special versions with conditional RCU handling. Now they're the standard entry and exit path, so the _cond_rcu part is just confusing. Drop it. Also change the signature to make them more extensible and more foolproof. No functional change -- it's pure refactoring. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/247fc67685263e0b673e1d7f808182d28ff80359.1593795633.git.luto@kernel.org --- arch/x86/entry/common.c | 50 ++++++++++++++++++--------------- arch/x86/include/asm/idtentry.h | 28 ++++++++++-------- arch/x86/kernel/kvm.c | 6 ++-- arch/x86/kernel/traps.c | 6 ++-- arch/x86/mm/fault.c | 6 ++-- 5 files changed, 53 insertions(+), 43 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index e83b3f14897c..0521546022cb 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -559,8 +559,7 @@ SYSCALL_DEFINE0(ni_syscall) } /** - * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional - * RCU handling + * idtentry_enter - Handle state tracking on ordinary idtentries * @regs: Pointer to pt_regs of interrupted context * * Invokes: @@ -572,6 +571,9 @@ SYSCALL_DEFINE0(ni_syscall) * - The hardirq tracer to keep the state consistent as low level ASM * entry disabled interrupts. * + * As a precondition, this requires that the entry came from user mode, + * idle, or a kernel context in which RCU is watching. + * * For kernel mode entries RCU handling is done conditional. If RCU is * watching then the only RCU requirement is to check whether the tick has * to be restarted. If RCU is not watching then rcu_irq_enter() has to be @@ -585,18 +587,21 @@ SYSCALL_DEFINE0(ni_syscall) * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit * would not be possible. * - * Returns: True if RCU has been adjusted on a kernel entry - * False otherwise + * Returns: An opaque object that must be passed to idtentry_exit() * - * The return value must be fed into the rcu_exit argument of - * idtentry_exit_cond_rcu(). + * The return value must be fed into the state argument of + * idtentry_exit(). */ -bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) +idtentry_state_t noinstr idtentry_enter(struct pt_regs *regs) { + idtentry_state_t ret = { + .exit_rcu = false, + }; + if (user_mode(regs)) { check_user_regs(regs); enter_from_user_mode(); - return false; + return ret; } /* @@ -634,7 +639,8 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) trace_hardirqs_off_finish(); instrumentation_end(); - return true; + ret.exit_rcu = true; + return ret; } /* @@ -649,7 +655,7 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) trace_hardirqs_off(); instrumentation_end(); - return false; + return ret; } static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) @@ -667,10 +673,9 @@ static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) } /** - * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU - * handling + * idtentry_exit - Handle return from exception that used idtentry_enter() * @regs: Pointer to pt_regs (exception entry regs) - * @rcu_exit: Invoke rcu_irq_exit() if true + * @state: Return value from matching call to idtentry_enter() * * Depending on the return target (kernel/user) this runs the necessary * preemption and work checks if possible and reguired and returns to @@ -679,10 +684,10 @@ static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) * This is the last action before returning to the low level ASM code which * just needs to return to the appropriate context. * - * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry - * function must be fed into the @rcu_exit argument. + * Counterpart to idtentry_enter(). The return value of the entry + * function must be fed into the @state argument. */ -void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) +void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state) { lockdep_assert_irqs_disabled(); @@ -695,7 +700,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) * carefully and needs the same ordering of lockdep/tracing * and RCU as the return to user mode path. */ - if (rcu_exit) { + if (state.exit_rcu) { instrumentation_begin(); /* Tell the tracer that IRET will enable interrupts */ trace_hardirqs_on_prepare(); @@ -714,7 +719,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) * IRQ flags state is correct already. Just tell RCU if it * was not watching on entry. */ - if (rcu_exit) + if (state.exit_rcu) rcu_irq_exit(); } } @@ -800,9 +805,10 @@ static void __xen_pv_evtchn_do_upcall(void) __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) { struct pt_regs *old_regs; - bool inhcall, rcu_exit; + bool inhcall; + idtentry_state_t state; - rcu_exit = idtentry_enter_cond_rcu(regs); + state = idtentry_enter(regs); old_regs = set_irq_regs(regs); instrumentation_begin(); @@ -812,13 +818,13 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) set_irq_regs(old_regs); inhcall = get_and_clear_inhcall(); - if (inhcall && !WARN_ON_ONCE(rcu_exit)) { + if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { instrumentation_begin(); idtentry_exit_cond_resched(regs, true); instrumentation_end(); restore_inhcall(inhcall); } else { - idtentry_exit_cond_rcu(regs, rcu_exit); + idtentry_exit(regs, state); } } #endif /* CONFIG_XEN_PV */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index eeac6dc2adaa..7227225cf45d 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -13,8 +13,12 @@ void idtentry_enter_user(struct pt_regs *regs); void idtentry_exit_user(struct pt_regs *regs); -bool idtentry_enter_cond_rcu(struct pt_regs *regs); -void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit); +typedef struct idtentry_state { + bool exit_rcu; +} idtentry_state_t; + +idtentry_state_t idtentry_enter(struct pt_regs *regs); +void idtentry_exit(struct pt_regs *regs, idtentry_state_t state); /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points @@ -54,12 +58,12 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + idtentry_state_t state = idtentry_enter(regs); \ \ instrumentation_begin(); \ __##func (regs); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + idtentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) @@ -101,12 +105,12 @@ static __always_inline void __##func(struct pt_regs *regs, \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + idtentry_state_t state = idtentry_enter(regs); \ \ instrumentation_begin(); \ __##func (regs, error_code); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + idtentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs, \ @@ -199,7 +203,7 @@ static __always_inline void __##func(struct pt_regs *regs, u8 vector); \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + idtentry_state_t state = idtentry_enter(regs); \ \ instrumentation_begin(); \ irq_enter_rcu(); \ @@ -207,7 +211,7 @@ __visible noinstr void func(struct pt_regs *regs, \ __##func (regs, (u8)error_code); \ irq_exit_rcu(); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + idtentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs, u8 vector) @@ -241,7 +245,7 @@ static void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + idtentry_state_t state = idtentry_enter(regs); \ \ instrumentation_begin(); \ irq_enter_rcu(); \ @@ -249,7 +253,7 @@ __visible noinstr void func(struct pt_regs *regs) \ run_on_irqstack_cond(__##func, regs, regs); \ irq_exit_rcu(); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + idtentry_exit(regs, state); \ } \ \ static noinline void __##func(struct pt_regs *regs) @@ -270,7 +274,7 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + idtentry_state_t state = idtentry_enter(regs); \ \ instrumentation_begin(); \ __irq_enter_raw(); \ @@ -278,7 +282,7 @@ __visible noinstr void func(struct pt_regs *regs) \ __##func (regs); \ __irq_exit_raw(); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + idtentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index df63786e7bfa..3f78482d9496 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { u32 reason = kvm_read_and_reset_apf_flags(); - bool rcu_exit; + idtentry_state_t state; switch (reason) { case KVM_PV_REASON_PAGE_NOT_PRESENT: @@ -243,7 +243,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) return false; } - rcu_exit = idtentry_enter_cond_rcu(regs); + state = idtentry_enter(regs); instrumentation_begin(); /* @@ -264,7 +264,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) } instrumentation_end(); - idtentry_exit_cond_rcu(regs, rcu_exit); + idtentry_exit(regs, state); return true; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b038695f36c5..4627f826fb57 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -245,7 +245,7 @@ static noinstr bool handle_bug(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_invalid_op) { - bool rcu_exit; + idtentry_state_t state; /* * We use UD2 as a short encoding for 'CALL __WARN', as such @@ -255,11 +255,11 @@ DEFINE_IDTENTRY_RAW(exc_invalid_op) if (!user_mode(regs) && handle_bug(regs)) return; - rcu_exit = idtentry_enter_cond_rcu(regs); + state = idtentry_enter(regs); instrumentation_begin(); handle_invalid_op(regs); instrumentation_end(); - idtentry_exit_cond_rcu(regs, rcu_exit); + idtentry_exit(regs, state); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1ead568c0101..5e41949453cc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1377,7 +1377,7 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { unsigned long address = read_cr2(); - bool rcu_exit; + idtentry_state_t state; prefetchw(¤t->mm->mmap_lock); @@ -1412,11 +1412,11 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) * code reenabled RCU to avoid subsequent wreckage which helps * debugability. */ - rcu_exit = idtentry_enter_cond_rcu(regs); + state = idtentry_enter(regs); instrumentation_begin(); handle_page_fault(regs, error_code, address); instrumentation_end(); - idtentry_exit_cond_rcu(regs, rcu_exit); + idtentry_exit(regs, state); } From 552ae76face5584085845646c5f57e10c1a4ebdc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 22 Dec 2018 12:00:10 +0000 Subject: [PATCH 274/502] arm64: Detect the ARMv8.4 TTL feature In order to reduce the cost of TLB invalidation, the ARMv8.4 TTL feature allows TLBs to be issued with a level allowing for quicker invalidation. Let's detect the feature for now. Further patches will implement its actual usage. Reviewed-by : Suzuki K Polose Reviewed-by: Catalin Marinas Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kernel/cpufeature.c | 11 +++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index d7b3bb0cb180..d44ba903d11d 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -62,7 +62,8 @@ #define ARM64_HAS_GENERIC_AUTH 52 #define ARM64_HAS_32BIT_EL1 53 #define ARM64_BTI 54 +#define ARM64_HAS_ARMv8_4_TTL 55 -#define ARM64_NCAPS 55 +#define ARM64_NCAPS 56 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 463175f80341..8c209aa17273 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -746,6 +746,7 @@ /* id_aa64mmfr2 */ #define ID_AA64MMFR2_E0PD_SHIFT 60 +#define ID_AA64MMFR2_TTL_SHIFT 48 #define ID_AA64MMFR2_FWB_SHIFT 40 #define ID_AA64MMFR2_AT_SHIFT 32 #define ID_AA64MMFR2_LVA_SHIFT 16 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9f63053a63a9..e877f56ff1ab 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -323,6 +323,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_E0PD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_TTL_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0), @@ -1882,6 +1883,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, .cpu_enable = cpu_has_fwb, }, + { + .desc = "ARMv8.4 Translation Table Level", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_ARMv8_4_TTL, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64MMFR2_TTL_SHIFT, + .min_field_value = 1, + .matches = has_cpuid_feature, + }, #ifdef CONFIG_ARM64_HW_AFDBM { /* From 6fcfdf6d72898d1c5118d7dd3d3d38690e2f6a64 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 28 Dec 2018 09:11:50 +0000 Subject: [PATCH 275/502] arm64: Document SW reserved PTE/PMD bits in Stage-2 descriptors Advertise bits [58:55] as reserved for SW in the S2 descriptors. Reviewed-by: Andrew Scull Acked-by: Catalin Marinas Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/pgtable-hwdef.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 9c91a8f93a0e..de0b603955f4 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -178,10 +178,12 @@ #define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ #define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ #define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */ +#define PTE_S2_SW_RESVD (_AT(pteval_t, 15) << 55) /* Reserved for SW */ #define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */ #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ #define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ +#define PMD_S2_SW_RESVD (_AT(pmdval_t, 15) << 55) /* Reserved for SW */ #define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */ #define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */ From c10bc62ae4d2135c9db40e96a8e994164faee531 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 2 Jan 2019 10:21:29 +0000 Subject: [PATCH 276/502] arm64: Add level-hinted TLB invalidation helper Add a level-hinted TLB invalidation helper that only gets used if ARMv8.4-TTL gets detected. Reviewed-by: Alexandru Elisei Reviewed-by: Catalin Marinas Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/stage2_pgtable.h | 9 +++++ arch/arm64/include/asm/tlbflush.h | 45 +++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index b767904f28b1..996bf98f0cab 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -256,4 +256,13 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) return (boundary - 1 < end - 1) ? boundary : end; } +/* + * Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and + * the architectural page-table level. + */ +#define S2_NO_LEVEL_HINT 0 +#define S2_PUD_LEVEL 1 +#define S2_PMD_LEVEL 2 +#define S2_PTE_LEVEL 3 + #endif /* __ARM64_S2_PGTABLE_H_ */ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index bc3949064725..3353f26302de 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -10,6 +10,7 @@ #ifndef __ASSEMBLY__ +#include #include #include #include @@ -59,6 +60,50 @@ __ta; \ }) +/* + * Level-based TLBI operations. + * + * When ARMv8.4-TTL exists, TLBI operations take an additional hint for + * the level at which the invalidation must take place. If the level is + * wrong, no invalidation may take place. In the case where the level + * cannot be easily determined, a 0 value for the level parameter will + * perform a non-hinted invalidation. + * + * For Stage-2 invalidation, use the level values provided to that effect + * in asm/stage2_pgtable.h. + */ +#define TLBI_TTL_MASK GENMASK_ULL(47, 44) +#define TLBI_TTL_TG_4K 1 +#define TLBI_TTL_TG_16K 2 +#define TLBI_TTL_TG_64K 3 + +#define __tlbi_level(op, addr, level) \ + do { \ + u64 arg = addr; \ + \ + if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) && \ + level) { \ + u64 ttl = level & 3; \ + \ + switch (PAGE_SIZE) { \ + case SZ_4K: \ + ttl |= TLBI_TTL_TG_4K << 2; \ + break; \ + case SZ_16K: \ + ttl |= TLBI_TTL_TG_16K << 2; \ + break; \ + case SZ_64K: \ + ttl |= TLBI_TTL_TG_64K << 2; \ + break; \ + } \ + \ + arg &= ~TLBI_TTL_MASK; \ + arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \ + } \ + \ + __tlbi(op, arg); \ + } while(0) + /* * TLB Invalidation * ================ From 7af928851508fb25207806f57e287272dd498981 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Thu, 18 Jun 2020 15:55:11 +0100 Subject: [PATCH 277/502] smccc: Make constants available to assembly Move constants out of the C-only section of the header next to the other constants that are available to assembly. Signed-off-by: Andrew Scull Reviewed-by: Sudeep Holla Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200618145511.69203-1-ascull@google.com Signed-off-by: Catalin Marinas --- include/linux/arm-smccc.h | 44 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 56d6a5c6e353..efcbde731f03 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -81,6 +81,28 @@ ARM_SMCCC_SMC_32, \ 0, 0x7fff) +/* Paravirtualised time calls (defined by ARM DEN0057A) */ +#define ARM_SMCCC_HV_PV_TIME_FEATURES \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x20) + +#define ARM_SMCCC_HV_PV_TIME_ST \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x21) + +/* + * Return codes defined in ARM DEN 0070A + * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C + */ +#define SMCCC_RET_SUCCESS 0 +#define SMCCC_RET_NOT_SUPPORTED -1 +#define SMCCC_RET_NOT_REQUIRED -2 +#define SMCCC_RET_INVALID_PARAMETER -3 + #ifndef __ASSEMBLY__ #include @@ -331,15 +353,6 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, */ #define arm_smccc_1_1_hvc(...) __arm_smccc_1_1(SMCCC_HVC_INST, __VA_ARGS__) -/* - * Return codes defined in ARM DEN 0070A - * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C - */ -#define SMCCC_RET_SUCCESS 0 -#define SMCCC_RET_NOT_SUPPORTED -1 -#define SMCCC_RET_NOT_REQUIRED -2 -#define SMCCC_RET_INVALID_PARAMETER -3 - /* * Like arm_smccc_1_1* but always returns SMCCC_RET_NOT_SUPPORTED. * Used when the SMCCC conduit is not defined. The empty asm statement @@ -385,18 +398,5 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, method; \ }) -/* Paravirtualised time calls (defined by ARM DEN0057A) */ -#define ARM_SMCCC_HV_PV_TIME_FEATURES \ - ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ - ARM_SMCCC_SMC_64, \ - ARM_SMCCC_OWNER_STANDARD_HYP, \ - 0x20) - -#define ARM_SMCCC_HV_PV_TIME_ST \ - ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ - ARM_SMCCC_SMC_64, \ - ARM_SMCCC_OWNER_STANDARD_HYP, \ - 0x21) - #endif /*__ASSEMBLY__*/ #endif /*__LINUX_ARM_SMCCC_H*/ From e735b98a5fe08c0f50f9fdc3e3a844e3638e6649 Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Thu, 25 Jun 2020 16:03:11 +0800 Subject: [PATCH 278/502] arm64: Add tlbi_user_level TLB invalidation helper Add a level-hinted parameter to __tlbi_user, which only gets used if ARMv8.4-TTL gets detected. ARMv8.4-TTL provides the TTL field in tlbi instruction to indicate the level of translation table walk holding the leaf entry for the address that is being invalidated. This patch set the default level value of flush_tlb_range() to 0, which will be updated in future patches. And set the ttl value of flush_tlb_page_nosync() to 3 because it is only called to flush a single pte page. Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200625080314.230-4-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3353f26302de..e1d07612e147 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -104,6 +104,11 @@ __tlbi(op, arg); \ } while(0) +#define __tlbi_user_level(op, arg, level) do { \ + if (arm64_kernel_unmapped_at_el0()) \ + __tlbi_level(op, (arg | USER_ASID_FLAG), level); \ +} while (0) + /* * TLB Invalidation * ================ @@ -205,8 +210,9 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm)); dsb(ishst); - __tlbi(vale1is, addr); - __tlbi_user(vale1is, addr); + /* This function is only called on a small page */ + __tlbi_level(vale1is, addr, 3); + __tlbi_user_level(vale1is, addr, 3); } static inline void flush_tlb_page(struct vm_area_struct *vma, @@ -246,11 +252,11 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, dsb(ishst); for (addr = start; addr < end; addr += stride) { if (last_level) { - __tlbi(vale1is, addr); - __tlbi_user(vale1is, addr); + __tlbi_level(vale1is, addr, 0); + __tlbi_user_level(vale1is, addr, 0); } else { - __tlbi(vae1is, addr); - __tlbi_user(vae1is, addr); + __tlbi_level(vae1is, addr, 0); + __tlbi_user_level(vae1is, addr, 0); } } dsb(ish); From 2631ed00b0498810f8d5c2163c6b5270d893687b Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Thu, 25 Jun 2020 16:03:12 +0800 Subject: [PATCH 279/502] tlb: mmu_gather: add tlb_flush_*_range APIs tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, then set corresponding cleared_*. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Zhenyu Ye Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20200625080314.230-5-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- include/asm-generic/tlb.h | 55 ++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 3f1649a8cf55..ef75ec86f865 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -512,6 +512,38 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm } #endif +/* + * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, + * and set corresponding cleared_*. + */ +static inline void tlb_flush_pte_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_ptes = 1; +} + +static inline void tlb_flush_pmd_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_pmds = 1; +} + +static inline void tlb_flush_pud_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_puds = 1; +} + +static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_p4ds = 1; +} + #ifndef __tlb_remove_tlb_entry #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) #endif @@ -525,19 +557,17 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ - tlb->cleared_ptes = 1; \ + tlb_flush_pte_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ - __tlb_adjust_range(tlb, address, _sz); \ if (_sz == PMD_SIZE) \ - tlb->cleared_pmds = 1; \ + tlb_flush_pmd_range(tlb, address, _sz); \ else if (_sz == PUD_SIZE) \ - tlb->cleared_puds = 1; \ + tlb_flush_pud_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) @@ -551,8 +581,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ do { \ - __tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE); \ - tlb->cleared_pmds = 1; \ + tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE); \ __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) @@ -566,8 +595,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ do { \ - __tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE); \ - tlb->cleared_puds = 1; \ + tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE); \ __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ } while (0) @@ -592,9 +620,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #ifndef pte_free_tlb #define pte_free_tlb(tlb, ptep, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ + tlb_flush_pmd_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ - tlb->cleared_pmds = 1; \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #endif @@ -602,9 +629,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #ifndef pmd_free_tlb #define pmd_free_tlb(tlb, pmdp, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ + tlb_flush_pud_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ - tlb->cleared_puds = 1; \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #endif @@ -612,9 +638,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ + tlb_flush_p4d_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ - tlb->cleared_p4ds = 1; \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif From c4ab2cbc1d8768eb505708a58c54c277dfe4a93d Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Thu, 25 Jun 2020 16:03:13 +0800 Subject: [PATCH 280/502] arm64: tlb: Set the TTL field in flush_tlb_range This patch uses the cleared_* in struct mmu_gather to set the TTL field in flush_tlb_range(). Signed-off-by: Zhenyu Ye Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20200625080314.230-6-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlb.h | 29 ++++++++++++++++++++++++++++- arch/arm64/include/asm/tlbflush.h | 14 ++++++++------ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index b76df828e6b7..61c97d3b58c7 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -21,11 +21,37 @@ static void tlb_flush(struct mmu_gather *tlb); #include +/* + * get the tlbi levels in arm64. Default value is 0 if more than one + * of cleared_* is set or neither is set. + * Arm64 doesn't support p4ds now. + */ +static inline int tlb_get_level(struct mmu_gather *tlb) +{ + if (tlb->cleared_ptes && !(tlb->cleared_pmds || + tlb->cleared_puds || + tlb->cleared_p4ds)) + return 3; + + if (tlb->cleared_pmds && !(tlb->cleared_ptes || + tlb->cleared_puds || + tlb->cleared_p4ds)) + return 2; + + if (tlb->cleared_puds && !(tlb->cleared_ptes || + tlb->cleared_pmds || + tlb->cleared_p4ds)) + return 1; + + return 0; +} + static inline void tlb_flush(struct mmu_gather *tlb) { struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0); bool last_level = !tlb->freed_tables; unsigned long stride = tlb_get_unmap_size(tlb); + int tlb_level = tlb_get_level(tlb); /* * If we're tearing down the address space then we only care about @@ -38,7 +64,8 @@ static inline void tlb_flush(struct mmu_gather *tlb) return; } - __flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level); + __flush_tlb_range(&vma, tlb->start, tlb->end, stride, + last_level, tlb_level); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index e1d07612e147..3505f6fbfca3 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -230,7 +230,8 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long stride, bool last_level) + unsigned long stride, bool last_level, + int tlb_level) { unsigned long asid = ASID(vma->vm_mm); unsigned long addr; @@ -252,11 +253,11 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, dsb(ishst); for (addr = start; addr < end; addr += stride) { if (last_level) { - __tlbi_level(vale1is, addr, 0); - __tlbi_user_level(vale1is, addr, 0); + __tlbi_level(vale1is, addr, tlb_level); + __tlbi_user_level(vale1is, addr, tlb_level); } else { - __tlbi_level(vae1is, addr, 0); - __tlbi_user_level(vae1is, addr, 0); + __tlbi_level(vae1is, addr, tlb_level); + __tlbi_user_level(vae1is, addr, tlb_level); } } dsb(ish); @@ -268,8 +269,9 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, /* * We cannot use leaf-only invalidation here, since we may be invalidating * table entries as part of collapsing hugepages or moving page tables. + * Set the tlb_level to 0 because we can not get enough information here. */ - __flush_tlb_range(vma, start, end, PAGE_SIZE, false); + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0); } static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) From a7ac1cfa4c0510217e74c2ba807ead549f80d82c Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Thu, 25 Jun 2020 16:03:14 +0800 Subject: [PATCH 281/502] arm64: tlb: Set the TTL field in flush_*_tlb_range This patch implement flush_{pmd|pud}_tlb_range() in arm64 by calling __flush_tlb_range() with the corresponding stride and tlb_level values. Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200625080314.230-7-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 758e2d1577d0..d5d3fbe73953 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -40,6 +40,16 @@ extern void __pmd_error(const char *file, int line, unsigned long val); extern void __pud_error(const char *file, int line, unsigned long val); extern void __pgd_error(const char *file, int line, unsigned long val); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE + +/* Set stride and tlb_level in flush_*_tlb_range */ +#define flush_pmd_tlb_range(vma, addr, end) \ + __flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2) +#define flush_pud_tlb_range(vma, addr, end) \ + __flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1) +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. From 34e36d81a0ef76047fa12a0f8e0dce4369b435cf Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 7 Jul 2020 11:26:14 +0100 Subject: [PATCH 282/502] arm64: Shift the __tlbi_level() indentation left This is for consistency with the other __tlbi macros in this file. No functional change. Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++++---------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3505f6fbfca3..39aed2efd21b 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -77,32 +77,31 @@ #define TLBI_TTL_TG_16K 2 #define TLBI_TTL_TG_64K 3 -#define __tlbi_level(op, addr, level) \ - do { \ - u64 arg = addr; \ +#define __tlbi_level(op, addr, level) do { \ + u64 arg = addr; \ \ - if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) && \ - level) { \ - u64 ttl = level & 3; \ + if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) && \ + level) { \ + u64 ttl = level & 3; \ \ - switch (PAGE_SIZE) { \ - case SZ_4K: \ - ttl |= TLBI_TTL_TG_4K << 2; \ - break; \ - case SZ_16K: \ - ttl |= TLBI_TTL_TG_16K << 2; \ - break; \ - case SZ_64K: \ - ttl |= TLBI_TTL_TG_64K << 2; \ - break; \ - } \ - \ - arg &= ~TLBI_TTL_MASK; \ - arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \ + switch (PAGE_SIZE) { \ + case SZ_4K: \ + ttl |= TLBI_TTL_TG_4K << 2; \ + break; \ + case SZ_16K: \ + ttl |= TLBI_TTL_TG_16K << 2; \ + break; \ + case SZ_64K: \ + ttl |= TLBI_TTL_TG_64K << 2; \ + break; \ } \ \ - __tlbi(op, arg); \ - } while(0) + arg &= ~TLBI_TTL_MASK; \ + arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \ + } \ + \ + __tlbi(op, arg); \ +} while(0) #define __tlbi_user_level(op, arg, level) do { \ if (arm64_kernel_unmapped_at_el0()) \ From c6c83d757a13a5df51428a6fe133c9193810507b Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 7 Jul 2020 19:53:13 +0530 Subject: [PATCH 283/502] arm64/cpufeature: Validate feature bits spacing in arm64_ftr_regs[] arm64_feature_bits for a register in arm64_ftr_regs[] are in a descending order as per their shift values. Validate that these features bits are defined correctly and do not overlap with each other. This check protects against any inadvertent erroneous changes to the register definitions. Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Cc: Will Deacon Cc: Suzuki K Poulose Cc: Mark Brown Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1594131793-9498-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 47 +++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 19146bd338b4..d9b51cb9cb8c 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -712,11 +712,52 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, static void __init sort_ftr_regs(void) { - int i; + unsigned int i; - /* Check that the array is sorted so that we can do the binary search */ - for (i = 1; i < ARRAY_SIZE(arm64_ftr_regs); i++) + for (i = 0; i < ARRAY_SIZE(arm64_ftr_regs); i++) { + const struct arm64_ftr_reg *ftr_reg = arm64_ftr_regs[i].reg; + const struct arm64_ftr_bits *ftr_bits = ftr_reg->ftr_bits; + unsigned int j = 0; + + /* + * Features here must be sorted in descending order with respect + * to their shift values and should not overlap with each other. + */ + for (; ftr_bits->width != 0; ftr_bits++, j++) { + unsigned int width = ftr_reg->ftr_bits[j].width; + unsigned int shift = ftr_reg->ftr_bits[j].shift; + unsigned int prev_shift; + + WARN((shift + width) > 64, + "%s has invalid feature at shift %d\n", + ftr_reg->name, shift); + + /* + * Skip the first feature. There is nothing to + * compare against for now. + */ + if (j == 0) + continue; + + prev_shift = ftr_reg->ftr_bits[j - 1].shift; + WARN((shift + width) > prev_shift, + "%s has feature overlap at shift %d\n", + ftr_reg->name, shift); + } + + /* + * Skip the first register. There is nothing to + * compare against for now. + */ + if (i == 0) + continue; + /* + * Registers here must be sorted in ascending order with respect + * to sys_id for subsequent binary search in get_arm64_ftr_reg() + * to work correctly. + */ BUG_ON(arm64_ftr_regs[i].sys_id < arm64_ftr_regs[i - 1].sys_id); + } } /* From 9dedd56301564acdbb1dd37cf09250a4c7b783c9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 7 Jul 2020 16:36:20 +0300 Subject: [PATCH 284/502] io_uring: partially inline io_iopoll_getevents() io_iopoll_reap_events() doesn't care about returned valued of io_iopoll_getevents() and does the same checks for list emptiness and need_resched(). Just use io_do_iopoll(). io_sq_thread() doesn't check return value as well. It also passes min=0, so there never be the second iteration inside io_poll_getevents(). Inline it there too. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e3169834bf7..104af675f6fb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2064,7 +2064,7 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) while (!list_empty(&ctx->poll_list)) { unsigned int nr_events = 0; - io_iopoll_getevents(ctx, &nr_events, 1); + io_do_iopoll(ctx, &nr_events, 1); /* * Ensure we allow local-to-the-cpu processing to take place, @@ -6318,8 +6318,8 @@ static int io_sq_thread(void *data) unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->poll_list)) - io_iopoll_getevents(ctx, &nr_events, 0); + if (!list_empty(&ctx->poll_list) && !need_resched()) + io_do_iopoll(ctx, &nr_events, 0); else timeout = jiffies + ctx->sq_thread_idle; mutex_unlock(&ctx->uring_lock); From 7668b92a69b8201e2dd16a47a08efb93e909f419 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 7 Jul 2020 16:36:21 +0300 Subject: [PATCH 285/502] io_uring: remove nr_events arg from iopoll_check() Nobody checks io_iopoll_check()'s output parameter @nr_events. Remove the parameter and declare it further down the stack. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 104af675f6fb..38bf42320f56 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2080,9 +2080,9 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { + unsigned int nr_events = 0; int iters = 0, ret = 0; /* @@ -2116,11 +2116,11 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, mutex_lock(&ctx->uring_lock); } - ret = io_iopoll_getevents(ctx, nr_events, min); + ret = io_iopoll_getevents(ctx, &nr_events, min); if (ret <= 0) break; ret = 0; - } while (min && !*nr_events && !need_resched()); + } while (min && !nr_events && !need_resched()); mutex_unlock(&ctx->uring_lock); return ret; @@ -7977,8 +7977,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { - unsigned nr_events = 0; - min_complete = min(min_complete, ctx->cq_entries); /* @@ -7989,7 +7987,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) { - ret = io_iopoll_check(ctx, &nr_events, min_complete); + ret = io_iopoll_check(ctx, min_complete); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } From b2edc0a77fac19bbdef63cedb2ea34aec1a9a499 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 7 Jul 2020 16:36:22 +0300 Subject: [PATCH 286/502] io_uring: don't burn CPU for iopoll on exit First of all don't spin in io_ring_ctx_wait_and_kill() on iopoll. Requests won't complete faster because of that, but only lengthen io_uring_release(). The same goes for offloaded cleanup in io_ring_exit_work() -- it already has waiting loop, don't do blocking active spinning. For that, pass min=0 into io_iopoll_[try_]reap_events(), so it won't actively spin. Leave the function if io_do_iopoll() there can't complete a request to sleep in io_ring_exit_work(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 38bf42320f56..4c9a494c9f9f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2055,7 +2055,7 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static void io_iopoll_reap_events(struct io_ring_ctx *ctx) +static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; @@ -2064,8 +2064,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) while (!list_empty(&ctx->poll_list)) { unsigned int nr_events = 0; - io_do_iopoll(ctx, &nr_events, 1); + io_do_iopoll(ctx, &nr_events, 0); + /* let it sleep and repeat later if can't complete a request */ + if (nr_events == 0) + break; /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. @@ -7648,7 +7651,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) ctx->sqo_mm = NULL; } - io_iopoll_reap_events(ctx); io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); @@ -7715,11 +7717,8 @@ static int io_remove_personalities(int id, void *p, void *data) static void io_ring_exit_work(struct work_struct *work) { - struct io_ring_ctx *ctx; - - ctx = container_of(work, struct io_ring_ctx, exit_work); - if (ctx->rings) - io_cqring_overflow_flush(ctx, true); + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + exit_work); /* * If we're doing polled IO and end up having requests being @@ -7727,11 +7726,11 @@ static void io_ring_exit_work(struct work_struct *work) * we're waiting for refs to drop. We need to reap these manually, * as nobody else will be looking for them. */ - while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) { - io_iopoll_reap_events(ctx); + do { if (ctx->rings) io_cqring_overflow_flush(ctx, true); - } + io_iopoll_try_reap_events(ctx); + } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); } @@ -7747,10 +7746,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); - io_iopoll_reap_events(ctx); /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); INIT_WORK(&ctx->exit_work, io_ring_exit_work); queue_work(system_wq, &ctx->exit_work); From bd657aa3dd8514e62486ce7f90b5e484c18d684d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:07 -0700 Subject: [PATCH 287/502] x86/cpufeatures: Add Architectural LBRs feature bit CPUID.(EAX=07H, ECX=0):EDX[19] indicates whether an Intel CPU supports Architectural LBRs. The "X86_FEATURE_..., word 18" is already mirrored from CPUID "0x00000007:0 (EDX)". Add X86_FEATURE_ARCH_LBR under the "word 18" section. The feature will appear as "arch_lbr" in /proc/cpuinfo. The Architectural Last Branch Records (LBR) feature enables recording of software path history by logging taken branches and other control flows. The feature will be supported in the perf_events subsystem. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-2-git-send-email-kan.liang@linux.intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 02dabc9e77b0..72ba4c59ad05 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -366,6 +366,7 @@ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ +#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ From 9f354a726cb1d4eb00a0784a27eaa0a3283cff71 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:08 -0700 Subject: [PATCH 288/502] perf/x86/intel/lbr: Add a function pointer for LBR reset The method to reset Architectural LBRs is different from previous model-specific LBR. Perf has to implement a different function. A function pointer is introduced for LBR reset. The enum of LBR_FORMAT_* is also moved to perf_event.h. Perf should initialize the corresponding functions at boot time, and avoid checking lbr_format at run time. The current 64-bit LBR reset function is set as default. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-3-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 7 +++++++ arch/x86/events/intel/lbr.c | 20 +++----------------- arch/x86/events/perf_event.h | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 582ddff9a359..fe49e99e4fbf 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3978,6 +3978,8 @@ static __initconst const struct x86_pmu core_pmu = { .cpu_dead = intel_pmu_cpu_dead, .check_period = intel_pmu_check_period, + + .lbr_reset = intel_pmu_lbr_reset_64, }; static __initconst const struct x86_pmu intel_pmu = { @@ -4023,6 +4025,8 @@ static __initconst const struct x86_pmu intel_pmu = { .check_period = intel_pmu_check_period, .aux_output_match = intel_pmu_aux_output_match, + + .lbr_reset = intel_pmu_lbr_reset_64, }; static __init void intel_clovertown_quirk(void) @@ -4649,6 +4653,9 @@ __init int intel_pmu_init(void) x86_pmu.intel_cap.capabilities = capabilities; } + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + x86_pmu.lbr_reset = intel_pmu_lbr_reset_32; + intel_ds_init(); x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index d03de7539957..7af27a766002 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -8,17 +8,6 @@ #include "../perf_event.h" -enum { - LBR_FORMAT_32 = 0x00, - LBR_FORMAT_LIP = 0x01, - LBR_FORMAT_EIP = 0x02, - LBR_FORMAT_EIP_FLAGS = 0x03, - LBR_FORMAT_EIP_FLAGS2 = 0x04, - LBR_FORMAT_INFO = 0x05, - LBR_FORMAT_TIME = 0x06, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, -}; - static const enum { LBR_EIP_FLAGS = 1, LBR_TSX = 2, @@ -194,7 +183,7 @@ static void __intel_pmu_lbr_disable(void) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } -static void intel_pmu_lbr_reset_32(void) +void intel_pmu_lbr_reset_32(void) { int i; @@ -202,7 +191,7 @@ static void intel_pmu_lbr_reset_32(void) wrmsrl(x86_pmu.lbr_from + i, 0); } -static void intel_pmu_lbr_reset_64(void) +void intel_pmu_lbr_reset_64(void) { int i; @@ -221,10 +210,7 @@ void intel_pmu_lbr_reset(void) if (!x86_pmu.lbr_nr) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_reset_32(); - else - intel_pmu_lbr_reset_64(); + x86_pmu.lbr_reset(); cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 81475963df99..5c1ad4360715 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -179,6 +179,17 @@ struct intel_excl_cntrs { struct x86_perf_task_context; #define MAX_LBR_ENTRIES 32 +enum { + LBR_FORMAT_32 = 0x00, + LBR_FORMAT_LIP = 0x01, + LBR_FORMAT_EIP = 0x02, + LBR_FORMAT_EIP_FLAGS = 0x03, + LBR_FORMAT_EIP_FLAGS2 = 0x04, + LBR_FORMAT_INFO = 0x05, + LBR_FORMAT_TIME = 0x06, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, +}; + enum { X86_PERF_KFREE_SHARED = 0, X86_PERF_KFREE_EXCL = 1, @@ -682,6 +693,8 @@ struct x86_pmu { bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ + void (*lbr_reset)(void); + /* * Intel PT/LBR/BTS are exclusive */ @@ -1058,6 +1071,10 @@ u64 lbr_from_signext_quirk_wr(u64 val); void intel_pmu_lbr_reset(void); +void intel_pmu_lbr_reset_32(void); + +void intel_pmu_lbr_reset_64(void); + void intel_pmu_lbr_add(struct perf_event *event); void intel_pmu_lbr_del(struct perf_event *event); From c301b1d80ed5b806834fe0f739f028f65fb4fb16 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:09 -0700 Subject: [PATCH 289/502] perf/x86/intel/lbr: Add a function pointer for LBR read The method to read Architectural LBRs is different from previous model-specific LBR. Perf has to implement a different function. A function pointer for LBR read is introduced. Perf should initialize the corresponding function at boot time, and avoid checking lbr_format at run time. The current 64-bit LBR read function is set as default. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-4-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 6 +++++- arch/x86/events/intel/lbr.c | 9 +++------ arch/x86/events/perf_event.h | 5 +++++ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fe49e99e4fbf..6414b4799ce7 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3980,6 +3980,7 @@ static __initconst const struct x86_pmu core_pmu = { .check_period = intel_pmu_check_period, .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, }; static __initconst const struct x86_pmu intel_pmu = { @@ -4027,6 +4028,7 @@ static __initconst const struct x86_pmu intel_pmu = { .aux_output_match = intel_pmu_aux_output_match, .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, }; static __init void intel_clovertown_quirk(void) @@ -4653,8 +4655,10 @@ __init int intel_pmu_init(void) x86_pmu.intel_cap.capabilities = capabilities; } - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) { x86_pmu.lbr_reset = intel_pmu_lbr_reset_32; + x86_pmu.lbr_read = intel_pmu_lbr_read_32; + } intel_ds_init(); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 7af27a766002..b8943f45ca69 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -562,7 +562,7 @@ void intel_pmu_lbr_disable_all(void) __intel_pmu_lbr_disable(); } -static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; u64 tos = intel_pmu_lbr_tos(); @@ -599,7 +599,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) * is the same as the linear address, allowing us to merge the LIP and EIP * LBR formats. */ -static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; @@ -704,10 +704,7 @@ void intel_pmu_lbr_read(void) cpuc->lbr_users == cpuc->lbr_pebs_users) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_read_32(cpuc); - else - intel_pmu_lbr_read_64(cpuc); + x86_pmu.lbr_read(cpuc); intel_pmu_lbr_filter(cpuc); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 5c1ad4360715..312d27f269e6 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -694,6 +694,7 @@ struct x86_pmu { bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ void (*lbr_reset)(void); + void (*lbr_read)(struct cpu_hw_events *cpuc); /* * Intel PT/LBR/BTS are exclusive @@ -1085,6 +1086,10 @@ void intel_pmu_lbr_disable_all(void); void intel_pmu_lbr_read(void); +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc); + +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc); + void intel_pmu_lbr_init_core(void); void intel_pmu_lbr_init_nhm(void); From 799571bf38fc2b4b744fa448184b5915739b10fd Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:10 -0700 Subject: [PATCH 290/502] perf/x86/intel/lbr: Add the function pointers for LBR save and restore The MSRs of Architectural LBR are different from previous model-specific LBR. Perf has to implement different functions to save and restore them. The function pointers for LBR save and restore are introduced. Perf should initialize the corresponding functions at boot time. The generic optimizations, e.g. avoiding restore LBR if no one else touched them, still apply for Architectural LBRs. The related codes are not moved to model-specific functions. Current model-specific LBR functions are set as default. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-5-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 4 ++ arch/x86/events/intel/lbr.c | 83 ++++++++++++++++++++++-------------- arch/x86/events/perf_event.h | 6 +++ 3 files changed, 61 insertions(+), 32 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 6414b4799ce7..50cb3c69d6a4 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3981,6 +3981,8 @@ static __initconst const struct x86_pmu core_pmu = { .lbr_reset = intel_pmu_lbr_reset_64, .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore = intel_pmu_lbr_restore, }; static __initconst const struct x86_pmu intel_pmu = { @@ -4029,6 +4031,8 @@ static __initconst const struct x86_pmu intel_pmu = { .lbr_reset = intel_pmu_lbr_reset_64, .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore = intel_pmu_lbr_restore, }; static __init void intel_clovertown_quirk(void) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index b8943f45ca69..b2b8dc973057 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -323,11 +323,41 @@ static inline u64 rdlbr_to(unsigned int idx) return val; } +void intel_pmu_lbr_restore(void *ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; + int i; + unsigned lbr_idx, mask; + u64 tos = task_ctx->tos; + + mask = x86_pmu.lbr_nr - 1; + for (i = 0; i < task_ctx->valid_lbrs; i++) { + lbr_idx = (tos - i) & mask; + wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); + wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); + + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); + } + + for (; i < x86_pmu.lbr_nr; i++) { + lbr_idx = (tos - i) & mask; + wrlbr_from(lbr_idx, 0); + wrlbr_to(lbr_idx, 0); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); + } + + wrmsrl(x86_pmu.lbr_tos, tos); + + if (cpuc->lbr_select) + wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); +} + static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int i; - unsigned lbr_idx, mask; u64 tos; if (task_ctx->lbr_callstack_users == 0 || @@ -349,43 +379,19 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) return; } - mask = x86_pmu.lbr_nr - 1; - for (i = 0; i < task_ctx->valid_lbrs; i++) { - lbr_idx = (tos - i) & mask; - wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); - wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); + x86_pmu.lbr_restore(task_ctx); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); - } - - for (; i < x86_pmu.lbr_nr; i++) { - lbr_idx = (tos - i) & mask; - wrlbr_from(lbr_idx, 0); - wrlbr_to(lbr_idx, 0); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); - } - - wrmsrl(x86_pmu.lbr_tos, tos); task_ctx->lbr_stack_state = LBR_NONE; - - if (cpuc->lbr_select) - wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } -static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +void intel_pmu_lbr_save(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; unsigned lbr_idx, mask; u64 tos, from; int i; - if (task_ctx->lbr_callstack_users == 0) { - task_ctx->lbr_stack_state = LBR_NONE; - return; - } - mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); for (i = 0; i < x86_pmu.lbr_nr; i++) { @@ -400,13 +406,26 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) } task_ctx->valid_lbrs = i; task_ctx->tos = tos; + + if (cpuc->lbr_select) + rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); +} + +static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (task_ctx->lbr_callstack_users == 0) { + task_ctx->lbr_stack_state = LBR_NONE; + return; + } + + x86_pmu.lbr_save(task_ctx); + task_ctx->lbr_stack_state = LBR_VALID; cpuc->last_task_ctx = task_ctx; cpuc->last_log_id = ++task_ctx->log_id; - - if (cpuc->lbr_select) - rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 312d27f269e6..6d11813582c0 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -695,6 +695,8 @@ struct x86_pmu { void (*lbr_reset)(void); void (*lbr_read)(struct cpu_hw_events *cpuc); + void (*lbr_save)(void *ctx); + void (*lbr_restore)(void *ctx); /* * Intel PT/LBR/BTS are exclusive @@ -1090,6 +1092,10 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc); void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc); +void intel_pmu_lbr_save(void *ctx); + +void intel_pmu_lbr_restore(void *ctx); + void intel_pmu_lbr_init_core(void); void intel_pmu_lbr_init_nhm(void); From 530bfff6480307d210734222a54d56af7f908957 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:11 -0700 Subject: [PATCH 291/502] perf/x86/intel/lbr: Factor out a new struct for generic optimization To reduce the overhead of a context switch with LBR enabled, some generic optimizations were introduced, e.g. avoiding restore LBR if no one else touched them. The generic optimizations can also be used by Architecture LBR later. Currently, the fields for the generic optimizations are part of structure x86_perf_task_context, which will be deprecated by Architecture LBR. A new structure should be introduced for the common fields of generic optimization, which can be shared between Architecture LBR and model-specific LBR. Both 'valid_lbrs' and 'tos' are also used by the generic optimizations, but they are not moved into the new structure, because Architecture LBR is stack-like. The 'valid_lbrs' which records the index of the valid LBR is not required anymore. The TOS MSR will be removed. LBR registers may be cleared in the deep Cstate. If so, the generic optimizations should not be applied. Perf has to unconditionally restore the LBR registers. A generic function is required to detect the reset due to the deep Cstate. lbr_is_reset_in_cstate() is introduced. Currently, for the model-specific LBR, the TOS MSR is used to detect the reset. There will be another method introduced for Architecture LBR later. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-6-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/lbr.c | 38 ++++++++++++++++++++---------------- arch/x86/events/perf_event.h | 10 +++++++--- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index b2b8dc973057..bba9939635b6 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -355,33 +355,37 @@ void intel_pmu_lbr_restore(void *ctx) wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } +static __always_inline bool +lbr_is_reset_in_cstate(struct x86_perf_task_context *task_ctx) +{ + return !rdlbr_from(task_ctx->tos); +} + static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - u64 tos; - if (task_ctx->lbr_callstack_users == 0 || - task_ctx->lbr_stack_state == LBR_NONE) { + if (task_ctx->opt.lbr_callstack_users == 0 || + task_ctx->opt.lbr_stack_state == LBR_NONE) { intel_pmu_lbr_reset(); return; } - tos = task_ctx->tos; /* * Does not restore the LBR registers, if * - No one else touched them, and - * - Did not enter C6 + * - Was not cleared in Cstate */ if ((task_ctx == cpuc->last_task_ctx) && - (task_ctx->log_id == cpuc->last_log_id) && - rdlbr_from(tos)) { - task_ctx->lbr_stack_state = LBR_NONE; + (task_ctx->opt.log_id == cpuc->last_log_id) && + !lbr_is_reset_in_cstate(task_ctx)) { + task_ctx->opt.lbr_stack_state = LBR_NONE; return; } x86_pmu.lbr_restore(task_ctx); - task_ctx->lbr_stack_state = LBR_NONE; + task_ctx->opt.lbr_stack_state = LBR_NONE; } void intel_pmu_lbr_save(void *ctx) @@ -415,17 +419,17 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (task_ctx->lbr_callstack_users == 0) { - task_ctx->lbr_stack_state = LBR_NONE; + if (task_ctx->opt.lbr_callstack_users == 0) { + task_ctx->opt.lbr_stack_state = LBR_NONE; return; } x86_pmu.lbr_save(task_ctx); - task_ctx->lbr_stack_state = LBR_VALID; + task_ctx->opt.lbr_stack_state = LBR_VALID; cpuc->last_task_ctx = task_ctx; - cpuc->last_log_id = ++task_ctx->log_id; + cpuc->last_log_id = ++task_ctx->opt.log_id; } void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, @@ -447,8 +451,8 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, if (!prev_ctx_data || !next_ctx_data) return; - swap(prev_ctx_data->lbr_callstack_users, - next_ctx_data->lbr_callstack_users); + swap(prev_ctx_data->opt.lbr_callstack_users, + next_ctx_data->opt.lbr_callstack_users); } void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) @@ -503,7 +507,7 @@ void intel_pmu_lbr_add(struct perf_event *event) if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users++; + task_ctx->opt.lbr_callstack_users++; } /* @@ -543,7 +547,7 @@ void intel_pmu_lbr_del(struct perf_event *event) if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users--; + task_ctx->opt.lbr_callstack_users--; } if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 6d11813582c0..96d73cd8b7a1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -736,6 +736,12 @@ struct x86_pmu { int (*aux_output_match) (struct perf_event *event); }; +struct x86_perf_task_context_opt { + int lbr_callstack_users; + int lbr_stack_state; + int log_id; +}; + struct x86_perf_task_context { u64 lbr_from[MAX_LBR_ENTRIES]; u64 lbr_to[MAX_LBR_ENTRIES]; @@ -743,9 +749,7 @@ struct x86_perf_task_context { u64 lbr_sel; int tos; int valid_lbrs; - int lbr_callstack_users; - int lbr_stack_state; - int log_id; + struct x86_perf_task_context_opt opt; }; #define x86_add_quirk(func_) \ From f42be8651a7a9d5cb165e5d176fc0b09621b4f4d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:12 -0700 Subject: [PATCH 292/502] perf/x86/intel/lbr: Use dynamic data structure for task_ctx The type of task_ctx is hardcoded as struct x86_perf_task_context, which doesn't apply for Architecture LBR. For example, Architecture LBR doesn't have the TOS MSR. The number of LBR entries is variable. A new struct will be introduced for Architecture LBR. Perf has to determine the type of task_ctx at run time. The type of task_ctx pointer is changed to 'void *', which will be determined at run time. The generic LBR optimization can be shared between Architecture LBR and model-specific LBR. Both need to access the structure for the generic LBR optimization. A helper task_context_opt() is introduced to retrieve the pointer of the structure at run time. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-7-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/lbr.c | 59 ++++++++++++++++-------------------- arch/x86/events/perf_event.h | 7 ++++- 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index bba9939635b6..e62baa996474 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -355,18 +355,17 @@ void intel_pmu_lbr_restore(void *ctx) wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } -static __always_inline bool -lbr_is_reset_in_cstate(struct x86_perf_task_context *task_ctx) +static __always_inline bool lbr_is_reset_in_cstate(void *ctx) { - return !rdlbr_from(task_ctx->tos); + return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos); } -static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) +static void __intel_pmu_lbr_restore(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (task_ctx->opt.lbr_callstack_users == 0 || - task_ctx->opt.lbr_stack_state == LBR_NONE) { + if (task_context_opt(ctx)->lbr_callstack_users == 0 || + task_context_opt(ctx)->lbr_stack_state == LBR_NONE) { intel_pmu_lbr_reset(); return; } @@ -376,16 +375,16 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) * - No one else touched them, and * - Was not cleared in Cstate */ - if ((task_ctx == cpuc->last_task_ctx) && - (task_ctx->opt.log_id == cpuc->last_log_id) && - !lbr_is_reset_in_cstate(task_ctx)) { - task_ctx->opt.lbr_stack_state = LBR_NONE; + if ((ctx == cpuc->last_task_ctx) && + (task_context_opt(ctx)->log_id == cpuc->last_log_id) && + !lbr_is_reset_in_cstate(ctx)) { + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; return; } - x86_pmu.lbr_restore(task_ctx); + x86_pmu.lbr_restore(ctx); - task_ctx->opt.lbr_stack_state = LBR_NONE; + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; } void intel_pmu_lbr_save(void *ctx) @@ -415,27 +414,27 @@ void intel_pmu_lbr_save(void *ctx) rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } -static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +static void __intel_pmu_lbr_save(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (task_ctx->opt.lbr_callstack_users == 0) { - task_ctx->opt.lbr_stack_state = LBR_NONE; + if (task_context_opt(ctx)->lbr_callstack_users == 0) { + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; return; } - x86_pmu.lbr_save(task_ctx); + x86_pmu.lbr_save(ctx); - task_ctx->opt.lbr_stack_state = LBR_VALID; + task_context_opt(ctx)->lbr_stack_state = LBR_VALID; - cpuc->last_task_ctx = task_ctx; - cpuc->last_log_id = ++task_ctx->opt.log_id; + cpuc->last_task_ctx = ctx; + cpuc->last_log_id = ++task_context_opt(ctx)->log_id; } void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, struct perf_event_context *next) { - struct x86_perf_task_context *prev_ctx_data, *next_ctx_data; + void *prev_ctx_data, *next_ctx_data; swap(prev->task_ctx_data, next->task_ctx_data); @@ -451,14 +450,14 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, if (!prev_ctx_data || !next_ctx_data) return; - swap(prev_ctx_data->opt.lbr_callstack_users, - next_ctx_data->opt.lbr_callstack_users); + swap(task_context_opt(prev_ctx_data)->lbr_callstack_users, + task_context_opt(next_ctx_data)->lbr_callstack_users); } void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; + void *task_ctx; if (!cpuc->lbr_users) return; @@ -495,7 +494,6 @@ static inline bool branch_user_callstack(unsigned br_sel) void intel_pmu_lbr_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; @@ -505,10 +503,8 @@ void intel_pmu_lbr_add(struct perf_event *event) cpuc->br_sel = event->hw.branch_reg.reg; - if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->opt.lbr_callstack_users++; - } + if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++; /* * Request pmu::sched_task() callback, which will fire inside the @@ -539,16 +535,13 @@ void intel_pmu_lbr_add(struct perf_event *event) void intel_pmu_lbr_del(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; if (branch_user_callstack(cpuc->br_sel) && - event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->opt.lbr_callstack_users--; - } + event->ctx->task_ctx_data) + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--; if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) cpuc->lbr_select = 0; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 96d73cd8b7a1..7dbf1480b0a2 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -247,7 +247,7 @@ struct cpu_hw_events { struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct er_account *lbr_sel; u64 br_sel; - struct x86_perf_task_context *last_task_ctx; + void *last_task_ctx; int last_log_id; int lbr_select; @@ -800,6 +800,11 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \ struct pmu *x86_get_pmu(void); extern struct x86_pmu x86_pmu __read_mostly; +static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) +{ + return &((struct x86_perf_task_context *)ctx)->opt; +} + static inline bool x86_pmu_has_lbr_callstack(void) { return x86_pmu.lbr_sel_map && From d6a162a41bfd2ff9ea4cbb338d3df6a3f9b7e89f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:13 -0700 Subject: [PATCH 293/502] x86/msr-index: Add bunch of MSRs for Arch LBR Add Arch LBR related MSRs and the new LBR INFO bits in MSR-index. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-8-git-send-email-kan.liang@linux.intel.com --- arch/x86/include/asm/msr-index.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e8370e64a155..bdc07fc6e517 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -158,7 +158,23 @@ #define LBR_INFO_MISPRED BIT_ULL(63) #define LBR_INFO_IN_TX BIT_ULL(62) #define LBR_INFO_ABORT BIT_ULL(61) +#define LBR_INFO_CYC_CNT_VALID BIT_ULL(60) #define LBR_INFO_CYCLES 0xffff +#define LBR_INFO_BR_TYPE_OFFSET 56 +#define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET) + +#define MSR_ARCH_LBR_CTL 0x000014ce +#define ARCH_LBR_CTL_LBREN BIT(0) +#define ARCH_LBR_CTL_CPL_OFFSET 1 +#define ARCH_LBR_CTL_CPL (0x3ull << ARCH_LBR_CTL_CPL_OFFSET) +#define ARCH_LBR_CTL_STACK_OFFSET 3 +#define ARCH_LBR_CTL_STACK (0x1ull << ARCH_LBR_CTL_STACK_OFFSET) +#define ARCH_LBR_CTL_FILTER_OFFSET 16 +#define ARCH_LBR_CTL_FILTER (0x7full << ARCH_LBR_CTL_FILTER_OFFSET) +#define MSR_ARCH_LBR_DEPTH 0x000014cf +#define MSR_ARCH_LBR_FROM_0 0x00001500 +#define MSR_ARCH_LBR_TO_0 0x00001600 +#define MSR_ARCH_LBR_INFO_0 0x00001200 #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_PEBS_DATA_CFG 0x000003f2 From af6cf129706b2f79e12f97e62d977e7f653cdfd1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:14 -0700 Subject: [PATCH 294/502] perf/x86: Expose CPUID enumeration bits for arch LBR The LBR capabilities of Architecture LBR are retrieved from the CPUID enumeration once at boot time. The capabilities have to be saved for future usage. Several new fields are added into structure x86_pmu to indicate the capabilities. The fields will be used in the following patches. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-9-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/perf_event.h | 13 ++++++++++ arch/x86/include/asm/perf_event.h | 40 +++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7dbf1480b0a2..cc8117764c08 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -693,6 +693,19 @@ struct x86_pmu { bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ + /* + * Intel Architectural LBR CPUID Enumeration + */ + unsigned int lbr_depth_mask:8; + unsigned int lbr_deep_c_reset:1; + unsigned int lbr_lip:1; + unsigned int lbr_cpl:1; + unsigned int lbr_filter:1; + unsigned int lbr_call_stack:1; + unsigned int lbr_mispred:1; + unsigned int lbr_timed_lbr:1; + unsigned int lbr_br_type:1; + void (*lbr_reset)(void); void (*lbr_read)(struct cpu_hw_events *cpuc); void (*lbr_save)(void *ctx); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 2df707311d17..9ffce7d31c4c 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -142,6 +142,46 @@ union cpuid10_edx { unsigned int full; }; +/* + * Intel Architectural LBR CPUID detection/enumeration details: + */ +union cpuid28_eax { + struct { + /* Supported LBR depth values */ + unsigned int lbr_depth_mask:8; + unsigned int reserved:22; + /* Deep C-state Reset */ + unsigned int lbr_deep_c_reset:1; + /* IP values contain LIP */ + unsigned int lbr_lip:1; + } split; + unsigned int full; +}; + +union cpuid28_ebx { + struct { + /* CPL Filtering Supported */ + unsigned int lbr_cpl:1; + /* Branch Filtering Supported */ + unsigned int lbr_filter:1; + /* Call-stack Mode Supported */ + unsigned int lbr_call_stack:1; + } split; + unsigned int full; +}; + +union cpuid28_ecx { + struct { + /* Mispredict Bit Supported */ + unsigned int lbr_mispred:1; + /* Timed LBRs Supported */ + unsigned int lbr_timed_lbr:1; + /* Branch Type Field Supported */ + unsigned int lbr_br_type:1; + } split; + unsigned int full; +}; + struct x86_pmu_capability { int version; int num_counters_gp; From 49d8184f2036ff5b8d1eea3d61bac8b23420eca7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:15 -0700 Subject: [PATCH 295/502] perf/x86/intel/lbr: Support LBR_CTL An IA32_LBR_CTL is introduced for Architecture LBR to enable and config LBR registers to replace the previous LBR_SELECT. All the related members in struct cpu_hw_events and struct x86_pmu have to be renamed. Some new macros are added to reflect the layout of LBR_CTL. The mapping from PERF_SAMPLE_BRANCH_* to the corresponding bits in LBR_CTL MSR is saved in lbr_ctl_map now, which is not a const value. The value relies on the CPUID enumeration. For the previous model-specific LBR, most of the bits in LBR_SELECT operate in the suppressed mode. For the bits in LBR_CTL, the polarity is inverted. For the previous model-specific LBR format 5 (LBR_FORMAT_INFO), if the NO_CYCLES and NO_FLAGS type are set, the flag LBR_NO_INFO will be set to avoid the unnecessary LBR_INFO MSR read. Although Architecture LBR also has a dedicated LBR_INFO MSR, perf doesn't need to check and set the flag LBR_NO_INFO. For Architecture LBR, XSAVES instruction will be used as the default way to read the LBR MSRs all together. The overhead which the flag tries to avoid doesn't exist anymore. Dropping the flag can save the extra check for the flag in the lbr_read() later, and make the code cleaner. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-10-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/lbr.c | 43 ++++++++++++++++++++++++++++++++++++ arch/x86/events/perf_event.h | 15 ++++++++++--- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index e62baa996474..77425624752c 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -132,6 +132,44 @@ enum { X86_BR_IRQ |\ X86_BR_INT) +/* + * Intel LBR_CTL bits + * + * Hardware branch filter for Arch LBR + */ +#define ARCH_LBR_KERNEL_BIT 1 /* capture at ring0 */ +#define ARCH_LBR_USER_BIT 2 /* capture at ring > 0 */ +#define ARCH_LBR_CALL_STACK_BIT 3 /* enable call stack */ +#define ARCH_LBR_JCC_BIT 16 /* capture conditional branches */ +#define ARCH_LBR_REL_JMP_BIT 17 /* capture relative jumps */ +#define ARCH_LBR_IND_JMP_BIT 18 /* capture indirect jumps */ +#define ARCH_LBR_REL_CALL_BIT 19 /* capture relative calls */ +#define ARCH_LBR_IND_CALL_BIT 20 /* capture indirect calls */ +#define ARCH_LBR_RETURN_BIT 21 /* capture near returns */ +#define ARCH_LBR_OTHER_BRANCH_BIT 22 /* capture other branches */ + +#define ARCH_LBR_KERNEL (1ULL << ARCH_LBR_KERNEL_BIT) +#define ARCH_LBR_USER (1ULL << ARCH_LBR_USER_BIT) +#define ARCH_LBR_CALL_STACK (1ULL << ARCH_LBR_CALL_STACK_BIT) +#define ARCH_LBR_JCC (1ULL << ARCH_LBR_JCC_BIT) +#define ARCH_LBR_REL_JMP (1ULL << ARCH_LBR_REL_JMP_BIT) +#define ARCH_LBR_IND_JMP (1ULL << ARCH_LBR_IND_JMP_BIT) +#define ARCH_LBR_REL_CALL (1ULL << ARCH_LBR_REL_CALL_BIT) +#define ARCH_LBR_IND_CALL (1ULL << ARCH_LBR_IND_CALL_BIT) +#define ARCH_LBR_RETURN (1ULL << ARCH_LBR_RETURN_BIT) +#define ARCH_LBR_OTHER_BRANCH (1ULL << ARCH_LBR_OTHER_BRANCH_BIT) + +#define ARCH_LBR_ANY \ + (ARCH_LBR_JCC |\ + ARCH_LBR_REL_JMP |\ + ARCH_LBR_IND_JMP |\ + ARCH_LBR_REL_CALL |\ + ARCH_LBR_IND_CALL |\ + ARCH_LBR_RETURN |\ + ARCH_LBR_OTHER_BRANCH) + +#define ARCH_LBR_CTL_MASK 0x7f000e + static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); /* @@ -820,6 +858,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) reg = &event->hw.branch_reg; reg->idx = EXTRA_REG_LBR; + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { + reg->config = mask; + return 0; + } + /* * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate * in suppress mode. So LBR_SELECT should be set to diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index cc8117764c08..ba89e563b2aa 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -245,7 +245,10 @@ struct cpu_hw_events { int lbr_pebs_users; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; - struct er_account *lbr_sel; + union { + struct er_account *lbr_sel; + struct er_account *lbr_ctl; + }; u64 br_sel; void *last_task_ctx; int last_log_id; @@ -688,8 +691,14 @@ struct x86_pmu { */ unsigned int lbr_tos, lbr_from, lbr_to, lbr_nr; /* LBR base regs and size */ - u64 lbr_sel_mask; /* LBR_SELECT valid bits */ - const int *lbr_sel_map; /* lbr_select mappings */ + union { + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ + u64 lbr_ctl_mask; /* LBR_CTL valid bits */ + }; + union { + const int *lbr_sel_map; /* lbr_select mappings */ + int *lbr_ctl_map; /* LBR_CTL mappings */ + }; bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ From 5624986dc61b81a77fb6136bc232593483d1c254 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:16 -0700 Subject: [PATCH 296/502] perf/x86/intel/lbr: Unify the stored format of LBR information Current LBR information in the structure x86_perf_task_context is stored in a different format from the PEBS LBR record and Architecture LBR, which prevents the sharing of the common codes. Use the format of the PEBS LBR record as a unified format. Use a generic name lbr_entry to replace pebs_lbr_entry. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-11-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 6 +++--- arch/x86/events/intel/lbr.c | 20 ++++++++++---------- arch/x86/events/perf_event.h | 6 ++---- arch/x86/include/asm/perf_event.h | 6 +----- 4 files changed, 16 insertions(+), 22 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index dc43cc124e09..86848c57b55e 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -954,7 +954,7 @@ static void adaptive_pebs_record_size_update(void) if (pebs_data_cfg & PEBS_DATACFG_XMMS) sz += sizeof(struct pebs_xmm); if (pebs_data_cfg & PEBS_DATACFG_LBRS) - sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry); + sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); cpuc->pebs_record_size = sz; } @@ -1595,10 +1595,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_LBRS) { - struct pebs_lbr *lbr = next_record; + struct lbr_entry *lbr = next_record; int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) & 0xff) + 1; - next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry); + next_record = next_record + num_lbr * sizeof(struct lbr_entry); if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 77425624752c..b8baaf15c5f4 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -372,11 +372,11 @@ void intel_pmu_lbr_restore(void *ctx) mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; - wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); - wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); + wrlbr_from(lbr_idx, task_ctx->lbr[i].from); + wrlbr_to(lbr_idx, task_ctx->lbr[i].to); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); + wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info); } for (; i < x86_pmu.lbr_nr; i++) { @@ -440,10 +440,10 @@ void intel_pmu_lbr_save(void *ctx) from = rdlbr_from(lbr_idx); if (!from) break; - task_ctx->lbr_from[i] = from; - task_ctx->lbr_to[i] = rdlbr_to(lbr_idx); + task_ctx->lbr[i].from = from; + task_ctx->lbr[i].to = rdlbr_to(lbr_idx); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); + rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info); } task_ctx->valid_lbrs = i; task_ctx->tos = tos; @@ -1179,7 +1179,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) } } -void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) +void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; @@ -1193,11 +1193,11 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); for (i = 0; i < x86_pmu.lbr_nr; i++) { - u64 info = lbr->lbr[i].info; + u64 info = lbr[i].info; struct perf_branch_entry *e = &cpuc->lbr_entries[i]; - e->from = lbr->lbr[i].from; - e->to = lbr->lbr[i].to; + e->from = lbr[i].from; + e->to = lbr[i].to; e->mispred = !!(info & LBR_INFO_MISPRED); e->predicted = !(info & LBR_INFO_MISPRED); e->in_tx = !!(info & LBR_INFO_IN_TX); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ba89e563b2aa..aaa426d3d66e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -765,13 +765,11 @@ struct x86_perf_task_context_opt { }; struct x86_perf_task_context { - u64 lbr_from[MAX_LBR_ENTRIES]; - u64 lbr_to[MAX_LBR_ENTRIES]; - u64 lbr_info[MAX_LBR_ENTRIES]; u64 lbr_sel; int tos; int valid_lbrs; struct x86_perf_task_context_opt opt; + struct lbr_entry lbr[MAX_LBR_ENTRIES]; }; #define x86_add_quirk(func_) \ @@ -1092,7 +1090,7 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); -void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr); +void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); void intel_ds_init(void); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 9ffce7d31c4c..2e29558c9c6b 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -282,14 +282,10 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; -struct pebs_lbr_entry { +struct lbr_entry { u64 from, to, info; }; -struct pebs_lbr { - struct pebs_lbr_entry lbr[0]; /* Variable length */ -}; - /* * IBS cpuid feature detection */ From 020d91e5f32da4f4b929b3a6e680135fd526107c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:17 -0700 Subject: [PATCH 297/502] perf/x86/intel/lbr: Mark the {rd,wr}lbr_{to,from} wrappers __always_inline The {rd,wr}lbr_{to,from} wrappers are invoked in hot paths, e.g. context switch and NMI handler. They should be always inline to achieve better performance. However, the CONFIG_OPTIMIZE_INLINING allows the compiler to uninline functions marked 'inline'. Mark the {rd,wr}lbr_{to,from} wrappers as __always_inline to force inline the wrappers. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-12-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/lbr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index b8baaf15c5f4..21f4f071f2c0 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -332,18 +332,18 @@ static u64 lbr_from_signext_quirk_rd(u64 val) return val; } -static inline void wrlbr_from(unsigned int idx, u64 val) +static __always_inline void wrlbr_from(unsigned int idx, u64 val) { val = lbr_from_signext_quirk_wr(val); wrmsrl(x86_pmu.lbr_from + idx, val); } -static inline void wrlbr_to(unsigned int idx, u64 val) +static __always_inline void wrlbr_to(unsigned int idx, u64 val) { wrmsrl(x86_pmu.lbr_to + idx, val); } -static inline u64 rdlbr_from(unsigned int idx) +static __always_inline u64 rdlbr_from(unsigned int idx) { u64 val; @@ -352,7 +352,7 @@ static inline u64 rdlbr_from(unsigned int idx) return lbr_from_signext_quirk_rd(val); } -static inline u64 rdlbr_to(unsigned int idx) +static __always_inline u64 rdlbr_to(unsigned int idx) { u64 val; From fda1f99f34a8f0975086bcfef34da865009995c1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:18 -0700 Subject: [PATCH 298/502] perf/x86/intel/lbr: Factor out rdlbr_all() and wrlbr_all() The previous model-specific LBR and Architecture LBR (legacy way) use a similar method to save/restore the LBR information, which directly accesses the LBR registers. The codes which read/write a set of LBR registers can be shared between them. Factor out two functions which are used to read/write a set of LBR registers. Add lbr_info into structure x86_pmu, and use it to replace the hardcoded LBR INFO MSR, because the LBR INFO MSR address of the previous model-specific LBR is different from Architecture LBR. The MSR address should be assigned at boot time. For now, only Sky Lake and later platforms have the LBR INFO MSR. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-13-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/lbr.c | 66 +++++++++++++++++++++++++++--------- arch/x86/events/perf_event.h | 2 +- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 21f4f071f2c0..d3d129c7d7ef 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -237,7 +237,7 @@ void intel_pmu_lbr_reset_64(void) wrmsrl(x86_pmu.lbr_from + i, 0); wrmsrl(x86_pmu.lbr_to + i, 0); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + i, 0); + wrmsrl(x86_pmu.lbr_info + i, 0); } } @@ -343,6 +343,11 @@ static __always_inline void wrlbr_to(unsigned int idx, u64 val) wrmsrl(x86_pmu.lbr_to + idx, val); } +static __always_inline void wrlbr_info(unsigned int idx, u64 val) +{ + wrmsrl(x86_pmu.lbr_info + idx, val); +} + static __always_inline u64 rdlbr_from(unsigned int idx) { u64 val; @@ -361,8 +366,44 @@ static __always_inline u64 rdlbr_to(unsigned int idx) return val; } +static __always_inline u64 rdlbr_info(unsigned int idx) +{ + u64 val; + + rdmsrl(x86_pmu.lbr_info + idx, val); + + return val; +} + +static inline void +wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + wrlbr_from(idx, lbr->from); + wrlbr_to(idx, lbr->to); + if (need_info) + wrlbr_info(idx, lbr->info); +} + +static inline bool +rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + u64 from = rdlbr_from(idx); + + /* Don't read invalid entry */ + if (!from) + return false; + + lbr->from = from; + lbr->to = rdlbr_to(idx); + if (need_info) + lbr->info = rdlbr_info(idx); + + return true; +} + void intel_pmu_lbr_restore(void *ctx) { + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx = ctx; int i; @@ -372,11 +413,7 @@ void intel_pmu_lbr_restore(void *ctx) mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; - wrlbr_from(lbr_idx, task_ctx->lbr[i].from); - wrlbr_to(lbr_idx, task_ctx->lbr[i].to); - - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info); + wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info); } for (; i < x86_pmu.lbr_nr; i++) { @@ -384,7 +421,7 @@ void intel_pmu_lbr_restore(void *ctx) wrlbr_from(lbr_idx, 0); wrlbr_to(lbr_idx, 0); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); + wrlbr_info(lbr_idx, 0); } wrmsrl(x86_pmu.lbr_tos, tos); @@ -427,23 +464,19 @@ static void __intel_pmu_lbr_restore(void *ctx) void intel_pmu_lbr_save(void *ctx) { + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx = ctx; unsigned lbr_idx, mask; - u64 tos, from; + u64 tos; int i; mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); for (i = 0; i < x86_pmu.lbr_nr; i++) { lbr_idx = (tos - i) & mask; - from = rdlbr_from(lbr_idx); - if (!from) + if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info)) break; - task_ctx->lbr[i].from = from; - task_ctx->lbr[i].to = rdlbr_to(lbr_idx); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info); } task_ctx->valid_lbrs = i; task_ctx->tos = tos; @@ -689,7 +722,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); + info = rdlbr_info(lbr_idx); mis = !!(info & LBR_INFO_MISPRED); pred = !mis; in_tx = !!(info & LBR_INFO_IN_TX); @@ -1336,6 +1369,7 @@ __init void intel_pmu_lbr_init_skl(void) x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; x86_pmu.lbr_to = MSR_LBR_NHM_TO; + x86_pmu.lbr_info = MSR_LBR_INFO_0; x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; @@ -1421,7 +1455,7 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->nr = x86_pmu.lbr_nr; lbr->from = x86_pmu.lbr_from; lbr->to = x86_pmu.lbr_to; - lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? MSR_LBR_INFO_0 : 0; + lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0; return 0; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index aaa426d3d66e..20e35cb1705d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -690,7 +690,7 @@ struct x86_pmu { * Intel LBR */ unsigned int lbr_tos, lbr_from, lbr_to, - lbr_nr; /* LBR base regs and size */ + lbr_info, lbr_nr; /* LBR base regs and size */ union { u64 lbr_sel_mask; /* LBR_SELECT valid bits */ u64 lbr_ctl_mask; /* LBR_CTL valid bits */ From 631618a0dca31dc23dcce38cf345c6139bd8a1e9 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:19 -0700 Subject: [PATCH 299/502] perf/x86/intel/lbr: Factor out intel_pmu_store_lbr The way to store the LBR information from a PEBS LBR record can be reused in Architecture LBR, because - The LBR information is stored like a stack. Entry 0 is always the youngest branch. - The layout of the LBR INFO MSR is similar. The LBR information may be retrieved from either the LBR registers (non-PEBS event) or a buffer (PEBS event). Extend rdlbr_*() to support both methods. Explicitly check the invalid entry (0s), which can avoid unnecessary MSR access if using a non-PEBS event. For a PEBS event, the check should slightly improve the performance as well. The invalid entries are cut. The intel_pmu_lbr_filter() doesn't need to check and filter them out. Cannot share the function with current model-specific LBR read, because the direction of the LBR growth is opposite. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-14-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/lbr.c | 82 +++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index d3d129c7d7ef..0d7a85903964 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -348,28 +348,37 @@ static __always_inline void wrlbr_info(unsigned int idx, u64 val) wrmsrl(x86_pmu.lbr_info + idx, val); } -static __always_inline u64 rdlbr_from(unsigned int idx) +static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->from; + rdmsrl(x86_pmu.lbr_from + idx, val); return lbr_from_signext_quirk_rd(val); } -static __always_inline u64 rdlbr_to(unsigned int idx) +static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->to; + rdmsrl(x86_pmu.lbr_to + idx, val); return val; } -static __always_inline u64 rdlbr_info(unsigned int idx) +static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->info; + rdmsrl(x86_pmu.lbr_info + idx, val); return val; @@ -387,16 +396,16 @@ wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) static inline bool rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) { - u64 from = rdlbr_from(idx); + u64 from = rdlbr_from(idx, NULL); /* Don't read invalid entry */ if (!from) return false; lbr->from = from; - lbr->to = rdlbr_to(idx); + lbr->to = rdlbr_to(idx, NULL); if (need_info) - lbr->info = rdlbr_info(idx); + lbr->info = rdlbr_info(idx, NULL); return true; } @@ -432,7 +441,7 @@ void intel_pmu_lbr_restore(void *ctx) static __always_inline bool lbr_is_reset_in_cstate(void *ctx) { - return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos); + return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL); } static void __intel_pmu_lbr_restore(void *ctx) @@ -709,8 +718,8 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) u16 cycles = 0; int lbr_flags = lbr_desc[lbr_format]; - from = rdlbr_from(lbr_idx); - to = rdlbr_to(lbr_idx); + from = rdlbr_from(lbr_idx, NULL); + to = rdlbr_to(lbr_idx, NULL); /* * Read LBR call stack entries @@ -722,7 +731,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; - info = rdlbr_info(lbr_idx); + info = rdlbr_info(lbr_idx, NULL); mis = !!(info & LBR_INFO_MISPRED); pred = !mis; in_tx = !!(info & LBR_INFO_IN_TX); @@ -777,6 +786,42 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_stack.hw_idx = tos; } +static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, + struct lbr_entry *entries) +{ + struct perf_branch_entry *e; + struct lbr_entry *lbr; + u64 from, to, info; + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr = entries ? &entries[i] : NULL; + e = &cpuc->lbr_entries[i]; + + from = rdlbr_from(i, lbr); + /* + * Read LBR entries until invalid entry (0s) is detected. + */ + if (!from) + break; + + to = rdlbr_to(i, lbr); + info = rdlbr_info(i, lbr); + + e->from = from; + e->to = to; + e->mispred = !!(info & LBR_INFO_MISPRED); + e->predicted = !(info & LBR_INFO_MISPRED); + e->in_tx = !!(info & LBR_INFO_IN_TX); + e->abort = !!(info & LBR_INFO_ABORT); + e->cycles = info & LBR_INFO_CYCLES; + e->type = 0; + e->reserved = 0; + } + + cpuc->lbr_stack.nr = i; +} + void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1215,9 +1260,6 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int i; - - cpuc->lbr_stack.nr = x86_pmu.lbr_nr; /* Cannot get TOS for large PEBS */ if (cpuc->n_pebs == cpuc->n_large_pebs) @@ -1225,19 +1267,7 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) else cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); - for (i = 0; i < x86_pmu.lbr_nr; i++) { - u64 info = lbr[i].info; - struct perf_branch_entry *e = &cpuc->lbr_entries[i]; - - e->from = lbr[i].from; - e->to = lbr[i].to; - e->mispred = !!(info & LBR_INFO_MISPRED); - e->predicted = !(info & LBR_INFO_MISPRED); - e->in_tx = !!(info & LBR_INFO_IN_TX); - e->abort = !!(info & LBR_INFO_ABORT); - e->cycles = info & LBR_INFO_CYCLES; - e->reserved = 0; - } + intel_pmu_store_lbr(cpuc, lbr); intel_pmu_lbr_filter(cpuc); } From 47125db27e47e9d44c878bf8925aa057824bb0d5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:20 -0700 Subject: [PATCH 300/502] perf/x86/intel/lbr: Support Architectural LBR Last Branch Records (LBR) enables recording of software path history by logging taken branches and other control flows within architectural registers now. Intel CPUs have had model-specific LBR for quite some time, but this evolves them into an architectural feature now. The main improvements of Architectural LBR implemented includes: - Linux kernel can support the LBR features without knowing the model number of the current CPU. - Architectural LBR capabilities can be enumerated by CPUID. The lbr_ctl_map is based on the CPUID Enumeration. - The possible LBR depth can be retrieved from CPUID enumeration. The max value is written to the new MSR_ARCH_LBR_DEPTH as the number of LBR entries. - A new IA32_LBR_CTL MSR is introduced to enable and configure LBRs, which replaces the IA32_DEBUGCTL[bit 0] and the LBR_SELECT MSR. - Each LBR record or entry is still comprised of three MSRs, IA32_LBR_x_FROM_IP, IA32_LBR_x_TO_IP and IA32_LBR_x_TO_IP. But they become the architectural MSRs. - Architectural LBR is stack-like now. Entry 0 is always the youngest branch, entry 1 the next youngest... The TOS MSR has been removed. The way to enable/disable Architectural LBR is similar to the previous model-specific LBR. __intel_pmu_lbr_enable/disable() can be reused, but some modifications are required, which include: - MSR_ARCH_LBR_CTL is used to enable and configure the Architectural LBR. - When checking the value of the IA32_DEBUGCTL MSR, ignoring the DEBUGCTLMSR_LBR (bit 0) for Architectural LBR, which has no meaning and always return 0. - The FREEZE_LBRS_ON_PMI has to be explicitly set/clear, because MSR_IA32_DEBUGCTLMSR is not touched in __intel_pmu_lbr_disable() for Architectural LBR. - Only MSR_ARCH_LBR_CTL is cleared in __intel_pmu_lbr_disable() for Architectural LBR. Some Architectural LBR dedicated functions are implemented to reset/read/save/restore LBR. - For reset, writing to the ARCH_LBR_DEPTH MSR clears all Arch LBR entries, which is a lot faster and can improve the context switch latency. - For read, the branch type information can be retrieved from the MSR_ARCH_LBR_INFO_*. But it's not fully compatible due to OTHER_BRANCH type. The software decoding is still required for the OTHER_BRANCH case. LBR records are stored in the age order as well. Reuse intel_pmu_store_lbr(). Check the CPUID enumeration before accessing the corresponding bits in LBR_INFO. - For save/restore, applying the fast reset (writing ARCH_LBR_DEPTH). Reading 'lbr_from' of entry 0 instead of the TOS MSR to check if the LBR registers are reset in the deep C-state. If 'the deep C-state reset' bit is not set in CPUID enumeration, ignoring the check. XSAVE support for Architectural LBR will be implemented later. The number of LBR entries cannot be hardcoded anymore, which should be retrieved from CPUID enumeration. A new structure x86_perf_task_context_arch_lbr is introduced for Architectural LBR. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-15-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 3 + arch/x86/events/intel/lbr.c | 251 +++++++++++++++++++++++++++++++++-- arch/x86/events/perf_event.h | 10 ++ 3 files changed, 253 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 50cb3c69d6a4..50963472ee85 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4664,6 +4664,9 @@ __init int intel_pmu_init(void) x86_pmu.lbr_read = intel_pmu_lbr_read_32; } + if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) + intel_pmu_arch_lbr_init(); + intel_ds_init(); x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 0d7a85903964..e4e249a78451 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -172,6 +172,14 @@ enum { static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); +static __always_inline bool is_lbr_call_stack_bit_set(u64 config) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return !!(config & ARCH_LBR_CALL_STACK); + + return !!(config & LBR_CALL_STACK); +} + /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI * otherwise it becomes near impossible to get a reliable stack. @@ -195,27 +203,40 @@ static void __intel_pmu_lbr_enable(bool pmi) */ if (cpuc->lbr_sel) lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask; - if (!pmi && cpuc->lbr_sel) + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel) wrmsrl(MSR_LBR_SELECT, lbr_select); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); orig_debugctl = debugctl; - debugctl |= DEBUGCTLMSR_LBR; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + debugctl |= DEBUGCTLMSR_LBR; /* * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions * may cause superfluous increase/decrease of LBR_TOS. */ - if (!(lbr_select & LBR_CALL_STACK)) + if (is_lbr_call_stack_bit_set(lbr_select)) + debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + else debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + if (orig_debugctl != debugctl) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN); } static void __intel_pmu_lbr_disable(void) { u64 debugctl; + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { + wrmsrl(MSR_ARCH_LBR_CTL, 0); + return; + } + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); @@ -241,6 +262,12 @@ void intel_pmu_lbr_reset_64(void) } } +static void intel_pmu_arch_lbr_reset(void) +{ + /* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */ + wrmsrl(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr); +} + void intel_pmu_lbr_reset(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -439,8 +466,28 @@ void intel_pmu_lbr_restore(void *ctx) wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } +static void intel_pmu_arch_lbr_restore(void *ctx) +{ + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; + struct lbr_entry *entries = task_ctx->entries; + int i; + + /* Fast reset the LBRs before restore if the call stack is not full. */ + if (!entries[x86_pmu.lbr_nr - 1].from) + intel_pmu_arch_lbr_reset(); + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!entries[i].from) + break; + wrlbr_all(&entries[i], i, true); + } +} + static __always_inline bool lbr_is_reset_in_cstate(void *ctx) { + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return x86_pmu.lbr_deep_c_reset && !rdlbr_from(0, NULL); + return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL); } @@ -494,6 +541,22 @@ void intel_pmu_lbr_save(void *ctx) rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } +static void intel_pmu_arch_lbr_save(void *ctx) +{ + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; + struct lbr_entry *entries = task_ctx->entries; + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!rdlbr_all(&entries[i], i, true)) + break; + } + + /* LBR call stack is not full. Reset is required in restore. */ + if (i < x86_pmu.lbr_nr) + entries[x86_pmu.lbr_nr - 1].from = 0; +} + static void __intel_pmu_lbr_save(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -786,6 +849,39 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_stack.hw_idx = tos; } +static __always_inline int get_lbr_br_type(u64 info) +{ + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type) + return 0; + + return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET; +} + +static __always_inline bool get_lbr_mispred(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) + return 0; + + return !!(info & LBR_INFO_MISPRED); +} + +static __always_inline bool get_lbr_predicted(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) + return 0; + + return !(info & LBR_INFO_MISPRED); +} + +static __always_inline bool get_lbr_cycles(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID)) + return 0; + + return info & LBR_INFO_CYCLES; +} + static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, struct lbr_entry *entries) { @@ -810,18 +906,23 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->from = from; e->to = to; - e->mispred = !!(info & LBR_INFO_MISPRED); - e->predicted = !(info & LBR_INFO_MISPRED); + e->mispred = get_lbr_mispred(info); + e->predicted = get_lbr_predicted(info); e->in_tx = !!(info & LBR_INFO_IN_TX); e->abort = !!(info & LBR_INFO_ABORT); - e->cycles = info & LBR_INFO_CYCLES; - e->type = 0; + e->cycles = get_lbr_cycles(info); + e->type = get_lbr_br_type(info); e->reserved = 0; } cpuc->lbr_stack.nr = i; } +static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc) +{ + intel_pmu_store_lbr(cpuc, NULL); +} + void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1197,6 +1298,27 @@ common_branch_type(int type) return PERF_BR_UNKNOWN; } +enum { + ARCH_LBR_BR_TYPE_JCC = 0, + ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1, + ARCH_LBR_BR_TYPE_NEAR_REL_JMP = 2, + ARCH_LBR_BR_TYPE_NEAR_IND_CALL = 3, + ARCH_LBR_BR_TYPE_NEAR_REL_CALL = 4, + ARCH_LBR_BR_TYPE_NEAR_RET = 5, + ARCH_LBR_BR_TYPE_KNOWN_MAX = ARCH_LBR_BR_TYPE_NEAR_RET, + + ARCH_LBR_BR_TYPE_MAP_MAX = 16, +}; + +static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = { + [ARCH_LBR_BR_TYPE_JCC] = X86_BR_JCC, + [ARCH_LBR_BR_TYPE_NEAR_IND_JMP] = X86_BR_IND_JMP, + [ARCH_LBR_BR_TYPE_NEAR_REL_JMP] = X86_BR_JMP, + [ARCH_LBR_BR_TYPE_NEAR_IND_CALL] = X86_BR_IND_CALL, + [ARCH_LBR_BR_TYPE_NEAR_REL_CALL] = X86_BR_CALL, + [ARCH_LBR_BR_TYPE_NEAR_RET] = X86_BR_RET, +}; + /* * implement actual branch filter based on user demand. * Hardware may not exactly satisfy that request, thus @@ -1209,7 +1331,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) { u64 from, to; int br_sel = cpuc->br_sel; - int i, j, type; + int i, j, type, to_plm; bool compress = false; /* if sampling all branches, then nothing to filter */ @@ -1221,8 +1343,19 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) from = cpuc->lbr_entries[i].from; to = cpuc->lbr_entries[i].to; + type = cpuc->lbr_entries[i].type; - type = branch_type(from, to, cpuc->lbr_entries[i].abort); + /* + * Parse the branch type recorded in LBR_x_INFO MSR. + * Doesn't support OTHER_BRANCH decoding for now. + * OTHER_BRANCH branch type still rely on software decoding. + */ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) { + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + type = arch_lbr_br_type_map[type] | to_plm; + } else + type = branch_type(from, to, cpuc->lbr_entries[i].abort); if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { if (cpuc->lbr_entries[i].in_tx) type |= X86_BR_IN_TX; @@ -1261,8 +1394,9 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - /* Cannot get TOS for large PEBS */ - if (cpuc->n_pebs == cpuc->n_large_pebs) + /* Cannot get TOS for large PEBS and Arch LBR */ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) || + (cpuc->n_pebs == cpuc->n_large_pebs)) cpuc->lbr_stack.hw_idx = -1ULL; else cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); @@ -1324,6 +1458,26 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, }; +static int arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = ARCH_LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = ARCH_LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = ARCH_LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = ARCH_LBR_RETURN | + ARCH_LBR_OTHER_BRANCH, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = ARCH_LBR_REL_CALL | + ARCH_LBR_IND_CALL | + ARCH_LBR_OTHER_BRANCH, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = ARCH_LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = ARCH_LBR_JCC, + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = ARCH_LBR_REL_CALL | + ARCH_LBR_IND_CALL | + ARCH_LBR_RETURN | + ARCH_LBR_CALL_STACK, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = ARCH_LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = ARCH_LBR_REL_CALL, +}; + /* core */ void __init intel_pmu_lbr_init_core(void) { @@ -1471,6 +1625,81 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } +void __init intel_pmu_arch_lbr_init(void) +{ + union cpuid28_eax eax; + union cpuid28_ebx ebx; + union cpuid28_ecx ecx; + unsigned int unused_edx; + u64 lbr_nr; + + /* Arch LBR Capabilities */ + cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx); + + lbr_nr = fls(eax.split.lbr_depth_mask) * 8; + if (!lbr_nr) + goto clear_arch_lbr; + + /* Apply the max depth of Arch LBR */ + if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr)) + goto clear_arch_lbr; + + x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask; + x86_pmu.lbr_deep_c_reset = eax.split.lbr_deep_c_reset; + x86_pmu.lbr_lip = eax.split.lbr_lip; + x86_pmu.lbr_cpl = ebx.split.lbr_cpl; + x86_pmu.lbr_filter = ebx.split.lbr_filter; + x86_pmu.lbr_call_stack = ebx.split.lbr_call_stack; + x86_pmu.lbr_mispred = ecx.split.lbr_mispred; + x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr; + x86_pmu.lbr_br_type = ecx.split.lbr_br_type; + x86_pmu.lbr_nr = lbr_nr; + + x86_get_pmu()->task_ctx_size = sizeof(struct x86_perf_task_context_arch_lbr) + + lbr_nr * sizeof(struct lbr_entry); + + x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; + x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0; + x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0; + + /* LBR callstack requires both CPL and Branch Filtering support */ + if (!x86_pmu.lbr_cpl || + !x86_pmu.lbr_filter || + !x86_pmu.lbr_call_stack) + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP; + + if (!x86_pmu.lbr_cpl) { + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_NOT_SUPP; + } else if (!x86_pmu.lbr_filter) { + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_NOT_SUPP; + } + + x86_pmu.lbr_ctl_mask = ARCH_LBR_CTL_MASK; + x86_pmu.lbr_ctl_map = arch_lbr_ctl_map; + + if (!x86_pmu.lbr_cpl && !x86_pmu.lbr_filter) + x86_pmu.lbr_ctl_map = NULL; + + x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset; + x86_pmu.lbr_read = intel_pmu_arch_lbr_read; + x86_pmu.lbr_save = intel_pmu_arch_lbr_save; + x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore; + + pr_cont("Architectural LBR, "); + + return; + +clear_arch_lbr: + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR); +} + /** * x86_perf_get_lbr - get the LBR records information * diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 20e35cb1705d..3f7c329374bb 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -772,6 +772,11 @@ struct x86_perf_task_context { struct lbr_entry lbr[MAX_LBR_ENTRIES]; }; +struct x86_perf_task_context_arch_lbr { + struct x86_perf_task_context_opt opt; + struct lbr_entry entries[]; +}; + #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ @@ -822,6 +827,9 @@ extern struct x86_pmu x86_pmu __read_mostly; static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt; + return &((struct x86_perf_task_context *)ctx)->opt; } @@ -1141,6 +1149,8 @@ void intel_pmu_lbr_init_skl(void); void intel_pmu_lbr_init_knl(void); +void intel_pmu_arch_lbr_init(void); + void intel_pmu_pebs_data_source_nhm(void); void intel_pmu_pebs_data_source_skl(bool pmem); From ff9ff926889dd8026b4ba55266a010c27f68604f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:21 -0700 Subject: [PATCH 301/502] perf/core: Factor out functions to allocate/free the task_ctx_data The method to allocate/free the task_ctx_data is going to be changed in the following patch. Currently, the task_ctx_data is allocated/freed in several different places. To avoid repeatedly modifying the same codes in several different places, alloc_task_ctx_data() and free_task_ctx_data() are factored out to allocate/free the task_ctx_data. The modification only needs to be applied once. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-16-git-send-email-kan.liang@linux.intel.com --- kernel/events/core.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 9b8f92500833..75090403f942 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1238,12 +1238,22 @@ static void get_ctx(struct perf_event_context *ctx) refcount_inc(&ctx->refcount); } +static void *alloc_task_ctx_data(struct pmu *pmu) +{ + return kzalloc(pmu->task_ctx_size, GFP_KERNEL); +} + +static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) +{ + kfree(task_ctx_data); +} + static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx->task_ctx_data); + free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } @@ -4471,7 +4481,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, goto errout; if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); + task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { err = -ENOMEM; goto errout; @@ -4529,11 +4539,11 @@ retry: } } - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } @@ -12497,8 +12507,7 @@ inherit_event(struct perf_event *parent_event, !child_ctx->task_ctx_data) { struct pmu *pmu = child_event->pmu; - child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, - GFP_KERNEL); + child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); if (!child_ctx->task_ctx_data) { free_event(child_event); return ERR_PTR(-ENOMEM); From 217c2a633ebb36f1cc6d249f4ef2e4a809d46818 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:22 -0700 Subject: [PATCH 302/502] perf/core: Use kmem_cache to allocate the PMU specific data Currently, the PMU specific data task_ctx_data is allocated by the function kzalloc() in the perf generic code. When there is no specific alignment requirement for the task_ctx_data, the method works well for now. However, there will be a problem once a specific alignment requirement is introduced in future features, e.g., the Architecture LBR XSAVE feature requires 64-byte alignment. If the specific alignment requirement is not fulfilled, the XSAVE family of instructions will fail to save/restore the xstate to/from the task_ctx_data. The function kzalloc() itself only guarantees a natural alignment. A new method to allocate the task_ctx_data has to be introduced, which has to meet the requirements as below: - must be a generic method can be used by different architectures, because the allocation of the task_ctx_data is implemented in the perf generic code; - must be an alignment-guarantee method (The alignment requirement is not changed after the boot); - must be able to allocate/free a buffer (smaller than a page size) dynamically; - should not cause extra CPU overhead or space overhead. Several options were considered as below: - One option is to allocate a larger buffer for task_ctx_data. E.g., ptr = kmalloc(size + alignment, GFP_KERNEL); ptr &= ~(alignment - 1); This option causes space overhead. - Another option is to allocate the task_ctx_data in the PMU specific code. To do so, several function pointers have to be added. As a result, both the generic structure and the PMU specific structure will become bigger. Besides, extra function calls are added when allocating/freeing the buffer. This option will increase both the space overhead and CPU overhead. - The third option is to use a kmem_cache to allocate a buffer for the task_ctx_data. The kmem_cache can be created with a specific alignment requirement by the PMU at boot time. A new pointer for kmem_cache has to be added in the generic struct pmu, which would be used to dynamically allocate a buffer for the task_ctx_data at run time. Although the new pointer is added to the struct pmu, the existing variable task_ctx_size is not required anymore. The size of the generic structure is kept the same. The third option which meets all the aforementioned requirements is used to replace kzalloc() for the PMU specific data allocation. A later patch will remove the kzalloc() method and the related variables. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-17-git-send-email-kan.liang@linux.intel.com --- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 46fe5cfb5163..09915ae06d28 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -424,6 +424,11 @@ struct pmu { */ size_t task_ctx_size; + /* + * Kmem cache of PMU specific data + */ + struct kmem_cache *task_ctx_cache; + /* * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) * can be synchronized using this function. See Intel LBR callstack support diff --git a/kernel/events/core.c b/kernel/events/core.c index 75090403f942..30d9b3182369 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1240,12 +1240,18 @@ static void get_ctx(struct perf_event_context *ctx) static void *alloc_task_ctx_data(struct pmu *pmu) { + if (pmu->task_ctx_cache) + return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); + return kzalloc(pmu->task_ctx_size, GFP_KERNEL); } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { - kfree(task_ctx_data); + if (pmu->task_ctx_cache && task_ctx_data) + kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); + else + kfree(task_ctx_data); } static void free_ctx(struct rcu_head *head) From 33cad284497cf40f55ad6029c06011de3538ebed Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:23 -0700 Subject: [PATCH 303/502] perf/x86/intel/lbr: Create kmem_cache for the LBR context data A new kmem_cache method is introduced to allocate the PMU specific data task_ctx_data, which requires the PMU specific code to create a kmem_cache. Currently, the task_ctx_data is only used by the Intel LBR call stack feature, which is introduced since Haswell. The kmem_cache should be only created for Haswell and later platforms. There is no alignment requirement for the existing platforms. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-18-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/lbr.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index e4e249a78451..e784c1d485ca 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1531,9 +1531,17 @@ void __init intel_pmu_lbr_init_snb(void) */ } +static inline struct kmem_cache * +create_lbr_kmem_cache(size_t size, size_t align) +{ + return kmem_cache_create("x86_lbr", size, align, 0, NULL); +} + /* haswell */ void intel_pmu_lbr_init_hsw(void) { + size_t size = sizeof(struct x86_perf_task_context); + x86_pmu.lbr_nr = 16; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; @@ -1542,6 +1550,8 @@ void intel_pmu_lbr_init_hsw(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); + if (lbr_from_signext_quirk_needed()) static_branch_enable(&lbr_from_quirk_key); } @@ -1549,6 +1559,8 @@ void intel_pmu_lbr_init_hsw(void) /* skylake */ __init void intel_pmu_lbr_init_skl(void) { + size_t size = sizeof(struct x86_perf_task_context); + x86_pmu.lbr_nr = 32; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; @@ -1558,6 +1570,8 @@ __init void intel_pmu_lbr_init_skl(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); + /* * SW branch filter usage: * - support syscall, sysret capture. @@ -1631,6 +1645,7 @@ void __init intel_pmu_arch_lbr_init(void) union cpuid28_ebx ebx; union cpuid28_ecx ecx; unsigned int unused_edx; + size_t size; u64 lbr_nr; /* Arch LBR Capabilities */ @@ -1655,8 +1670,10 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_br_type = ecx.split.lbr_br_type; x86_pmu.lbr_nr = lbr_nr; - x86_get_pmu()->task_ctx_size = sizeof(struct x86_perf_task_context_arch_lbr) + - lbr_nr * sizeof(struct lbr_entry); + size = sizeof(struct x86_perf_task_context_arch_lbr) + + lbr_nr * sizeof(struct lbr_entry); + x86_get_pmu()->task_ctx_size = size; + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0; From 5a09928d339f3cf0973991ddc3a2798825c84c99 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:24 -0700 Subject: [PATCH 304/502] perf/x86: Remove task_ctx_size A new kmem_cache method has replaced the kzalloc() to allocate the PMU specific data. The task_ctx_size is not required anymore. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-19-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/core.c | 1 - arch/x86/events/intel/lbr.c | 1 - include/linux/perf_event.h | 4 ---- kernel/events/core.c | 4 +--- 4 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index d740c861724c..6b1228ae007d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2371,7 +2371,6 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, - .task_ctx_size = sizeof(struct x86_perf_task_context), .swap_task_ctx = x86_pmu_swap_task_ctx, .check_period = x86_pmu_check_period, diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index e784c1d485ca..3ad528996d1c 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1672,7 +1672,6 @@ void __init intel_pmu_arch_lbr_init(void) size = sizeof(struct x86_perf_task_context_arch_lbr) + lbr_nr * sizeof(struct lbr_entry); - x86_get_pmu()->task_ctx_size = size; x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 09915ae06d28..3b22db08b6fb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -419,10 +419,6 @@ struct pmu { */ void (*sched_task) (struct perf_event_context *ctx, bool sched_in); - /* - * PMU specific data size - */ - size_t task_ctx_size; /* * Kmem cache of PMU specific data diff --git a/kernel/events/core.c b/kernel/events/core.c index 30d9b3182369..7c436d705fbd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1243,15 +1243,13 @@ static void *alloc_task_ctx_data(struct pmu *pmu) if (pmu->task_ctx_cache) return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); - return kzalloc(pmu->task_ctx_size, GFP_KERNEL); + return NULL; } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { if (pmu->task_ctx_cache && task_ctx_data) kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); - else - kfree(task_ctx_data); } static void free_ctx(struct rcu_head *head) From a063bf249b9f8d8004f282031781322c1b527d13 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:25 -0700 Subject: [PATCH 305/502] x86/fpu: Use proper mask to replace full instruction mask When saving xstate to a kernel/user XSAVE area with the XSAVE family of instructions, the current code applies the 'full' instruction mask (-1), which tries to XSAVE all possible features. This method relies on hardware to trim 'all possible' down to what is enabled in the hardware. The code works well for now. However, there will be a problem, if some features are enabled in hardware, but are not suitable to be saved into all kernel XSAVE buffers, like task->fpu, due to performance consideration. One such example is the Last Branch Records (LBR) state. The LBR state only contains valuable information when LBR is explicitly enabled by the perf subsystem, and the size of an LBR state is large (808 bytes for now). To avoid both CPU overhead and space overhead at each context switch, the LBR state should not be saved into task->fpu like other state components. It should be saved/restored on demand when LBR is enabled in the perf subsystem. Current copy_xregs_to_* will trigger a buffer overflow for such cases. Three sites use the '-1' instruction mask which must be updated. Two are saving/restoring the xstate to/from a kernel-allocated XSAVE buffer and can use 'xfeatures_mask_all', which will save/restore all of the features present in a normal task FPU buffer. The last one saves the register state directly to a user buffer. It could also use 'xfeatures_mask_all'. Just as it was with the '-1' argument, any supervisor states in the mask will be filtered out by the hardware and not saved to the buffer. But, to be more explicit about what is expected to be saved, use xfeatures_mask_user() for the instruction mask. KVM includes the header file fpu/internal.h. To avoid 'undefined xfeatures_mask_all' compiling issue, move copy_fpregs_to_fpstate() to fpu/core.c and export it, because: - The xfeatures_mask_all is indirectly used via copy_fpregs_to_fpstate() by KVM. The function which is directly used by other modules should be exported. - The copy_fpregs_to_fpstate() is a function, while xfeatures_mask_all is a variable for the "internal" FPU state. It's safer to export a function than a variable, which may be implicitly changed by others. - The copy_fpregs_to_fpstate() is a big function with many checks. The removal of the inline keyword should not impact the performance. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-20-git-send-email-kan.liang@linux.intel.com --- arch/x86/include/asm/fpu/internal.h | 47 +++++------------------------ arch/x86/kernel/fpu/core.c | 39 ++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 40 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 42159f45bf9c..d3724dc8c5d2 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -274,7 +274,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) */ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) { - u64 mask = -1; + u64 mask = xfeatures_mask_all; u32 lmask = mask; u32 hmask = mask >> 32; int err; @@ -320,7 +320,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) */ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) { - u64 mask = -1; + u64 mask = xfeatures_mask_all; u32 lmask = mask; u32 hmask = mask >> 32; int err; @@ -356,6 +356,9 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) */ static inline int copy_xregs_to_user(struct xregs_state __user *buf) { + u64 mask = xfeatures_mask_user(); + u32 lmask = mask; + u32 hmask = mask >> 32; int err; /* @@ -367,7 +370,7 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf) return -EFAULT; stac(); - XSTATE_OP(XSAVE, buf, -1, -1, err); + XSTATE_OP(XSAVE, buf, lmask, hmask, err); clac(); return err; @@ -408,43 +411,7 @@ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask) return err; } -/* - * These must be called with preempt disabled. Returns - * 'true' if the FPU state is still intact and we can - * keep registers active. - * - * The legacy FNSAVE instruction cleared all FPU state - * unconditionally, so registers are essentially destroyed. - * Modern FPU state can be kept in registers, if there are - * no pending FP exceptions. - */ -static inline int copy_fpregs_to_fpstate(struct fpu *fpu) -{ - if (likely(use_xsave())) { - copy_xregs_to_kernel(&fpu->state.xsave); - - /* - * AVX512 state is tracked here because its use is - * known to slow the max clock speed of the core. - */ - if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) - fpu->avx512_timestamp = jiffies; - return 1; - } - - if (likely(use_fxsr())) { - copy_fxregs_to_kernel(fpu); - return 1; - } - - /* - * Legacy FPU register saving, FNSAVE always clears FPU registers, - * so we have to mark them inactive: - */ - asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); - - return 0; -} +extern int copy_fpregs_to_fpstate(struct fpu *fpu); static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 06c818967bb6..1bb7532f5f34 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -82,6 +82,45 @@ bool irq_fpu_usable(void) } EXPORT_SYMBOL(irq_fpu_usable); +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact and we can + * keep registers active. + * + * The legacy FNSAVE instruction cleared all FPU state + * unconditionally, so registers are essentially destroyed. + * Modern FPU state can be kept in registers, if there are + * no pending FP exceptions. + */ +int copy_fpregs_to_fpstate(struct fpu *fpu) +{ + if (likely(use_xsave())) { + copy_xregs_to_kernel(&fpu->state.xsave); + + /* + * AVX512 state is tracked here because its use is + * known to slow the max clock speed of the core. + */ + if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) + fpu->avx512_timestamp = jiffies; + return 1; + } + + if (likely(use_fxsr())) { + copy_fxregs_to_kernel(fpu); + return 1; + } + + /* + * Legacy FPU register saving, FNSAVE always clears FPU registers, + * so we have to mark them inactive: + */ + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); + + return 0; +} +EXPORT_SYMBOL(copy_fpregs_to_fpstate); + void kernel_fpu_begin(void) { preempt_disable(); From f0dccc9da4c0fda049e99326f85db8c242fd781f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:26 -0700 Subject: [PATCH 306/502] x86/fpu/xstate: Support dynamic supervisor feature for LBR Last Branch Records (LBR) registers are used to log taken branches and other control flows. In perf with call stack mode, LBR information is used to reconstruct a call stack. To get the complete call stack, perf has to save/restore all LBR registers during a context switch. Due to the large number of the LBR registers, e.g., the current platform has 96 LBR registers, this process causes a high CPU overhead. To reduce the CPU overhead during a context switch, an LBR state component that contains all the LBR related registers is introduced in hardware. All LBR registers can be saved/restored together using one XSAVES/XRSTORS instruction. However, the kernel should not save/restore the LBR state component at each context switch, like other state components, because of the following unique features of LBR: - The LBR state component only contains valuable information when LBR is enabled in the perf subsystem, but for most of the time, LBR is disabled. - The size of the LBR state component is huge. For the current platform, it's 808 bytes. If the kernel saves/restores the LBR state at each context switch, for most of the time, it is just a waste of space and cycles. To efficiently support the LBR state component, it is desired to have: - only context-switch the LBR when the LBR feature is enabled in perf. - only allocate an LBR-specific XSAVE buffer on demand. (Besides the LBR state, a legacy region and an XSAVE header have to be included in the buffer as well. There is a total of (808+576) byte overhead for the LBR-specific XSAVE buffer. The overhead only happens when the perf is actively using LBRs. There is still a space-saving, on average, when it replaces the constant 808 bytes of overhead for every task, all the time on the systems that support architectural LBR.) - be able to use XSAVES/XRSTORS for accessing LBR at run time. However, the IA32_XSS should not be adjusted at run time. (The XCR0 | IA32_XSS are used to determine the requested-feature bitmap (RFBM) of XSAVES.) A solution, called dynamic supervisor feature, is introduced to address this issue, which - does not allocate a buffer in each task->fpu; - does not save/restore a state component at each context switch; - sets the bit corresponding to the dynamic supervisor feature in IA32_XSS at boot time, and avoids setting it at run time. - dynamically allocates a specific buffer for a state component on demand, e.g. only allocates LBR-specific XSAVE buffer when LBR is enabled in perf. (Note: The buffer has to include the LBR state component, a legacy region and a XSAVE header space.) (Implemented in a later patch) - saves/restores a state component on demand, e.g. manually invokes the XSAVES/XRSTORS instruction to save/restore the LBR state to/from the buffer when perf is active and a call stack is required. (Implemented in a later patch) A new mask XFEATURE_MASK_DYNAMIC and a helper xfeatures_mask_dynamic() are introduced to indicate the dynamic supervisor feature. For the systems which support the Architecture LBR, LBR is the only dynamic supervisor feature for now. For the previous systems, there is no dynamic supervisor feature available. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-21-git-send-email-kan.liang@linux.intel.com --- arch/x86/include/asm/fpu/types.h | 7 +++++++ arch/x86/include/asm/fpu/xstate.h | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/fpu/xstate.c | 15 ++++++++++----- 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index f098f6cab94b..132e9cc26d60 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -114,6 +114,12 @@ enum xfeature { XFEATURE_Hi16_ZMM, XFEATURE_PT_UNIMPLEMENTED_SO_FAR, XFEATURE_PKRU, + XFEATURE_RSRVD_COMP_10, + XFEATURE_RSRVD_COMP_11, + XFEATURE_RSRVD_COMP_12, + XFEATURE_RSRVD_COMP_13, + XFEATURE_RSRVD_COMP_14, + XFEATURE_LBR, XFEATURE_MAX, }; @@ -128,6 +134,7 @@ enum xfeature { #define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) #define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) +#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 422d8369012a..040c4d49bfcb 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -35,6 +35,27 @@ /* All currently supported supervisor features */ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (0) +/* + * A supervisor state component may not always contain valuable information, + * and its size may be huge. Saving/restoring such supervisor state components + * at each context switch can cause high CPU and space overhead, which should + * be avoided. Such supervisor state components should only be saved/restored + * on demand. The on-demand dynamic supervisor features are set in this mask. + * + * Unlike the existing supported supervisor features, a dynamic supervisor + * feature does not allocate a buffer in task->fpu, and the corresponding + * supervisor state component cannot be saved/restored at each context switch. + * + * To support a dynamic supervisor feature, a developer should follow the + * dos and don'ts as below: + * - Do dynamically allocate a buffer for the supervisor state component. + * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the + * state component to/from the buffer. + * - Don't set the bit corresponding to the dynamic supervisor feature in + * IA32_XSS at run time, since it has been set at boot time. + */ +#define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR) + /* * Unsupported supervisor features. When a supervisor feature in this mask is * supported in the future, move it to the supported supervisor feature mask. @@ -43,6 +64,7 @@ /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ + XFEATURE_MASK_DYNAMIC | \ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) #ifdef CONFIG_X86_64 @@ -63,6 +85,14 @@ static inline u64 xfeatures_mask_user(void) return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; } +static inline u64 xfeatures_mask_dynamic(void) +{ + if (!boot_cpu_has(X86_FEATURE_ARCH_LBR)) + return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR; + + return XFEATURE_MASK_DYNAMIC; +} + extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index bda2e5eaca0e..dcf062442b18 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -233,8 +233,10 @@ void fpu__init_cpu_xstate(void) /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. */ - if (boot_cpu_has(X86_FEATURE_XSAVES)) - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_dynamic()); + } } static bool xfeature_enabled(enum xfeature xfeature) @@ -598,7 +600,8 @@ static void check_xstate_against_struct(int nr) */ if ((nr < XFEATURE_YMM) || (nr >= XFEATURE_MAX) || - (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) { + (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || + ((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); } @@ -847,8 +850,10 @@ void fpu__resume_cpu(void) * Restore IA32_XSS. The same CPUID bit enumerates support * of XSAVES and MSR_IA32_XSS. */ - if (boot_cpu_has(X86_FEATURE_XSAVES)) - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_dynamic()); + } } /* From 50f408d96d4d1a945d2c50c5fd8ed400883edf0e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:27 -0700 Subject: [PATCH 307/502] x86/fpu/xstate: Add helpers for LBR dynamic supervisor feature The perf subsystem will only need to save/restore the LBR state. However, the existing helpers save all supported supervisor states to a kernel buffer, which will be unnecessary. Two helpers are introduced to only save/restore requested dynamic supervisor states. The supervisor features in XFEATURE_MASK_SUPERVISOR_SUPPORTED and XFEATURE_MASK_SUPERVISOR_UNSUPPORTED mask cannot be saved/restored using these helpers. The helpers will be used in the following patch. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-22-git-send-email-kan.liang@linux.intel.com --- arch/x86/include/asm/fpu/xstate.h | 3 ++ arch/x86/kernel/fpu/xstate.c | 72 +++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 040c4d49bfcb..c029fce627cf 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -106,6 +106,9 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); void copy_supervisor_to_kernel(struct xregs_state *xsave); +void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask); +void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask); + /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ int validate_user_xstate_header(const struct xstate_header *hdr); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index dcf062442b18..b0c22b7dae0a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1361,6 +1361,78 @@ void copy_supervisor_to_kernel(struct xregs_state *xstate) } } +/** + * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to + * an xsave area + * @xstate: A pointer to an xsave area + * @mask: Represent the dynamic supervisor features saved into the xsave area + * + * Only the dynamic supervisor states sets in the mask are saved into the xsave + * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic + * supervisor feature). Besides the dynamic supervisor states, the legacy + * region and XSAVE header are also saved into the xsave area. The supervisor + * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved. + * + * The xsave area must be 64-bytes aligned. + */ +void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask) +{ + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u32 lmask, hmask; + int err; + + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (WARN_ON_FPU(!dynamic_mask)) + return; + + lmask = dynamic_mask; + hmask = dynamic_mask >> 32; + + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); + + /* Should never fault when copying to a kernel buffer */ + WARN_ON_FPU(err); +} + +/** + * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from + * an xsave area + * @xstate: A pointer to an xsave area + * @mask: Represent the dynamic supervisor features restored from the xsave area + * + * Only the dynamic supervisor states sets in the mask are restored from the + * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of + * dynamic supervisor feature). Besides the dynamic supervisor states, the + * legacy region and XSAVE header are also restored from the xsave area. The + * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored. + * + * The xsave area must be 64-bytes aligned. + */ +void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask) +{ + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u32 lmask, hmask; + int err; + + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (WARN_ON_FPU(!dynamic_mask)) + return; + + lmask = dynamic_mask; + hmask = dynamic_mask >> 32; + + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + + /* Should never fault when copying from a kernel buffer */ + WARN_ON_FPU(err); +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 From ce711ea3cab9ad325d849792d442848e553095b8 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:28 -0700 Subject: [PATCH 308/502] perf/x86/intel/lbr: Support XSAVES/XRSTORS for LBR context switch In the LBR call stack mode, LBR information is used to reconstruct a call stack. To get the complete call stack, perf has to save/restore all LBR registers during a context switch. Due to a large number of the LBR registers, this process causes a high CPU overhead. To reduce the CPU overhead during a context switch, use the XSAVES/XRSTORS instructions. Every XSAVE area must follow a canonical format: the legacy region, an XSAVE header and the extended region. Although the LBR information is only kept in the extended region, a space for the legacy region and XSAVE header is still required. Add a new dedicated structure for LBR XSAVES support. Before enabling XSAVES support, the size of the LBR state has to be sanity checked, because: - the size of the software structure is calculated from the max number of the LBR depth, which is enumerated by the CPUID leaf for Arch LBR. The size of the LBR state is enumerated by the CPUID leaf for XSAVE support of Arch LBR. If the values from the two CPUID leaves are not consistent, it may trigger a buffer overflow. For example, a hypervisor may unconsciously set inconsistent values for the two emulated CPUID. - unlike other state components, the size of an LBR state depends on the max number of LBRs, which may vary from generation to generation. Expose the function xfeature_size() for the sanity check. The LBR XSAVES support will be disabled if the size of the LBR state enumerated by CPUID doesn't match with the size of the software structure. The XSAVE instruction requires 64-byte alignment for state buffers. A new macro is added to reflect the alignment requirement. A 64-byte aligned kmem_cache is created for architecture LBR. Currently, the structure for each state component is maintained in fpu/types.h. The structure for the new LBR state component should be maintained in the same place. Move structure lbr_entry to fpu/types.h as well for broader sharing. Add dedicated lbr_save/lbr_restore functions for LBR XSAVES support, which invokes the corresponding xstate helpers to XSAVES/XRSTORS LBR information at the context switch when the call stack mode is enabled. Since the XSAVES/XRSTORS instructions will be eventually invoked, the dedicated functions is named with '_xsaves'/'_xrstors' postfix. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-23-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/lbr.c | 79 +++++++++++++++++++++++++++++-- arch/x86/events/perf_event.h | 21 ++++++++ arch/x86/include/asm/fpu/types.h | 20 ++++++++ arch/x86/include/asm/fpu/xstate.h | 3 ++ arch/x86/include/asm/perf_event.h | 4 -- arch/x86/kernel/fpu/xstate.c | 2 +- 6 files changed, 119 insertions(+), 10 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 3ad528996d1c..cb1a0495339b 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -483,6 +483,17 @@ static void intel_pmu_arch_lbr_restore(void *ctx) } } +/* + * Restore the Architecture LBR state from the xsave area in the perf + * context data for the task via the XRSTORS instruction. + */ +static void intel_pmu_arch_lbr_xrstors(void *ctx) +{ + struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; + + copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR); +} + static __always_inline bool lbr_is_reset_in_cstate(void *ctx) { if (static_cpu_has(X86_FEATURE_ARCH_LBR)) @@ -557,6 +568,17 @@ static void intel_pmu_arch_lbr_save(void *ctx) entries[x86_pmu.lbr_nr - 1].from = 0; } +/* + * Save the Architecture LBR state to the xsave area in the perf + * context data for the task via the XSAVES instruction. + */ +static void intel_pmu_arch_lbr_xsaves(void *ctx) +{ + struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; + + copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR); +} + static void __intel_pmu_lbr_save(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1639,12 +1661,40 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } +/* + * LBR state size is variable based on the max number of registers. + * This calculates the expected state size, which should match + * what the hardware enumerates for the size of XFEATURE_LBR. + */ +static inline unsigned int get_lbr_state_size(void) +{ + return sizeof(struct arch_lbr_state) + + x86_pmu.lbr_nr * sizeof(struct lbr_entry); +} + +static bool is_arch_lbr_xsave_available(void) +{ + if (!boot_cpu_has(X86_FEATURE_XSAVES)) + return false; + + /* + * Check the LBR state with the corresponding software structure. + * Disable LBR XSAVES support if the size doesn't match. + */ + if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size())) + return false; + + return true; +} + void __init intel_pmu_arch_lbr_init(void) { + struct pmu *pmu = x86_get_pmu(); union cpuid28_eax eax; union cpuid28_ebx ebx; union cpuid28_ecx ecx; unsigned int unused_edx; + bool arch_lbr_xsave; size_t size; u64 lbr_nr; @@ -1670,9 +1720,22 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_br_type = ecx.split.lbr_br_type; x86_pmu.lbr_nr = lbr_nr; - size = sizeof(struct x86_perf_task_context_arch_lbr) + - lbr_nr * sizeof(struct lbr_entry); - x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); + + arch_lbr_xsave = is_arch_lbr_xsave_available(); + if (arch_lbr_xsave) { + size = sizeof(struct x86_perf_task_context_arch_lbr_xsave) + + get_lbr_state_size(); + pmu->task_ctx_cache = create_lbr_kmem_cache(size, + XSAVE_ALIGNMENT); + } + + if (!pmu->task_ctx_cache) { + arch_lbr_xsave = false; + + size = sizeof(struct x86_perf_task_context_arch_lbr) + + lbr_nr * sizeof(struct lbr_entry); + pmu->task_ctx_cache = create_lbr_kmem_cache(size, 0); + } x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0; @@ -1705,8 +1768,14 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset; x86_pmu.lbr_read = intel_pmu_arch_lbr_read; - x86_pmu.lbr_save = intel_pmu_arch_lbr_save; - x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore; + if (arch_lbr_xsave) { + x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves; + x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors; + pr_cont("XSAVE "); + } else { + x86_pmu.lbr_save = intel_pmu_arch_lbr_save; + x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore; + } pr_cont("Architectural LBR, "); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 3f7c329374bb..d5e351c1f3c1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -777,6 +777,27 @@ struct x86_perf_task_context_arch_lbr { struct lbr_entry entries[]; }; +/* + * Add padding to guarantee the 64-byte alignment of the state buffer. + * + * The structure is dynamically allocated. The size of the LBR state may vary + * based on the number of LBR registers. + * + * Do not put anything after the LBR state. + */ +struct x86_perf_task_context_arch_lbr_xsave { + struct x86_perf_task_context_opt opt; + + union { + struct xregs_state xsave; + struct { + struct fxregs_state i387; + struct xstate_header header; + struct arch_lbr_state lbr; + } __attribute__ ((packed, aligned (XSAVE_ALIGNMENT))); + }; +}; + #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 132e9cc26d60..c87364ea6446 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -236,6 +236,26 @@ struct pkru_state { u32 pad; } __packed; +/* + * State component 15: Architectural LBR configuration state. + * The size of Arch LBR state depends on the number of LBRs (lbr_depth). + */ + +struct lbr_entry { + u64 from; + u64 to; + u64 info; +}; + +struct arch_lbr_state { + u64 lbr_ctl; + u64 lbr_depth; + u64 ler_from; + u64 ler_to; + u64 ler_info; + struct lbr_entry entries[]; +} __packed; + struct xstate_header { u64 xfeatures; u64 xcomp_bv; diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index c029fce627cf..1559554af931 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -21,6 +21,8 @@ #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) +#define XSAVE_ALIGNMENT 64 + /* All currently supported user features */ #define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ @@ -101,6 +103,7 @@ extern void __init update_regset_xstate_info(unsigned int size, void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); const void *get_xsave_field_ptr(int xfeature_nr); int using_compacted_format(void); +int xfeature_size(int xfeature_nr); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 2e29558c9c6b..0c1b13720525 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -282,10 +282,6 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; -struct lbr_entry { - u64 from, to, info; -}; - /* * IBS cpuid feature detection */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b0c22b7dae0a..10cf8789c378 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -488,7 +488,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr) return ebx; } -static int xfeature_size(int xfeature_nr) +int xfeature_size(int xfeature_nr) { u32 eax, ebx, ecx, edx; From c085fb8774671e83f6199a8e838fbc0e57094029 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:29 -0700 Subject: [PATCH 309/502] perf/x86/intel/lbr: Support XSAVES for arch LBR read Reading LBR registers in a perf NMI handler for a non-PEBS event causes a high overhead because the number of LBR registers is huge. To reduce the overhead, the XSAVES instruction should be used to replace the LBR registers' reading method. The XSAVES buffer used for LBR read has to be per-CPU because the NMI handler invoked the lbr_read(). The existing task_ctx_data buffer cannot be used which is per-task and only be allocated for the LBR call stack mode. A new lbr_xsave pointer is introduced in the cpu_hw_events as an XSAVES buffer for LBR read. The XSAVES buffer should be allocated only when LBR is used by a non-PEBS event on the CPU because the total size of the lbr_xsave is not small (~1.4KB). The XSAVES buffer is allocated when a non-PEBS event is added, but it is lazily released in x86_release_hardware() when perf releases the entire PMU hardware resource, because perf may frequently schedule the event, e.g. high context switch. The lazy release method reduces the overhead of frequently allocate/free the buffer. If the lbr_xsave fails to be allocated, roll back to normal Arch LBR lbr_read(). Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-24-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/core.c | 1 + arch/x86/events/intel/lbr.c | 40 +++++++++++++++++++++++++++++++++++- arch/x86/events/perf_event.h | 7 +++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6b1228ae007d..1cbf57dc2ac8 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -358,6 +358,7 @@ void x86_release_hardware(void) if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { release_pmc_hardware(); release_ds_buffers(); + release_lbr_buffers(); mutex_unlock(&pmc_reserve_mutex); } } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index cb1a0495339b..63f58bdf556c 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -658,6 +658,7 @@ static inline bool branch_user_callstack(unsigned br_sel) void intel_pmu_lbr_add(struct perf_event *event) { + struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu.lbr_nr) @@ -695,6 +696,29 @@ void intel_pmu_lbr_add(struct perf_event *event) perf_sched_cb_inc(event->ctx->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) intel_pmu_lbr_reset(); + + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + kmem_cache && !cpuc->lbr_xsave && + (cpuc->lbr_users != cpuc->lbr_pebs_users)) + cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL); +} + +void release_lbr_buffers(void) +{ + struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache; + struct cpu_hw_events *cpuc; + int cpu; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + return; + + for_each_possible_cpu(cpu) { + cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + if (kmem_cache && cpuc->lbr_xsave) { + kmem_cache_free(kmem_cache, cpuc->lbr_xsave); + cpuc->lbr_xsave = NULL; + } + } } void intel_pmu_lbr_del(struct perf_event *event) @@ -945,6 +969,19 @@ static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc) intel_pmu_store_lbr(cpuc, NULL); } +static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc) +{ + struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave; + + if (!xsave) { + intel_pmu_store_lbr(cpuc, NULL); + return; + } + copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR); + + intel_pmu_store_lbr(cpuc, xsave->lbr.entries); +} + void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1767,14 +1804,15 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_ctl_map = NULL; x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset; - x86_pmu.lbr_read = intel_pmu_arch_lbr_read; if (arch_lbr_xsave) { x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves; x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors; + x86_pmu.lbr_read = intel_pmu_arch_lbr_read_xsave; pr_cont("XSAVE "); } else { x86_pmu.lbr_save = intel_pmu_arch_lbr_save; x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore; + x86_pmu.lbr_read = intel_pmu_arch_lbr_read; } pr_cont("Architectural LBR, "); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index d5e351c1f3c1..7b68ab5f19e7 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -253,6 +253,7 @@ struct cpu_hw_events { void *last_task_ctx; int last_log_id; int lbr_select; + void *lbr_xsave; /* * Intel host/guest exclude bits @@ -1066,6 +1067,8 @@ void release_ds_buffers(void); void reserve_ds_buffers(void); +void release_lbr_buffers(void); + extern struct event_constraint bts_constraint; extern struct event_constraint vlbr_constraint; @@ -1207,6 +1210,10 @@ static inline void release_ds_buffers(void) { } +static inline void release_lbr_buffers(void) +{ +} + static inline int intel_pmu_init(void) { return 0; From aa340845ae6f019e0a12321a1741c14679bb0664 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Jul 2020 21:47:11 +0300 Subject: [PATCH 310/502] io_uring: fix a use after free in io_async_task_func() The "apoll" variable is freed and then used on the next line. We need to move the free down a few lines. Fixes: 0be0b0e33b0b ("io_uring: simplify io_async_task_func()") Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4c9a494c9f9f..14168fbc7d79 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4655,12 +4655,13 @@ static void io_async_task_func(struct callback_head *cb) /* restore ->work in case we need to retry again */ if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&req->work, &apoll->work, sizeof(req->work)); - kfree(apoll); if (!READ_ONCE(apoll->poll.canceled)) __io_req_task_submit(req); else __io_req_task_cancel(req, -ECANCELED); + + kfree(apoll); } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, From 5acbbc8ed3a9aef71c6eb5f19ba24f7321200220 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jul 2020 15:15:26 -0600 Subject: [PATCH 311/502] io_uring: only call kfree() for a non-zero pointer It's safe to call kfree() with a NULL pointer, but it's also pointless. Most of the time we don't have any data to free, and at millions of requests per second, the redundant function call adds noticeable overhead (about 1.3% of the runtime). Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 14168fbc7d79..51ff88330f9a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1519,7 +1519,8 @@ static void io_dismantle_req(struct io_kiocb *req) if (req->flags & REQ_F_NEED_CLEANUP) io_cleanup_req(req); - kfree(req->io); + if (req->io) + kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); __io_put_req_task(req); From 2bc9930e78fe0cb3e7b7e3169de0a40baee38d29 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Jul 2020 09:43:27 -0600 Subject: [PATCH 312/502] io_uring: get rid of __req_need_defer() We just have one caller of this, req_need_defer(), just inline the code in there instead. Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 51ff88330f9a..7f2a2cb5c056 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1069,18 +1069,14 @@ err: return NULL; } -static inline bool __req_need_defer(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - - return req->sequence != ctx->cached_cq_tail - + atomic_read(&ctx->cached_cq_overflow); -} - static inline bool req_need_defer(struct io_kiocb *req) { - if (unlikely(req->flags & REQ_F_IO_DRAIN)) - return __req_need_defer(req); + if (unlikely(req->flags & REQ_F_IO_DRAIN)) { + struct io_ring_ctx *ctx = req->ctx; + + return req->sequence != ctx->cached_cq_tail + + atomic_read(&ctx->cached_cq_overflow); + } return false; } From 4349f30ecb8068d146a1e57bb12f46e745323b4c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Jul 2020 15:07:01 -0600 Subject: [PATCH 313/502] io_uring: remove dead 'ctx' argument and move forward declaration We don't use 'ctx' at all in io_sq_thread_drop_mm(), it just works on the mm of the current task. Drop the argument. Move io_file_put_work() to where we have the other forward declarations of functions. Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7f2a2cb5c056..3ce02a1613cc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -902,6 +902,7 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_comp_state *cs); +static void io_file_put_work(struct work_struct *work); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, @@ -942,7 +943,7 @@ static void __io_put_req_task(struct io_kiocb *req) put_task_struct(req->task); } -static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +static void io_sq_thread_drop_mm(void) { struct mm_struct *mm = current->mm; @@ -977,8 +978,6 @@ static inline void req_set_fail_links(struct io_kiocb *req) req->flags |= REQ_F_FAIL_LINK; } -static void io_file_put_work(struct work_struct *work); - /* * Note: must call io_req_init_async() for the first time you * touch any members of io_wq_work. @@ -6339,7 +6338,7 @@ static int io_sq_thread(void *data) * adding ourselves to the waitqueue, as the unuse/drop * may sleep. */ - io_sq_thread_drop_mm(ctx); + io_sq_thread_drop_mm(); /* * We're polling. If we're within the defined idle @@ -6410,7 +6409,7 @@ static int io_sq_thread(void *data) io_run_task_work(); - io_sq_thread_drop_mm(ctx); + io_sq_thread_drop_mm(); revert_creds(old_cred); kthread_parkme(); From 248591f5d257a19c1cba9ab9da3536bfbc2f0149 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 24 Jun 2020 13:32:46 +0200 Subject: [PATCH 314/502] kcsan: Make KCSAN compatible with new IRQ state tracking The new IRQ state tracking code does not honor lockdep_off(), and as such we should again permit tracing by using non-raw functions in core.c. Update the lockdep_off() comment in report.c, to reflect the fact there is still a potential risk of deadlock due to using printk() from scheduler code. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200624113246.GA170324@elver.google.com --- kernel/kcsan/core.c | 5 ++--- kernel/kcsan/report.c | 9 +++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 15f67949d11e..732623c30359 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -397,8 +397,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) } if (!kcsan_interrupt_watcher) - /* Use raw to avoid lockdep recursion via IRQ flags tracing. */ - raw_local_irq_save(irq_flags); + local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { @@ -539,7 +538,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); out_unlock: if (!kcsan_interrupt_watcher) - raw_local_irq_restore(irq_flags); + local_irq_restore(irq_flags); out: user_access_restore(ua_flags); } diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ac5f8345bae9..6b2fb1a6d8cd 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -606,10 +606,11 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, goto out; /* - * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if - * we do not turn off lockdep here; this could happen due to recursion - * into lockdep via KCSAN if we detect a race in utilities used by - * lockdep. + * Because we may generate reports when we're in scheduler code, the use + * of printk() could deadlock. Until such time that all printing code + * called in print_report() is scheduler-safe, accept the risk, and just + * get our message out. As such, also disable lockdep to hide the + * warning, and avoid disabling lockdep for the rest of the kernel. */ lockdep_off(); From 48017e5481ce85ba52c4cff082cad5f021c4b413 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 22:40:58 +0200 Subject: [PATCH 315/502] sparc64: Fix asm/percpu.h build error In order to break a header dependency between lockdep and task_struct, I need per-cpu stuff from lockdep. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Acked-by: David S. Miller Link: https://lkml.kernel.org/r/20200623083721.277992771@infradead.org --- arch/sparc/include/asm/percpu_64.h | 2 ++ arch/sparc/include/asm/trap_block.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/sparc/include/asm/percpu_64.h b/arch/sparc/include/asm/percpu_64.h index 32ef6f05cc56..a8786a4b90b6 100644 --- a/arch/sparc/include/asm/percpu_64.h +++ b/arch/sparc/include/asm/percpu_64.h @@ -4,7 +4,9 @@ #include +#ifndef BUILD_VDSO register unsigned long __local_per_cpu_offset asm("g5"); +#endif #ifdef CONFIG_SMP diff --git a/arch/sparc/include/asm/trap_block.h b/arch/sparc/include/asm/trap_block.h index 0f6d0c4f6683..ace0d48e837e 100644 --- a/arch/sparc/include/asm/trap_block.h +++ b/arch/sparc/include/asm/trap_block.h @@ -2,6 +2,8 @@ #ifndef _SPARC_TRAP_BLOCK_H #define _SPARC_TRAP_BLOCK_H +#include + #include #include From 859d069ee1ddd87862e1d6a356a82ed417dbeb67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2020 15:00:57 +0200 Subject: [PATCH 316/502] lockdep: Prepare for NMI IRQ state tracking There is no reason not to always, accurately, track IRQ state. This change also makes IRQ state tracking ignore lockdep_off(). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.155449112@infradead.org --- kernel/locking/lockdep.c | 46 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..d595623c4b34 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -395,7 +395,7 @@ void lockdep_init_task(struct task_struct *task) static __always_inline void lockdep_recursion_finish(void) { - if (WARN_ON_ONCE(--current->lockdep_recursion)) + if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK)) current->lockdep_recursion = 0; } @@ -3646,7 +3646,16 @@ static void __trace_hardirqs_on_caller(void) */ void lockdep_hardirqs_on_prepare(unsigned long ip) { - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * NMIs do not (and cannot) track lock dependencies, nothing to do. + */ + if (unlikely(in_nmi())) + return; + + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; if (unlikely(current->hardirqs_enabled)) { @@ -3692,7 +3701,27 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || curr->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * NMIs can happen in the middle of local_irq_{en,dis}able() where the + * tracking state and hardware state are out of sync. + * + * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from, + * and not rely on hardware state like normal interrupts. + */ + if (unlikely(in_nmi())) { + /* + * Skip: + * - recursion check, because NMI can hit lockdep; + * - hardware state check, because above; + * - chain_key check, see lockdep_hardirqs_on_prepare(). + */ + goto skip_checks; + } + + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; if (curr->hardirqs_enabled) { @@ -3720,6 +3749,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != current->curr_chain_key); +skip_checks: /* we'll do an OFF -> ON transition: */ curr->hardirqs_enabled = 1; curr->hardirq_enable_ip = ip; @@ -3735,7 +3765,15 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || curr->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep; + * they will restore the software state. This ensures the software + * state is consistent inside NMIs as well. + */ + if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK))) return; /* From d6bdceb6c2276276c0392b926ccd2e5991d5cb9a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 22:41:01 +0200 Subject: [PATCH 317/502] powerpc64: Break asm/percpu.h vs spinlock_types.h dependency In order to use in lockdep.h, we need to make sure asm/percpu.h does not itself depend on lockdep. The below seems to make that so and builds powerpc64-defconfig + PROVE_LOCKING. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar https://lkml.kernel.org/r/20200623083721.336906073@infradead.org --- arch/powerpc/include/asm/dtl.h | 52 ++++++++++++++++++++++++++ arch/powerpc/include/asm/lppaca.h | 44 ---------------------- arch/powerpc/include/asm/paca.h | 2 +- arch/powerpc/kernel/time.c | 2 + arch/powerpc/kvm/book3s_hv.c | 1 + arch/powerpc/platforms/pseries/dtl.c | 1 + arch/powerpc/platforms/pseries/lpar.c | 1 + arch/powerpc/platforms/pseries/setup.c | 1 + arch/powerpc/platforms/pseries/svm.c | 1 + 9 files changed, 60 insertions(+), 45 deletions(-) create mode 100644 arch/powerpc/include/asm/dtl.h diff --git a/arch/powerpc/include/asm/dtl.h b/arch/powerpc/include/asm/dtl.h new file mode 100644 index 000000000000..1625888f27ef --- /dev/null +++ b/arch/powerpc/include/asm/dtl.h @@ -0,0 +1,52 @@ +#ifndef _ASM_POWERPC_DTL_H +#define _ASM_POWERPC_DTL_H + +#include +#include + +/* + * Layout of entries in the hypervisor's dispatch trace log buffer. + */ +struct dtl_entry { + u8 dispatch_reason; + u8 preempt_reason; + __be16 processor_id; + __be32 enqueue_to_dispatch_time; + __be32 ready_to_enqueue_time; + __be32 waiting_to_ready_time; + __be64 timebase; + __be64 fault_addr; + __be64 srr0; + __be64 srr1; +}; + +#define DISPATCH_LOG_BYTES 4096 /* bytes per cpu */ +#define N_DISPATCH_LOG (DISPATCH_LOG_BYTES / sizeof(struct dtl_entry)) + +/* + * Dispatch trace log event enable mask: + * 0x1: voluntary virtual processor waits + * 0x2: time-slice preempts + * 0x4: virtual partition memory page faults + */ +#define DTL_LOG_CEDE 0x1 +#define DTL_LOG_PREEMPT 0x2 +#define DTL_LOG_FAULT 0x4 +#define DTL_LOG_ALL (DTL_LOG_CEDE | DTL_LOG_PREEMPT | DTL_LOG_FAULT) + +extern struct kmem_cache *dtl_cache; +extern rwlock_t dtl_access_lock; + +/* + * When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE = y, the cpu accounting code controls + * reading from the dispatch trace log. If other code wants to consume + * DTL entries, it can set this pointer to a function that will get + * called once for each DTL entry that gets processed. + */ +extern void (*dtl_consumer)(struct dtl_entry *entry, u64 index); + +extern void register_dtl_buffer(int cpu); +extern void alloc_dtl_buffers(unsigned long *time_limit); +extern long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity); + +#endif /* _ASM_POWERPC_DTL_H */ diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h index 3b4b305796ae..c390ec377bae 100644 --- a/arch/powerpc/include/asm/lppaca.h +++ b/arch/powerpc/include/asm/lppaca.h @@ -42,7 +42,6 @@ */ #include #include -#include #include #include #include @@ -146,49 +145,6 @@ struct slb_shadow { } save_area[SLB_NUM_BOLTED]; } ____cacheline_aligned; -/* - * Layout of entries in the hypervisor's dispatch trace log buffer. - */ -struct dtl_entry { - u8 dispatch_reason; - u8 preempt_reason; - __be16 processor_id; - __be32 enqueue_to_dispatch_time; - __be32 ready_to_enqueue_time; - __be32 waiting_to_ready_time; - __be64 timebase; - __be64 fault_addr; - __be64 srr0; - __be64 srr1; -}; - -#define DISPATCH_LOG_BYTES 4096 /* bytes per cpu */ -#define N_DISPATCH_LOG (DISPATCH_LOG_BYTES / sizeof(struct dtl_entry)) - -/* - * Dispatch trace log event enable mask: - * 0x1: voluntary virtual processor waits - * 0x2: time-slice preempts - * 0x4: virtual partition memory page faults - */ -#define DTL_LOG_CEDE 0x1 -#define DTL_LOG_PREEMPT 0x2 -#define DTL_LOG_FAULT 0x4 -#define DTL_LOG_ALL (DTL_LOG_CEDE | DTL_LOG_PREEMPT | DTL_LOG_FAULT) - -extern struct kmem_cache *dtl_cache; -extern rwlock_t dtl_access_lock; - -/* - * When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE = y, the cpu accounting code controls - * reading from the dispatch trace log. If other code wants to consume - * DTL entries, it can set this pointer to a function that will get - * called once for each DTL entry that gets processed. - */ -extern void (*dtl_consumer)(struct dtl_entry *entry, u64 index); - -extern void register_dtl_buffer(int cpu); -extern void alloc_dtl_buffers(unsigned long *time_limit); extern long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity); #endif /* CONFIG_PPC_BOOK3S */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 45a839a7c6cf..84b2564cf5a4 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -29,7 +29,6 @@ #include #include #include -#include #include @@ -53,6 +52,7 @@ extern unsigned int debug_smp_processor_id(void); /* from linux/smp.h */ #define get_slb_shadow() (get_paca()->slb_shadow_ptr) struct task_struct; +struct rtas_args; /* * Defines the layout of the paca. diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 6fcae436ae51..f85539ebb513 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -183,6 +183,8 @@ static inline unsigned long read_spurr(unsigned long tb) #ifdef CONFIG_PPC_SPLPAR +#include + /* * Scan the dispatch trace log and count up the stolen time. * Should be called with interrupts disabled. diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6bf66649ab92..ebb04f331ad3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -74,6 +74,7 @@ #include #include #include +#include #include "book3s.h" diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index eab8aa293743..982f069e4c31 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index fd26f3d21d7b..f71ff2c94efe 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "pseries.h" diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 2db8469e475f..27094c872fd6 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -70,6 +70,7 @@ #include #include #include +#include #include "pseries.h" #include "../../../../drivers/pci/pci.h" diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 40c0637203d5..e6d7a344d9f2 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -11,6 +11,7 @@ #include #include #include +#include static int __init init_svm(void) { From ba1f2b2eaa2a529dba722507c55ff3d761d325dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2020 15:50:29 +0200 Subject: [PATCH 318/502] x86/entry: Fix NMI vs IRQ state tracking While the nmi_enter() users did trace_hardirqs_{off_prepare,on_finish}() there was no matching lockdep_hardirqs_*() calls to complete the picture. Introduce idtentry_{enter,exit}_nmi() to enable proper IRQ state tracking across the NMIs. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.216740948@infradead.org --- arch/x86/entry/common.c | 42 +++++++++++++++++++++++++++++---- arch/x86/include/asm/idtentry.h | 3 +++ arch/x86/kernel/nmi.c | 9 ++++--- arch/x86/kernel/traps.c | 17 +++++-------- include/linux/hardirq.h | 28 +++++++++++++++------- 5 files changed, 70 insertions(+), 29 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 0521546022cb..63c607dd6c52 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -592,7 +592,7 @@ SYSCALL_DEFINE0(ni_syscall) * The return value must be fed into the state argument of * idtentry_exit(). */ -idtentry_state_t noinstr idtentry_enter(struct pt_regs *regs) +noinstr idtentry_state_t idtentry_enter(struct pt_regs *regs) { idtentry_state_t ret = { .exit_rcu = false, @@ -687,7 +687,7 @@ static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) * Counterpart to idtentry_enter(). The return value of the entry * function must be fed into the @state argument. */ -void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state) +noinstr void idtentry_exit(struct pt_regs *regs, idtentry_state_t state) { lockdep_assert_irqs_disabled(); @@ -731,7 +731,7 @@ void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state) * Invokes enter_from_user_mode() to establish the proper context for * NOHZ_FULL. Otherwise scheduling on exit would not be possible. */ -void noinstr idtentry_enter_user(struct pt_regs *regs) +noinstr void idtentry_enter_user(struct pt_regs *regs) { check_user_regs(regs); enter_from_user_mode(); @@ -749,13 +749,47 @@ void noinstr idtentry_enter_user(struct pt_regs *regs) * * Counterpart to idtentry_enter_user(). */ -void noinstr idtentry_exit_user(struct pt_regs *regs) +noinstr void idtentry_exit_user(struct pt_regs *regs) { lockdep_assert_irqs_disabled(); prepare_exit_to_usermode(regs); } +noinstr bool idtentry_enter_nmi(struct pt_regs *regs) +{ + bool irq_state = lockdep_hardirqs_enabled(current); + + __nmi_enter(); + lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirq_enter(); + rcu_nmi_enter(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + ftrace_nmi_enter(); + instrumentation_end(); + + return irq_state; +} + +noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore) +{ + instrumentation_begin(); + ftrace_nmi_exit(); + if (restore) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + } + instrumentation_end(); + + rcu_nmi_exit(); + lockdep_hardirq_exit(); + if (restore) + lockdep_hardirqs_on(CALLER_ADDR0); + __nmi_exit(); +} + #ifdef CONFIG_XEN_PV #ifndef CONFIG_PREEMPTION /* diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 7227225cf45d..2b0497486525 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -20,6 +20,9 @@ typedef struct idtentry_state { idtentry_state_t idtentry_enter(struct pt_regs *regs); void idtentry_exit(struct pt_regs *regs, idtentry_state_t state); +bool idtentry_enter_nmi(struct pt_regs *regs); +void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state); + /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index d7c5e44b26f7..4fc9954a9560 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -330,7 +330,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); instrumentation_begin(); - trace_hardirqs_off_finish(); handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); @@ -417,8 +416,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); out: - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); } @@ -478,6 +475,8 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { + bool irq_state; + if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; @@ -491,14 +490,14 @@ nmi_restart: this_cpu_write(nmi_dr7, local_db_save()); - nmi_enter(); + irq_state = idtentry_enter_nmi(regs); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); local_db_restore(this_cpu_read(nmi_dr7)); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4627f826fb57..cdd73829e637 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -403,7 +403,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) } #endif - nmi_enter(); + idtentry_enter_nmi(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -649,15 +649,12 @@ DEFINE_IDTENTRY_RAW(exc_int3) instrumentation_end(); idtentry_exit_user(regs); } else { - nmi_enter(); + bool irq_state = idtentry_enter_nmi(regs); instrumentation_begin(); - trace_hardirqs_off_finish(); if (!do_int3(regs)) die("int3", regs, 0); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); } } @@ -865,9 +862,8 @@ out: static __always_inline void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { - nmi_enter(); + bool irq_state = idtentry_enter_nmi(regs); instrumentation_begin(); - trace_hardirqs_off_finish(); /* * If something gets miswired and we end up here for a user mode @@ -884,10 +880,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, handle_debug(regs, dr6, false); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); } static __always_inline void exc_debug_user(struct pt_regs *regs, @@ -903,6 +897,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, instrumentation_begin(); handle_debug(regs, dr6, true); + instrumentation_end(); idtentry_exit_user(regs); } diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 03c9fece7d43..754f67ac4326 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -111,32 +111,42 @@ extern void rcu_nmi_exit(void); /* * nmi_enter() can nest up to 15 times; see NMI_BITS. */ -#define nmi_enter() \ +#define __nmi_enter() \ do { \ + lockdep_off(); \ arch_nmi_enter(); \ printk_nmi_enter(); \ - lockdep_off(); \ BUG_ON(in_nmi() == NMI_MASK); \ __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ - rcu_nmi_enter(); \ + } while (0) + +#define nmi_enter() \ + do { \ + __nmi_enter(); \ lockdep_hardirq_enter(); \ + rcu_nmi_enter(); \ instrumentation_begin(); \ ftrace_nmi_enter(); \ instrumentation_end(); \ } while (0) +#define __nmi_exit() \ + do { \ + BUG_ON(!in_nmi()); \ + __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ + printk_nmi_exit(); \ + arch_nmi_exit(); \ + lockdep_on(); \ + } while (0) + #define nmi_exit() \ do { \ instrumentation_begin(); \ ftrace_nmi_exit(); \ instrumentation_end(); \ - lockdep_hardirq_exit(); \ rcu_nmi_exit(); \ - BUG_ON(!in_nmi()); \ - __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ - lockdep_on(); \ - printk_nmi_exit(); \ - arch_nmi_exit(); \ + lockdep_hardirq_exit(); \ + __nmi_exit(); \ } while (0) #endif /* LINUX_HARDIRQ_H */ From 28e5bfd81c8de77504703adf24ceff9301e3c7be Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 22:41:05 +0200 Subject: [PATCH 319/502] s390: Break cyclic percpu include In order to use in irqflags.h, we need to make sure asm/percpu.h does not itself depend on irqflags.h Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.396143816@infradead.org --- arch/s390/include/asm/smp.h | 1 + arch/s390/include/asm/thread_info.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 7326f110d48c..f48a43b63d9e 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -10,6 +10,7 @@ #include #include +#include #define raw_smp_processor_id() (S390_lowcore.cpu_nr) diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index e582fbe59e20..13a04fcf7762 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -24,7 +24,6 @@ #ifndef __ASSEMBLY__ #include #include -#include #define STACK_INIT_OFFSET \ (THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs)) From a6342915881a687b07847b7c57628de07a256525 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2020 17:21:58 +0200 Subject: [PATCH 320/502] arm: Break cyclic percpu include In order to use in irqflags.h, we need to make sure asm/percpu.h does not itself depend on irqflags.h. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Acked-by: Will Deacon Link: https://lkml.kernel.org/r/20200623083721.454517573@infradead.org --- arch/arm/include/asm/percpu.h | 2 ++ arch/arm/include/asm/thread_info.h | 5 ----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/arm/include/asm/percpu.h b/arch/arm/include/asm/percpu.h index f44f448537f2..e2fcb3cfd3de 100644 --- a/arch/arm/include/asm/percpu.h +++ b/arch/arm/include/asm/percpu.h @@ -5,6 +5,8 @@ #ifndef _ASM_ARM_PERCPU_H_ #define _ASM_ARM_PERCPU_H_ +register unsigned long current_stack_pointer asm ("sp"); + /* * Same as asm-generic/percpu.h, except that we store the per cpu offset * in the TPIDRPRW. TPIDRPRW only exists on V6K and V7 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 3609a6980c34..536b6b979f63 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -75,11 +75,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -/* - * how to get the current stack pointer in C - */ -register unsigned long current_stack_pointer asm ("sp"); - /* * how to get the thread information struct from C */ From a21ee6055c30ce68c4e201c6496f0ed2a1936230 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2020 12:22:41 +0200 Subject: [PATCH 321/502] lockdep: Change hardirq{s_enabled,_context} to per-cpu variables Currently all IRQ-tracking state is in task_struct, this means that task_struct needs to be defined before we use it. Especially for lockdep_assert_irq*() this can lead to header-hell. Move the hardirq state into per-cpu variables to avoid the task_struct dependency. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.512673481@infradead.org --- include/linux/irqflags.h | 19 ++++++++++++------- include/linux/lockdep.h | 34 ++++++++++++++++++---------------- include/linux/sched.h | 2 -- kernel/fork.c | 4 +--- kernel/locking/lockdep.c | 30 +++++++++++++++--------------- kernel/softirq.c | 6 ++++++ 6 files changed, 52 insertions(+), 43 deletions(-) diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 6384d2813ded..255444fe4609 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -14,6 +14,7 @@ #include #include +#include /* Currently lockdep_softirqs_on/off is used only by lockdep */ #ifdef CONFIG_PROVE_LOCKING @@ -31,18 +32,22 @@ #endif #ifdef CONFIG_TRACE_IRQFLAGS + +DECLARE_PER_CPU(int, hardirqs_enabled); +DECLARE_PER_CPU(int, hardirq_context); + extern void trace_hardirqs_on_prepare(void); extern void trace_hardirqs_off_finish(void); extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); -# define lockdep_hardirq_context(p) ((p)->hardirq_context) +# define lockdep_hardirq_context(p) (this_cpu_read(hardirq_context)) # define lockdep_softirq_context(p) ((p)->softirq_context) -# define lockdep_hardirqs_enabled(p) ((p)->hardirqs_enabled) +# define lockdep_hardirqs_enabled(p) (this_cpu_read(hardirqs_enabled)) # define lockdep_softirqs_enabled(p) ((p)->softirqs_enabled) -# define lockdep_hardirq_enter() \ -do { \ - if (!current->hardirq_context++) \ - current->hardirq_threaded = 0; \ +# define lockdep_hardirq_enter() \ +do { \ + if (this_cpu_inc_return(hardirq_context) == 1) \ + current->hardirq_threaded = 0; \ } while (0) # define lockdep_hardirq_threaded() \ do { \ @@ -50,7 +55,7 @@ do { \ } while (0) # define lockdep_hardirq_exit() \ do { \ - current->hardirq_context--; \ + this_cpu_dec(hardirq_context); \ } while (0) # define lockdep_softirq_enter() \ do { \ diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 3b73cf84f77d..be6cb17a8879 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -11,6 +11,7 @@ #define __LINUX_LOCKDEP_H #include +#include struct task_struct; @@ -529,28 +530,29 @@ do { \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) -#define lockdep_assert_irqs_enabled() do { \ - WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - !current->hardirqs_enabled, \ - "IRQs not enabled as expected\n"); \ - } while (0) +DECLARE_PER_CPU(int, hardirqs_enabled); +DECLARE_PER_CPU(int, hardirq_context); -#define lockdep_assert_irqs_disabled() do { \ - WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - current->hardirqs_enabled, \ - "IRQs not disabled as expected\n"); \ - } while (0) +#define lockdep_assert_irqs_enabled() \ +do { \ + WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirqs_enabled)); \ +} while (0) -#define lockdep_assert_in_irq() do { \ - WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - !current->hardirq_context, \ - "Not in hardirq as expected\n"); \ - } while (0) +#define lockdep_assert_irqs_disabled() \ +do { \ + WARN_ON_ONCE(debug_locks && this_cpu_read(hardirqs_enabled)); \ +} while (0) + +#define lockdep_assert_in_irq() \ +do { \ + WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirq_context)); \ +} while (0) #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define might_lock_nested(lock, subclass) do { } while (0) + # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) @@ -560,7 +562,7 @@ do { \ # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - current->hardirq_context && \ + lockdep_hardirq_context(current) && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 692e327d7455..3903a9500926 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -990,8 +990,6 @@ struct task_struct { unsigned long hardirq_disable_ip; unsigned int hardirq_enable_event; unsigned int hardirq_disable_event; - int hardirqs_enabled; - int hardirq_context; u64 hardirq_chain_key; unsigned long softirq_disable_ip; unsigned long softirq_enable_ip; diff --git a/kernel/fork.c b/kernel/fork.c index efc5493203ae..70d9d0a4de2a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1954,8 +1954,8 @@ static __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); + lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; @@ -2036,7 +2036,6 @@ static __latent_entropy struct task_struct *copy_process( #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; - p->hardirqs_enabled = 0; p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; @@ -2046,7 +2045,6 @@ static __latent_entropy struct task_struct *copy_process( p->softirq_enable_event = 0; p->softirq_disable_ip = 0; p->softirq_disable_event = 0; - p->hardirq_context = 0; p->softirq_context = 0; #endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d595623c4b34..ab4ffbe0e9e9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, + lockdep_hardirqs_enabled(curr), curr->softirqs_enabled); print_lock(next); @@ -3658,7 +3658,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (unlikely(current->hardirqs_enabled)) { + if (unlikely(lockdep_hardirqs_enabled(current))) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3686,7 +3686,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current))) return; current->hardirq_chain_key = current->curr_chain_key; @@ -3724,7 +3724,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled(curr)) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3751,7 +3751,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) skip_checks: /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; + this_cpu_write(hardirqs_enabled, 1); curr->hardirq_enable_ip = ip; curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_on_events); @@ -3783,11 +3783,11 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled(curr)) { /* * We have done an ON -> OFF transition: */ - curr->hardirqs_enabled = 0; + this_cpu_write(hardirqs_enabled, 0); curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_off_events); @@ -3832,7 +3832,7 @@ void lockdep_softirqs_on(unsigned long ip) * usage bit for all held locks, if hardirqs are * enabled too: */ - if (curr->hardirqs_enabled) + if (lockdep_hardirqs_enabled(curr)) mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3881,7 +3881,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) */ if (!hlock->trylock) { if (hlock->read) { - if (curr->hardirq_context) + if (lockdep_hardirq_context(curr)) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -3890,7 +3890,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (curr->hardirq_context) + if (lockdep_hardirq_context(curr)) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -3928,7 +3928,7 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } @@ -4021,7 +4021,7 @@ static inline short task_wait_context(struct task_struct *curr) * Set appropriate wait type for the context; for IRQs we have to take * into account force_irqthread as that is implied by PREEMPT_RT. */ - if (curr->hardirq_context) { + if (lockdep_hardirq_context(curr)) { /* * Check if force_irqthreads will run us threaded. */ @@ -4864,11 +4864,11 @@ static void check_flags(unsigned long flags) return; if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) { printk("possible reason: unannotated irqs-on.\n"); } } diff --git a/kernel/softirq.c b/kernel/softirq.c index c4201b7f42b1..342c53feaa7a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -107,6 +107,12 @@ static bool ksoftirqd_running(unsigned long pending) * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS + +DEFINE_PER_CPU(int, hardirqs_enabled); +DEFINE_PER_CPU(int, hardirq_context); +EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; From f9ad4a5f3f20bee022b1bdde94e5ece6dc0b0edc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2020 13:03:26 +0200 Subject: [PATCH 322/502] lockdep: Remove lockdep_hardirq{s_enabled,_context}() argument Now that the macros use per-cpu data, we no longer need the argument. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.571835311@infradead.org --- arch/x86/entry/common.c | 2 +- include/linux/irqflags.h | 8 ++++---- include/linux/lockdep.h | 2 +- kernel/locking/lockdep.c | 30 +++++++++++++++--------------- kernel/softirq.c | 2 +- tools/include/linux/irqflags.h | 4 ++-- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 63c607dd6c52..4ea640363f5d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -758,7 +758,7 @@ noinstr void idtentry_exit_user(struct pt_regs *regs) noinstr bool idtentry_enter_nmi(struct pt_regs *regs) { - bool irq_state = lockdep_hardirqs_enabled(current); + bool irq_state = lockdep_hardirqs_enabled(); __nmi_enter(); lockdep_hardirqs_off(CALLER_ADDR0); diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 255444fe4609..5811ee8a5cd8 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -40,9 +40,9 @@ DECLARE_PER_CPU(int, hardirq_context); extern void trace_hardirqs_off_finish(void); extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); -# define lockdep_hardirq_context(p) (this_cpu_read(hardirq_context)) +# define lockdep_hardirq_context() (this_cpu_read(hardirq_context)) # define lockdep_softirq_context(p) ((p)->softirq_context) -# define lockdep_hardirqs_enabled(p) (this_cpu_read(hardirqs_enabled)) +# define lockdep_hardirqs_enabled() (this_cpu_read(hardirqs_enabled)) # define lockdep_softirqs_enabled(p) ((p)->softirqs_enabled) # define lockdep_hardirq_enter() \ do { \ @@ -109,9 +109,9 @@ do { \ # define trace_hardirqs_off_finish() do { } while (0) # define trace_hardirqs_on() do { } while (0) # define trace_hardirqs_off() do { } while (0) -# define lockdep_hardirq_context(p) 0 +# define lockdep_hardirq_context() 0 # define lockdep_softirq_context(p) 0 -# define lockdep_hardirqs_enabled(p) 0 +# define lockdep_hardirqs_enabled() 0 # define lockdep_softirqs_enabled(p) 0 # define lockdep_hardirq_enter() do { } while (0) # define lockdep_hardirq_threaded() do { } while (0) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index be6cb17a8879..fd04b9e96091 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -562,7 +562,7 @@ do { \ # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - lockdep_hardirq_context(current) && \ + lockdep_hardirq_context() && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ab4ffbe0e9e9..c9ea05edce25 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - lockdep_hardirqs_enabled(curr), + lockdep_hardirqs_enabled(), curr->softirqs_enabled); print_lock(next); @@ -3331,9 +3331,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), - lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - lockdep_hardirqs_enabled(curr), + lockdep_hardirqs_enabled(), lockdep_softirqs_enabled(curr)); print_lock(this); @@ -3658,7 +3658,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (unlikely(lockdep_hardirqs_enabled(current))) { + if (unlikely(lockdep_hardirqs_enabled())) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3686,7 +3686,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current))) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())) return; current->hardirq_chain_key = current->curr_chain_key; @@ -3724,7 +3724,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (lockdep_hardirqs_enabled(curr)) { + if (lockdep_hardirqs_enabled()) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3783,7 +3783,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (lockdep_hardirqs_enabled(curr)) { + if (lockdep_hardirqs_enabled()) { /* * We have done an ON -> OFF transition: */ @@ -3832,7 +3832,7 @@ void lockdep_softirqs_on(unsigned long ip) * usage bit for all held locks, if hardirqs are * enabled too: */ - if (lockdep_hardirqs_enabled(curr)) + if (lockdep_hardirqs_enabled()) mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3881,7 +3881,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) */ if (!hlock->trylock) { if (hlock->read) { - if (lockdep_hardirq_context(curr)) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -3890,7 +3890,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (lockdep_hardirq_context(curr)) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -3928,7 +3928,7 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) + + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } @@ -4021,7 +4021,7 @@ static inline short task_wait_context(struct task_struct *curr) * Set appropriate wait type for the context; for IRQs we have to take * into account force_irqthread as that is implied by PREEMPT_RT. */ - if (lockdep_hardirq_context(curr)) { + if (lockdep_hardirq_context()) { /* * Check if force_irqthreads will run us threaded. */ @@ -4864,11 +4864,11 @@ static void check_flags(unsigned long flags) return; if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-on.\n"); } } diff --git a/kernel/softirq.c b/kernel/softirq.c index 342c53feaa7a..5e9aaa648a74 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -230,7 +230,7 @@ static inline bool lockdep_softirq_start(void) { bool in_hardirq = false; - if (lockdep_hardirq_context(current)) { + if (lockdep_hardirq_context()) { in_hardirq = true; lockdep_hardirq_exit(); } diff --git a/tools/include/linux/irqflags.h b/tools/include/linux/irqflags.h index 67e01bbadbfe..501262aee8ff 100644 --- a/tools/include/linux/irqflags.h +++ b/tools/include/linux/irqflags.h @@ -2,9 +2,9 @@ #ifndef _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_ #define _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_ -# define lockdep_hardirq_context(p) 0 +# define lockdep_hardirq_context() 0 # define lockdep_softirq_context(p) 0 -# define lockdep_hardirqs_enabled(p) 0 +# define lockdep_hardirqs_enabled() 0 # define lockdep_softirqs_enabled(p) 0 # define lockdep_hardirq_enter() do { } while (0) # define lockdep_hardirq_exit() do { } while (0) From 776499058167d9f41c8eb468e21fe2d241c0b8e6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 1 Jul 2020 16:18:29 +0200 Subject: [PATCH 323/502] mm/memblock: expose only miminal interface to add/walk physmem "physmem" in the memblock allocator is somewhat weird: it's not actually used for allocation, it's simply information collected during boot, which describes the unmodified physical memory map at boot time, without any standby/hotplugged memory. It's only used on s390 and is currently the only reason s390 keeps using CONFIG_ARCH_KEEP_MEMBLOCK. Physmem isn't numa aware and current users don't specify any flags. Let's hide it from the user, exposing only for_each_physmem(), and simplify. The interface for physmem is now really minimalistic: - memblock_physmem_add() to add ranges - for_each_physmem() / __next_physmem_range() to walk physmem ranges Don't place it into an __init section and don't discard it without CONFIG_ARCH_KEEP_MEMBLOCK. As we're reusing __next_mem_range(), remove the __meminit notifier to avoid section mismatch warnings once CONFIG_ARCH_KEEP_MEMBLOCK is no longer used with CONFIG_HAVE_MEMBLOCK_PHYS_MAP. While fixing up the documentation, sneak in some related cleanups. We can stop setting CONFIG_ARCH_KEEP_MEMBLOCK for s390 next. Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Mike Rapoport Cc: Andrew Morton Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport Message-Id: <20200701141830.18749-2-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/kernel/crash_dump.c | 6 ++-- include/linux/memblock.h | 28 ++++++++++++++--- mm/memblock.c | 57 ++++++++++++++++++----------------- 3 files changed, 55 insertions(+), 36 deletions(-) diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index f96a5857bbfd..c42ce348103c 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -549,8 +549,7 @@ static int get_mem_chunk_cnt(void) int cnt = 0; u64 idx; - for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE, - MEMBLOCK_NONE, NULL, NULL, NULL) + for_each_physmem_range(idx, &oldmem_type, NULL, NULL) cnt++; return cnt; } @@ -563,8 +562,7 @@ static void loads_init(Elf64_Phdr *phdr, u64 loads_offset) phys_addr_t start, end; u64 idx; - for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE, - MEMBLOCK_NONE, &start, &end, NULL) { + for_each_physmem_range(idx, &oldmem_type, &start, &end) { phdr->p_filesz = end - start; phdr->p_type = PT_LOAD; phdr->p_offset = start; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 017fae833d4a..9d925db0d355 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -77,16 +77,12 @@ struct memblock_type { * @current_limit: physical address of the current allocation limit * @memory: usable memory regions * @reserved: reserved memory regions - * @physmem: all physical memory */ struct memblock { bool bottom_up; /* is bottom up direction? */ phys_addr_t current_limit; struct memblock_type memory; struct memblock_type reserved; -#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP - struct memblock_type physmem; -#endif }; extern struct memblock memblock; @@ -145,6 +141,30 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, void __memblock_free_late(phys_addr_t base, phys_addr_t size); +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, + phys_addr_t *out_start, + phys_addr_t *out_end) +{ + extern struct memblock_type physmem; + + __next_mem_range(idx, NUMA_NO_NODE, MEMBLOCK_NONE, &physmem, type, + out_start, out_end, NULL); +} + +/** + * for_each_physmem_range - iterate through physmem areas not included in type. + * @i: u64 used as loop variable + * @type: ptr to memblock_type which excludes from the iteration, can be %NULL + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + */ +#define for_each_physmem_range(i, type, p_start, p_end) \ + for (i = 0, __next_physmem_range(&i, type, p_start, p_end); \ + i != (u64)ULLONG_MAX; \ + __next_physmem_range(&i, type, p_start, p_end)) +#endif /* CONFIG_HAVE_MEMBLOCK_PHYS_MAP */ + /** * for_each_mem_range - iterate through memblock areas from type_a and not * included in type_b. Or just type_a if type_b is NULL. diff --git a/mm/memblock.c b/mm/memblock.c index 39aceafc57f6..45f198750be9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -44,19 +44,20 @@ * in the system, for instance when the memory is restricted with * ``mem=`` command line parameter * * ``reserved`` - describes the regions that were allocated - * * ``physmap`` - describes the actual physical memory regardless of - * the possible restrictions; the ``physmap`` type is only available - * on some architectures. + * * ``physmem`` - describes the actual physical memory available during + * boot regardless of the possible restrictions and memory hot(un)plug; + * the ``physmem`` type is only available on some architectures. * * Each region is represented by :c:type:`struct memblock_region` that * defines the region extents, its attributes and NUMA node id on NUMA * systems. Every memory type is described by the :c:type:`struct * memblock_type` which contains an array of memory regions along with - * the allocator metadata. The memory types are nicely wrapped with - * :c:type:`struct memblock`. This structure is statically initialzed - * at build time. The region arrays for the "memory" and "reserved" - * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the - * "physmap" type to %INIT_PHYSMEM_REGIONS. + * the allocator metadata. The "memory" and "reserved" types are nicely + * wrapped with :c:type:`struct memblock`. This structure is statically + * initialized at build time. The region arrays are initially sized to + * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS + * for "reserved". The region array for "physmem" is initially sized to + * %INIT_PHYSMEM_REGIONS. * The memblock_allow_resize() enables automatic resizing of the region * arrays during addition of new regions. This feature should be used * with care so that memory allocated for the region array will not @@ -87,8 +88,8 @@ * function frees all the memory to the buddy page allocator. * * Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the - * memblock data structures will be discarded after the system - * initialization completes. + * memblock data structures (except "physmem") will be discarded after the + * system initialization completes. */ #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -104,7 +105,7 @@ unsigned long long max_possible_pfn; static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP -static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock; +static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS]; #endif struct memblock memblock __initdata_memblock = { @@ -118,17 +119,19 @@ struct memblock memblock __initdata_memblock = { .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS, .reserved.name = "reserved", -#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP - .physmem.regions = memblock_physmem_init_regions, - .physmem.cnt = 1, /* empty dummy entry */ - .physmem.max = INIT_PHYSMEM_REGIONS, - .physmem.name = "physmem", -#endif - .bottom_up = false, .current_limit = MEMBLOCK_ALLOC_ANYWHERE, }; +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +struct memblock_type physmem = { + .regions = memblock_physmem_init_regions, + .cnt = 1, /* empty dummy entry */ + .max = INIT_PHYSMEM_REGIONS, + .name = "physmem", +}; +#endif + int memblock_debug __initdata_memblock; static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; @@ -838,7 +841,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); - return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0); + return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0); } #endif @@ -1019,12 +1022,10 @@ static bool should_skip_region(struct memblock_region *m, int nid, int flags) * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_mem_range(u64 *idx, int nid, - enum memblock_flags flags, - struct memblock_type *type_a, - struct memblock_type *type_b, - phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid) +void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, + struct memblock_type *type_a, + struct memblock_type *type_b, phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; @@ -1924,7 +1925,7 @@ void __init_memblock __memblock_dump_all(void) memblock_dump(&memblock.memory); memblock_dump(&memblock.reserved); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP - memblock_dump(&memblock.physmem); + memblock_dump(&physmem); #endif } @@ -2064,8 +2065,8 @@ static int __init memblock_init_debugfs(void) debugfs_create_file("reserved", 0444, root, &memblock.reserved, &memblock_debug_fops); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP - debugfs_create_file("physmem", 0444, root, - &memblock.physmem, &memblock_debug_fops); + debugfs_create_file("physmem", 0444, root, &physmem, + &memblock_debug_fops); #endif return 0; From fa49066fc326b78e7141d68387179f8968e0e1f0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 1 Jul 2020 16:18:30 +0200 Subject: [PATCH 324/502] s390/mm: don't set ARCH_KEEP_MEMBLOCK Commit 50be63450728 ("s390/mm: Convert bootmem to memblock") mentions "The original bootmem allocator is getting replaced by memblock. To cover the needs of the s390 kdump implementation the physical memory list is used." As we can now reference "physmem" managed in the memblock allocator after init even without ARCH_KEEP_MEMBLOCK, and s390x does no longer need other memblock metadata after boot (esp., the zcore memmap device that used it got removed), we can stop setting ARCH_KEEP_MEMBLOCK. With this change, we no longer create memblocks for standby/hotplugged memory (added via add_memory()) and free up memblock metadata (except physmem) after boot. Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Philipp Rudo Cc: Mike Rapoport Cc: Andrew Morton Signed-off-by: David Hildenbrand Message-Id: <20200701141830.18749-3-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c7d7ede6300c..7697a1f8e819 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -102,7 +102,6 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE - select ARCH_KEEP_MEMBLOCK select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING From c8337c47deb9338417c61e7a6ba7de690eb1d300 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Wed, 1 Jul 2020 12:40:39 +0200 Subject: [PATCH 325/502] s390/ap: rework crypto config info and default domain code Rework of the QCI crypto info and how it is used. This is only a internal rework but does not affect the way how the ap bus acts with ap card and queue devices and domain handling. Tested on z15, z14, z12 (QCI support) and z196 (no QCI support). Signed-off-by: Harald Freudenberger Signed-off-by: Heiko Carstens --- drivers/s390/crypto/ap_bus.c | 305 ++++++++++++++++++----------------- 1 file changed, 155 insertions(+), 150 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 64fa66788194..f218a0b67ed5 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -73,8 +73,7 @@ EXPORT_SYMBOL(ap_perms); DEFINE_MUTEX(ap_perms_mutex); EXPORT_SYMBOL(ap_perms_mutex); -static struct ap_config_info *ap_configuration; -static bool initialised; +static struct ap_config_info *ap_qci_info; /* * AP bus related debug feature things. @@ -105,8 +104,10 @@ static struct hrtimer ap_poll_timer; */ static unsigned long long poll_timeout = 250000; -/* Maximum domain id */ -static int ap_max_domain_id; +/* Maximum domain id, if not given via qci */ +static int ap_max_domain_id = 15; +/* Maximum adapter id, if not given via qci */ +static int ap_max_adapter_id = 63; static struct bus_type ap_bus_type; @@ -154,12 +155,12 @@ static int ap_interrupts_available(void) } /** - * ap_configuration_available(): Test if AP configuration - * information is available. + * ap_qci_available(): Test if AP configuration + * information can be queried via QCI subfunction. * - * Returns 1 if AP configuration information is available. + * Returns 1 if subfunction PQAP(QCI) is available. */ -static int ap_configuration_available(void) +static int ap_qci_available(void) { return test_facility(12); } @@ -182,22 +183,22 @@ static int ap_apft_available(void) */ static inline int ap_qact_available(void) { - if (ap_configuration) - return ap_configuration->qact; + if (ap_qci_info) + return ap_qci_info->qact; return 0; } /* - * ap_query_configuration(): Fetch cryptographic config info + * ap_fetch_qci_info(): Fetch cryptographic config info * * Returns the ap configuration info fetched via PQAP(QCI). * On success 0 is returned, on failure a negative errno * is returned, e.g. if the PQAP(QCI) instruction is not * available, the return value will be -EOPNOTSUPP. */ -static inline int ap_query_configuration(struct ap_config_info *info) +static inline int ap_fetch_qci_info(struct ap_config_info *info) { - if (!ap_configuration_available()) + if (!ap_qci_available()) return -EOPNOTSUPP; if (!info) return -EINVAL; @@ -205,20 +206,39 @@ static inline int ap_query_configuration(struct ap_config_info *info) } /** - * ap_init_configuration(): Allocate and query configuration array. - */ -static void ap_init_configuration(void) -{ - if (!ap_configuration_available()) - return; + * ap_init_qci_info(): Allocate and query qci config info. + * Does also update the static variables ap_max_domain_id + * and ap_max_adapter_id if this info is available. - ap_configuration = kzalloc(sizeof(*ap_configuration), GFP_KERNEL); - if (!ap_configuration) + */ +static void __init ap_init_qci_info(void) +{ + if (!ap_qci_available()) { + AP_DBF(DBF_INFO, "%s QCI not supported\n", __func__); return; - if (ap_query_configuration(ap_configuration) != 0) { - kfree(ap_configuration); - ap_configuration = NULL; + } + + ap_qci_info = kzalloc(sizeof(*ap_qci_info), GFP_KERNEL); + if (!ap_qci_info) return; + if (ap_fetch_qci_info(ap_qci_info) != 0) { + kfree(ap_qci_info); + ap_qci_info = NULL; + return; + } + AP_DBF(DBF_INFO, "%s successful fetched initial qci info\n", __func__); + + if (ap_qci_info->apxa) { + if (ap_qci_info->Na) { + ap_max_adapter_id = ap_qci_info->Na; + AP_DBF(DBF_INFO, "%s new ap_max_adapter_id is %d\n", + __func__, ap_max_adapter_id); + } + if (ap_qci_info->Nd) { + ap_max_domain_id = ap_qci_info->Nd; + AP_DBF(DBF_INFO, "%s new ap_max_domain_id is %d\n", + __func__, ap_max_domain_id); + } } } @@ -233,7 +253,6 @@ static inline int ap_test_config(unsigned int *field, unsigned int nr) /* * ap_test_config_card_id(): Test, whether an AP card ID is configured. - * @id AP card ID * * Returns 0 if the card is not configured * 1 if the card is configured or @@ -241,16 +260,16 @@ static inline int ap_test_config(unsigned int *field, unsigned int nr) */ static inline int ap_test_config_card_id(unsigned int id) { - if (!ap_configuration) /* QCI not supported */ - /* only ids 0...3F may be probed */ - return id < 0x40 ? 1 : 0; - return ap_test_config(ap_configuration->apm, id); + if (id > ap_max_adapter_id) + return 0; + if (ap_qci_info) + return ap_test_config(ap_qci_info->apm, id); + return 1; } /* * ap_test_config_usage_domain(): Test, whether an AP usage domain * is configured. - * @domain AP usage domain ID * * Returns 0 if the usage domain is not configured * 1 if the usage domain is configured or @@ -258,9 +277,11 @@ static inline int ap_test_config_card_id(unsigned int id) */ int ap_test_config_usage_domain(unsigned int domain) { - if (!ap_configuration) /* QCI not supported */ - return domain < 16; - return ap_test_config(ap_configuration->aqm, domain); + if (domain > ap_max_domain_id) + return 0; + if (ap_qci_info) + return ap_test_config(ap_qci_info->aqm, domain); + return 1; } EXPORT_SYMBOL(ap_test_config_usage_domain); @@ -274,43 +295,44 @@ EXPORT_SYMBOL(ap_test_config_usage_domain); */ int ap_test_config_ctrl_domain(unsigned int domain) { - if (!ap_configuration) /* QCI not supported */ + if (!ap_qci_info || domain > ap_max_domain_id) return 0; - return ap_test_config(ap_configuration->adm, domain); + return ap_test_config(ap_qci_info->adm, domain); } EXPORT_SYMBOL(ap_test_config_ctrl_domain); -/** - * ap_query_queue(): Check if an AP queue is available. - * @qid: The AP queue number - * @queue_depth: Pointer to queue depth value - * @device_type: Pointer to device type value - * @facilities: Pointer to facility indicator +/* + * ap_queue_info(): Check and get AP queue info. + * Returns true if TAPQ succeeded and the info is filled or + * false otherwise. */ -static int ap_query_queue(ap_qid_t qid, int *queue_depth, int *device_type, - unsigned int *facilities) +static bool ap_queue_info(ap_qid_t qid, int *q_type, + unsigned int *q_fac, int *q_depth) { struct ap_queue_status status; - unsigned long info; - int nd; + unsigned long info = 0; - if (!ap_test_config_card_id(AP_QID_CARD(qid))) - return -ENODEV; + /* make sure we don't run into a specifiation exception */ + if (AP_QID_CARD(qid) > ap_max_adapter_id || + AP_QID_QUEUE(qid) > ap_max_domain_id) + return false; + /* call TAPQ on this APQN */ status = ap_test_queue(qid, ap_apft_available(), &info); switch (status.response_code) { case AP_RESPONSE_NORMAL: - *queue_depth = (int)(info & 0xff); - *device_type = (int)((info >> 24) & 0xff); - *facilities = (unsigned int)(info >> 32); - /* Update maximum domain id */ - nd = (info >> 16) & 0xff; - /* if N bit is available, z13 and newer */ - if ((info & (1UL << 57)) && nd > 0) - ap_max_domain_id = nd; - else /* older machine types */ - ap_max_domain_id = 15; - switch (*device_type) { + case AP_RESPONSE_RESET_IN_PROGRESS: + /* + * According to the architecture in all these cases the + * info should be filled. All bits 0 is not possible as + * there is at least one of the mode bits set. + */ + if (WARN_ON_ONCE(!info)) + return false; + *q_type = (int)((info >> 24) & 0xff); + *q_fac = (unsigned int)(info >> 32); + *q_depth = (int)(info & 0xff); + switch (*q_type) { /* For CEX2 and CEX3 the available functions * are not reflected by the facilities bits. * Instead it is coded into the type. So here @@ -318,27 +340,21 @@ static int ap_query_queue(ap_qid_t qid, int *queue_depth, int *device_type, */ case AP_DEVICE_TYPE_CEX2A: case AP_DEVICE_TYPE_CEX3A: - *facilities |= 0x08000000; + *q_fac |= 0x08000000; break; case AP_DEVICE_TYPE_CEX2C: case AP_DEVICE_TYPE_CEX3C: - *facilities |= 0x10000000; + *q_fac |= 0x10000000; break; default: break; } - return 0; - case AP_RESPONSE_Q_NOT_AVAIL: - case AP_RESPONSE_DECONFIGURED: - case AP_RESPONSE_CHECKSTOPPED: - case AP_RESPONSE_INVALID_ADDRESS: - return -ENODEV; - case AP_RESPONSE_RESET_IN_PROGRESS: - case AP_RESPONSE_OTHERWISE_CHANGED: - case AP_RESPONSE_BUSY: - return -EBUSY; + return true; default: - BUG(); + /* + * A response code which indicates, there is no info available. + */ + return false; } } @@ -751,9 +767,6 @@ int ap_driver_register(struct ap_driver *ap_drv, struct module *owner, { struct device_driver *drv = &ap_drv->driver; - if (!initialised) - return -ENODEV; - drv->bus = &ap_bus_type; drv->probe = ap_device_probe; drv->remove = ap_device_remove; @@ -929,11 +942,12 @@ static ssize_t ap_domain_store(struct bus_type *bus, domain < 0 || domain > ap_max_domain_id || !test_bit_inv(domain, ap_perms.aqm)) return -EINVAL; + spin_lock_bh(&ap_domain_lock); ap_domain_index = domain; spin_unlock_bh(&ap_domain_lock); - AP_DBF(DBF_DEBUG, "stored new default domain=%d\n", domain); + AP_DBF(DBF_INFO, "stored new default domain=%d\n", domain); return count; } @@ -942,45 +956,45 @@ static BUS_ATTR_RW(ap_domain); static ssize_t ap_control_domain_mask_show(struct bus_type *bus, char *buf) { - if (!ap_configuration) /* QCI not supported */ + if (!ap_qci_info) /* QCI not supported */ return scnprintf(buf, PAGE_SIZE, "not supported\n"); return scnprintf(buf, PAGE_SIZE, "0x%08x%08x%08x%08x%08x%08x%08x%08x\n", - ap_configuration->adm[0], ap_configuration->adm[1], - ap_configuration->adm[2], ap_configuration->adm[3], - ap_configuration->adm[4], ap_configuration->adm[5], - ap_configuration->adm[6], ap_configuration->adm[7]); + ap_qci_info->adm[0], ap_qci_info->adm[1], + ap_qci_info->adm[2], ap_qci_info->adm[3], + ap_qci_info->adm[4], ap_qci_info->adm[5], + ap_qci_info->adm[6], ap_qci_info->adm[7]); } static BUS_ATTR_RO(ap_control_domain_mask); static ssize_t ap_usage_domain_mask_show(struct bus_type *bus, char *buf) { - if (!ap_configuration) /* QCI not supported */ + if (!ap_qci_info) /* QCI not supported */ return scnprintf(buf, PAGE_SIZE, "not supported\n"); return scnprintf(buf, PAGE_SIZE, "0x%08x%08x%08x%08x%08x%08x%08x%08x\n", - ap_configuration->aqm[0], ap_configuration->aqm[1], - ap_configuration->aqm[2], ap_configuration->aqm[3], - ap_configuration->aqm[4], ap_configuration->aqm[5], - ap_configuration->aqm[6], ap_configuration->aqm[7]); + ap_qci_info->aqm[0], ap_qci_info->aqm[1], + ap_qci_info->aqm[2], ap_qci_info->aqm[3], + ap_qci_info->aqm[4], ap_qci_info->aqm[5], + ap_qci_info->aqm[6], ap_qci_info->aqm[7]); } static BUS_ATTR_RO(ap_usage_domain_mask); static ssize_t ap_adapter_mask_show(struct bus_type *bus, char *buf) { - if (!ap_configuration) /* QCI not supported */ + if (!ap_qci_info) /* QCI not supported */ return scnprintf(buf, PAGE_SIZE, "not supported\n"); return scnprintf(buf, PAGE_SIZE, "0x%08x%08x%08x%08x%08x%08x%08x%08x\n", - ap_configuration->apm[0], ap_configuration->apm[1], - ap_configuration->apm[2], ap_configuration->apm[3], - ap_configuration->apm[4], ap_configuration->apm[5], - ap_configuration->apm[6], ap_configuration->apm[7]); + ap_qci_info->apm[0], ap_qci_info->apm[1], + ap_qci_info->apm[2], ap_qci_info->apm[3], + ap_qci_info->apm[4], ap_qci_info->apm[5], + ap_qci_info->apm[6], ap_qci_info->apm[7]); } static BUS_ATTR_RO(ap_adapter_mask); @@ -1066,17 +1080,18 @@ static BUS_ATTR_RW(poll_timeout); static ssize_t ap_max_domain_id_show(struct bus_type *bus, char *buf) { - int max_domain_id; - - if (ap_configuration) - max_domain_id = ap_max_domain_id ? : -1; - else - max_domain_id = 15; - return scnprintf(buf, PAGE_SIZE, "%d\n", max_domain_id); + return scnprintf(buf, PAGE_SIZE, "%d\n", ap_max_domain_id); } static BUS_ATTR_RO(ap_max_domain_id); +static ssize_t ap_max_adapter_id_show(struct bus_type *bus, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%d\n", ap_max_adapter_id); +} + +static BUS_ATTR_RO(ap_max_adapter_id); + static ssize_t apmask_show(struct bus_type *bus, char *buf) { int rc; @@ -1149,6 +1164,7 @@ static struct bus_attribute *const ap_bus_attrs[] = { &bus_attr_ap_interrupts, &bus_attr_poll_timeout, &bus_attr_ap_max_domain_id, + &bus_attr_ap_max_adapter_id, &bus_attr_apmask, &bus_attr_aqmask, NULL, @@ -1160,47 +1176,42 @@ static struct bus_attribute *const ap_bus_attrs[] = { */ static void ap_select_domain(void) { - int count, max_count, best_domain; struct ap_queue_status status; - int i, j; + int card, dom; /* - * We want to use a single domain. Either the one specified with - * the "domain=" parameter or the domain with the maximum number - * of devices. + * Choose the default domain. Either the one specified with + * the "domain=" parameter or the first domain with at least + * one valid APQN. */ spin_lock_bh(&ap_domain_lock); if (ap_domain_index >= 0) { /* Domain has already been selected. */ - spin_unlock_bh(&ap_domain_lock); - return; + goto out; } - best_domain = -1; - max_count = 0; - for (i = 0; i < AP_DOMAINS; i++) { - if (!ap_test_config_usage_domain(i) || - !test_bit_inv(i, ap_perms.aqm)) + for (dom = 0; dom <= ap_max_domain_id; dom++) { + if (!ap_test_config_usage_domain(dom) || + !test_bit_inv(dom, ap_perms.aqm)) continue; - count = 0; - for (j = 0; j < AP_DEVICES; j++) { - if (!ap_test_config_card_id(j)) + for (card = 0; card <= ap_max_adapter_id; card++) { + if (!ap_test_config_card_id(card) || + !test_bit_inv(card, ap_perms.apm)) continue; - status = ap_test_queue(AP_MKQID(j, i), + status = ap_test_queue(AP_MKQID(card, dom), ap_apft_available(), NULL); - if (status.response_code != AP_RESPONSE_NORMAL) - continue; - count++; - } - if (count > max_count) { - max_count = count; - best_domain = i; + if (status.response_code == AP_RESPONSE_NORMAL) + break; } + if (card <= ap_max_adapter_id) + break; } - if (best_domain >= 0) { - ap_domain_index = best_domain; - AP_DBF(DBF_DEBUG, "new ap_domain_index=%d\n", ap_domain_index); + if (dom <= ap_max_domain_id) { + ap_domain_index = dom; + AP_DBF(DBF_DEBUG, "%s new default domain is %d\n", + __func__, ap_domain_index); } +out: spin_unlock_bh(&ap_domain_lock); } @@ -1279,12 +1290,13 @@ static int __match_queue_device_with_queue_id(struct device *dev, const void *da */ static void _ap_scan_bus_adapter(int id) { + bool broken; ap_qid_t qid; unsigned int func; struct ap_card *ac; struct device *dev; struct ap_queue *aq; - int rc, dom, depth, type, comp_type, borked; + int rc, dom, depth, type, comp_type; /* check if there is a card device registered with this id */ dev = bus_find_device(&ap_bus_type, NULL, @@ -1312,23 +1324,23 @@ static void _ap_scan_bus_adapter(int id) /* find the first valid queue */ for (dom = 0; dom < AP_DOMAINS; dom++) { qid = AP_MKQID(id, dom); - if (ap_query_queue(qid, &depth, &type, &func) == 0) + if (ap_queue_info(qid, &type, &func, &depth)) break; } - borked = 0; + broken = false; if (dom >= AP_DOMAINS) { /* no accessible queue on this card */ - borked = 1; + broken = true; } else if (ac->raw_hwtype != type) { /* card type has changed */ AP_DBF(DBF_INFO, "card=%02x type changed.\n", id); - borked = 1; + broken = true; } else if (ac->functions != func) { /* card functions have changed */ AP_DBF(DBF_INFO, "card=%02x functions changed.\n", id); - borked = 1; + broken = true; } - if (borked) { + if (broken) { /* unregister card device and associated queues */ bus_for_each_dev(&ap_bus_type, NULL, (void *)(long) id, @@ -1364,16 +1376,14 @@ static void _ap_scan_bus_adapter(int id) continue; } /* try to fetch infos about this queue */ - rc = ap_query_queue(qid, &depth, &type, &func); + broken = !ap_queue_info(qid, &type, &func, &depth); if (dev) { - if (rc == -ENODEV) - borked = 1; - else { + if (!broken) { spin_lock_bh(&aq->lock); - borked = aq->sm_state == AP_SM_STATE_BORKED; + broken = aq->sm_state == AP_SM_STATE_BORKED; spin_unlock_bh(&aq->lock); } - if (borked) { + if (broken) { /* Remove broken device */ AP_DBF(DBF_DEBUG, "removing broken queue=%02x.%04x\n", @@ -1383,7 +1393,7 @@ static void _ap_scan_bus_adapter(int id) put_device(dev); continue; } - if (rc) + if (broken) continue; /* a new queue device is needed, check out comp type */ comp_type = ap_get_compatible_type(qid, type, func); @@ -1435,11 +1445,11 @@ static void ap_scan_bus(struct work_struct *unused) { int id; - AP_DBF(DBF_DEBUG, "%s running\n", __func__); - - ap_query_configuration(ap_configuration); + ap_fetch_qci_info(ap_qci_info); ap_select_domain(); + AP_DBF(DBF_DEBUG, "%s running\n", __func__); + /* loop over all possible adapters */ for (id = 0; id < AP_DEVICES; id++) _ap_scan_bus_adapter(id); @@ -1505,7 +1515,6 @@ static void __init ap_perms_init(void) */ static int __init ap_module_init(void) { - int max_domain_id; int rc, i; rc = ap_debug_init(); @@ -1524,14 +1533,10 @@ static int __init ap_module_init(void) ap_perms_init(); /* Get AP configuration data if available */ - ap_init_configuration(); + ap_init_qci_info(); - if (ap_configuration) - max_domain_id = - ap_max_domain_id ? ap_max_domain_id : AP_DOMAINS - 1; - else - max_domain_id = 15; - if (ap_domain_index < -1 || ap_domain_index > max_domain_id || + /* check default domain setting */ + if (ap_domain_index < -1 || ap_domain_index > ap_max_domain_id || (ap_domain_index >= 0 && !test_bit_inv(ap_domain_index, ap_perms.aqm))) { pr_warn("%d is not a valid cryptographic domain\n", @@ -1539,6 +1544,7 @@ static int __init ap_module_init(void) ap_domain_index = -1; } + /* enable interrupts if available */ if (ap_interrupts_available()) { rc = register_adapter_interrupt(&ap_airq); ap_airq_flag = (rc == 0); @@ -1581,7 +1587,6 @@ static int __init ap_module_init(void) } queue_work(system_long_wq, &ap_scan_work); - initialised = true; return 0; @@ -1595,7 +1600,7 @@ out_bus: out: if (ap_using_interrupts()) unregister_adapter_interrupt(&ap_airq); - kfree(ap_configuration); + kfree(ap_qci_info); return rc; } device_initcall(ap_module_init); From 7b7735c5be473473d7a4b9e31460ed8e129dcb36 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 7 Jul 2020 14:07:53 +0200 Subject: [PATCH 326/502] s390: fix comment regarding interrupts in svc With the removal of the critical section cleanup, we now enter the svc interrupt handler with interrupts disabled. Fixes: 0b0ed657fe00 ("s390: remove critical section cleanup from entry.S") Signed-off-by: Christian Borntraeger Signed-off-by: Heiko Carstens --- arch/s390/kernel/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 969b35b177dd..23edf196d3dc 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -370,7 +370,7 @@ EXPORT_SYMBOL(sie_exit) /* * SVC interrupt handler routine. System calls are synchronous events and - * are executed with interrupts enabled. + * are entered with interrupts disabled. */ ENTRY(system_call) From 6589c93f99894e007a1260f009018effc958ab69 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 8 Jul 2020 11:21:25 +0200 Subject: [PATCH 327/502] s390: add trace events for idle enter/exit Helpful for debugging. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/kernel/idle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 0d7fbdfe995a..88bb42ca5008 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include "entry.h" @@ -32,11 +33,12 @@ void enabled_wait(void) PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; clear_cpu_flag(CIF_NOHZ_DELAY); + trace_cpu_idle_rcuidle(1, smp_processor_id()); local_irq_save(flags); /* Call the assembler magic in entry.S */ psw_idle(idle, psw_mask); local_irq_restore(flags); - + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); /* Account time spent with enabled wait psw loaded as idle time. */ write_seqcount_begin(&idle->seqcount); From 61c11656b67b0a30f702f240aabe81fd93e702ac Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Fri, 10 Jul 2020 17:41:58 +0800 Subject: [PATCH 328/502] arm64: tlb: don't set the ttl value in flush_tlb_page_nosync flush_tlb_page_nosync() may be called from pmd level, so we can not set the ttl = 3 here. The callstack is as follows: pmdp_set_access_flags ptep_set_access_flags flush_tlb_fix_spurious_fault flush_tlb_page flush_tlb_page_nosync Fixes: e735b98a5fe0 ("arm64: Add tlbi_user_level TLB invalidation helper") Reported-by: Catalin Marinas Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200710094158.468-1-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 39aed2efd21b..2cb275efcea3 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -209,9 +209,8 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm)); dsb(ishst); - /* This function is only called on a small page */ - __tlbi_level(vale1is, addr, 3); - __tlbi_user_level(vale1is, addr, 3); + __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); } static inline void flush_tlb_page(struct vm_area_struct *vma, From 028a342ec8e128c5d71548d1210f1dba1ae95332 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 May 2020 21:38:07 +0900 Subject: [PATCH 329/502] m68k: Add arch/m68k/Kbuild Use the standard obj-y form to specify the sub-directories under arch/m68k/. No functional change intended. Signed-off-by: Masahiro Yamada Acked-by: Greg Ungerer Link: https://lore.kernel.org/r/20200526123810.301667-1-masahiroy@kernel.org Signed-off-by: Geert Uytterhoeven --- arch/m68k/Kbuild | 19 +++++++++++++++++++ arch/m68k/Makefile | 20 +------------------- 2 files changed, 20 insertions(+), 19 deletions(-) create mode 100644 arch/m68k/Kbuild diff --git a/arch/m68k/Kbuild b/arch/m68k/Kbuild new file mode 100644 index 000000000000..7dc1398dd188 --- /dev/null +++ b/arch/m68k/Kbuild @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-y += kernel/ mm/ +obj-$(CONFIG_Q40) += q40/ +obj-$(CONFIG_AMIGA) += amiga/ +obj-$(CONFIG_ATARI) += atari/ +obj-$(CONFIG_MAC) += mac/ +obj-$(CONFIG_HP300) += hp300/ +obj-$(CONFIG_APOLLO) += apollo/ +obj-$(CONFIG_MVME147) += mvme147/ +obj-$(CONFIG_MVME16x) += mvme16x/ +obj-$(CONFIG_BVME6000) += bvme6000/ +obj-$(CONFIG_SUN3X) += sun3x/ sun3/ +obj-$(CONFIG_SUN3) += sun3/ sun3/prom/ +obj-$(CONFIG_NATFEAT) += emu/ +obj-$(CONFIG_M68040) += fpsp040/ +obj-$(CONFIG_M68060) += ifpsp060/ +obj-$(CONFIG_M68KFPU_EMU) += math-emu/ +obj-$(CONFIG_M68000) += 68000/ +obj-$(CONFIG_COLDFIRE) += coldfire/ diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile index 0415d28dbe4f..e431015f5cc9 100644 --- a/arch/m68k/Makefile +++ b/arch/m68k/Makefile @@ -97,27 +97,9 @@ head-$(CONFIG_SUN3) := arch/m68k/kernel/sun3-head.o head-$(CONFIG_M68000) := arch/m68k/68000/head.o head-$(CONFIG_COLDFIRE) := arch/m68k/coldfire/head.o -core-y += arch/m68k/kernel/ arch/m68k/mm/ +core-y += arch/m68k/ libs-y += arch/m68k/lib/ -core-$(CONFIG_Q40) += arch/m68k/q40/ -core-$(CONFIG_AMIGA) += arch/m68k/amiga/ -core-$(CONFIG_ATARI) += arch/m68k/atari/ -core-$(CONFIG_MAC) += arch/m68k/mac/ -core-$(CONFIG_HP300) += arch/m68k/hp300/ -core-$(CONFIG_APOLLO) += arch/m68k/apollo/ -core-$(CONFIG_MVME147) += arch/m68k/mvme147/ -core-$(CONFIG_MVME16x) += arch/m68k/mvme16x/ -core-$(CONFIG_BVME6000) += arch/m68k/bvme6000/ -core-$(CONFIG_SUN3X) += arch/m68k/sun3x/ arch/m68k/sun3/ -core-$(CONFIG_SUN3) += arch/m68k/sun3/ arch/m68k/sun3/prom/ -core-$(CONFIG_NATFEAT) += arch/m68k/emu/ -core-$(CONFIG_M68040) += arch/m68k/fpsp040/ -core-$(CONFIG_M68060) += arch/m68k/ifpsp060/ -core-$(CONFIG_M68KFPU_EMU) += arch/m68k/math-emu/ -core-$(CONFIG_M68000) += arch/m68k/68000/ -core-$(CONFIG_COLDFIRE) += arch/m68k/coldfire/ - all: zImage From bd3ff3f1b69cdb315d91fef0fb9512af83ae579b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 May 2020 21:38:08 +0900 Subject: [PATCH 330/502] m68k: sun3: Descend to prom from arch/m68k/sun3 Move prom/ to the more relevant Makefile. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20200526123810.301667-2-masahiroy@kernel.org Signed-off-by: Geert Uytterhoeven --- arch/m68k/Kbuild | 2 +- arch/m68k/sun3/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/m68k/Kbuild b/arch/m68k/Kbuild index 7dc1398dd188..18abb35c26a1 100644 --- a/arch/m68k/Kbuild +++ b/arch/m68k/Kbuild @@ -10,7 +10,7 @@ obj-$(CONFIG_MVME147) += mvme147/ obj-$(CONFIG_MVME16x) += mvme16x/ obj-$(CONFIG_BVME6000) += bvme6000/ obj-$(CONFIG_SUN3X) += sun3x/ sun3/ -obj-$(CONFIG_SUN3) += sun3/ sun3/prom/ +obj-$(CONFIG_SUN3) += sun3/ obj-$(CONFIG_NATFEAT) += emu/ obj-$(CONFIG_M68040) += fpsp040/ obj-$(CONFIG_M68060) += ifpsp060/ diff --git a/arch/m68k/sun3/Makefile b/arch/m68k/sun3/Makefile index 9960c46d303c..4e99e17d82ea 100644 --- a/arch/m68k/sun3/Makefile +++ b/arch/m68k/sun3/Makefile @@ -5,4 +5,4 @@ obj-y := sun3ints.o sun3dvma.o idprom.o -obj-$(CONFIG_SUN3) += config.o mmu_emu.o leds.o dvma.o intersil.o +obj-$(CONFIG_SUN3) += config.o mmu_emu.o leds.o dvma.o intersil.o prom/ From 2367b0264294a50cd2cd9d4c1270a9393f32038c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 May 2020 21:38:09 +0900 Subject: [PATCH 331/502] m68k: Optimize cc-option calls for cpuflags-y arch/m68k/Makefile computes lots of unneeded cc-option calls. For example, if CONFIG_M5441x is not defined, there is not point in evaluating the following compiler flag. cpuflags-$(CONFIG_M5441x) := $(call cc-option,-mcpu=54455,-mcfv4e) The result is set to cpuflags-, then thrown away. The right hand side of ':=' is immediately expanded. Hence, all of the 16 calls for cc-option are evaluated. This is expensive since cc-option invokes the compiler. This occurs even if you are not attempting to build anything, like 'make ARCH=m68k help'. Use '=' to expand the value _lazily_. The evaluation for cc-option is delayed until $(cpuflags-y) is expanded. So, the cc-option test happens just once at most. This commit mimics tune-y of arch/arm/Makefile. Signed-off-by: Masahiro Yamada Acked-by: Greg Ungerer Link: https://lore.kernel.org/r/20200526123810.301667-3-masahiroy@kernel.org Signed-off-by: Geert Uytterhoeven --- arch/m68k/Makefile | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile index e431015f5cc9..0507bf297727 100644 --- a/arch/m68k/Makefile +++ b/arch/m68k/Makefile @@ -32,30 +32,33 @@ endif # compiler cpu type flag. # ifndef CONFIG_M68040 -cpuflags-$(CONFIG_M68060) := -m68060 +cpuflags-$(CONFIG_M68060) = -m68060 endif ifndef CONFIG_M68060 -cpuflags-$(CONFIG_M68040) := -m68040 +cpuflags-$(CONFIG_M68040) = -m68040 endif -cpuflags-$(CONFIG_M68030) := -cpuflags-$(CONFIG_M68020) := -cpuflags-$(CONFIG_M68000) := -m68000 -cpuflags-$(CONFIG_M5441x) := $(call cc-option,-mcpu=54455,-mcfv4e) -cpuflags-$(CONFIG_M54xx) := $(call cc-option,-mcpu=5475,-m5200) -cpuflags-$(CONFIG_M5407) := $(call cc-option,-mcpu=5407,-m5200) -cpuflags-$(CONFIG_M532x) := $(call cc-option,-mcpu=532x,-m5307) -cpuflags-$(CONFIG_M537x) := $(call cc-option,-mcpu=537x,-m5307) -cpuflags-$(CONFIG_M5307) := $(call cc-option,-mcpu=5307,-m5200) -cpuflags-$(CONFIG_M528x) := $(call cc-option,-mcpu=528x,-m5307) -cpuflags-$(CONFIG_M5275) := $(call cc-option,-mcpu=5275,-m5307) -cpuflags-$(CONFIG_M5272) := $(call cc-option,-mcpu=5272,-m5307) -cpuflags-$(CONFIG_M5271) := $(call cc-option,-mcpu=5271,-m5307) -cpuflags-$(CONFIG_M523x) := $(call cc-option,-mcpu=523x,-m5307) -cpuflags-$(CONFIG_M525x) := $(call cc-option,-mcpu=5253,-m5200) -cpuflags-$(CONFIG_M5249) := $(call cc-option,-mcpu=5249,-m5200) -cpuflags-$(CONFIG_M520x) := $(call cc-option,-mcpu=5208,-m5200) -cpuflags-$(CONFIG_M5206e) := $(call cc-option,-mcpu=5206e,-m5200) -cpuflags-$(CONFIG_M5206) := $(call cc-option,-mcpu=5206,-m5200) +cpuflags-$(CONFIG_M68030) = +cpuflags-$(CONFIG_M68020) = +cpuflags-$(CONFIG_M68000) = -m68000 +cpuflags-$(CONFIG_M5441x) = $(call cc-option,-mcpu=54455,-mcfv4e) +cpuflags-$(CONFIG_M54xx) = $(call cc-option,-mcpu=5475,-m5200) +cpuflags-$(CONFIG_M5407) = $(call cc-option,-mcpu=5407,-m5200) +cpuflags-$(CONFIG_M532x) = $(call cc-option,-mcpu=532x,-m5307) +cpuflags-$(CONFIG_M537x) = $(call cc-option,-mcpu=537x,-m5307) +cpuflags-$(CONFIG_M5307) = $(call cc-option,-mcpu=5307,-m5200) +cpuflags-$(CONFIG_M528x) = $(call cc-option,-mcpu=528x,-m5307) +cpuflags-$(CONFIG_M5275) = $(call cc-option,-mcpu=5275,-m5307) +cpuflags-$(CONFIG_M5272) = $(call cc-option,-mcpu=5272,-m5307) +cpuflags-$(CONFIG_M5271) = $(call cc-option,-mcpu=5271,-m5307) +cpuflags-$(CONFIG_M523x) = $(call cc-option,-mcpu=523x,-m5307) +cpuflags-$(CONFIG_M525x) = $(call cc-option,-mcpu=5253,-m5200) +cpuflags-$(CONFIG_M5249) = $(call cc-option,-mcpu=5249,-m5200) +cpuflags-$(CONFIG_M520x) = $(call cc-option,-mcpu=5208,-m5200) +cpuflags-$(CONFIG_M5206e) = $(call cc-option,-mcpu=5206e,-m5200) +cpuflags-$(CONFIG_M5206) = $(call cc-option,-mcpu=5206,-m5200) + +# Evaluate tune cc-option calls now +cpuflags-y := $(cpuflags-y) KBUILD_AFLAGS += $(cpuflags-y) KBUILD_CFLAGS += $(cpuflags-y) From 40b13fd7fd6e1ec295230cc114c6c9309e15784a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 May 2020 21:38:10 +0900 Subject: [PATCH 332/502] m68k: Pass -D options to KBUILD_CPPFLAGS instead of KBUILD_{A,C}FLAGS Precisely, -D is a preprocessor option. KBUILD_CPPFLAGS is passed for compiling .c and .S files too. Signed-off-by: Masahiro Yamada Acked-by: Greg Ungerer Link: https://lore.kernel.org/r/20200526123810.301667-4-masahiroy@kernel.org Signed-off-by: Geert Uytterhoeven --- arch/m68k/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile index 0507bf297727..71ffaf5f8954 100644 --- a/arch/m68k/Makefile +++ b/arch/m68k/Makefile @@ -70,9 +70,8 @@ ifdef CONFIG_MMU KBUILD_CFLAGS += -fno-strength-reduce -ffixed-a2 else # we can use a m68k-linux-gcc toolchain with these in place -KBUILD_CFLAGS += -DUTS_SYSNAME=\"uClinux\" -KBUILD_CFLAGS += -D__uClinux__ -KBUILD_AFLAGS += -D__uClinux__ +KBUILD_CPPFLAGS += -DUTS_SYSNAME=\"uClinux\" +KBUILD_CPPFLAGS += -D__uClinux__ endif KBUILD_LDFLAGS := -m m68kelf From 5f5f2949c14d6fe5cfc51bd98a41fdf69652c7e3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 27 May 2020 08:39:42 -0500 Subject: [PATCH 333/502] m68k: Use sizeof_field() helper Make use of the sizeof_field() helper instead of an open-coded version. Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200527133942.GA10408@embeddedor Signed-off-by: Geert Uytterhoeven --- arch/m68k/kernel/signal.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index b3ff39588f36..fc034fd19798 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -61,25 +61,25 @@ #define FMT4SIZE 0 #else #define FORMAT 0 -#define FMT4SIZE sizeof(((struct frame *)0)->un.fmt4) +#define FMT4SIZE sizeof_field(struct frame, un.fmt4) #endif static const int frame_size_change[16] = { - [1] = -1, /* sizeof(((struct frame *)0)->un.fmt1), */ - [2] = sizeof(((struct frame *)0)->un.fmt2), - [3] = sizeof(((struct frame *)0)->un.fmt3), + [1] = -1, /* sizeof_field(struct frame, un.fmt1), */ + [2] = sizeof_field(struct frame, un.fmt2), + [3] = sizeof_field(struct frame, un.fmt3), [4] = FMT4SIZE, - [5] = -1, /* sizeof(((struct frame *)0)->un.fmt5), */ - [6] = -1, /* sizeof(((struct frame *)0)->un.fmt6), */ - [7] = sizeof(((struct frame *)0)->un.fmt7), - [8] = -1, /* sizeof(((struct frame *)0)->un.fmt8), */ - [9] = sizeof(((struct frame *)0)->un.fmt9), - [10] = sizeof(((struct frame *)0)->un.fmta), - [11] = sizeof(((struct frame *)0)->un.fmtb), - [12] = -1, /* sizeof(((struct frame *)0)->un.fmtc), */ - [13] = -1, /* sizeof(((struct frame *)0)->un.fmtd), */ - [14] = -1, /* sizeof(((struct frame *)0)->un.fmte), */ - [15] = -1, /* sizeof(((struct frame *)0)->un.fmtf), */ + [5] = -1, /* sizeof_field(struct frame, un.fmt5), */ + [6] = -1, /* sizeof_field(struct frame, un.fmt6), */ + [7] = sizeof_field(struct frame, un.fmt7), + [8] = -1, /* sizeof_field(struct frame, un.fmt8), */ + [9] = sizeof_field(struct frame, un.fmt9), + [10] = sizeof_field(struct frame, un.fmta), + [11] = sizeof_field(struct frame, un.fmtb), + [12] = -1, /* sizeof_field(struct frame, un.fmtc), */ + [13] = -1, /* sizeof_field(struct frame, un.fmtd), */ + [14] = -1, /* sizeof_field(struct frame, un.fmte), */ + [15] = -1, /* sizeof_field(struct frame, un.fmtf), */ }; static inline int frame_extra_sizes(int f) @@ -651,7 +651,7 @@ static int mangle_kernel_stack(struct pt_regs *regs, int formatvec, } else { struct switch_stack *sw = (struct switch_stack *)regs - 1; /* yes, twice as much as max(sizeof(frame.un.fmt)) */ - unsigned long buf[sizeof(((struct frame *)0)->un) / 2]; + unsigned long buf[sizeof_field(struct frame, un) / 2]; /* that'll make sure that expansion won't crap over data */ if (copy_from_user(buf + fsize / 4, fp, fsize)) From be1a31283655105606407502800871b9c1a1132f Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Sun, 31 May 2020 10:45:19 +1200 Subject: [PATCH 334/502] m68k: atari: Annotate dummy read in ROM port IO code as __maybe_unused The Atari ROM port IO code uses dummy variables to implement writes (not supported by the hardware) as reads that encode the write data in part of the address. The value read from the ROM port in this operation is discarded. Annotate dummy variables as __maybe_unused to avoid a compiler warning with W=1. Reported-by: kbuild test robot Signed-off-by: Michael Schmitz Link: https://lore.kernel.org/r/1590878719-21219-1-git-send-email-schmitzmic@gmail.com Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/raw_io.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/m68k/include/asm/raw_io.h b/arch/m68k/include/asm/raw_io.h index 8a6dc6e5a279..911826ea83ce 100644 --- a/arch/m68k/include/asm/raw_io.h +++ b/arch/m68k/include/asm/raw_io.h @@ -80,14 +80,14 @@ ({ u16 __v = le16_to_cpu(*(__force volatile u16 *) (addr)); __v; }) #define rom_out_8(addr, b) \ - ({u8 __w, __v = (b); u32 _addr = ((u32) (addr)); \ + ({u8 __maybe_unused __w, __v = (b); u32 _addr = ((u32) (addr)); \ __w = ((*(__force volatile u8 *) ((_addr | 0x10000) + (__v<<1)))); }) #define rom_out_be16(addr, w) \ - ({u16 __w, __v = (w); u32 _addr = ((u32) (addr)); \ + ({u16 __maybe_unused __w, __v = (w); u32 _addr = ((u32) (addr)); \ __w = ((*(__force volatile u16 *) ((_addr & 0xFFFF0000UL) + ((__v & 0xFF)<<1)))); \ __w = ((*(__force volatile u16 *) ((_addr | 0x10000) + ((__v >> 8)<<1)))); }) #define rom_out_le16(addr, w) \ - ({u16 __w, __v = (w); u32 _addr = ((u32) (addr)); \ + ({u16 __maybe_unused __w, __v = (w); u32 _addr = ((u32) (addr)); \ __w = ((*(__force volatile u16 *) ((_addr & 0xFFFF0000UL) + ((__v >> 8)<<1)))); \ __w = ((*(__force volatile u16 *) ((_addr | 0x10000) + ((__v & 0xFF)<<1)))); }) From aeb445bf2194d83e12e85bf5c65baaf1f093bd8f Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Sun, 31 May 2020 09:12:13 +1000 Subject: [PATCH 335/502] m68k: mac: Don't send IOP message until channel is idle In the following sequence of calls, iop_do_send() gets called when the "send" channel is not in the IOP_MSG_IDLE state: iop_ism_irq() iop_handle_send() (msg->handler)() iop_send_message() iop_do_send() Avoid this by testing the channel state before calling iop_do_send(). When sending, and iop_send_queue is empty, call iop_do_send() because the channel is idle. If iop_send_queue is not empty, iop_do_send() will get called later by iop_handle_send(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Finn Thain Tested-by: Stan Johnson Cc: Joshua Thompson Link: https://lore.kernel.org/r/6d667c39e53865661fa5a48f16829d18ed8abe54.1590880333.git.fthain@telegraphics.com.au Signed-off-by: Geert Uytterhoeven --- arch/m68k/mac/iop.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c index d3775afb0f07..754f6478c30d 100644 --- a/arch/m68k/mac/iop.c +++ b/arch/m68k/mac/iop.c @@ -415,7 +415,8 @@ static void iop_handle_send(uint iop_num, uint chan) msg->status = IOP_MSGSTATUS_UNUSED; msg = msg->next; iop_send_queue[iop_num][chan] = msg; - if (msg) iop_do_send(msg); + if (msg && iop_readb(iop, IOP_ADDR_SEND_STATE + chan) == IOP_MSG_IDLE) + iop_do_send(msg); } /* @@ -489,16 +490,12 @@ int iop_send_message(uint iop_num, uint chan, void *privdata, if (!(q = iop_send_queue[iop_num][chan])) { iop_send_queue[iop_num][chan] = msg; + iop_do_send(msg); } else { while (q->next) q = q->next; q->next = msg; } - if (iop_readb(iop_base[iop_num], - IOP_ADDR_SEND_STATE + chan) == IOP_MSG_IDLE) { - iop_do_send(msg); - } - return 0; } From 931fc82a6aaf4e2e4a5490addaa6a090d78c24a7 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Sun, 31 May 2020 09:12:13 +1000 Subject: [PATCH 336/502] m68k: mac: Fix IOP status/control register writes When writing values to the IOP status/control register make sure those values do not have any extraneous bits that will clear interrupt flags. To place the SCC IOP into bypass mode would be desirable but this is not achieved by writing IOP_DMAINACTIVE | IOP_RUN | IOP_AUTOINC | IOP_BYPASS to the control register. Drop this ineffective register write. Remove the flawed and unused iop_bypass() function. Make use of the unused iop_stop() function. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Finn Thain Tested-by: Stan Johnson Cc: Joshua Thompson Link: https://lore.kernel.org/r/09bcb7359a1719a18b551ee515da3c4c3cf709e6.1590880333.git.fthain@telegraphics.com.au Signed-off-by: Geert Uytterhoeven --- arch/m68k/mac/iop.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c index 754f6478c30d..bfc8daf50744 100644 --- a/arch/m68k/mac/iop.c +++ b/arch/m68k/mac/iop.c @@ -183,7 +183,7 @@ static __inline__ void iop_writeb(volatile struct mac_iop *iop, __u16 addr, __u8 static __inline__ void iop_stop(volatile struct mac_iop *iop) { - iop->status_ctrl &= ~IOP_RUN; + iop->status_ctrl = IOP_AUTOINC; } static __inline__ void iop_start(volatile struct mac_iop *iop) @@ -191,14 +191,9 @@ static __inline__ void iop_start(volatile struct mac_iop *iop) iop->status_ctrl = IOP_RUN | IOP_AUTOINC; } -static __inline__ void iop_bypass(volatile struct mac_iop *iop) -{ - iop->status_ctrl |= IOP_BYPASS; -} - static __inline__ void iop_interrupt(volatile struct mac_iop *iop) { - iop->status_ctrl |= IOP_IRQ; + iop->status_ctrl = IOP_IRQ | IOP_RUN | IOP_AUTOINC; } static int iop_alive(volatile struct mac_iop *iop) @@ -244,7 +239,6 @@ void __init iop_preinit(void) } else { iop_base[IOP_NUM_SCC] = (struct mac_iop *) SCC_IOP_BASE_QUADRA; } - iop_base[IOP_NUM_SCC]->status_ctrl = 0x87; iop_scc_present = 1; } else { iop_base[IOP_NUM_SCC] = NULL; @@ -256,7 +250,7 @@ void __init iop_preinit(void) } else { iop_base[IOP_NUM_ISM] = (struct mac_iop *) ISM_IOP_BASE_QUADRA; } - iop_base[IOP_NUM_ISM]->status_ctrl = 0; + iop_stop(iop_base[IOP_NUM_ISM]); iop_ism_present = 1; } else { iop_base[IOP_NUM_ISM] = NULL; From adc19b2e314b3883a22e4f51654da4e6d8102d5d Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Sun, 31 May 2020 09:12:13 +1000 Subject: [PATCH 337/502] m68k: mac: Don't send uninitialized data in IOP message reply Clear the message reply before calling iop_complete(). This code path is not normally executed but should that happen let's arrange for consistent behaviour from the IOP. Signed-off-by: Finn Thain Tested-by: Stan Johnson Cc: Joshua Thompson Link: https://lore.kernel.org/r/8e35df4d193b082cb6285b1f30c949ff7e30e99e.1590880333.git.fthain@telegraphics.com.au Signed-off-by: Geert Uytterhoeven --- arch/m68k/mac/iop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c index bfc8daf50744..8844963eea75 100644 --- a/arch/m68k/mac/iop.c +++ b/arch/m68k/mac/iop.c @@ -449,6 +449,7 @@ static void iop_handle_recv(uint iop_num, uint chan) iop_pr_debug("unclaimed message on iop_num %d chan %d\n", iop_num, chan); iop_pr_debug("%*ph\n", IOP_MSG_LEN, msg->message); + memset(msg->reply, 0, IOP_MSG_LEN); iop_complete_message(msg); } } From 47fbcb9506df7cf02ccae6895be3f76fa5768eb1 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Sun, 31 May 2020 09:12:13 +1000 Subject: [PATCH 338/502] m68k: mac: Improve IOP debug messages Always dump the full message and reply. Avoid printing partial lines as this output gets mixed up with the output from called functions. Don't output the state of idle channels. Signed-off-by: Finn Thain Tested-by: Stan Johnson Cc: Joshua Thompson Link: https://lore.kernel.org/r/317909d69244f06581973c5839382f5516cd9a1c.1590880333.git.fthain@telegraphics.com.au Signed-off-by: Geert Uytterhoeven --- arch/m68k/mac/iop.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c index 8844963eea75..c669a7644301 100644 --- a/arch/m68k/mac/iop.c +++ b/arch/m68k/mac/iop.c @@ -347,8 +347,8 @@ void iop_complete_message(struct iop_msg *msg) int chan = msg->channel; int i,offset; - iop_pr_debug("msg %p iop_num %d channel %d\n", msg, msg->iop_num, - msg->channel); + iop_pr_debug("iop_num %d chan %d reply %*ph\n", + msg->iop_num, msg->channel, IOP_MSG_LEN, msg->reply); offset = IOP_ADDR_RECV_MSG + (msg->channel * IOP_MSG_LEN); @@ -372,6 +372,9 @@ static void iop_do_send(struct iop_msg *msg) volatile struct mac_iop *iop = iop_base[msg->iop_num]; int i,offset; + iop_pr_debug("iop_num %d chan %d message %*ph\n", + msg->iop_num, msg->channel, IOP_MSG_LEN, msg->message); + offset = IOP_ADDR_SEND_MSG + (msg->channel * IOP_MSG_LEN); for (i = 0 ; i < IOP_MSG_LEN ; i++, offset++) { @@ -394,8 +397,6 @@ static void iop_handle_send(uint iop_num, uint chan) struct iop_msg *msg; int i,offset; - iop_pr_debug("iop_num %d chan %d\n", iop_num, chan); - iop_writeb(iop, IOP_ADDR_SEND_STATE + chan, IOP_MSG_IDLE); if (!(msg = iop_send_queue[iop_num][chan])) return; @@ -405,6 +406,9 @@ static void iop_handle_send(uint iop_num, uint chan) for (i = 0 ; i < IOP_MSG_LEN ; i++, offset++) { msg->reply[i] = iop_readb(iop, offset); } + iop_pr_debug("iop_num %d chan %d reply %*ph\n", + iop_num, chan, IOP_MSG_LEN, msg->reply); + if (msg->handler) (*msg->handler)(msg); msg->status = IOP_MSGSTATUS_UNUSED; msg = msg->next; @@ -424,8 +428,6 @@ static void iop_handle_recv(uint iop_num, uint chan) int i,offset; struct iop_msg *msg; - iop_pr_debug("iop_num %d chan %d\n", iop_num, chan); - msg = iop_get_unused_msg(); msg->iop_num = iop_num; msg->channel = chan; @@ -437,6 +439,8 @@ static void iop_handle_recv(uint iop_num, uint chan) for (i = 0 ; i < IOP_MSG_LEN ; i++, offset++) { msg->message[i] = iop_readb(iop, offset); } + iop_pr_debug("iop_num %d chan %d message %*ph\n", + iop_num, chan, IOP_MSG_LEN, msg->message); iop_writeb(iop, IOP_ADDR_RECV_STATE + chan, IOP_MSG_RCVD); @@ -446,9 +450,6 @@ static void iop_handle_recv(uint iop_num, uint chan) if (msg->handler) { (*msg->handler)(msg); } else { - iop_pr_debug("unclaimed message on iop_num %d chan %d\n", - iop_num, chan); - iop_pr_debug("%*ph\n", IOP_MSG_LEN, msg->message); memset(msg->reply, 0, IOP_MSG_LEN); iop_complete_message(msg); } @@ -559,35 +560,34 @@ irqreturn_t iop_ism_irq(int irq, void *dev_id) int i,state; u8 events = iop->status_ctrl & (IOP_INT0 | IOP_INT1); - iop_pr_debug("status %02X\n", iop->status_ctrl); - do { + iop_pr_debug("iop_num %d status %02X\n", iop_num, + iop->status_ctrl); + /* INT0 indicates state change on an outgoing message channel */ if (events & IOP_INT0) { iop->status_ctrl = IOP_INT0 | IOP_RUN | IOP_AUTOINC; - iop_pr_debug("new status %02X, send states", - iop->status_ctrl); for (i = 0; i < NUM_IOP_CHAN; i++) { state = iop_readb(iop, IOP_ADDR_SEND_STATE + i); - iop_pr_cont(" %02X", state); if (state == IOP_MSG_COMPLETE) iop_handle_send(iop_num, i); + else if (state != IOP_MSG_IDLE) + iop_pr_debug("chan %d send state %02X\n", + i, state); } - iop_pr_cont("\n"); } /* INT1 for incoming messages */ if (events & IOP_INT1) { iop->status_ctrl = IOP_INT1 | IOP_RUN | IOP_AUTOINC; - iop_pr_debug("new status %02X, recv states", - iop->status_ctrl); for (i = 0; i < NUM_IOP_CHAN; i++) { state = iop_readb(iop, IOP_ADDR_RECV_STATE + i); - iop_pr_cont(" %02X", state); if (state == IOP_MSG_NEW) iop_handle_recv(iop_num, i); + else if (state != IOP_MSG_IDLE) + iop_pr_debug("chan %d recv state %02X\n", + i, state); } - iop_pr_cont("\n"); } events = iop->status_ctrl & (IOP_INT0 | IOP_INT1); From e3a549487f08f6326b24e92b3d87f9683f1d74a4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 17 Jun 2020 12:11:53 +0900 Subject: [PATCH 339/502] m68k: Use CLEAN_FILES to clean up files The log of 'make ARCH=m68k clean' does not look nice. $ make ARCH=m68k clean CLEAN arch/m68k/kernel [ snip ] CLEAN usr rm -f vmlinux.gz vmlinux.bz2 CLEAN vmlinux.symvers modules.builtin modules.builtin.modinfo Use CLEAN_FILES to simplify the code, and beautify the log. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20200617031153.85858-1-masahiroy@kernel.org Signed-off-by: Geert Uytterhoeven --- arch/m68k/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile index 71ffaf5f8954..4438ffb4bbe1 100644 --- a/arch/m68k/Makefile +++ b/arch/m68k/Makefile @@ -138,8 +138,7 @@ else $(KBZIP2) -1c vmlinux >vmlinux.bz2 endif -archclean: - rm -f vmlinux.gz vmlinux.bz2 +CLEAN_FILES += vmlinux.gz vmlinux.bz2 archheaders: $(Q)$(MAKE) $(build)=arch/m68k/kernel/syscalls all From 382f429bb559fe991b1ece2e5e58c812e28b3ad8 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 6 Jul 2020 11:34:56 +0200 Subject: [PATCH 340/502] m68k: defconfig: Update defconfigs for v5.8-rc3 - Re-enable modular build of DES crypto algorithm (no longer auto-enabled since commit be01369859b8aa07 ("esp, ah: modernize the crypto algorithm selections")), - Enable modular build of prime numbers and bitops test modules. Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20200615075458.22088-1-geert@linux-m68k.org Link: https://lore.kernel.org/r/20200706093456.15641-1-geert@linux-m68k.org --- arch/m68k/configs/amiga_defconfig | 3 +++ arch/m68k/configs/apollo_defconfig | 3 +++ arch/m68k/configs/atari_defconfig | 3 +++ arch/m68k/configs/bvme6000_defconfig | 3 +++ arch/m68k/configs/hp300_defconfig | 3 +++ arch/m68k/configs/mac_defconfig | 3 +++ arch/m68k/configs/multi_defconfig | 3 +++ arch/m68k/configs/mvme147_defconfig | 3 +++ arch/m68k/configs/mvme16x_defconfig | 3 +++ arch/m68k/configs/q40_defconfig | 3 +++ arch/m68k/configs/sun3_defconfig | 3 +++ arch/m68k/configs/sun3x_defconfig | 3 +++ 12 files changed, 36 insertions(+) diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 888b75e7fd79..f9f4fa595e13 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -594,6 +594,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -615,6 +616,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -643,6 +645,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 45303846b659..f4828e86d547 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -550,6 +550,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -571,6 +572,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -599,6 +601,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index de824c1bc3d3..e7911f141de1 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -572,6 +572,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -593,6 +594,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -621,6 +623,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 071839ca6a59..d574e438e6db 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -543,6 +543,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -564,6 +565,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -592,6 +594,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 37ac7b019ec1..c7ce206e6138 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -552,6 +552,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -573,6 +574,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -601,6 +603,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 608779866260..522dcf624aa5 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -574,6 +574,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -595,6 +596,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -623,6 +625,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 0abb53c38c20..2433409f4369 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -660,6 +660,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -681,6 +682,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -709,6 +711,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index cb14c234d3ad..5568aa7d9d41 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -542,6 +542,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -563,6 +564,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -591,6 +593,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index e8a1920aded7..5b1e72ce53f8 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -543,6 +543,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -564,6 +565,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -592,6 +594,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 2cbf416fc725..c3a3dcf30fb9 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -561,6 +561,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -582,6 +583,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -610,6 +612,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index fed3cc7abcc4..3c00e52f1bf0 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -545,6 +545,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -566,6 +567,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -593,6 +595,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 0954fde256e6..241242d73cbd 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -544,6 +544,7 @@ CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m @@ -565,6 +566,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRIME_NUMBERS=m CONFIG_CRC32_SELFTEST=m CONFIG_CRC64=m CONFIG_XZ_DEC_TEST=m @@ -593,6 +595,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m From f011856ce7b600fdc2d1102d56873b787ff6d1bb Mon Sep 17 00:00:00 2001 From: Jay Chen Date: Mon, 6 Jul 2020 19:22:45 +0800 Subject: [PATCH 341/502] perf/smmuv3: To simplify code for ioremap page in pmcg Use the devm_platform_get_and_ioremap_resource to simplify the code a bit. Signed-off-by: Jay Chen Link: https://lore.kernel.org/r/20200706112246.92220-2-jkchen@linux.alibaba.com Signed-off-by: Will Deacon --- drivers/perf/arm_smmuv3_pmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c index 48e28ef93a70..2d09f3e47d12 100644 --- a/drivers/perf/arm_smmuv3_pmu.c +++ b/drivers/perf/arm_smmuv3_pmu.c @@ -755,8 +755,7 @@ static int smmu_pmu_probe(struct platform_device *pdev) .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }; - res_0 = platform_get_resource(pdev, IORESOURCE_MEM, 0); - smmu_pmu->reg_base = devm_ioremap_resource(dev, res_0); + smmu_pmu->reg_base = devm_platform_get_and_ioremap_resource(pdev, 0, &res_0); if (IS_ERR(smmu_pmu->reg_base)) return PTR_ERR(smmu_pmu->reg_base); From 1583052d111f8ea43f9954c5e749164fd2b954af Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Jun 2020 17:58:31 +0200 Subject: [PATCH 342/502] arm64/acpi: disallow AML memory opregions to access kernel memory AML uses SystemMemory opregions to allow AML handlers to access MMIO registers of, e.g., GPIO controllers, or access reserved regions of memory that are owned by the firmware. Currently, we also allow AML access to memory that is owned by the kernel and mapped via the linear region, which does not seem to be supported by a valid use case, and exposes the kernel's internal state to AML methods that may be buggy and exploitable. On arm64, ACPI support requires booting in EFI mode, and so we can cross reference the requested region against the EFI memory map, rather than just do a minimal check on the first page. So let's only permit regions to be remapped by the ACPI core if - they don't appear in the EFI memory map at all (which is the case for most MMIO), or - they are covered by a single region in the EFI memory map, which is not of a type that describes memory that is given to the kernel at boot. Reported-by: Jason A. Donenfeld Signed-off-by: Ard Biesheuvel Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20200626155832.2323789-2-ardb@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/acpi.h | 15 +------- arch/arm64/kernel/acpi.c | 66 +++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index a45366c3909b..bd68e1b7f29f 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -47,20 +47,7 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr); /* ACPI table mapping after acpi_permanent_mmap is set */ -static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys, - acpi_size size) -{ - /* For normal memory we already have a cacheable mapping. */ - if (memblock_is_map_memory(phys)) - return (void __iomem *)__phys_to_virt(phys); - - /* - * We should still honor the memory's attribute here because - * crash dump kernel possibly excludes some ACPI (reclaim) - * regions from memblock list. - */ - return __ioremap(phys, size, __acpi_get_mem_attribute(phys)); -} +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size); #define acpi_os_ioremap acpi_os_ioremap typedef u64 phys_cpuid_t; diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index a7586a4db142..01b861e225b0 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -261,6 +261,72 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) return __pgprot(PROT_DEVICE_nGnRnE); } +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +{ + efi_memory_desc_t *md, *region = NULL; + pgprot_t prot; + + if (WARN_ON_ONCE(!efi_enabled(EFI_MEMMAP))) + return NULL; + + for_each_efi_memory_desc(md) { + u64 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + + if (phys < md->phys_addr || phys >= end) + continue; + + if (phys + size > end) { + pr_warn(FW_BUG "requested region covers multiple EFI memory regions\n"); + return NULL; + } + region = md; + break; + } + + /* + * It is fine for AML to remap regions that are not represented in the + * EFI memory map at all, as it only describes normal memory, and MMIO + * regions that require a virtual mapping to make them accessible to + * the EFI runtime services. + */ + prot = __pgprot(PROT_DEVICE_nGnRnE); + if (region) { + switch (region->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + case EFI_PERSISTENT_MEMORY: + pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys); + return NULL; + + case EFI_ACPI_RECLAIM_MEMORY: + /* + * ACPI reclaim memory is used to pass firmware tables + * and other data that is intended for consumption by + * the OS only, which may decide it wants to reclaim + * that memory and use it for something else. We never + * do that, but we usually add it to the linear map + * anyway, in which case we should use the existing + * mapping. + */ + if (memblock_is_map_memory(phys)) + return (void __iomem *)__phys_to_virt(phys); + /* fall through */ + + default: + if (region->attribute & EFI_MEMORY_WB) + prot = PAGE_KERNEL; + else if (region->attribute & EFI_MEMORY_WT) + prot = __pgprot(PROT_NORMAL_WT); + else if (region->attribute & EFI_MEMORY_WC) + prot = __pgprot(PROT_NORMAL_NC); + } + } + return __ioremap(phys, size, prot); +} + /* * Claim Synchronous External Aborts as a firmware first notification. * From 325f5585ec36953a3fe2e000451f690440fe1bf5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Jun 2020 17:58:32 +0200 Subject: [PATCH 343/502] arm64/acpi: disallow writeable AML opregion mapping for EFI code regions Given that the contents of EFI runtime code and data regions are provided by the firmware, as well as the DSDT, it is not unimaginable that AML code exists today that accesses EFI runtime code regions using a SystemMemory OpRegion. There is nothing fundamentally wrong with that, but since we take great care to ensure that executable code is never mapped writeable and executable at the same time, we should not permit AML to create writable mapping. Signed-off-by: Ard Biesheuvel Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20200626155832.2323789-3-ardb@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/acpi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 01b861e225b0..455966401102 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -301,6 +301,15 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys); return NULL; + case EFI_RUNTIME_SERVICES_CODE: + /* + * This would be unusual, but not problematic per se, + * as long as we take care not to create a writable + * mapping for executable code. + */ + prot = PAGE_KERNEL_RO; + break; + case EFI_ACPI_RECLAIM_MEMORY: /* * ACPI reclaim memory is used to pass firmware tables From 0de674afe83cb23676ec391470251aaa9700f21a Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 10 Jul 2020 19:24:02 +0100 Subject: [PATCH 344/502] arm64: stacktrace: Move export for save_stack_trace_tsk() Due to refactoring way back in bb53c820c5b0f1 ("arm64: stacktrace: avoid listing stacktrace functions in stacktrace") the EXPORT_SYMBOL_GPL() for save_stack_trace_tsk() is at the end of __save_stack_trace() rather than the function it exports. Move it to the expected location. Signed-off-by: Mark Brown Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200710182402.50473-1-broonie@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 139679c745bf..2dd8e3b8b94b 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -199,12 +199,12 @@ static noinline void __save_stack_trace(struct task_struct *tsk, put_task_stack(tsk); } -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { __save_stack_trace(tsk, trace, 1); } +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); void save_stack_trace(struct stack_trace *trace) { From abb7962adc80ab4f4313e8a065302525b6a9c2dc Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 1 Jul 2020 10:12:01 +0530 Subject: [PATCH 345/502] arm64/hugetlb: Reserve CMA areas for gigantic pages on 16K and 64K configs Currently 'hugetlb_cma=' command line argument does not create CMA area on ARM64_16K_PAGES and ARM64_64K_PAGES based platforms. Instead, it just ends up with the following warning message. Reason being, hugetlb_cma_reserve() never gets called for these huge page sizes. [ 64.255669] hugetlb_cma: the option isn't supported by current arch This enables CMA areas reservation on ARM64_16K_PAGES and ARM64_64K_PAGES configs by defining an unified arm64_hugetlb_cma_reseve() that is wrapped in CONFIG_CMA. Call site for arm64_hugetlb_cma_reserve() is also protected as is conditionally included and hence cannot contain stub for the inverse config i.e !(CONFIG_HUGETLB_PAGE && CONFIG_CMA). Signed-off-by: Anshuman Khandual Cc: Will Deacon Cc: Mark Rutland Cc: Mike Kravetz Cc: Barry Song Cc: Andrew Morton Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593578521-24672-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/hugetlb.h | 2 ++ arch/arm64/mm/hugetlbpage.c | 38 ++++++++++++++++++++++++++++++++ arch/arm64/mm/init.c | 4 ++-- 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 94ba0c5bced2..5abf91e3494c 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -49,6 +49,8 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); #define set_huge_swap_pte_at set_huge_swap_pte_at +void __init arm64_hugetlb_cma_reserve(void); + #include #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index c79084739096..aa421bf4956e 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -19,6 +19,44 @@ #include #include +/* + * HugeTLB Support Matrix + * + * --------------------------------------------------- + * | Page Size | CONT PTE | PMD | CONT PMD | PUD | + * --------------------------------------------------- + * | 4K | 64K | 2M | 32M | 1G | + * | 16K | 2M | 32M | 1G | | + * | 64K | 2M | 512M | 16G | | + * --------------------------------------------------- + */ + +/* + * Reserve CMA areas for the largest supported gigantic + * huge page when requested. Any other smaller gigantic + * huge pages could still be served from those areas. + */ +#ifdef CONFIG_CMA +void __init arm64_hugetlb_cma_reserve(void) +{ + int order; + +#ifdef CONFIG_ARM64_4K_PAGES + order = PUD_SHIFT - PAGE_SHIFT; +#else + order = CONT_PMD_SHIFT + PMD_SHIFT - PAGE_SHIFT; +#endif + /* + * HugeTLB CMA reservation is required for gigantic + * huge pages which could not be allocated via the + * page allocator. Just warn if there is any change + * breaking this assumption. + */ + WARN_ON(order <= MAX_ORDER); + hugetlb_cma_reserve(order); +} +#endif /* CONFIG_CMA */ + #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION bool arch_hugetlb_migration_supported(struct hstate *h) { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 6c3eb424c613..f8c19c6c8e71 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -425,8 +425,8 @@ void __init bootmem_init(void) * initialize node_online_map that gets used in hugetlb_cma_reserve() * while allocating required CMA size across online nodes. */ -#ifdef CONFIG_ARM64_4K_PAGES - hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); +#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) + arm64_hugetlb_cma_reserve(); #endif /* From b620ba54547cd0f98e35c1be102eec2cc25fda5d Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Wed, 15 Jul 2020 15:19:43 +0800 Subject: [PATCH 346/502] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a range of input addresses. This patch detect this feature. Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200715071945.897-2-yezhenyu2@huawei.com [catalin.marinas@arm.com: some renaming for consistency] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/kernel/cpufeature.c | 10 ++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index d44ba903d11d..07b643a70710 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -63,7 +63,8 @@ #define ARM64_HAS_32BIT_EL1 53 #define ARM64_BTI 54 #define ARM64_HAS_ARMv8_4_TTL 55 +#define ARM64_HAS_TLB_RANGE 56 -#define ARM64_NCAPS 56 +#define ARM64_NCAPS 57 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 8c209aa17273..551f30ace4db 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -617,6 +617,9 @@ #define ID_AA64ISAR0_SHA1_SHIFT 8 #define ID_AA64ISAR0_AES_SHIFT 4 +#define ID_AA64ISAR0_TLB_RANGE_NI 0x0 +#define ID_AA64ISAR0_TLB_RANGE 0x2 + /* id_aa64isar1 */ #define ID_AA64ISAR1_I8MM_SHIFT 52 #define ID_AA64ISAR1_DGH_SHIFT 48 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e877f56ff1ab..2f5adefef34d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1893,6 +1893,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = 1, .matches = has_cpuid_feature, }, + { + .desc = "TLB range maintenance instructions", + .capability = ARM64_HAS_TLB_RANGE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR0_EL1, + .field_pos = ID_AA64ISAR0_TLB_SHIFT, + .sign = FTR_UNSIGNED, + .min_field_value = ID_AA64ISAR0_TLB_RANGE, + }, #ifdef CONFIG_ARM64_HW_AFDBM { /* From 7c78f67e9bd97478d56157c2ad53823668b5b822 Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Wed, 15 Jul 2020 15:19:44 +0800 Subject: [PATCH 347/502] arm64: enable tlbi range instructions TLBI RANGE feature instoduces new assembly instructions and only support by binutils >= 2.30. Add necessary Kconfig logic to allow this to be enabled and pass '-march=armv8.4-a' to KBUILD_CFLAGS. Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200715071945.897-3-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 14 ++++++++++++++ arch/arm64/Makefile | 7 +++++++ 2 files changed, 21 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 66dc41fd49f2..0f39468dbc60 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1596,6 +1596,20 @@ config ARM64_AMU_EXTN correctly reflect reality. Most commonly, the value read will be 0, indicating that the counter is not enabled. +config AS_HAS_ARMV8_4 + def_bool $(cc-option,-Wa$(comma)-march=armv8.4-a) + +config ARM64_TLB_RANGE + bool "Enable support for tlbi range feature" + default y + depends on AS_HAS_ARMV8_4 + help + ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a + range of input addresses. + + The feature introduces new assembly instructions, and they were + support when binutils >= 2.30. + endmenu menu "ARMv8.5 architectural features" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index a0d94d063fa8..4e823b97c92e 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -82,11 +82,18 @@ endif # compiler to generate them and consequently to break the single image contract # we pass it only to the assembler. This option is utilized only in case of non # integrated assemblers. +ifneq ($(CONFIG_AS_HAS_ARMV8_4), y) branch-prot-flags-$(CONFIG_AS_HAS_PAC) += -Wa,-march=armv8.3-a endif +endif KBUILD_CFLAGS += $(branch-prot-flags-y) +ifeq ($(CONFIG_AS_HAS_ARMV8_4), y) +# make sure to pass the newest target architecture to -march. +KBUILD_CFLAGS += -Wa,-march=armv8.4-a +endif + ifeq ($(CONFIG_SHADOW_CALL_STACK), y) KBUILD_CFLAGS += -ffixed-x18 endif From d1d3aa98b1d4826a19adfefb69b96142a0cac633 Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Wed, 15 Jul 2020 15:19:45 +0800 Subject: [PATCH 348/502] arm64: tlb: Use the TLBI RANGE feature in arm64 Add __TLBI_VADDR_RANGE macro and rewrite __flush_tlb_range(). When cpu supports TLBI feature, the minimum range granularity is decided by 'scale', so we can not flush all pages by one instruction in some cases. For example, when the pages = 0xe81a, let's start 'scale' from maximum, and find right 'num' for each 'scale': 1. scale = 3, we can flush no pages because the minimum range is 2^(5*3 + 1) = 0x10000. 2. scale = 2, the minimum range is 2^(5*2 + 1) = 0x800, we can flush 0xe800 pages this time, the num = 0xe800/0x800 - 1 = 0x1c. Remaining pages is 0x1a; 3. scale = 1, the minimum range is 2^(5*1 + 1) = 0x40, no page can be flushed. 4. scale = 0, we flush the remaining 0x1a pages, the num = 0x1a/0x2 - 1 = 0xd. However, in most scenarios, the pages = 1 when flush_tlb_range() is called. Start from scale = 3 or other proper value (such as scale = ilog2(pages)), will incur extra overhead. So increase 'scale' from 0 to maximum, the flush order is exactly opposite to the example. Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200715071945.897-4-yezhenyu2@huawei.com [catalin.marinas@arm.com: removed unnecessary masks in __TLBI_VADDR_RANGE] [catalin.marinas@arm.com: __TLB_RANGE_NUM subtracts 1] [catalin.marinas@arm.com: minor adjustments to the comments] [catalin.marinas@arm.com: introduce system_supports_tlb_range()] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpufeature.h | 6 ++ arch/arm64/include/asm/tlbflush.h | 156 ++++++++++++++++++++++------ 2 files changed, 132 insertions(+), 30 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 5d1f4ae42799..cf56daa95a7d 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -692,6 +692,12 @@ static inline bool system_supports_bti(void) return IS_ENABLED(CONFIG_ARM64_BTI) && cpus_have_const_cap(ARM64_BTI); } +static inline bool system_supports_tlb_range(void) +{ + return IS_ENABLED(CONFIG_ARM64_TLB_RANGE) && + cpus_have_const_cap(ARM64_HAS_TLB_RANGE); +} + #define ARM64_BP_HARDEN_UNKNOWN -1 #define ARM64_BP_HARDEN_WA_NEEDED 0 #define ARM64_BP_HARDEN_NOT_REQUIRED 1 diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 2cb275efcea3..d493174415db 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -60,6 +60,31 @@ __ta; \ }) +/* + * Get translation granule of the system, which is decided by + * PAGE_SIZE. Used by TTL. + * - 4KB : 1 + * - 16KB : 2 + * - 64KB : 3 + */ +#define TLBI_TTL_TG_4K 1 +#define TLBI_TTL_TG_16K 2 +#define TLBI_TTL_TG_64K 3 + +static inline unsigned long get_trans_granule(void) +{ + switch (PAGE_SIZE) { + case SZ_4K: + return TLBI_TTL_TG_4K; + case SZ_16K: + return TLBI_TTL_TG_16K; + case SZ_64K: + return TLBI_TTL_TG_64K; + default: + return 0; + } +} + /* * Level-based TLBI operations. * @@ -73,9 +98,6 @@ * in asm/stage2_pgtable.h. */ #define TLBI_TTL_MASK GENMASK_ULL(47, 44) -#define TLBI_TTL_TG_4K 1 -#define TLBI_TTL_TG_16K 2 -#define TLBI_TTL_TG_64K 3 #define __tlbi_level(op, addr, level) do { \ u64 arg = addr; \ @@ -83,19 +105,7 @@ if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) && \ level) { \ u64 ttl = level & 3; \ - \ - switch (PAGE_SIZE) { \ - case SZ_4K: \ - ttl |= TLBI_TTL_TG_4K << 2; \ - break; \ - case SZ_16K: \ - ttl |= TLBI_TTL_TG_16K << 2; \ - break; \ - case SZ_64K: \ - ttl |= TLBI_TTL_TG_64K << 2; \ - break; \ - } \ - \ + ttl |= get_trans_granule() << 2; \ arg &= ~TLBI_TTL_MASK; \ arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \ } \ @@ -108,6 +118,44 @@ __tlbi_level(op, (arg | USER_ASID_FLAG), level); \ } while (0) +/* + * This macro creates a properly formatted VA operand for the TLB RANGE. + * The value bit assignments are: + * + * +----------+------+-------+-------+-------+----------------------+ + * | ASID | TG | SCALE | NUM | TTL | BADDR | + * +-----------------+-------+-------+-------+----------------------+ + * |63 48|47 46|45 44|43 39|38 37|36 0| + * + * The address range is determined by below formula: + * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE) + * + */ +#define __TLBI_VADDR_RANGE(addr, asid, scale, num, ttl) \ + ({ \ + unsigned long __ta = (addr) >> PAGE_SHIFT; \ + __ta &= GENMASK_ULL(36, 0); \ + __ta |= (unsigned long)(ttl) << 37; \ + __ta |= (unsigned long)(num) << 39; \ + __ta |= (unsigned long)(scale) << 44; \ + __ta |= get_trans_granule() << 46; \ + __ta |= (unsigned long)(asid) << 48; \ + __ta; \ + }) + +/* These macros are used by the TLBI RANGE feature. */ +#define __TLBI_RANGE_PAGES(num, scale) \ + ((unsigned long)((num) + 1) << (5 * (scale) + 1)) +#define MAX_TLBI_RANGE_PAGES __TLBI_RANGE_PAGES(31, 3) + +/* + * Generate 'num' values from -1 to 30 with -1 rejected by the + * __flush_tlb_range() loop below. + */ +#define TLBI_RANGE_MASK GENMASK_ULL(4, 0) +#define __TLBI_RANGE_NUM(pages, scale) \ + ((((pages) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK) - 1) + /* * TLB Invalidation * ================ @@ -231,32 +279,80 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long stride, bool last_level, int tlb_level) { + int num = 0; + int scale = 0; unsigned long asid = ASID(vma->vm_mm); unsigned long addr; + unsigned long pages; start = round_down(start, stride); end = round_up(end, stride); + pages = (end - start) >> PAGE_SHIFT; - if ((end - start) >= (MAX_TLBI_OPS * stride)) { + /* + * When not uses TLB range ops, we can handle up to + * (MAX_TLBI_OPS - 1) pages; + * When uses TLB range ops, we can handle up to + * (MAX_TLBI_RANGE_PAGES - 1) pages. + */ + if ((!system_supports_tlb_range() && + (end - start) >= (MAX_TLBI_OPS * stride)) || + pages >= MAX_TLBI_RANGE_PAGES) { flush_tlb_mm(vma->vm_mm); return; } - /* Convert the stride into units of 4k */ - stride >>= 12; - - start = __TLBI_VADDR(start, asid); - end = __TLBI_VADDR(end, asid); - dsb(ishst); - for (addr = start; addr < end; addr += stride) { - if (last_level) { - __tlbi_level(vale1is, addr, tlb_level); - __tlbi_user_level(vale1is, addr, tlb_level); - } else { - __tlbi_level(vae1is, addr, tlb_level); - __tlbi_user_level(vae1is, addr, tlb_level); + + /* + * When the CPU does not support TLB range operations, flush the TLB + * entries one by one at the granularity of 'stride'. If the the TLB + * range ops are supported, then: + * + * 1. If 'pages' is odd, flush the first page through non-range + * operations; + * + * 2. For remaining pages: the minimum range granularity is decided + * by 'scale', so multiple range TLBI operations may be required. + * Start from scale = 0, flush the corresponding number of pages + * ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it + * until no pages left. + * + * Note that certain ranges can be represented by either num = 31 and + * scale or num = 0 and scale + 1. The loop below favours the latter + * since num is limited to 30 by the __TLBI_RANGE_NUM() macro. + */ + while (pages > 0) { + if (!system_supports_tlb_range() || + pages % 2 == 1) { + addr = __TLBI_VADDR(start, asid); + if (last_level) { + __tlbi_level(vale1is, addr, tlb_level); + __tlbi_user_level(vale1is, addr, tlb_level); + } else { + __tlbi_level(vae1is, addr, tlb_level); + __tlbi_user_level(vae1is, addr, tlb_level); + } + start += stride; + pages -= stride >> PAGE_SHIFT; + continue; } + + num = __TLBI_RANGE_NUM(pages, scale); + if (num >= 0) { + addr = __TLBI_VADDR_RANGE(start, asid, scale, + num, tlb_level); + if (last_level) { + __tlbi(rvale1is, addr); + __tlbi_user(rvale1is, addr); + } else { + __tlbi(rvae1is, addr); + __tlbi_user(rvae1is, addr); + } + start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; + pages -= __TLBI_RANGE_PAGES(num, scale); + } + scale++; } dsb(ish); } From 5be542e945cb39a2457aa2cfe8b84aac95ef0f2d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 16 Jul 2020 16:36:50 +1000 Subject: [PATCH 349/502] lockdep: Move list.h inclusion into lockdep.h Currently lockdep_types.h includes list.h without actually using any of its macros or functions. All it needs are the type definitions which were moved into types.h long ago. This potentially causes inclusion loops because both are included by many core header files. This patch moves the list.h inclusion into lockdep.h. Note that we could probably remove it completely but that could potentially result in compile failures should any end users not include list.h directly and also be unlucky enough to not get list.h via some other header file. Reported-by: Petr Mladek Signed-off-by: Herbert Xu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Petr Mladek Link: https://lkml.kernel.org/r/20200716063649.GA23065@gondor.apana.org.au --- include/linux/lockdep.h | 1 + include/linux/lockdep_types.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index fd04b9e96091..7aafba0ddcf9 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -22,6 +22,7 @@ extern int lock_stat; #ifdef CONFIG_LOCKDEP #include +#include #include #include diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 7b9350624577..bb35b449f533 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -32,8 +32,6 @@ enum lockdep_wait_type { #ifdef CONFIG_LOCKDEP -#include - /* * We'd rather not expose kernel/lockdep_states.h this wide, but we do need * the total number of states... :-( From 482cbb6cc33dca60091048631cd0a8dde72c3da7 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Mon, 13 Jul 2020 13:57:28 +0200 Subject: [PATCH 350/502] docs: locking: Replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200713115728.33905-1-grandmaster@al2klimov.de --- Documentation/locking/mutex-design.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst index 4d8236b81fa5..8f3e9a5141f9 100644 --- a/Documentation/locking/mutex-design.rst +++ b/Documentation/locking/mutex-design.rst @@ -18,7 +18,7 @@ as an alternative to these. This new data structure provided a number of advantages, including simpler interfaces, and at that time smaller code (see Disadvantages). -[1] http://lwn.net/Articles/164802/ +[1] https://lwn.net/Articles/164802/ Implementation -------------- From a9232dc5607dbada801f2fe83ea307cda762969a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 11 Jul 2020 17:59:54 +0300 Subject: [PATCH 351/502] rwsem: fix commas in initialisation Leading comma prevents arbitrary reordering of initialisation clauses. The whole point of C99 initialisation is to allow any such reordering. Signed-off-by: Alexey Dobriyan Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200711145954.GA1178171@localhost.localdomain --- include/linux/rwsem.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 7e5b2a4eb560..25e3fde85617 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -60,39 +60,39 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) } #define RWSEM_UNLOCKED_VALUE 0L -#define __RWSEM_INIT_COUNT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) +#define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) /* Common initializer macros and functions */ #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __RWSEM_DEP_MAP_INIT(lockname) \ - , .dep_map = { \ + .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_SLEEP, \ - } + }, #else # define __RWSEM_DEP_MAP_INIT(lockname) #endif #ifdef CONFIG_DEBUG_RWSEMS -# define __DEBUG_RWSEM_INITIALIZER(lockname) , .magic = &lockname +# define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname, #else -# define __DEBUG_RWSEM_INITIALIZER(lockname) +# define __RWSEM_DEBUG_INIT(lockname) #endif #ifdef CONFIG_RWSEM_SPIN_ON_OWNER -#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED +#define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED, #else #define __RWSEM_OPT_INIT(lockname) #endif #define __RWSEM_INITIALIZER(name) \ - { __RWSEM_INIT_COUNT(name), \ + { __RWSEM_COUNT_INIT(name), \ .owner = ATOMIC_LONG_INIT(0), \ - .wait_list = LIST_HEAD_INIT((name).wait_list), \ - .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \ __RWSEM_OPT_INIT(name) \ - __DEBUG_RWSEM_INITIALIZER(name) \ + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\ + .wait_list = LIST_HEAD_INIT((name).wait_list), \ + __RWSEM_DEBUG_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(name) \ From 0f85c4805184765ff35e0079b3241ee8f25d1b2b Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Thu, 16 Jul 2020 16:47:47 +0800 Subject: [PATCH 352/502] debugobjects: Convert to DEFINE_SHOW_ATTRIBUTE Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. [ tglx: Distangled it from the mess in -next ] Signed-off-by: Qinglang Miao Signed-off-by: Thomas Gleixner Cc: hch@lst.de Link: https://lkml.kernel.org/r/20200716084747.8034-1-miaoqinglang@huawei.com --- lib/debugobjects.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 48054dbf1b51..fe4557955d97 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1022,18 +1022,7 @@ static int debug_stats_show(struct seq_file *m, void *v) seq_printf(m, "objs_freed :%d\n", debug_objects_freed); return 0; } - -static int debug_stats_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, debug_stats_show, NULL); -} - -static const struct file_operations debug_stats_fops = { - .open = debug_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(debug_stats); static int __init debug_objects_init_debugfs(void) { From 9180bd467f9abdb44afde650d07e3b9dd66d837c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 2 Jul 2020 17:28:40 -0300 Subject: [PATCH 353/502] futex: Remove put_futex_key() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 4b39f99c ("futex: Remove {get,drop}_futex_key_refs()"), put_futex_key() is empty. Remove all references for this function and the then redundant labels. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-2-andrealmeid@collabora.com --- kernel/futex.c | 61 ++++++++++---------------------------------------- 1 file changed, 12 insertions(+), 49 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index e646661f6282..bd9adfca5d51 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -677,10 +677,6 @@ out: return err; } -static inline void put_futex_key(union futex_key *key) -{ -} - /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address @@ -1617,7 +1613,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out_put_key; + goto out; spin_lock(&hb->lock); @@ -1640,8 +1636,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); -out_put_key: - put_futex_key(&key); out: return ret; } @@ -1712,7 +1706,7 @@ retry: goto out; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out_put_key1; + goto out; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1730,13 +1724,13 @@ retry_private: * an MMU, but we might get them from range checking */ ret = op_ret; - goto out_put_keys; + goto out; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out_put_keys; + goto out; } if (!(flags & FLAGS_SHARED)) { @@ -1744,8 +1738,6 @@ retry_private: goto retry_private; } - put_futex_key(&key2); - put_futex_key(&key1); cond_resched(); goto retry; } @@ -1781,10 +1773,6 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); out: return ret; } @@ -1996,7 +1984,7 @@ retry: ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out_put_key1; + goto out; /* * The check above which compares uaddrs is not sufficient for @@ -2004,7 +1992,7 @@ retry: */ if (requeue_pi && match_futex(&key1, &key2)) { ret = -EINVAL; - goto out_put_keys; + goto out; } hb1 = hash_futex(&key1); @@ -2025,13 +2013,11 @@ retry_private: ret = get_user(curval, uaddr1); if (ret) - goto out_put_keys; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&key2); - put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -2090,8 +2076,6 @@ retry_private: case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; @@ -2106,8 +2090,6 @@ retry_private: */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2217,10 +2199,6 @@ out_unlock: wake_up_q(&wake_q); hb_waiters_dec(hb2); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); out: return ret ? ret : task_count; } @@ -2697,7 +2675,6 @@ retry_private: if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q->key); goto retry; } @@ -2707,8 +2684,6 @@ retry_private: } out: - if (ret) - put_futex_key(&q->key); return ret; } @@ -2853,7 +2828,6 @@ retry_private: * - EAGAIN: The user space value changed. */ queue_unlock(hb); - put_futex_key(&q.key); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2961,13 +2935,11 @@ no_block: put_pi_state(pi_state); } - goto out_put_key; + goto out; out_unlock_put_key: queue_unlock(hb); -out_put_key: - put_futex_key(&q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -2980,12 +2952,11 @@ uaddr_faulted: ret = fault_in_user_writeable(uaddr); if (ret) - goto out_put_key; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q.key); goto retry; } @@ -3114,16 +3085,13 @@ retry: out_unlock: spin_unlock(&hb->lock); out_putkey: - put_futex_key(&key); return ret; pi_retry: - put_futex_key(&key); cond_resched(); goto retry; pi_faulted: - put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -3265,7 +3233,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out_key2; + goto out; /* * The check above which compares uaddrs is not sufficient for @@ -3274,7 +3242,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (match_futex(&q.key, &key2)) { queue_unlock(hb); ret = -EINVAL; - goto out_put_keys; + goto out; } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ @@ -3284,7 +3252,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); spin_unlock(&hb->lock); if (ret) - goto out_put_keys; + goto out; /* * In order for us to be here, we know our q.key == key2, and since @@ -3374,11 +3342,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = -EWOULDBLOCK; } -out_put_keys: - put_futex_key(&q.key); -out_key2: - put_futex_key(&key2); - out: if (to) { hrtimer_cancel(&to->timer); From d7c5ed73b19c4640426d9c106f70ec2cb532034d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 2 Jul 2020 17:28:41 -0300 Subject: [PATCH 354/502] futex: Remove needless goto's MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As stated in the coding style documentation, "if there is no cleanup needed then just return directly", instead of jumping to a label and then returning. Remove such goto's and replace with a return statement. When there's a ternary operator on the return value, replace it with the result of the operation when it is logically possible to determine it by the control flow. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-3-andrealmeid@collabora.com --- kernel/futex.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index bd9adfca5d51..362fbca6d614 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1607,13 +1607,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; hb = hash_futex(&key); /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out; + return ret; spin_lock(&hb->lock); @@ -1636,7 +1636,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); -out: return ret; } @@ -1703,10 +1702,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out; + return ret; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1724,13 +1723,13 @@ retry_private: * an MMU, but we might get them from range checking */ ret = op_ret; - goto out; + return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out; + return ret; } if (!(flags & FLAGS_SHARED)) { @@ -1773,7 +1772,6 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); -out: return ret; } @@ -1980,20 +1978,18 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ - if (requeue_pi && match_futex(&key1, &key2)) { - ret = -EINVAL; - goto out; - } + if (requeue_pi && match_futex(&key1, &key2)) + return -EINVAL; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -2013,7 +2009,7 @@ retry_private: ret = get_user(curval, uaddr1); if (ret) - goto out; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; @@ -2079,7 +2075,7 @@ retry_private: ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; - goto out; + return ret; case -EBUSY: case -EAGAIN: /* @@ -2198,8 +2194,6 @@ out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); - -out: return ret ? ret : task_count; } @@ -2545,7 +2539,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); - goto out; + return ret ? ret : locked; } /* @@ -2558,7 +2552,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner == current) { ret = fixup_pi_state_owner(uaddr, q, NULL); - goto out; + return ret; } /* @@ -2572,8 +2566,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) q->pi_state->owner); } -out: - return ret ? ret : locked; + return ret; } /** @@ -2670,7 +2663,7 @@ retry_private: ret = get_user(uval, uaddr); if (ret) - goto out; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; @@ -2683,7 +2676,6 @@ retry_private: ret = -EWOULDBLOCK; } -out: return ret; } From 9261308598ad28b9a8a2237d881833e9f217244e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 2 Jul 2020 17:28:43 -0300 Subject: [PATCH 355/502] futex: Consistently use fshared as boolean MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since fshared is only conveying true/false values, declare it as bool. In get_futex_key() the usage of fshared can be restricted to the first part of the function. If fshared is false the function is terminated early and the subsequent code can use a constant 'true' instead of the variable. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-5-andrealmeid@collabora.com --- kernel/futex.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 362fbca6d614..cda91755b77d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -476,7 +476,7 @@ static u64 get_inode_sequence_number(struct inode *inode) /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) @@ -500,8 +500,8 @@ static u64 get_inode_sequence_number(struct inode *inode) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw) +static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -538,7 +538,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a again: /* Ignore any VERIFY_READ mapping (futex common case) */ - if (unlikely(should_fail_futex(fshared))) + if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); @@ -626,7 +626,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (unlikely(should_fail_futex(fshared)) || ro) { + if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } From 9a71df495c3d29dab596bb590e73fd8b20106e2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 2 Jul 2020 17:28:42 -0300 Subject: [PATCH 356/502] futex: Remove unused or redundant includes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 82af7aca ("Removal of FUTEX_FD"), some includes related to file operations aren't needed anymore. More investigation around the includes showed that a lot of includes aren't required for compilation, possible due to redundant includes. Simplify the code by removing unused includes. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-4-andrealmeid@collabora.com --- kernel/futex.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index cda91755b77d..4616d4ad609d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -32,30 +32,13 @@ * "But they come in a choice of three flavours!" */ #include -#include -#include -#include -#include #include -#include -#include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include #include -#include #include From 7904aaa8b22fa07fd5457ee4a885cf9f665cb9c4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 14 Jul 2020 07:43:26 +0200 Subject: [PATCH 357/502] s390/mm: fix typo in comment Signed-off-by: Heiko Carstens --- arch/s390/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index d53c2e2ea1fd..598828517d9d 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -376,7 +376,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, * routines. * * interruption code (int_code): - * 04 Protection -> Write-Protection (suprression) + * 04 Protection -> Write-Protection (suppression) * 10 Segment translation -> Not present (nullification) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) From 529683d4705b6b1fa1c2f902e859ad6a8d17e31e Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 15 Jun 2020 17:23:11 +0200 Subject: [PATCH 358/502] s390/qdio: fix statistics for 128 SBALs Old code would only scan up to 127 SBALs at once. So the last statistics bucket was set aside to count "discovered 127 SBALs with new work" events. But nowadays we allow to scan all 128 SBALs for Output Queues, and a subsequent patch will introduce the same for Input Queues. So fix up the accounting to use the last bucket only when all 128 SBALs have been discovered with new work. Signed-off-by: Julian Wiedmann Signed-off-by: Heiko Carstens --- drivers/s390/cio/qdio.h | 6 +----- drivers/s390/cio/qdio_debug.c | 2 +- drivers/s390/cio/qdio_main.c | 9 +-------- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index bb1c8402c67d..7f0aa95585a4 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -166,11 +166,7 @@ struct qdio_dev_perf_stat { } ____cacheline_aligned; struct qdio_queue_perf_stat { - /* - * Sorted into order-2 buckets: 1, 2-3, 4-7, ... 64-127, 128. - * Since max. 127 SBALs are scanned reuse entry for 128 as queue full - * aka 127 SBALs found. - */ + /* Sorted into order-2 buckets: 1, 2-3, 4-7, ... 64-127, 128. */ unsigned int nr_sbals[8]; unsigned int nr_sbal_error; unsigned int nr_sbal_nop; diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c index da95c923d81a..863d17c802ca 100644 --- a/drivers/s390/cio/qdio_debug.c +++ b/drivers/s390/cio/qdio_debug.c @@ -165,7 +165,7 @@ static int qstat_show(struct seq_file *m, void *v) } seq_printf(m, "\n1 2.. 4.. 8.. " - "16.. 32.. 64.. 127\n"); + "16.. 32.. 64.. 128\n"); for (i = 0; i < ARRAY_SIZE(q->q_stats.nr_sbals); i++) seq_printf(m, "%-10u ", q->q_stats.nr_sbals[i]); seq_printf(m, "\nError NOP Total\n%-10u %-10u %-10u\n\n", diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 0c919a11a46e..d4c699773070 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -413,15 +413,8 @@ static inline void qdio_stop_polling(struct qdio_q *q) static inline void account_sbals(struct qdio_q *q, unsigned int count) { - int pos; - q->q_stats.nr_sbal_total += count; - if (count == QDIO_MAX_BUFFERS_MASK) { - q->q_stats.nr_sbals[7]++; - return; - } - pos = ilog2(count); - q->q_stats.nr_sbals[pos]++; + q->q_stats.nr_sbals[ilog2(count)]++; } static void process_buffer_error(struct qdio_q *q, unsigned int start, From 2bbf282a5e8e7e6b36586718b484a36117b6b8a0 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 17 Jun 2020 15:30:14 +0200 Subject: [PATCH 359/502] s390/qdio: allow to scan all 128 Input SBALs The comment is inaccurate, qdio_inbound_q_moved() and/or its callers no longer get confused by a count of 128 completed SBALs. Scanning all 128 SBALs at once can improve IRQ reduction (as we now place the ACK at the right spot), and reduce the amount of processing needed to handle all completed SBALs. Signed-off-by: Julian Wiedmann Signed-off-by: Heiko Carstens --- drivers/s390/cio/qdio_main.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index d4c699773070..0c1f186c6291 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -457,11 +457,7 @@ static int get_inbound_buffer_frontier(struct qdio_q *q, unsigned int start) q->timestamp = get_tod_clock_fast(); - /* - * Don't check 128 buffers, as otherwise qdio_inbound_q_moved - * would return 0. - */ - count = min(atomic_read(&q->nr_buf_used), QDIO_MAX_BUFFERS_MASK); + count = atomic_read(&q->nr_buf_used); if (!count) return 0; From a709423f7a3a452e5fa7442425817c1bdccd7926 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 16 Jun 2020 14:13:00 +0200 Subject: [PATCH 360/502] s390/qdio: remove internal polling in non-thinint path For non-thinint devices in LPAR, qdio polls an idle Input Queue for a little while to catch more work. But platform support for thinints has been around practically _forever_ by now, so this micro-optimization is seeing 0 actual use. Remove it to reduce the overall complexity of the hot path. In the meantime we also grew support for driver-level polling (eg. NAPI in qeth), so it's quite questionable how useful this would actually be on current kernels. Signed-off-by: Julian Wiedmann Signed-off-by: Heiko Carstens --- drivers/s390/cio/qdio.h | 3 --- drivers/s390/cio/qdio_main.c | 26 ++------------------------ 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index 7f0aa95585a4..cd2df4ff8e0e 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -15,7 +15,6 @@ #define QDIO_BUSY_BIT_PATIENCE (100 << 12) /* 100 microseconds */ #define QDIO_BUSY_BIT_RETRY_DELAY 10 /* 10 milliseconds */ #define QDIO_BUSY_BIT_RETRIES 1000 /* = 10s retry time */ -#define QDIO_INPUT_THRESHOLD (500 << 12) /* 500 microseconds */ enum qdio_irq_states { QDIO_IRQ_STATE_INACTIVE, @@ -181,8 +180,6 @@ struct qdio_input_q { /* Batch of SBALs that we processed while polling the queue: */ unsigned int batch_start; unsigned int batch_count; - /* last time of noticing incoming data */ - u64 timestamp; }; struct qdio_output_q { diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 0c1f186c6291..4fab8bba2cdd 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -510,14 +510,7 @@ static int get_inbound_buffer_frontier(struct qdio_q *q, unsigned int start) static int qdio_inbound_q_moved(struct qdio_q *q, unsigned int start) { - int count; - - count = get_inbound_buffer_frontier(q, start); - - if (count && !is_thinint_irq(q->irq_ptr) && MACHINE_IS_LPAR) - q->u.in.timestamp = get_tod_clock(); - - return count; + return get_inbound_buffer_frontier(q, start); } static inline int qdio_inbound_q_done(struct qdio_q *q, unsigned int start) @@ -535,22 +528,7 @@ static inline int qdio_inbound_q_done(struct qdio_q *q, unsigned int start) /* more work coming */ return 0; - if (is_thinint_irq(q->irq_ptr)) - return 1; - - /* don't poll under z/VM */ - if (MACHINE_IS_VM) - return 1; - - /* - * At this point we know, that inbound first_to_check - * has (probably) not moved (see qdio_inbound_processing). - */ - if (get_tod_clock_fast() > q->u.in.timestamp + QDIO_INPUT_THRESHOLD) { - DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "in done:%02x", start); - return 1; - } else - return 0; + return 1; } static inline void qdio_handle_aobs(struct qdio_q *q, int start, int count) From 3c5f2eb9695cd241c9898a01388b19a149d0b7d2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 14 Jul 2020 07:46:40 +0200 Subject: [PATCH 361/502] s390/mm: avoid trimming to MAX_ORDER Trimming to MAX_ORDER was originally done in order to avoid to set HOLES_IN_ZONE, which in turn would enable a quite expensive pfn_valid() check. pfn_valid() however only checks if a struct page exists for a given pfn. With sparsemen vmemmap there are always struct pages, since memmaps are allocated for whole sections. Therefore remove the HOLES_IN_ZONE comment and the trimming. Signed-off-by: Heiko Carstens --- arch/s390/kernel/setup.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 5853c9872dfe..295a02bab64d 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -1126,14 +1126,6 @@ void __init setup_arch(char **cmdline_p) free_mem_detect_info(); remove_oldmem(); - /* - * Make sure all chunks are MAX_ORDER aligned so we don't need the - * extra checks that HOLES_IN_ZONE would require. - * - * Is this still required? - */ - memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT)); - if (is_prot_virt_host()) setup_uv(); setup_memory_end(); From 771cf196cc92a6078656548bbc073aa932c053ab Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 14 Jul 2020 08:22:21 +0200 Subject: [PATCH 362/502] s390/mm: allow order 10 allocations Get rid of FORCE_MAX_ZONEORDER which limited allocations to order 8 (= 1MB) and use the default, which allows for order 10 (= 4MB) allocations. Given that s390 allows less than the default this caused some memory allocation problems more or less unique to s390 from time to time. Note: this was originally introduced with commit 684de39bd795 ("[S390] Fix IPL from NSS.") in order to support Named Saved Segments, which could start/end at an arbitrary 1 megabyte boundary and also before support for sparsemem vmemmmap was enabled. Since NSS support is gone, but sparsemem vmemmap support is available this limitation can go away. Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 7697a1f8e819..0df33cffec52 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -625,10 +625,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE config ARCH_ENABLE_SPLIT_PMD_PTLOCK def_bool y -config FORCE_MAX_ZONEORDER - int - default "9" - config MAX_PHYSMEM_BITS int "Maximum size of supported physical memory in bits (42-53)" range 42 53 From 88aa8939c96781089e5ace3492d818074c5c6fe9 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 29 Jun 2020 20:48:09 +0200 Subject: [PATCH 363/502] s390/kernel: unify EX_TABLE* implementations Replace three implementations with one using using __stringify_in_c macro conveniently "borrowed" from powerpc and microblaze. Signed-off-by: Ilya Leoshkevich Signed-off-by: Heiko Carstens --- arch/s390/include/asm/asm-const.h | 12 +++++++++++ arch/s390/include/asm/linkage.h | 34 ++++++++++--------------------- 2 files changed, 23 insertions(+), 23 deletions(-) create mode 100644 arch/s390/include/asm/asm-const.h diff --git a/arch/s390/include/asm/asm-const.h b/arch/s390/include/asm/asm-const.h new file mode 100644 index 000000000000..11f615eb0066 --- /dev/null +++ b/arch/s390/include/asm/asm-const.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ASM_CONST_H +#define _ASM_S390_ASM_CONST_H + +#ifdef __ASSEMBLY__ +# define stringify_in_c(...) __VA_ARGS__ +#else +/* This version of stringify will deal with commas... */ +# define __stringify_in_c(...) #__VA_ARGS__ +# define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " +#endif +#endif /* _ASM_S390_ASM_CONST_H */ diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h index 7f22262b0e46..1b52c07b5642 100644 --- a/arch/s390/include/asm/linkage.h +++ b/arch/s390/include/asm/linkage.h @@ -2,38 +2,26 @@ #ifndef __ASM_LINKAGE_H #define __ASM_LINKAGE_H +#include #include #define __ALIGN .align 4, 0x07 #define __ALIGN_STR __stringify(__ALIGN) -#ifndef __ASSEMBLY__ - /* * Helper macro for exception table entries */ -#define EX_TABLE(_fault, _target) \ - ".section __ex_table,\"a\"\n" \ - ".align 4\n" \ - ".long (" #_fault ") - .\n" \ - ".long (" #_target ") - .\n" \ - ".previous\n" -#else /* __ASSEMBLY__ */ +#define __EX_TABLE(_section, _fault, _target) \ + stringify_in_c(.section _section,"a";) \ + stringify_in_c(.align 4;) \ + stringify_in_c(.long (_fault) - .;) \ + stringify_in_c(.long (_target) - .;) \ + stringify_in_c(.previous) -#define EX_TABLE(_fault, _target) \ - .section __ex_table,"a" ; \ - .align 4 ; \ - .long (_fault) - . ; \ - .long (_target) - . ; \ - .previous +#define EX_TABLE(_fault, _target) \ + __EX_TABLE(__ex_table, _fault, _target) +#define EX_TABLE_DMA(_fault, _target) \ + __EX_TABLE(.dma.ex_table, _fault, _target) -#define EX_TABLE_DMA(_fault, _target) \ - .section .dma.ex_table, "a" ; \ - .align 4 ; \ - .long (_fault) - . ; \ - .long (_target) - . ; \ - .previous - -#endif /* __ASSEMBLY__ */ #endif From 05a68e892e89c97df6650cd8cc55058002657cbc Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 30 Jun 2020 20:52:03 +0200 Subject: [PATCH 364/502] s390/kernel: expand exception table logic to allow new handling options This is a s390 port of commit 548acf19234d ("x86/mm: Expand the exception table logic to allow new handling options"), which is needed for implementing BPF_PROBE_MEM on s390. The new handler field is made 64-bit in order to allow pointing from dynamically allocated entries to handlers in kernel text. Unlike on x86, NULL is used instead of ex_handler_default. This is because exception tables are used by boot/text_dma.S, and it would be a pain to preserve ex_handler_default. The new infrastructure is ignored in early_pgm_check_handler, since there is no pt_regs. Signed-off-by: Ilya Leoshkevich Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/include/asm/extable.h | 52 +++++++++++++++++++++++++++++---- arch/s390/include/asm/linkage.h | 3 +- arch/s390/kernel/kprobes.c | 4 +-- arch/s390/kernel/traps.c | 7 ++--- arch/s390/mm/fault.c | 4 +-- scripts/sorttable.c | 41 ++++++++++++++++++++++++++ 6 files changed, 94 insertions(+), 17 deletions(-) diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h index ae27f756b409..3beb294fd553 100644 --- a/arch/s390/include/asm/extable.h +++ b/arch/s390/include/asm/extable.h @@ -1,12 +1,20 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __S390_EXTABLE_H #define __S390_EXTABLE_H + +#include +#include + /* - * The exception table consists of pairs of addresses: the first is the - * address of an instruction that is allowed to fault, and the second is - * the address at which the program should continue. No registers are - * modified, so it is entirely up to the continuation code to figure out - * what to do. + * The exception table consists of three addresses: + * + * - Address of an instruction that is allowed to fault. + * - Address at which the program should continue. + * - Optional address of handler that takes pt_regs * argument and runs in + * interrupt context. + * + * No registers are modified, so it is entirely up to the continuation code + * to figure out what to do. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, @@ -17,6 +25,7 @@ struct exception_table_entry { int insn, fixup; + long handler; }; extern struct exception_table_entry *__start_dma_ex_table; @@ -29,6 +38,39 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x) return (unsigned long)&x->fixup + x->fixup; } +typedef bool (*ex_handler_t)(const struct exception_table_entry *, + struct pt_regs *); + +static inline ex_handler_t +ex_fixup_handler(const struct exception_table_entry *x) +{ + if (likely(!x->handler)) + return NULL; + return (ex_handler_t)((unsigned long)&x->handler + x->handler); +} + +static inline bool ex_handle(const struct exception_table_entry *x, + struct pt_regs *regs) +{ + ex_handler_t handler = ex_fixup_handler(x); + + if (unlikely(handler)) + return handler(x, regs); + regs->psw.addr = extable_fixup(x); + return true; +} + #define ARCH_HAS_RELATIVE_EXTABLE +static inline void swap_ex_entry_fixup(struct exception_table_entry *a, + struct exception_table_entry *b, + struct exception_table_entry tmp, + int delta) +{ + a->fixup = b->fixup + delta; + b->fixup = tmp.fixup - delta; + a->handler = b->handler + delta; + b->handler = tmp.handler - delta; +} + #endif diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h index 1b52c07b5642..a0a7a2c72bd4 100644 --- a/arch/s390/include/asm/linkage.h +++ b/arch/s390/include/asm/linkage.h @@ -14,9 +14,10 @@ #define __EX_TABLE(_section, _fault, _target) \ stringify_in_c(.section _section,"a";) \ - stringify_in_c(.align 4;) \ + stringify_in_c(.align 8;) \ stringify_in_c(.long (_fault) - .;) \ stringify_in_c(.long (_target) - .;) \ + stringify_in_c(.quad 0;) \ stringify_in_c(.previous) #define EX_TABLE(_fault, _target) \ diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 548d0ea9808d..d2a71d872638 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -523,10 +523,8 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) * zero, try to fix up. */ entry = s390_search_extables(regs->psw.addr); - if (entry) { - regs->psw.addr = extable_fixup(entry); + if (entry && ex_handle(entry, regs)) return 1; - } /* * fixup_exception() could not handle it, diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index ff9cc4c3290e..8d1e8a1a97df 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -50,11 +50,8 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) } else { const struct exception_table_entry *fixup; fixup = s390_search_extables(regs->psw.addr); - if (fixup) - regs->psw.addr = extable_fixup(fixup); - else { + if (!fixup || !ex_handle(fixup, regs)) die(regs, str); - } } } @@ -251,7 +248,7 @@ void monitor_event_exception(struct pt_regs *regs) case BUG_TRAP_TYPE_NONE: fixup = s390_search_extables(regs->psw.addr); if (fixup) - regs->psw.addr = extable_fixup(fixup); + ex_handle(fixup, regs); break; case BUG_TRAP_TYPE_WARN: break; diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 598828517d9d..aebf9183bedd 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -255,10 +255,8 @@ static noinline void do_no_context(struct pt_regs *regs) /* Are we prepared to handle this kernel fault? */ fixup = s390_search_extables(regs->psw.addr); - if (fixup) { - regs->psw.addr = extable_fixup(fixup); + if (fixup && ex_handle(fixup, regs)) return; - } /* * Oops. The kernel tried to access some bad page. We'll have to diff --git a/scripts/sorttable.c b/scripts/sorttable.c index ec6b5e81eba1..0ef3abfc4a51 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -255,6 +255,45 @@ static void x86_sort_relative_table(char *extab_image, int image_size) } } +static void s390_sort_relative_table(char *extab_image, int image_size) +{ + int i; + + for (i = 0; i < image_size; i += 16) { + char *loc = extab_image + i; + uint64_t handler; + + w(r((uint32_t *)loc) + i, (uint32_t *)loc); + w(r((uint32_t *)(loc + 4)) + (i + 4), (uint32_t *)(loc + 4)); + /* + * 0 is a special self-relative handler value, which means that + * handler should be ignored. It is safe, because it means that + * handler field points to itself, which should never happen. + * When creating extable-relative values, keep it as 0, since + * this should never occur either: it would mean that handler + * field points to the first extable entry. + */ + handler = r8((uint64_t *)(loc + 8)); + if (handler) + handler += i + 8; + w8(handler, (uint64_t *)(loc + 8)); + } + + qsort(extab_image, image_size / 16, 16, compare_relative_table); + + for (i = 0; i < image_size; i += 16) { + char *loc = extab_image + i; + uint64_t handler; + + w(r((uint32_t *)loc) - i, (uint32_t *)loc); + w(r((uint32_t *)(loc + 4)) - (i + 4), (uint32_t *)(loc + 4)); + handler = r8((uint64_t *)(loc + 8)); + if (handler) + handler -= i + 8; + w8(handler, (uint64_t *)(loc + 8)); + } +} + static int do_file(char const *const fname, void *addr) { int rc = -1; @@ -297,6 +336,8 @@ static int do_file(char const *const fname, void *addr) custom_sort = x86_sort_relative_table; break; case EM_S390: + custom_sort = s390_sort_relative_table; + break; case EM_AARCH64: case EM_PARISC: case EM_PPC: From 3f161e0ae863a0456d00e5a6c9c81098c62ab7fe Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 24 Jun 2020 14:55:22 +0200 Subject: [PATCH 365/502] s390/bpf: implement BPF_PROBE_MEM This is a s390 port of x86 commit 3dec541b2e63 ("bpf: Add support for BTF pointers to x86 JIT"). Signed-off-by: Ilya Leoshkevich Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/net/bpf_jit_comp.c | 139 ++++++++++++++++++++++++++++++++++- 1 file changed, 138 insertions(+), 1 deletion(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index f4242b894cf2..8fe7bdfc8d15 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -49,6 +49,7 @@ struct bpf_jit { int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */ int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ + int excnt; /* Number of exception table entries */ int labels[1]; /* Labels for local jumps */ }; @@ -588,6 +589,84 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) } } +static int get_probe_mem_regno(const u8 *insn) +{ + /* + * insn must point to llgc, llgh, llgf or lg, which have destination + * register at the same position. + */ + if (insn[0] != 0xe3) /* common llgc, llgh, llgf and lg prefix */ + return -1; + if (insn[5] != 0x90 && /* llgc */ + insn[5] != 0x91 && /* llgh */ + insn[5] != 0x16 && /* llgf */ + insn[5] != 0x04) /* lg */ + return -1; + return insn[1] >> 4; +} + +static bool ex_handler_bpf(const struct exception_table_entry *x, + struct pt_regs *regs) +{ + int regno; + u8 *insn; + + regs->psw.addr = extable_fixup(x); + insn = (u8 *)__rewind_psw(regs->psw, regs->int_code >> 16); + regno = get_probe_mem_regno(insn); + if (WARN_ON_ONCE(regno < 0)) + /* JIT bug - unexpected instruction. */ + return false; + regs->gprs[regno] = 0; + return true; +} + +static int bpf_jit_probe_mem(struct bpf_jit *jit, struct bpf_prog *fp, + int probe_prg, int nop_prg) +{ + struct exception_table_entry *ex; + s64 delta; + u8 *insn; + int prg; + int i; + + if (!fp->aux->extable) + /* Do nothing during early JIT passes. */ + return 0; + insn = jit->prg_buf + probe_prg; + if (WARN_ON_ONCE(get_probe_mem_regno(insn) < 0)) + /* JIT bug - unexpected probe instruction. */ + return -1; + if (WARN_ON_ONCE(probe_prg + insn_length(*insn) != nop_prg)) + /* JIT bug - gap between probe and nop instructions. */ + return -1; + for (i = 0; i < 2; i++) { + if (WARN_ON_ONCE(jit->excnt >= fp->aux->num_exentries)) + /* Verifier bug - not enough entries. */ + return -1; + ex = &fp->aux->extable[jit->excnt]; + /* Add extable entries for probe and nop instructions. */ + prg = i == 0 ? probe_prg : nop_prg; + delta = jit->prg_buf + prg - (u8 *)&ex->insn; + if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX)) + /* JIT bug - code and extable must be close. */ + return -1; + ex->insn = delta; + /* + * Always land on the nop. Note that extable infrastructure + * ignores fixup field, it is handled by ex_handler_bpf(). + */ + delta = jit->prg_buf + nop_prg - (u8 *)&ex->fixup; + if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX)) + /* JIT bug - landing pad and extable must be close. */ + return -1; + ex->fixup = delta; + ex->handler = (u8 *)ex_handler_bpf - (u8 *)&ex->handler; + jit->excnt++; + } + return 0; +} + /* * Compile one eBPF instruction into s390x code * @@ -604,7 +683,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, u32 *addrs = jit->addrs; s32 imm = insn->imm; s16 off = insn->off; + int probe_prg = -1; unsigned int mask; + int nop_prg; + int err; + + if (BPF_CLASS(insn->code) == BPF_LDX && + BPF_MODE(insn->code) == BPF_PROBE_MEM) + probe_prg = jit->prg; switch (insn->code) { /* @@ -1119,6 +1205,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, * BPF_LDX */ case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_B: /* llgc %dst,0(off,%src) */ EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, REG_0, off); jit->seen |= SEEN_MEM; @@ -1126,6 +1213,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, insn_count = 2; break; case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_H: /* llgh %dst,0(off,%src) */ EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, REG_0, off); jit->seen |= SEEN_MEM; @@ -1133,6 +1221,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, insn_count = 2; break; case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_W: /* llgf %dst,off(%src) */ jit->seen |= SEEN_MEM; EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, REG_0, off); @@ -1140,6 +1229,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, insn_count = 2; break; case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_DW: /* lg %dst,0(off,%src) */ jit->seen |= SEEN_MEM; EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, src_reg, REG_0, off); @@ -1485,6 +1575,23 @@ branch_oc: pr_err("Unknown opcode %02x\n", insn->code); return -1; } + + if (probe_prg != -1) { + /* + * Handlers of certain exceptions leave psw.addr pointing to + * the instruction directly after the failing one. Therefore, + * create two exception table entries and also add a nop in + * case two probing instructions come directly after each + * other. + */ + nop_prg = jit->prg; + /* bcr 0,%0 */ + _EMIT2(0x0700); + err = bpf_jit_probe_mem(jit, fp, probe_prg, nop_prg); + if (err < 0) + return err; + } + return insn_count; } @@ -1527,6 +1634,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, jit->lit32 = jit->lit32_start; jit->lit64 = jit->lit64_start; jit->prg = 0; + jit->excnt = 0; bpf_jit_prologue(jit, stack_depth); if (bpf_set_addr(jit, 0) < 0) @@ -1551,6 +1659,12 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, jit->lit64_start = ALIGN(jit->lit64_start, 8); jit->size = jit->lit64_start + lit64_size; jit->size_prg = jit->prg; + + if (WARN_ON_ONCE(fp->aux->extable && + jit->excnt != fp->aux->num_exentries)) + /* Verifier bug - too many entries. */ + return -1; + return 0; } @@ -1565,6 +1679,29 @@ struct s390_jit_data { int pass; }; +static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit, + struct bpf_prog *fp) +{ + struct bpf_binary_header *header; + u32 extable_size; + u32 code_size; + + /* We need two entries per insn. */ + fp->aux->num_exentries *= 2; + + code_size = roundup(jit->size, + __alignof__(struct exception_table_entry)); + extable_size = fp->aux->num_exentries * + sizeof(struct exception_table_entry); + header = bpf_jit_binary_alloc(code_size + extable_size, &jit->prg_buf, + 8, jit_fill_hole); + if (!header) + return NULL; + fp->aux->extable = (struct exception_table_entry *) + (jit->prg_buf + code_size); + return header; +} + /* * Compile eBPF program "fp" */ @@ -1631,7 +1768,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) /* * Final pass: Allocate and generate program */ - header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 8, jit_fill_hole); + header = bpf_jit_alloc(&jit, fp); if (!header) { fp = orig_fp; goto free_addrs; From 539707caa1a89ee4efc57b4e4231c20c46575ccc Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Thu, 18 Jun 2020 21:35:44 +0800 Subject: [PATCH 366/502] arm64: perf: Correct the event index in sysfs When PMU event ID is equal or greater than 0x4000, it will be reduced by 0x4000 and it is not the raw number in the sysfs. Let's correct it and obtain the raw event ID. Before this patch: cat /sys/bus/event_source/devices/armv8_pmuv3_0/events/sample_feed event=0x001 After this patch: cat /sys/bus/event_source/devices/armv8_pmuv3_0/events/sample_feed event=0x4001 Signed-off-by: Shaokun Zhang Cc: Will Deacon Cc: Mark Rutland Cc: Link: https://lore.kernel.org/r/1592487344-30555-3-git-send-email-zhangshaokun@hisilicon.com [will: fixed formatting of 'if' condition] Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 4d7879484cec..581602413a13 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -155,7 +155,7 @@ armv8pmu_events_sysfs_show(struct device *dev, pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); - return sprintf(page, "event=0x%03llx\n", pmu_attr->id); + return sprintf(page, "event=0x%04llx\n", pmu_attr->id); } #define ARMV8_EVENT_ATTR(name, config) \ @@ -244,10 +244,13 @@ armv8pmu_event_attr_is_visible(struct kobject *kobj, test_bit(pmu_attr->id, cpu_pmu->pmceid_bitmap)) return attr->mode; - pmu_attr->id -= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE; - if (pmu_attr->id < ARMV8_PMUV3_MAX_COMMON_EVENTS && - test_bit(pmu_attr->id, cpu_pmu->pmceid_ext_bitmap)) - return attr->mode; + if (pmu_attr->id >= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE) { + u64 id = pmu_attr->id - ARMV8_PMUV3_EXT_COMMON_EVENT_BASE; + + if (id < ARMV8_PMUV3_MAX_COMMON_EVENTS && + test_bit(id, cpu_pmu->pmceid_ext_bitmap)) + return attr->mode; + } return 0; } From 1b86abc1c645ad5c9c7bf70910cb3ce73939d2d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:24 +0800 Subject: [PATCH 367/502] sched_clock: Expose struct clock_read_data In order to support perf_event_mmap_page::cap_time features, an architecture needs, aside from a userspace readable counter register, to expose the exact clock data so that userspace can convert the counter register into a correct timestamp. Provide struct clock_read_data and two (seqcount) helpers so that architectures (arm64 in specific) can expose the numbers to userspace. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-2-leo.yan@linaro.org Signed-off-by: Will Deacon --- include/linux/sched_clock.h | 28 +++++++++++++++++++++++++ kernel/time/sched_clock.c | 41 ++++++++++++------------------------- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index 0bb04a96a6d4..528718e4ed52 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -6,6 +6,34 @@ #define LINUX_SCHED_CLOCK #ifdef CONFIG_GENERIC_SCHED_CLOCK +/** + * struct clock_read_data - data required to read from sched_clock() + * + * @epoch_ns: sched_clock() value at last update + * @epoch_cyc: Clock cycle value at last update. + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks. + * @read_sched_clock: Current clock source (or dummy source when suspended). + * @mult: Multipler for scaled math conversion. + * @shift: Shift value for scaled math conversion. + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=40 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { + u64 epoch_ns; + u64 epoch_cyc; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); + u32 mult; + u32 shift; +}; + +extern struct clock_read_data *sched_clock_read_begin(unsigned int *seq); +extern int sched_clock_read_retry(unsigned int seq); + extern void generic_sched_clock_init(void); extern void sched_clock_register(u64 (*read)(void), int bits, diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fa3f800d7d76..0acaadc3156c 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -19,31 +19,6 @@ #include "timekeeping.h" -/** - * struct clock_read_data - data required to read from sched_clock() - * - * @epoch_ns: sched_clock() value at last update - * @epoch_cyc: Clock cycle value at last update. - * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit - * clocks. - * @read_sched_clock: Current clock source (or dummy source when suspended). - * @mult: Multipler for scaled math conversion. - * @shift: Shift value for scaled math conversion. - * - * Care must be taken when updating this structure; it is read by - * some very hot code paths. It occupies <=40 bytes and, when combined - * with the seqcount used to synchronize access, comfortably fits into - * a 64 byte cache line. - */ -struct clock_read_data { - u64 epoch_ns; - u64 epoch_cyc; - u64 sched_clock_mask; - u64 (*read_sched_clock)(void); - u32 mult; - u32 shift; -}; - /** * struct clock_data - all data needed for sched_clock() (including * registration of a new clock source) @@ -93,6 +68,17 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } +struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +{ + *seq = raw_read_seqcount(&cd.seq); + return cd.read_data + (*seq & 1); +} + +int sched_clock_read_retry(unsigned int seq) +{ + return read_seqcount_retry(&cd.seq, seq); +} + unsigned long long notrace sched_clock(void) { u64 cyc, res; @@ -100,13 +86,12 @@ unsigned long long notrace sched_clock(void) struct clock_read_data *rd; do { - seq = raw_read_seqcount(&cd.seq); - rd = cd.read_data + (seq & 1); + rd = sched_clock_read_begin(&seq); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); - } while (read_seqcount_retry(&cd.seq, seq)); + } while (sched_clock_read_retry(seq)); return res; } From aadd6e5caaacd6feca9691ba30536e7de5a7d152 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 16 Jul 2020 13:11:25 +0800 Subject: [PATCH 368/502] time/sched_clock: Use raw_read_seqcount_latch() sched_clock uses seqcount_t latching to switch between two storage places protected by the sequence counter. This allows it to have interruptible, NMI-safe, seqcount_t write side critical sections. Since 7fc26327b756 ("seqlock: Introduce raw_read_seqcount_latch()"), raw_read_seqcount_latch() became the standardized way for seqcount_t latch read paths. Due to the dependent load, it also has one read memory barrier less than the currently used raw_read_seqcount() API. Use raw_read_seqcount_latch() for the seqcount_t latch read path. Signed-off-by: Ahmed S. Darwish Signed-off-by: Leo Yan Link: https://lkml.kernel.org/r/20200625085745.GD117543@hirez.programming.kicks-ass.net Link: https://lkml.kernel.org/r/20200715092345.GA231464@debian-buster-darwi.lab.linutronix.de Link: https://lore.kernel.org/r/20200716051130.4359-3-leo.yan@linaro.org References: 1809bfa44e10 ("timers, sched/clock: Avoid deadlock during read from NMI") Signed-off-by: Will Deacon --- kernel/time/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0acaadc3156c..0deaf4b79fb4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -70,7 +70,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { - *seq = raw_read_seqcount(&cd.seq); + *seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } From 950b74ddefc4a42add8b1ae0170aa309338ffe73 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:26 +0800 Subject: [PATCH 369/502] arm64: perf: Implement correct cap_user_time As reported by Leo; the existing implementation is broken when the clock and counter don't intersect at 0. Use the sched_clock's struct clock_read_data information to correctly implement cap_user_time and cap_user_time_zero. Note that the ARM64 counter is architecturally only guaranteed to be 56bit wide (implementations are allowed to be wider) and the existing perf ABI cannot deal with wrap-around. This implementation should also be faster than the old; seeing how we don't need to recompute mult and shift all the time. [leoyan: Use mul_u64_u32_shr() to convert cyc to ns to avoid overflow] Reported-by: Leo Yan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-4-leo.yan@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 38 ++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 581602413a13..3bbbc22a5148 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -19,6 +19,7 @@ #include #include #include +#include #include /* ARMv8 Cortex-A53 specific event types. */ @@ -1168,28 +1169,47 @@ device_initcall(armv8_pmu_driver_init) void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { - u32 freq; - u32 shift; + struct clock_read_data *rd; + unsigned int seq; + u64 ns; /* * Internal timekeeping for enabled/running/stopped times * is always computed with the sched_clock. */ - freq = arch_timer_get_rate(); userpg->cap_user_time = 1; + userpg->cap_user_time_zero = 1; + + do { + rd = sched_clock_read_begin(&seq); + + userpg->time_mult = rd->mult; + userpg->time_shift = rd->shift; + userpg->time_zero = rd->epoch_ns; + + /* + * This isn't strictly correct, the ARM64 counter can be + * 'short' and then we get funnies when it wraps. The correct + * thing would be to extend the perf ABI with a cycle and mask + * value, but because wrapping on ARM64 is very rare in + * practise this 'works'. + */ + ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift); + userpg->time_zero -= ns; + + } while (sched_clock_read_retry(seq)); + + userpg->time_offset = userpg->time_zero - now; - clocks_calc_mult_shift(&userpg->time_mult, &shift, freq, - NSEC_PER_SEC, 0); /* * time_shift is not expected to be greater than 31 due to * the original published conversion algorithm shifting a * 32-bit value (now specifies a 64-bit value) - refer * perf_event_mmap_page documentation in perf_event.h. */ - if (shift == 32) { - shift = 31; + if (userpg->time_shift == 32) { + userpg->time_shift = 31; userpg->time_mult >>= 1; } - userpg->time_shift = (u16)shift; - userpg->time_offset = -now; + } From 279a811eb520594fac3cd3a541e6c7ea50072ac9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:27 +0800 Subject: [PATCH 370/502] arm64: perf: Only advertise cap_user_time for arch_timer When sched_clock is running on anything other than arch_timer, don't advertise cap_user_time*. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-5-leo.yan@linaro.org Requested-by: Will Deacon Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 3bbbc22a5148..674edc7ba8ca 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -13,6 +13,8 @@ #include #include +#include + #include #include #include @@ -1173,16 +1175,15 @@ void arch_perf_update_userpage(struct perf_event *event, unsigned int seq; u64 ns; - /* - * Internal timekeeping for enabled/running/stopped times - * is always computed with the sched_clock. - */ - userpg->cap_user_time = 1; - userpg->cap_user_time_zero = 1; + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; do { rd = sched_clock_read_begin(&seq); + if (rd->read_sched_clock != arch_timer_read_counter) + return; + userpg->time_mult = rd->mult; userpg->time_shift = rd->shift; userpg->time_zero = rd->epoch_ns; @@ -1212,4 +1213,10 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->time_mult >>= 1; } + /* + * Internal timekeeping for enabled/running/stopped times + * is always computed with the sched_clock. + */ + userpg->cap_user_time = 1; + userpg->cap_user_time_zero = 1; } From 6c0246a4588d418f72acd40a7b7601be403d80a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:28 +0800 Subject: [PATCH 371/502] perf: Add perf_event_mmap_page::cap_user_time_short ABI In order to support short clock counters, provide an ABI extension. As a whole: u64 time, delta, cyc = read_cycle_counter(); + if (cap_user_time_short) + cyc = time_cycle + ((cyc - time_cycle) & time_mask); delta = mul_u64_u32_shr(cyc, time_mult, time_shift); if (cap_user_time_zero) time = time_zero + delta; delta += time_offset; Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-6-leo.yan@linaro.org Signed-off-by: Will Deacon --- include/uapi/linux/perf_event.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7b2d6fc9e6ed..21a1edd08cbe 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -532,9 +532,10 @@ struct perf_event_mmap_page { cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ - cap_user_time : 1, /* The time_* fields are used */ + cap_user_time : 1, /* The time_{shift,mult,offset} fields are used */ cap_user_time_zero : 1, /* The time_zero field is used */ - cap_____res : 59; + cap_user_time_short : 1, /* the time_{cycle,mask} fields are used */ + cap_____res : 58; }; }; @@ -593,13 +594,29 @@ struct perf_event_mmap_page { * ((rem * time_mult) >> time_shift); */ __u64 time_zero; + __u32 size; /* Header size up to __reserved[] fields. */ + __u32 __reserved_1; + + /* + * If cap_usr_time_short, the hardware clock is less than 64bit wide + * and we must compute the 'cyc' value, as used by cap_usr_time, as: + * + * cyc = time_cycles + ((cyc - time_cycles) & time_mask) + * + * NOTE: this form is explicitly chosen such that cap_usr_time_short + * is a correction on top of cap_usr_time, and code that doesn't + * know about cap_usr_time_short still works under the assumption + * the counter doesn't wrap. + */ + __u64 time_cycles; + __u64 time_mask; /* * Hole for extension of the self monitor capabilities */ - __u8 __reserved[118*8+4]; /* align to 1k. */ + __u8 __reserved[116*8]; /* align to 1k. */ /* * Control data for the mmap() data buffer. From c8f9eb0d6ebaa768c9f6eb2ee21b01d74230934d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:29 +0800 Subject: [PATCH 372/502] arm64: perf: Add cap_user_time_short This completes the ARM64 cap_user_time support. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-7-leo.yan@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 674edc7ba8ca..fdb6029c9021 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -1177,6 +1177,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; + userpg->cap_user_time_short = 0; do { rd = sched_clock_read_begin(&seq); @@ -1187,13 +1188,13 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->time_mult = rd->mult; userpg->time_shift = rd->shift; userpg->time_zero = rd->epoch_ns; + userpg->time_cycles = rd->epoch_cyc; + userpg->time_mask = rd->sched_clock_mask; /* - * This isn't strictly correct, the ARM64 counter can be - * 'short' and then we get funnies when it wraps. The correct - * thing would be to extend the perf ABI with a cycle and mask - * value, but because wrapping on ARM64 is very rare in - * practise this 'works'. + * Subtract the cycle base, such that software that + * doesn't know about cap_user_time_short still 'works' + * assuming no wraps. */ ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift); userpg->time_zero -= ns; @@ -1219,4 +1220,5 @@ void arch_perf_update_userpage(struct perf_event *event, */ userpg->cap_user_time = 1; userpg->cap_user_time_zero = 1; + userpg->cap_user_time_short = 1; } From 5271d915a99c696a2f16ae59cf6a037be35afa22 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 16 Jul 2020 13:11:30 +0800 Subject: [PATCH 373/502] tools headers UAPI: Update tools's copy of linux/perf_event.h To get the changes in the commit: "perf: Add perf_event_mmap_page::cap_user_time_short ABI" This update is a prerequisite to add support for short clock counters related ABI extension. Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-8-leo.yan@linaro.org Signed-off-by: Will Deacon --- tools/include/uapi/linux/perf_event.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 7b2d6fc9e6ed..21a1edd08cbe 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -532,9 +532,10 @@ struct perf_event_mmap_page { cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ - cap_user_time : 1, /* The time_* fields are used */ + cap_user_time : 1, /* The time_{shift,mult,offset} fields are used */ cap_user_time_zero : 1, /* The time_zero field is used */ - cap_____res : 59; + cap_user_time_short : 1, /* the time_{cycle,mask} fields are used */ + cap_____res : 58; }; }; @@ -593,13 +594,29 @@ struct perf_event_mmap_page { * ((rem * time_mult) >> time_shift); */ __u64 time_zero; + __u32 size; /* Header size up to __reserved[] fields. */ + __u32 __reserved_1; + + /* + * If cap_usr_time_short, the hardware clock is less than 64bit wide + * and we must compute the 'cyc' value, as used by cap_usr_time, as: + * + * cyc = time_cycles + ((cyc - time_cycles) & time_mask) + * + * NOTE: this form is explicitly chosen such that cap_usr_time_short + * is a correction on top of cap_usr_time, and code that doesn't + * know about cap_usr_time_short still works under the assumption + * the counter doesn't wrap. + */ + __u64 time_cycles; + __u64 time_mask; /* * Hole for extension of the self monitor capabilities */ - __u8 __reserved[118*8+4]; /* align to 1k. */ + __u8 __reserved[116*8]; /* align to 1k. */ /* * Control data for the mmap() data buffer. From f143c11bb7b924403ea2d5b5c990717772293620 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 21 Nov 2019 12:41:40 +0000 Subject: [PATCH 374/502] tools: bpf: Use local copy of headers including uapi/linux/filter.h Pulling header files directly out of the kernel sources for inclusion in userspace programs is highly error prone, not least because it bypasses the kbuild infrastructure entirely and so may end up referencing other header files that have not been generated. Subsequent patches will cause compiler.h to pull in the ungenerated asm/rwonce.h file via filter.h, breaking the build for tools/bpf: | $ make -C tools/bpf | make: Entering directory '/linux/tools/bpf' | CC bpf_jit_disasm.o | LINK bpf_jit_disasm | CC bpf_dbg.o | In file included from /linux/include/uapi/linux/filter.h:9, | from /linux/tools/bpf/bpf_dbg.c:41: | /linux/include/linux/compiler.h:247:10: fatal error: asm/rwonce.h: No such file or directory | #include | ^~~~~~~~~~~~~~ | compilation terminated. | make: *** [Makefile:61: bpf_dbg.o] Error 1 | make: Leaving directory '/linux/tools/bpf' Take a copy of the installed version of linux/filter.h (i.e. the one created by the 'headers_install' target) into tools/include/uapi/linux/ and adjust the BPF tool Makefile to reference the local include directories instead of those in the main source tree. Cc: Masahiro Yamada Acked-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Suggested-by: Daniel Borkmann Reported-by: Xiao Yang Signed-off-by: Will Deacon --- tools/bpf/Makefile | 3 +- tools/include/uapi/linux/filter.h | 90 +++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 tools/include/uapi/linux/filter.h diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index 6df1850f8353..8a69258fd8aa 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -9,7 +9,8 @@ MAKE = make INSTALL ?= install CFLAGS += -Wall -O2 -CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/include/uapi -I$(srctree)/include +CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi \ + -I$(srctree)/tools/include # This will work when bpf is built in tools env. where srctree # isn't set and when invoked from selftests build, where srctree diff --git a/tools/include/uapi/linux/filter.h b/tools/include/uapi/linux/filter.h new file mode 100644 index 000000000000..eaef459e7bd4 --- /dev/null +++ b/tools/include/uapi/linux/filter.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Linux Socket Filter Data Structures + */ + +#ifndef __LINUX_FILTER_H__ +#define __LINUX_FILTER_H__ + + +#include +#include + +/* + * Current version of the filter code architecture. + */ +#define BPF_MAJOR_VERSION 1 +#define BPF_MINOR_VERSION 1 + +/* + * Try and keep these values and structures similar to BSD, especially + * the BPF code definitions which need to match so you can share filters + */ + +struct sock_filter { /* Filter block */ + __u16 code; /* Actual filter code */ + __u8 jt; /* Jump true */ + __u8 jf; /* Jump false */ + __u32 k; /* Generic multiuse field */ +}; + +struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ + unsigned short len; /* Number of filter blocks */ + struct sock_filter *filter; +}; + +/* ret - BPF_K and BPF_X also apply */ +#define BPF_RVAL(code) ((code) & 0x18) +#define BPF_A 0x10 + +/* misc */ +#define BPF_MISCOP(code) ((code) & 0xf8) +#define BPF_TAX 0x00 +#define BPF_TXA 0x80 + +/* + * Macros for filter block array initializers. + */ +#ifndef BPF_STMT +#define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k } +#endif +#ifndef BPF_JUMP +#define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k } +#endif + +/* + * Number of scratch memory words for: BPF_ST and BPF_STX + */ +#define BPF_MEMWORDS 16 + +/* RATIONALE. Negative offsets are invalid in BPF. + We use them to reference ancillary data. + Unlike introduction new instructions, it does not break + existing compilers/optimizers. + */ +#define SKF_AD_OFF (-0x1000) +#define SKF_AD_PROTOCOL 0 +#define SKF_AD_PKTTYPE 4 +#define SKF_AD_IFINDEX 8 +#define SKF_AD_NLATTR 12 +#define SKF_AD_NLATTR_NEST 16 +#define SKF_AD_MARK 20 +#define SKF_AD_QUEUE 24 +#define SKF_AD_HATYPE 28 +#define SKF_AD_RXHASH 32 +#define SKF_AD_CPU 36 +#define SKF_AD_ALU_XOR_X 40 +#define SKF_AD_VLAN_TAG 44 +#define SKF_AD_VLAN_TAG_PRESENT 48 +#define SKF_AD_PAY_OFFSET 52 +#define SKF_AD_RANDOM 56 +#define SKF_AD_VLAN_TPID 60 +#define SKF_AD_MAX 64 + +#define SKF_NET_OFF (-0x100000) +#define SKF_LL_OFF (-0x200000) + +#define BPF_NET_OFF SKF_NET_OFF +#define BPF_LL_OFF SKF_LL_OFF + +#endif /* __LINUX_FILTER_H__ */ From e506ea451254ab17e0bf918ca36232fec2a9b10c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Oct 2019 16:29:32 -0700 Subject: [PATCH 375/502] compiler.h: Split {READ,WRITE}_ONCE definitions out into rwonce.h In preparation for allowing architectures to define their own implementation of the READ_ONCE() macro, move the generic {READ,WRITE}_ONCE() definitions out of the unwieldy 'linux/compiler.h' file and into a new 'rwonce.h' header under 'asm-generic'. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- include/asm-generic/Kbuild | 1 + include/asm-generic/barrier.h | 2 +- include/asm-generic/rwonce.h | 101 ++++++++++++++++++++++++++++++++++ include/linux/compiler.h | 93 +------------------------------ 4 files changed, 105 insertions(+), 92 deletions(-) create mode 100644 include/asm-generic/rwonce.h diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 44ec80e70518..74b0612601dd 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -45,6 +45,7 @@ mandatory-y += pci.h mandatory-y += percpu.h mandatory-y += pgalloc.h mandatory-y += preempt.h +mandatory-y += rwonce.h mandatory-y += sections.h mandatory-y += serial.h mandatory-y += shmparam.h diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 2eacaf7d62f6..8116744bb82c 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -13,7 +13,7 @@ #ifndef __ASSEMBLY__ -#include +#include #ifndef nop #define nop() asm volatile ("nop") diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h new file mode 100644 index 000000000000..87584379da43 --- /dev/null +++ b/include/asm-generic/rwonce.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Prevent the compiler from merging or refetching reads or writes. The + * compiler is also forbidden from reordering successive instances of + * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some + * particular ordering. One way to make the compiler aware of ordering is to + * put the two invocations of READ_ONCE or WRITE_ONCE in different C + * statements. + * + * These two macros will also work on aggregate data types like structs or + * unions. + * + * Their two major use cases are: (1) Mediating communication between + * process-level code and irq/NMI handlers, all running on the same CPU, + * and (2) Ensuring that the compiler does not fold, spindle, or otherwise + * mutilate accesses that either do not require ordering or that interact + * with an explicit memory barrier or atomic instruction that provides the + * required ordering. + */ +#ifndef __ASM_GENERIC_RWONCE_H +#define __ASM_GENERIC_RWONCE_H + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +/* + * Yes, this permits 64-bit accesses on 32-bit architectures. These will + * actually be atomic in some cases (namely Armv7 + LPAE), but for others we + * rely on the access being split into 2x32-bit accesses for a 32-bit quantity + * (e.g. a virtual address) and a strong prevailing wind. + */ +#define compiletime_assert_rwonce_type(t) \ + compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ + "Unsupported access size for {READ,WRITE}_ONCE().") + +/* + * Use __READ_ONCE() instead of READ_ONCE() if you do not require any + * atomicity or dependency ordering guarantees. Note that this may result + * in tears! + */ +#define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) + +#define __READ_ONCE_SCALAR(x) \ +({ \ + __unqual_scalar_typeof(x) __x = __READ_ONCE(x); \ + smp_read_barrier_depends(); \ + (typeof(x))__x; \ +}) + +#define READ_ONCE(x) \ +({ \ + compiletime_assert_rwonce_type(x); \ + __READ_ONCE_SCALAR(x); \ +}) + +#define __WRITE_ONCE(x, val) \ +do { \ + *(volatile typeof(x) *)&(x) = (val); \ +} while (0) + +#define WRITE_ONCE(x, val) \ +do { \ + compiletime_assert_rwonce_type(x); \ + __WRITE_ONCE(x, val); \ +} while (0) + +static __no_sanitize_or_inline +unsigned long __read_once_word_nocheck(const void *addr) +{ + return __READ_ONCE(*(unsigned long *)addr); +} + +/* + * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a + * word from memory atomically but without telling KASAN/KCSAN. This is + * usually used by unwinding code when walking the stack of a running process. + */ +#define READ_ONCE_NOCHECK(x) \ +({ \ + unsigned long __x; \ + compiletime_assert(sizeof(x) == sizeof(__x), \ + "Unsupported access size for READ_ONCE_NOCHECK()."); \ + __x = __read_once_word_nocheck(&(x)); \ + smp_read_barrier_depends(); \ + (typeof(x))__x; \ +}) + +static __no_kasan_or_inline +unsigned long read_word_at_a_time(const void *addr) +{ + kasan_check_read(addr, 1); + return *(unsigned long *)addr; +} + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_GENERIC_RWONCE_H */ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 204e76856435..f075a3df4fe2 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -230,28 +230,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, # define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__) #endif -/* - * Prevent the compiler from merging or refetching reads or writes. The - * compiler is also forbidden from reordering successive instances of - * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some - * particular ordering. One way to make the compiler aware of ordering is to - * put the two invocations of READ_ONCE or WRITE_ONCE in different C - * statements. - * - * These two macros will also work on aggregate data types like structs or - * unions. - * - * Their two major use cases are: (1) Mediating communication between - * process-level code and irq/NMI handlers, all running on the same CPU, - * and (2) Ensuring that the compiler does not fold, spindle, or otherwise - * mutilate accesses that either do not require ordering or that interact - * with an explicit memory barrier or atomic instruction that provides the - * required ordering. - */ -#include -#include -#include - /** * data_race - mark an expression as containing intentional data races * @@ -272,65 +250,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __v; \ }) -/* - * Use __READ_ONCE() instead of READ_ONCE() if you do not require any - * atomicity or dependency ordering guarantees. Note that this may result - * in tears! - */ -#define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) - -#define __READ_ONCE_SCALAR(x) \ -({ \ - __unqual_scalar_typeof(x) __x = __READ_ONCE(x); \ - smp_read_barrier_depends(); \ - (typeof(x))__x; \ -}) - -#define READ_ONCE(x) \ -({ \ - compiletime_assert_rwonce_type(x); \ - __READ_ONCE_SCALAR(x); \ -}) - -#define __WRITE_ONCE(x, val) \ -do { \ - *(volatile typeof(x) *)&(x) = (val); \ -} while (0) - -#define WRITE_ONCE(x, val) \ -do { \ - compiletime_assert_rwonce_type(x); \ - __WRITE_ONCE(x, val); \ -} while (0) - -static __no_sanitize_or_inline -unsigned long __read_once_word_nocheck(const void *addr) -{ - return __READ_ONCE(*(unsigned long *)addr); -} - -/* - * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a - * word from memory atomically but without telling KASAN/KCSAN. This is - * usually used by unwinding code when walking the stack of a running process. - */ -#define READ_ONCE_NOCHECK(x) \ -({ \ - unsigned long __x; \ - compiletime_assert(sizeof(x) == sizeof(__x), \ - "Unsupported access size for READ_ONCE_NOCHECK()."); \ - __x = __read_once_word_nocheck(&(x)); \ - smp_read_barrier_depends(); \ - (typeof(x))__x; \ -}) - -static __no_kasan_or_inline -unsigned long read_word_at_a_time(const void *addr) -{ - kasan_check_read(addr, 1); - return *(unsigned long *)addr; -} - #endif /* __KERNEL__ */ /* @@ -395,16 +314,6 @@ static inline void *offset_to_ptr(const int *off) compiletime_assert(__native_word(t), \ "Need native word sized stores/loads for atomicity.") -/* - * Yes, this permits 64-bit accesses on 32-bit architectures. These will - * actually be atomic in some cases (namely Armv7 + LPAE), but for others we - * rely on the access being split into 2x32-bit accesses for a 32-bit quantity - * (e.g. a virtual address) and a strong prevailing wind. - */ -#define compiletime_assert_rwonce_type(t) \ - compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ - "Unsupported access size for {READ,WRITE}_ONCE().") - /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) @@ -414,4 +323,6 @@ static inline void *offset_to_ptr(const int *off) */ #define prevent_tail_call_optimization() mb() +#include + #endif /* __LINUX_COMPILER_H */ From b78b331a3f5c0773171dadd6bbfa2a2242b45604 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Oct 2019 17:30:47 -0700 Subject: [PATCH 376/502] asm/rwonce: Allow __READ_ONCE to be overridden by the architecture The meat and potatoes of READ_ONCE() is defined by the __READ_ONCE() macro, which uses a volatile casts in an attempt to avoid tearing of byte, halfword, word and double-word accesses. Allow this to be overridden by the architecture code in the case that things like memory barriers are also required. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- include/asm-generic/rwonce.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h index 87584379da43..04586b55a7c2 100644 --- a/include/asm-generic/rwonce.h +++ b/include/asm-generic/rwonce.h @@ -43,7 +43,9 @@ * atomicity or dependency ordering guarantees. Note that this may result * in tears! */ +#ifndef __READ_ONCE #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) +#endif #define __READ_ONCE_SCALAR(x) \ ({ \ From d6462858851549c62d73eaa14b31132b0f32d6b6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Oct 2019 16:50:10 +0000 Subject: [PATCH 377/502] alpha: Override READ_ONCE() with barriered implementation Rather then relying on the core code to use smp_read_barrier_depends() as part of the READ_ONCE() definition, instead override __READ_ONCE() in the Alpha code so that it generates the required mb() and then implement smp_load_acquire() using the new macro to avoid redundant back-to-back barriers from the generic implementation. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- arch/alpha/include/asm/barrier.h | 59 +++----------------------------- arch/alpha/include/asm/rwonce.h | 35 +++++++++++++++++++ 2 files changed, 40 insertions(+), 54 deletions(-) create mode 100644 arch/alpha/include/asm/rwonce.h diff --git a/arch/alpha/include/asm/barrier.h b/arch/alpha/include/asm/barrier.h index 92ec486a4f9e..c56bfffc9918 100644 --- a/arch/alpha/include/asm/barrier.h +++ b/arch/alpha/include/asm/barrier.h @@ -2,64 +2,15 @@ #ifndef __BARRIER_H #define __BARRIER_H -#include - #define mb() __asm__ __volatile__("mb": : :"memory") #define rmb() __asm__ __volatile__("mb": : :"memory") #define wmb() __asm__ __volatile__("wmb": : :"memory") -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - */ -#define read_barrier_depends() __asm__ __volatile__("mb": : :"memory") +#define __smp_load_acquire(p) \ +({ \ + compiletime_assert_atomic_type(*p); \ + __READ_ONCE(*p); \ +}) #ifdef CONFIG_SMP #define __ASM_SMP_MB "\tmb\n" diff --git a/arch/alpha/include/asm/rwonce.h b/arch/alpha/include/asm/rwonce.h new file mode 100644 index 000000000000..35542bcf92b3 --- /dev/null +++ b/arch/alpha/include/asm/rwonce.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Google LLC. + */ +#ifndef __ASM_RWONCE_H +#define __ASM_RWONCE_H + +#ifdef CONFIG_SMP + +#include + +/* + * Alpha is apparently daft enough to reorder address-dependent loads + * on some CPU implementations. Knock some common sense into it with + * a memory barrier in READ_ONCE(). + * + * For the curious, more information about this unusual reordering is + * available in chapter 15 of the "perfbook": + * + * https://kernel.org/pub/linux/kernel/people/paulmck/perfbook/perfbook.html + * + */ +#define __READ_ONCE(x) \ +({ \ + __unqual_scalar_typeof(x) __x = \ + (*(volatile typeof(__x) *)(&(x))); \ + mb(); \ + (typeof(x))__x; \ +}) + +#endif /* CONFIG_SMP */ + +#include + +#endif /* __ASM_RWONCE_H */ From 3c9184109e78ea2371ca8fa66d7f36986a53af98 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Oct 2019 16:51:07 +0000 Subject: [PATCH 378/502] asm/rwonce: Remove smp_read_barrier_depends() invocation Alpha overrides __READ_ONCE() directly, so there's no need to use smp_read_barrier_depends() in the core code. This also means that __READ_ONCE() can be relied upon to provide dependency ordering. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- include/asm-generic/rwonce.h | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h index 04586b55a7c2..3a7f737c77bd 100644 --- a/include/asm-generic/rwonce.h +++ b/include/asm-generic/rwonce.h @@ -40,24 +40,16 @@ /* * Use __READ_ONCE() instead of READ_ONCE() if you do not require any - * atomicity or dependency ordering guarantees. Note that this may result - * in tears! + * atomicity. Note that this may result in tears! */ #ifndef __READ_ONCE #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) #endif -#define __READ_ONCE_SCALAR(x) \ -({ \ - __unqual_scalar_typeof(x) __x = __READ_ONCE(x); \ - smp_read_barrier_depends(); \ - (typeof(x))__x; \ -}) - #define READ_ONCE(x) \ ({ \ compiletime_assert_rwonce_type(x); \ - __READ_ONCE_SCALAR(x); \ + __READ_ONCE(x); \ }) #define __WRITE_ONCE(x, val) \ @@ -84,12 +76,9 @@ unsigned long __read_once_word_nocheck(const void *addr) */ #define READ_ONCE_NOCHECK(x) \ ({ \ - unsigned long __x; \ - compiletime_assert(sizeof(x) == sizeof(__x), \ + compiletime_assert(sizeof(x) == sizeof(unsigned long), \ "Unsupported access size for READ_ONCE_NOCHECK()."); \ - __x = __read_once_word_nocheck(&(x)); \ - smp_read_barrier_depends(); \ - (typeof(x))__x; \ + (typeof(x))__read_once_word_nocheck(&(x)); \ }) static __no_kasan_or_inline From 002dff36acfba3476b685a09f78ffb7c452f5951 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 10 Jul 2020 14:49:40 +0100 Subject: [PATCH 379/502] asm/rwonce: Don't pull into 'asm-generic/rwonce.h' Now that 'smp_read_barrier_depends()' has gone the way of the Norwegian Blue, drop the inclusion of in 'asm-generic/rwonce.h'. This requires fixups to some architecture vdso headers which were previously relying on 'asm/barrier.h' coming in via 'linux/compiler.h'. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Will Deacon --- arch/arm/include/asm/vdso/gettimeofday.h | 1 + arch/arm64/include/asm/vdso/compat_gettimeofday.h | 1 + arch/arm64/include/asm/vdso/gettimeofday.h | 1 + arch/riscv/include/asm/vdso/gettimeofday.h | 1 + include/asm-generic/rwonce.h | 2 -- include/linux/nospec.h | 2 ++ 6 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/vdso/gettimeofday.h b/arch/arm/include/asm/vdso/gettimeofday.h index 36dc18553ed8..1b207cf07697 100644 --- a/arch/arm/include/asm/vdso/gettimeofday.h +++ b/arch/arm/include/asm/vdso/gettimeofday.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ +#include #include #include #include diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index b6907ae78e53..bcf7649999a4 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ +#include #include #include diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index afba6ba332f8..127fa63893e2 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ +#include #include #define VDSO_HAS_CLOCK_GETRES 1 diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h index c8e818688ec1..3099362d9f26 100644 --- a/arch/riscv/include/asm/vdso/gettimeofday.h +++ b/arch/riscv/include/asm/vdso/gettimeofday.h @@ -4,6 +4,7 @@ #ifndef __ASSEMBLY__ +#include #include #include #include diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h index 3a7f737c77bd..8d0a6280e982 100644 --- a/include/asm-generic/rwonce.h +++ b/include/asm-generic/rwonce.h @@ -26,8 +26,6 @@ #include #include -#include - /* * Yes, this permits 64-bit accesses on 32-bit architectures. These will * actually be atomic in some cases (namely Armv7 + LPAE), but for others we diff --git a/include/linux/nospec.h b/include/linux/nospec.h index 0c5ef54fd416..c1e79f72cd89 100644 --- a/include/linux/nospec.h +++ b/include/linux/nospec.h @@ -5,6 +5,8 @@ #ifndef _LINUX_NOSPEC_H #define _LINUX_NOSPEC_H + +#include #include struct task_struct; From 71c0b9a65cefa8c34eab83d337a1e3ec61fb7cc2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Oct 2019 16:22:17 +0000 Subject: [PATCH 380/502] vhost: Remove redundant use of read_barrier_depends() barrier Since commit 76ebbe78f739 ("locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE()"), there is no need to use smp_read_barrier_depends() outside of the Alpha architecture code. Unfortunately, there is precisely _one_ user in the vhost code, and there isn't an obvious READ_ONCE() access making the barrier redundant. However, on closer inspection (thanks, Jason), it appears that vring synchronisation between the producer and consumer occurs via the 'avail_idx' field, which is followed up by an rmb() in vhost_get_vq_desc(), making the read_barrier_depends() redundant on Alpha. Jason says: | I'm also confused about the barrier here, basically in driver side | we did: | | 1) allocate pages | 2) store pages in indirect->addr | 3) smp_wmb() | 4) increase the avail idx (somehow a tail pointer of vring) | | in vhost we did: | | 1) read avail idx | 2) smp_rmb() | 3) read indirect->addr | 4) read from indirect->addr | | It looks to me even the data dependency barrier is not necessary | since we have rmb() which is sufficient for us to the correct | indirect->addr and driver are not expected to do any writing to | indirect->addr after avail idx is increased Remove the redundant barrier invocation. Acked-by: Michael S. Tsirkin Acked-by: Peter Zijlstra (Intel) Suggested-by: Jason Wang Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- drivers/vhost/vhost.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index d7b8df3edffc..74d135ee7e26 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2092,11 +2092,6 @@ static int get_indirect(struct vhost_virtqueue *vq, return ret; } iov_iter_init(&from, READ, vq->indirect, ret, len); - - /* We will use the result as an address to read from, so most - * architectures only need a compiler barrier here. */ - read_barrier_depends(); - count = len / sizeof desc; /* Buffers are chained via a 16 bit next field, so * we can have at most 2^16 of these. */ From bb7cdd38185a4f9fa32e62db115c2c6dceb2b621 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Oct 2019 17:15:01 +0000 Subject: [PATCH 381/502] alpha: Replace smp_read_barrier_depends() usage with smp_[r]mb() In preparation for removing smp_read_barrier_depends() altogether, move the Alpha code over to using smp_rmb() and smp_mb() directly. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- arch/alpha/include/asm/atomic.h | 16 ++++++++-------- arch/alpha/include/asm/pgtable.h | 10 +++++----- mm/memory.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index 2144530d1428..2f8f7e54792f 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -16,10 +16,10 @@ /* * To ensure dependency ordering is preserved for the _relaxed and - * _release atomics, an smp_read_barrier_depends() is unconditionally - * inserted into the _relaxed variants, which are used to build the - * barriered versions. Avoid redundant back-to-back fences in the - * _acquire and _fence versions. + * _release atomics, an smp_mb() is unconditionally inserted into the + * _relaxed variants, which are used to build the barriered versions. + * Avoid redundant back-to-back fences in the _acquire and _fence + * versions. */ #define __atomic_acquire_fence() #define __atomic_post_full_fence() @@ -70,7 +70,7 @@ static inline int atomic_##op##_return_relaxed(int i, atomic_t *v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ - smp_read_barrier_depends(); \ + smp_mb(); \ return result; \ } @@ -88,7 +88,7 @@ static inline int atomic_fetch_##op##_relaxed(int i, atomic_t *v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ - smp_read_barrier_depends(); \ + smp_mb(); \ return result; \ } @@ -123,7 +123,7 @@ static __inline__ s64 atomic64_##op##_return_relaxed(s64 i, atomic64_t * v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ - smp_read_barrier_depends(); \ + smp_mb(); \ return result; \ } @@ -141,7 +141,7 @@ static __inline__ s64 atomic64_fetch_##op##_relaxed(s64 i, atomic64_t * v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ - smp_read_barrier_depends(); \ + smp_mb(); \ return result; \ } diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 162c17b2631f..660b14ce1317 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -277,9 +277,9 @@ extern inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= __DIRTY_BITS; retur extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; return pte; } /* - * The smp_read_barrier_depends() in the following functions are required to - * order the load of *dir (the pointer in the top level page table) with any - * subsequent load of the returned pmd_t *ret (ret is data dependent on *dir). + * The smp_rmb() in the following functions are required to order the load of + * *dir (the pointer in the top level page table) with any subsequent load of + * the returned pmd_t *ret (ret is data dependent on *dir). * * If this ordering is not enforced, the CPU might load an older value of * *ret, which may be uninitialized data. See mm/memory.c:__pte_alloc for @@ -293,7 +293,7 @@ extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; retu extern inline pmd_t * pmd_offset(pud_t * dir, unsigned long address) { pmd_t *ret = (pmd_t *) pud_page_vaddr(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); - smp_read_barrier_depends(); /* see above */ + smp_rmb(); /* see above */ return ret; } #define pmd_offset pmd_offset @@ -303,7 +303,7 @@ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address) { pte_t *ret = (pte_t *) pmd_page_vaddr(*dir) + ((address >> PAGE_SHIFT) & (PTRS_PER_PAGE - 1)); - smp_read_barrier_depends(); /* see above */ + smp_rmb(); /* see above */ return ret; } #define pte_offset_kernel pte_offset_kernel diff --git a/mm/memory.c b/mm/memory.c index 87ec87cdc1ff..e1f2c730d8bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -437,7 +437,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) * of a chain of data-dependent loads, meaning most CPUs (alpha * being the notable exception) will already guarantee loads are * seen in-order. See the alpha page table accessors for the - * smp_read_barrier_depends() barriers in page table walking code. + * smp_rmb() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ From 93fab07c22930c9ac4f01212fd92913c9a812f9f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Oct 2019 17:17:22 +0000 Subject: [PATCH 382/502] locking/barriers: Remove definitions for [smp_]read_barrier_depends() There are no remaining users of [smp_]read_barrier_depends(), so remove it from the generic implementation of 'barrier.h'. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- include/asm-generic/barrier.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 8116744bb82c..fec97dc34de7 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -46,10 +46,6 @@ #define dma_wmb() wmb() #endif -#ifndef read_barrier_depends -#define read_barrier_depends() do { } while (0) -#endif - #ifndef __smp_mb #define __smp_mb() mb() #endif @@ -62,10 +58,6 @@ #define __smp_wmb() wmb() #endif -#ifndef __smp_read_barrier_depends -#define __smp_read_barrier_depends() read_barrier_depends() -#endif - #ifdef CONFIG_SMP #ifndef smp_mb @@ -80,10 +72,6 @@ #define smp_wmb() __smp_wmb() #endif -#ifndef smp_read_barrier_depends -#define smp_read_barrier_depends() __smp_read_barrier_depends() -#endif - #else /* !CONFIG_SMP */ #ifndef smp_mb @@ -98,10 +86,6 @@ #define smp_wmb() barrier() #endif -#ifndef smp_read_barrier_depends -#define smp_read_barrier_depends() do { } while (0) -#endif - #endif /* CONFIG_SMP */ #ifndef __smp_store_mb @@ -196,7 +180,6 @@ do { \ #define virt_mb() __smp_mb() #define virt_rmb() __smp_rmb() #define virt_wmb() __smp_wmb() -#define virt_read_barrier_depends() __smp_read_barrier_depends() #define virt_store_mb(var, value) __smp_store_mb(var, value) #define virt_mb__before_atomic() __smp_mb__before_atomic() #define virt_mb__after_atomic() __smp_mb__after_atomic() From 8ca924aeb4f28e5bf552707e8ecbe105c4f17c7b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 7 Nov 2019 14:36:37 +0000 Subject: [PATCH 383/502] Documentation/barriers: Remove references to [smp_]read_barrier_depends() The [smp_]read_barrier_depends() barrier macros no longer exist as part of the Linux memory model, so remove all references to them from the Documentation/ directory. Although this is fairly mechanical on the whole, we drop the "CACHE COHERENCY" section entirely from 'memory-barriers.txt' as it doesn't make any sense now that the dependency barriers have been removed. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- .../RCU/Design/Requirements/Requirements.rst | 2 +- Documentation/memory-barriers.txt | 156 +----------------- 2 files changed, 9 insertions(+), 149 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 75b8ca007a11..50d5c43c48b0 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -463,7 +463,7 @@ again without disrupting RCU readers. This guarantee was only partially premeditated. DYNIX/ptx used an explicit memory barrier for publication, but had nothing resembling ``rcu_dereference()`` for subscription, nor did it have anything -resembling the ``smp_read_barrier_depends()`` that was later subsumed +resembling the dependency-ordering barrier that was later subsumed into ``rcu_dereference()`` and later still into ``READ_ONCE()``. The need for these operations made itself known quite suddenly at a late-1990s meeting with the DEC Alpha architects, back in the days when diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index eaabc3134294..4e55aba3eb4a 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -553,12 +553,12 @@ There are certain things that the Linux kernel memory barriers do not guarantee: DATA DEPENDENCY BARRIERS (HISTORICAL) ------------------------------------- -As of v4.15 of the Linux kernel, an smp_read_barrier_depends() was -added to READ_ONCE(), which means that about the only people who -need to pay attention to this section are those working on DEC Alpha -architecture-specific code and those working on READ_ONCE() itself. -For those who need it, and for those who are interested in the history, -here is the story of data-dependency barriers. +As of v4.15 of the Linux kernel, an smp_mb() was added to READ_ONCE() for +DEC Alpha, which means that about the only people who need to pay attention +to this section are those working on DEC Alpha architecture-specific code +and those working on READ_ONCE() itself. For those who need it, and for +those who are interested in the history, here is the story of +data-dependency barriers. The usage requirements of data dependency barriers are a little subtle, and it's not always obvious that they're needed. To illustrate, consider the @@ -2708,144 +2708,6 @@ the properties of the memory window through which devices are accessed and/or the use of any special device communication instructions the CPU may have. -CACHE COHERENCY ---------------- - -Life isn't quite as simple as it may appear above, however: for while the -caches are expected to be coherent, there's no guarantee that that coherency -will be ordered. This means that while changes made on one CPU will -eventually become visible on all CPUs, there's no guarantee that they will -become apparent in the same order on those other CPUs. - - -Consider dealing with a system that has a pair of CPUs (1 & 2), each of which -has a pair of parallel data caches (CPU 1 has A/B, and CPU 2 has C/D): - - : - : +--------+ - : +---------+ | | - +--------+ : +--->| Cache A |<------->| | - | | : | +---------+ | | - | CPU 1 |<---+ | | - | | : | +---------+ | | - +--------+ : +--->| Cache B |<------->| | - : +---------+ | | - : | Memory | - : +---------+ | System | - +--------+ : +--->| Cache C |<------->| | - | | : | +---------+ | | - | CPU 2 |<---+ | | - | | : | +---------+ | | - +--------+ : +--->| Cache D |<------->| | - : +---------+ | | - : +--------+ - : - -Imagine the system has the following properties: - - (*) an odd-numbered cache line may be in cache A, cache C or it may still be - resident in memory; - - (*) an even-numbered cache line may be in cache B, cache D or it may still be - resident in memory; - - (*) while the CPU core is interrogating one cache, the other cache may be - making use of the bus to access the rest of the system - perhaps to - displace a dirty cacheline or to do a speculative load; - - (*) each cache has a queue of operations that need to be applied to that cache - to maintain coherency with the rest of the system; - - (*) the coherency queue is not flushed by normal loads to lines already - present in the cache, even though the contents of the queue may - potentially affect those loads. - -Imagine, then, that two writes are made on the first CPU, with a write barrier -between them to guarantee that they will appear to reach that CPU's caches in -the requisite order: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); Make sure change to v is visible before - change to p - v is now in cache A exclusively - p = &v; - p is now in cache B exclusively - -The write memory barrier forces the other CPUs in the system to perceive that -the local CPU's caches have apparently been updated in the correct order. But -now imagine that the second CPU wants to read those values: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - ... - q = p; - x = *q; - -The above pair of reads may then fail to happen in the expected order, as the -cacheline holding p may get updated in one of the second CPU's caches while -the update to the cacheline holding v is delayed in the other of the second -CPU's caches by some other cache event: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); - - - p = &v; q = p; - - - - x = *q; - Reads from v before v updated in cache - - - -Basically, while both cachelines will be updated on CPU 2 eventually, there's -no guarantee that, without intervention, the order of update will be the same -as that committed on CPU 1. - - -To intervene, we need to interpolate a data dependency barrier or a read -barrier between the loads (which as of v4.15 is supplied unconditionally -by the READ_ONCE() macro). This will force the cache to commit its -coherency queue before processing any further requests: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); - - - p = &v; q = p; - - - - smp_read_barrier_depends() - - - x = *q; - Reads from v after v updated in cache - - -This sort of problem can be encountered on DEC Alpha processors as they have a -split cache that improves performance by making better use of the data bus. -While most CPUs do imply a data dependency barrier on the read when a memory -access depends on a read, not all do, so it may not be relied on. - -Other CPUs may also have split caches, but must coordinate between the various -cachelets for normal memory accesses. The semantics of the Alpha removes the -need for hardware coordination in the absence of memory barriers, which -permitted Alpha to sport higher CPU clock rates back in the day. However, -please note that (again, as of v4.15) smp_read_barrier_depends() should not -be used except in Alpha arch-specific code and within the READ_ONCE() macro. - - CACHE COHERENCY VS DMA ---------------------- @@ -3009,10 +2871,8 @@ caches with the memory coherence system, thus making it seem like pointer changes vs new data occur in the right order. The Alpha defines the Linux kernel's memory model, although as of v4.15 -the Linux kernel's addition of smp_read_barrier_depends() to READ_ONCE() -greatly reduced Alpha's impact on the memory model. - -See the subsection on "Cache Coherency" above. +the Linux kernel's addition of smp_mb() to READ_ONCE() on Alpha greatly +reduced its impact on the memory model. VIRTUAL MACHINE GUESTS From 9ce1b14e74042a3477f880bee675945044880b01 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 29 Nov 2019 19:08:37 +0100 Subject: [PATCH 384/502] Documentation/barriers/kokr: Remove references to [smp_]read_barrier_depends() This commit translates commit ("Documentation/barriers: Remove references to [smp_]read_barrier_depends()") into Korean. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Yunjae Lee Signed-off-by: SeongJae Park Signed-off-by: Will Deacon --- .../translations/ko_KR/memory-barriers.txt | 146 +----------------- 1 file changed, 3 insertions(+), 143 deletions(-) diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index 34d041d68f78..a1f772ef622c 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -577,7 +577,7 @@ ACQUIRE 는 해당 오퍼레이션의 로드 부분에만 적용되고 RELEASE 데이터 의존성 배리어 (역사적) ----------------------------- -리눅스 커널 v4.15 기준으로, smp_read_barrier_depends() 가 READ_ONCE() 에 +리눅스 커널 v4.15 기준으로, smp_mb() 가 DEC Alpha 용 READ_ONCE() 코드에 추가되었는데, 이는 이 섹션에 주의를 기울여야 하는 사람들은 DEC Alpha 아키텍쳐 전용 코드를 만드는 사람들과 READ_ONCE() 자체를 만드는 사람들 뿐임을 의미합니다. 그런 분들을 위해, 그리고 역사에 관심 있는 분들을 위해, 여기 데이터 의존성 @@ -2664,144 +2664,6 @@ CPU 코어는 프로그램의 인과성이 유지된다고만 여겨진다면 수도 있습니다. -캐시 일관성 ------------ - -하지만 삶은 앞에서 이야기한 것처럼 단순하지 않습니다: 캐시들은 일관적일 것으로 -기대되지만, 그 일관성이 순서에도 적용될 거라는 보장은 없습니다. 한 CPU 에서 -만들어진 변경 사항은 최종적으로는 시스템의 모든 CPU 에게 보여지게 되지만, 다른 -CPU 들에게도 같은 순서로 보이게 될 거라는 보장은 없다는 뜻입니다. - - -두개의 CPU (1 & 2) 가 달려 있고, 각 CPU 에 두개의 데이터 캐시(CPU 1 은 A/B 를, -CPU 2 는 C/D 를 갖습니다)가 병렬로 연결되어 있는 시스템을 다룬다고 생각해 -봅시다: - - : - : +--------+ - : +---------+ | | - +--------+ : +--->| Cache A |<------->| | - | | : | +---------+ | | - | CPU 1 |<---+ | | - | | : | +---------+ | | - +--------+ : +--->| Cache B |<------->| | - : +---------+ | | - : | Memory | - : +---------+ | System | - +--------+ : +--->| Cache C |<------->| | - | | : | +---------+ | | - | CPU 2 |<---+ | | - | | : | +---------+ | | - +--------+ : +--->| Cache D |<------->| | - : +---------+ | | - : +--------+ - : - -이 시스템이 다음과 같은 특성을 갖는다 생각해 봅시다: - - (*) 홀수번 캐시라인은 캐시 A, 캐시 C 또는 메모리에 위치할 수 있음; - - (*) 짝수번 캐시라인은 캐시 B, 캐시 D 또는 메모리에 위치할 수 있음; - - (*) CPU 코어가 한개의 캐시에 접근하는 동안, 다른 캐시는 - 더티 캐시라인을 - 메모리에 내리거나 추측성 로드를 하거나 하기 위해 - 시스템의 다른 부분에 - 액세스 하기 위해 버스를 사용할 수 있음; - - (*) 각 캐시는 시스템의 나머지 부분들과 일관성을 맞추기 위해 해당 캐시에 - 적용되어야 할 오퍼레이션들의 큐를 가짐; - - (*) 이 일관성 큐는 캐시에 이미 존재하는 라인에 가해지는 평범한 로드에 의해서는 - 비워지지 않는데, 큐의 오퍼레이션들이 이 로드의 결과에 영향을 끼칠 수 있다 - 할지라도 그러함. - -이제, 첫번째 CPU 에서 두개의 쓰기 오퍼레이션을 만드는데, 해당 CPU 의 캐시에 -요청된 순서로 오퍼레이션이 도달됨을 보장하기 위해 두 오퍼레이션 사이에 쓰기 -배리어를 사용하는 상황을 상상해 봅시다: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); v 의 변경이 p 의 변경 전에 보일 것을 - 분명히 함 - v 는 이제 캐시 A 에 독점적으로 존재함 - p = &v; - p 는 이제 캐시 B 에 독점적으로 존재함 - -여기서의 쓰기 메모리 배리어는 CPU 1 의 캐시가 올바른 순서로 업데이트 된 것으로 -시스템의 다른 CPU 들이 인지하게 만듭니다. 하지만, 이제 두번째 CPU 가 그 값들을 -읽으려 하는 상황을 생각해 봅시다: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - ... - q = p; - x = *q; - -위의 두개의 읽기 오퍼레이션은 예상된 순서로 일어나지 못할 수 있는데, 두번째 CPU -의 한 캐시에 다른 캐시 이벤트가 발생해 v 를 담고 있는 캐시라인의 해당 캐시에의 -업데이트가 지연되는 사이, p 를 담고 있는 캐시라인은 두번째 CPU 의 다른 캐시에 -업데이트 되어버렸을 수 있기 때문입니다. - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); - - - p = &v; q = p; - - - - x = *q; - 캐시에 업데이트 되기 전의 v 를 읽음 - - - -기본적으로, 두개의 캐시라인 모두 CPU 2 에 최종적으로는 업데이트 될 것이지만, -별도의 개입 없이는, 업데이트의 순서가 CPU 1 에서 만들어진 순서와 동일할 -것이라는 보장이 없습니다. - - -여기에 개입하기 위해선, 데이터 의존성 배리어나 읽기 배리어를 로드 오퍼레이션들 -사이에 넣어야 합니다 (v4.15 부터는 READ_ONCE() 매크로에 의해 무조건적으로 -그렇게 됩니다). 이렇게 함으로써 캐시가 다음 요청을 처리하기 전에 일관성 큐를 -처리하도록 강제하게 됩니다. - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); - - - p = &v; q = p; - - - - smp_read_barrier_depends() - - - x = *q; - 캐시에 업데이트 된 v 를 읽음 - - -이런 부류의 문제는 DEC Alpha 계열 프로세서들에서 발견될 수 있는데, 이들은 -데이터 버스를 좀 더 잘 사용해 성능을 개선할 수 있는, 분할된 캐시를 가지고 있기 -때문입니다. 대부분의 CPU 는 하나의 읽기 오퍼레이션의 메모리 액세스가 다른 읽기 -오퍼레이션에 의존적이라면 데이터 의존성 배리어를 내포시킵니다만, 모두가 그런건 -아니기 때문에 이점에 의존해선 안됩니다. - -다른 CPU 들도 분할된 캐시를 가지고 있을 수 있지만, 그런 CPU 들은 평범한 메모리 -액세스를 위해서도 이 분할된 캐시들 사이의 조정을 해야만 합니다. Alpha 는 가장 -약한 메모리 순서 시맨틱 (semantic) 을 선택함으로써 메모리 배리어가 명시적으로 -사용되지 않았을 때에는 그런 조정이 필요하지 않게 했으며, 이는 Alpha 가 당시에 -더 높은 CPU 클락 속도를 가질 수 있게 했습니다. 하지만, (다시 말하건대, v4.15 -이후부터는) Alpha 아키텍쳐 전용 코드와 READ_ONCE() 매크로 내부에서를 제외하고는 -smp_read_barrier_depends() 가 사용되지 않아야 함을 알아두시기 바랍니다. - - 캐시 일관성 VS DMA ------------------ @@ -2962,10 +2824,8 @@ Alpha CPU 의 일부 버전은 분할된 데이터 캐시를 가지고 있어서 데이터의 발견을 올바른 순서로 일어나게 하기 때문입니다. 리눅스 커널의 메모리 배리어 모델은 Alpha 에 기초해서 정의되었습니다만, v4.15 -부터는 리눅스 커널이 READ_ONCE() 내에 smp_read_barrier_depends() 를 추가해서 -Alpha 의 메모리 모델로의 영향력이 크게 줄어들긴 했습니다. - -위의 "캐시 일관성" 서브섹션을 참고하세요. +부터는 Alpha 용 READ_ONCE() 코드 내에 smp_mb() 가 추가되어서 메모리 모델로의 +Alpha 의 영향력이 크게 줄어들었습니다. 가상 머신 게스트 From 628fd55671f753a1e4fe8c21b6a0553503cade08 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 7 Nov 2019 14:44:06 +0000 Subject: [PATCH 385/502] tools/memory-model: Remove smp_read_barrier_depends() from informal doc smp_read_barrier_depends() has gone the way of mmiowb() and so many esoteric memory barriers before it. Drop the two mentions of this deceased barrier from the LKMM informal explanation document. Acked-by: Peter Zijlstra (Intel) Acked-by: Alan Stern Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- .../Documentation/explanation.txt | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt index e91a2eb19592..01adf9e0ebac 100644 --- a/tools/memory-model/Documentation/explanation.txt +++ b/tools/memory-model/Documentation/explanation.txt @@ -1122,12 +1122,10 @@ maintain at least the appearance of FIFO order. In practice, this difficulty is solved by inserting a special fence between P1's two loads when the kernel is compiled for the Alpha architecture. In fact, as of version 4.15, the kernel automatically -adds this fence (called smp_read_barrier_depends() and defined as -nothing at all on non-Alpha builds) after every READ_ONCE() and atomic -load. The effect of the fence is to cause the CPU not to execute any -po-later instructions until after the local cache has finished -processing all the stores it has already received. Thus, if the code -was changed to: +adds this fence after every READ_ONCE() and atomic load on Alpha. The +effect of the fence is to cause the CPU not to execute any po-later +instructions until after the local cache has finished processing all +the stores it has already received. Thus, if the code was changed to: P1() { @@ -1146,14 +1144,14 @@ READ_ONCE() or another synchronization primitive rather than accessed directly. The LKMM requires that smp_rmb(), acquire fences, and strong fences -share this property with smp_read_barrier_depends(): They do not allow -the CPU to execute any po-later instructions (or po-later loads in the -case of smp_rmb()) until all outstanding stores have been processed by -the local cache. In the case of a strong fence, the CPU first has to -wait for all of its po-earlier stores to propagate to every other CPU -in the system; then it has to wait for the local cache to process all -the stores received as of that time -- not just the stores received -when the strong fence began. +share this property: They do not allow the CPU to execute any po-later +instructions (or po-later loads in the case of smp_rmb()) until all +outstanding stores have been processed by the local cache. In the +case of a strong fence, the CPU first has to wait for all of its +po-earlier stores to propagate to every other CPU in the system; then +it has to wait for the local cache to process all the stores received +as of that time -- not just the stores received when the strong fence +began. And of course, none of this matters for any architecture other than Alpha. From c6cd2e011655aead2097273a04350f52429a1a8d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 7 Nov 2019 14:46:59 +0000 Subject: [PATCH 386/502] include/linux: Remove smp_read_barrier_depends() from comments smp_read_barrier_depends() doesn't exist any more, so reword the two comments that mention it to refer to "dependency ordering" instead. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- include/linux/percpu-refcount.h | 2 +- include/linux/ptr_ring.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 22d9d183950d..87d8a38bdea1 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -155,7 +155,7 @@ static inline bool __ref_is_percpu(struct percpu_ref *ref, * between contaminating the pointer value, meaning that * READ_ONCE() is required when fetching it. * - * The smp_read_barrier_depends() implied by READ_ONCE() pairs + * The dependency ordering from the READ_ONCE() pairs * with smp_store_release() in __percpu_ref_switch_to_percpu(). */ percpu_ptr = READ_ONCE(ref->percpu_count_ptr); diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 417db0a79a62..808f9d3ee546 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -107,7 +107,7 @@ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) return -ENOSPC; /* Make sure the pointer we are storing points to a valid data. */ - /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */ + /* Pairs with the dependency ordering in __ptr_ring_consume. */ smp_wmb(); WRITE_ONCE(r->queue[r->producer++], ptr); From ad83ec6ce13618a8b975ffdd8291742cb5b0005b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 7 Nov 2019 14:49:00 +0000 Subject: [PATCH 387/502] checkpatch: Remove checks relating to [smp_]read_barrier_depends() The [smp_]read_barrier_depends() macros no longer exist, so we don't need to deal with them in the checkpatch script. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- scripts/checkpatch.pl | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4c820607540b..8032f80c5bc7 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5903,8 +5903,7 @@ sub process { my $barriers = qr{ mb| rmb| - wmb| - read_barrier_depends + wmb }x; my $barrier_stems = qr{ mb__before_atomic| @@ -5953,12 +5952,6 @@ sub process { } } -# check for smp_read_barrier_depends and read_barrier_depends - if (!$file && $line =~ /\b(smp_|)read_barrier_depends\s*\(/) { - WARN("READ_BARRIER_DEPENDS", - "$1read_barrier_depends should only be used in READ_ONCE or DEC Alpha code\n" . $herecurr); - } - # check of hardware specific defines if ($line =~ m@^.\s*\#\s*if.*\b(__i386__|__powerpc64__|__sun__|__s390x__)\b@ && $realfile !~ m@include/asm-@) { CHK("ARCH_DEFINES", From eb5c2d4b45e3d2d5d052ea6b8f1463976b1020d5 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 21 Jul 2020 09:54:15 +0100 Subject: [PATCH 388/502] compiler.h: Move compiletime_assert() macros into compiler_types.h The kernel test robot reports that moving READ_ONCE() out into its own header breaks a W=1 build for parisc, which is relying on the definition of compiletime_assert() being available: | In file included from ./arch/parisc/include/generated/asm/rwonce.h:1, | from ./include/asm-generic/barrier.h:16, | from ./arch/parisc/include/asm/barrier.h:29, | from ./arch/parisc/include/asm/atomic.h:11, | from ./include/linux/atomic.h:7, | from kernel/locking/percpu-rwsem.c:2: | ./arch/parisc/include/asm/atomic.h: In function 'atomic_read': | ./include/asm-generic/rwonce.h:36:2: error: implicit declaration of function 'compiletime_assert' [-Werror=implicit-function-declaration] | 36 | compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ | | ^~~~~~~~~~~~~~~~~~ | ./include/asm-generic/rwonce.h:49:2: note: in expansion of macro 'compiletime_assert_rwonce_type' | 49 | compiletime_assert_rwonce_type(x); \ | | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ./arch/parisc/include/asm/atomic.h:73:9: note: in expansion of macro 'READ_ONCE' | 73 | return READ_ONCE((v)->counter); | | ^~~~~~~~~ Move these macros into compiler_types.h, so that they are available to READ_ONCE() and friends. Link: http://lists.infradead.org/pipermail/linux-arm-kernel/2020-July/587094.html Reported-by: kernel test robot Signed-off-by: Will Deacon --- include/linux/compiler.h | 41 ---------------------------------- include/linux/compiler_types.h | 41 ++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index f075a3df4fe2..59f7194fdf08 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -273,47 +273,6 @@ static inline void *offset_to_ptr(const int *off) #endif /* __ASSEMBLY__ */ -/* Compile time object size, -1 for unknown */ -#ifndef __compiletime_object_size -# define __compiletime_object_size(obj) -1 -#endif -#ifndef __compiletime_warning -# define __compiletime_warning(message) -#endif -#ifndef __compiletime_error -# define __compiletime_error(message) -#endif - -#ifdef __OPTIMIZE__ -# define __compiletime_assert(condition, msg, prefix, suffix) \ - do { \ - extern void prefix ## suffix(void) __compiletime_error(msg); \ - if (!(condition)) \ - prefix ## suffix(); \ - } while (0) -#else -# define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0) -#endif - -#define _compiletime_assert(condition, msg, prefix, suffix) \ - __compiletime_assert(condition, msg, prefix, suffix) - -/** - * compiletime_assert - break build and emit msg if condition is false - * @condition: a compile-time constant condition to check - * @msg: a message to emit if condition is false - * - * In tradition of POSIX assert, this macro will break the build if the - * supplied condition is *false*, emitting the supplied error message if the - * compiler has support to do so. - */ -#define compiletime_assert(condition, msg) \ - _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) - -#define compiletime_assert_atomic_type(t) \ - compiletime_assert(__native_word(t), \ - "Need native word sized stores/loads for atomicity.") - /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index c3bf7710f69a..d9bbb62a3e2a 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -300,6 +300,47 @@ struct ftrace_likely_data { (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) +/* Compile time object size, -1 for unknown */ +#ifndef __compiletime_object_size +# define __compiletime_object_size(obj) -1 +#endif +#ifndef __compiletime_warning +# define __compiletime_warning(message) +#endif +#ifndef __compiletime_error +# define __compiletime_error(message) +#endif + +#ifdef __OPTIMIZE__ +# define __compiletime_assert(condition, msg, prefix, suffix) \ + do { \ + extern void prefix ## suffix(void) __compiletime_error(msg); \ + if (!(condition)) \ + prefix ## suffix(); \ + } while (0) +#else +# define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0) +#endif + +#define _compiletime_assert(condition, msg, prefix, suffix) \ + __compiletime_assert(condition, msg, prefix, suffix) + +/** + * compiletime_assert - break build and emit msg if condition is false + * @condition: a compile-time constant condition to check + * @msg: a message to emit if condition is false + * + * In tradition of POSIX assert, this macro will break the build if the + * supplied condition is *false*, emitting the supplied error message if the + * compiler has support to do so. + */ +#define compiletime_assert(condition, msg) \ + _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) + +#define compiletime_assert_atomic_type(t) \ + compiletime_assert(__native_word(t), \ + "Need native word sized stores/loads for atomicity.") + /* Helpers for emitting diagnostics in pragmas. */ #ifndef __diag #define __diag(string) From 5f1f7f6c205a2e7f1d92229ac358254bd2826c2d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 30 Jun 2020 13:53:07 +0100 Subject: [PATCH 389/502] arm64: Reduce the number of header files pulled into vmlinux.lds.S Although vmlinux.lds.S smells like an assembly file and is compiled with __ASSEMBLY__ defined, it's actually just fed to the preprocessor to create our linker script. This means that any assembly macros defined by headers that it includes will result in a helpful link error: | aarch64-linux-gnu-ld:./arch/arm64/kernel/vmlinux.lds:1: syntax error In preparation for an arm64-private asm/rwonce.h implementation, which will end up pulling assembly macros into linux/compiler.h, reduce the number of headers we include directly and transitively in vmlinux.lds.S Acked-by: Peter Zijlstra (Intel) Signed-off-by: Will Deacon --- arch/arm64/include/asm/kernel-pgtable.h | 2 +- arch/arm64/include/asm/memory.h | 11 ++++++----- arch/arm64/include/asm/uaccess.h | 1 + arch/arm64/kernel/entry.S | 1 + arch/arm64/kernel/vmlinux.lds.S | 1 - arch/arm64/kvm/hyp-init.S | 1 + 6 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 3bf626f6fe0c..329fb15f6bac 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -8,7 +8,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H -#include +#include #include /* diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index a1871bb32bb1..9d4bf58cf7b3 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -10,11 +10,8 @@ #ifndef __ASM_MEMORY_H #define __ASM_MEMORY_H -#include #include #include -#include -#include #include /* @@ -157,11 +154,15 @@ #endif #ifndef __ASSEMBLY__ -extern u64 vabits_actual; -#define PAGE_END (_PAGE_END(vabits_actual)) #include +#include #include +#include +#include + +extern u64 vabits_actual; +#define PAGE_END (_PAGE_END(vabits_actual)) extern s64 physvirt_offset; extern s64 memstart_addr; diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index bc5c7b091152..8d7c466f809b 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 5304d193c79d..b668aad3b762 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 6827da7f3aa5..e1e7c0431b4d 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -10,7 +10,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 6e6ed5581eed..076544393c3c 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -6,6 +6,7 @@ #include +#include #include #include #include From 55fdc1f44cd6bb1d61c9ca946d8f7cd67ea0bf36 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Tue, 21 Jul 2020 18:49:33 +0800 Subject: [PATCH 390/502] arm64: perf: Expose some new events via sysfs Some new PMU events can been detected by PMCEID1_EL0, but it can't be listed, Let's expose these through sysfs. Signed-off-by: Shaokun Zhang Cc: Will Deacon Cc: Mark Rutland Link: https://lore.kernel.org/r/1595328573-12751-2-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/perf_event.h | 27 +++++++++++++++++++++++++++ arch/arm64/kernel/perf_event.c | 19 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h index e7765b62c712..2c2d7dbe8a02 100644 --- a/arch/arm64/include/asm/perf_event.h +++ b/arch/arm64/include/asm/perf_event.h @@ -72,6 +72,13 @@ #define ARMV8_PMUV3_PERFCTR_LL_CACHE_RD 0x36 #define ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD 0x37 #define ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD 0x38 +#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD 0x39 +#define ARMV8_PMUV3_PERFCTR_OP_RETIRED 0x3A +#define ARMV8_PMUV3_PERFCTR_OP_SPEC 0x3B +#define ARMV8_PMUV3_PERFCTR_STALL 0x3C +#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND 0x3D +#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND 0x3E +#define ARMV8_PMUV3_PERFCTR_STALL_SLOT 0x3F /* Statistical profiling extension microarchitectural events */ #define ARMV8_SPE_PERFCTR_SAMPLE_POP 0x4000 @@ -79,6 +86,26 @@ #define ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE 0x4002 #define ARMV8_SPE_PERFCTR_SAMPLE_COLLISION 0x4003 +/* AMUv1 architecture events */ +#define ARMV8_AMU_PERFCTR_CNT_CYCLES 0x4004 +#define ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM 0x4005 + +/* long-latency read miss events */ +#define ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS 0x4006 +#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD 0x4009 +#define ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS 0x400A +#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD 0x400B + +/* additional latency from alignment events */ +#define ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT 0x4020 +#define ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT 0x4021 +#define ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT 0x4022 + +/* Armv8.5 Memory Tagging Extension events */ +#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED 0x4024 +#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD 0x4025 +#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR 0x4026 + /* ARMv8 recommended implementation defined event types */ #define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD 0x40 #define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR 0x41 diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index fdb6029c9021..462f9a9cc44b 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -225,10 +225,29 @@ static struct attribute *armv8_pmuv3_event_attrs[] = { ARMV8_EVENT_ATTR(ll_cache_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_RD), ARMV8_EVENT_ATTR(ll_cache_miss_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD), ARMV8_EVENT_ATTR(remote_access_rd, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD), + ARMV8_EVENT_ATTR(l1d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD), + ARMV8_EVENT_ATTR(op_retired, ARMV8_PMUV3_PERFCTR_OP_RETIRED), + ARMV8_EVENT_ATTR(op_spec, ARMV8_PMUV3_PERFCTR_OP_SPEC), + ARMV8_EVENT_ATTR(stall, ARMV8_PMUV3_PERFCTR_STALL), + ARMV8_EVENT_ATTR(stall_slot_backend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND), + ARMV8_EVENT_ATTR(stall_slot_frontend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND), + ARMV8_EVENT_ATTR(stall_slot, ARMV8_PMUV3_PERFCTR_STALL_SLOT), ARMV8_EVENT_ATTR(sample_pop, ARMV8_SPE_PERFCTR_SAMPLE_POP), ARMV8_EVENT_ATTR(sample_feed, ARMV8_SPE_PERFCTR_SAMPLE_FEED), ARMV8_EVENT_ATTR(sample_filtrate, ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE), ARMV8_EVENT_ATTR(sample_collision, ARMV8_SPE_PERFCTR_SAMPLE_COLLISION), + ARMV8_EVENT_ATTR(cnt_cycles, ARMV8_AMU_PERFCTR_CNT_CYCLES), + ARMV8_EVENT_ATTR(stall_backend_mem, ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM), + ARMV8_EVENT_ATTR(l1i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS), + ARMV8_EVENT_ATTR(l2d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD), + ARMV8_EVENT_ATTR(l2i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS), + ARMV8_EVENT_ATTR(l3d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD), + ARMV8_EVENT_ATTR(ldst_align_lat, ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT), + ARMV8_EVENT_ATTR(ld_align_lat, ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT), + ARMV8_EVENT_ATTR(st_align_lat, ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT), + ARMV8_EVENT_ATTR(mem_access_checked, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED), + ARMV8_EVENT_ATTR(mem_access_checked_rd, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD), + ARMV8_EVENT_ATTR(mem_access_checked_wr, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR), NULL, }; From c2127e14c127de2775feefdfb1444e30a129a59f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:30:27 -0700 Subject: [PATCH 391/502] perf: : drop a duplicated word Drop the repeated word "the" in a comment. Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200719003027.20798-1-rdunlap@infradead.org --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3b22db08b6fb..0edd257a5916 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -366,7 +366,7 @@ struct pmu { * ->stop() with PERF_EF_UPDATE will read the counter and update * period/count values like ->read() would. * - * ->start() with PERF_EF_RELOAD will reprogram the the counter + * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. */ void (*start) (struct perf_event *event, int flags); From 2ac5413e5edca6910d2ae157187a889e94be2b62 Mon Sep 17 00:00:00 2001 From: Hu Haowen Date: Sun, 19 Jul 2020 18:50:07 +0800 Subject: [PATCH 392/502] x86/perf: Fix a typo The word "Zhoaxin" is incorrect and the right one is "Zhaoxin". Signed-off-by: Hu Haowen Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200719105007.57649-1-xianfengting221@163.com --- arch/x86/events/zhaoxin/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c index 898fa1ae9ceb..e68827e604ad 100644 --- a/arch/x86/events/zhaoxin/core.c +++ b/arch/x86/events/zhaoxin/core.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Zhoaxin PMU; like Intel Architectural PerfMon-v2 + * Zhaoxin PMU; like Intel Architectural PerfMon-v2 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt From 493cf9b723bcc87e9284e5e5971259951a13f22e Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Tue, 21 Jul 2020 10:12:59 +0100 Subject: [PATCH 393/502] arm64: s/AMEVTYPE/AMEVTYPER Activity Monitor Event Type Registers are named as AMEVTYPER{0,1} Signed-off-by: Vladimir Murzin Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20200721091259.102756-1-vladimir.murzin@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 4 +- arch/arm64/kvm/sys_regs.c | 68 ++++++++++++++++----------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 463175f80341..273bb1d15d21 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -421,9 +421,9 @@ */ #define SYS_AMEVCNTR0_EL0(n) SYS_AM_EL0(4 + ((n) >> 3), (n) & 7) -#define SYS_AMEVTYPE0_EL0(n) SYS_AM_EL0(6 + ((n) >> 3), (n) & 7) +#define SYS_AMEVTYPER0_EL0(n) SYS_AM_EL0(6 + ((n) >> 3), (n) & 7) #define SYS_AMEVCNTR1_EL0(n) SYS_AM_EL0(12 + ((n) >> 3), (n) & 7) -#define SYS_AMEVTYPE1_EL0(n) SYS_AM_EL0(14 + ((n) >> 3), (n) & 7) +#define SYS_AMEVTYPER1_EL0(n) SYS_AM_EL0(14 + ((n) >> 3), (n) & 7) /* AMU v1: Fixed (architecturally defined) activity monitors */ #define SYS_AMEVCNTR0_CORE_EL0 SYS_AMEVCNTR0_EL0(0) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index baf5ce9225ce..d3196671c590 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1024,9 +1024,9 @@ static bool access_amu(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* Macro to expand the AMU counter and type registers*/ #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), access_amu } -#define AMU_AMEVTYPE0_EL0(n) { SYS_DESC(SYS_AMEVTYPE0_EL0(n)), access_amu } +#define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), access_amu } #define AMU_AMEVCNTR1_EL0(n) { SYS_DESC(SYS_AMEVCNTR1_EL0(n)), access_amu } -#define AMU_AMEVTYPE1_EL0(n) { SYS_DESC(SYS_AMEVTYPE1_EL0(n)), access_amu } +#define AMU_AMEVTYPER1_EL0(n) { SYS_DESC(SYS_AMEVTYPER1_EL0(n)), access_amu } static bool trap_ptrauth(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -1629,22 +1629,22 @@ static const struct sys_reg_desc sys_reg_descs[] = { AMU_AMEVCNTR0_EL0(13), AMU_AMEVCNTR0_EL0(14), AMU_AMEVCNTR0_EL0(15), - AMU_AMEVTYPE0_EL0(0), - AMU_AMEVTYPE0_EL0(1), - AMU_AMEVTYPE0_EL0(2), - AMU_AMEVTYPE0_EL0(3), - AMU_AMEVTYPE0_EL0(4), - AMU_AMEVTYPE0_EL0(5), - AMU_AMEVTYPE0_EL0(6), - AMU_AMEVTYPE0_EL0(7), - AMU_AMEVTYPE0_EL0(8), - AMU_AMEVTYPE0_EL0(9), - AMU_AMEVTYPE0_EL0(10), - AMU_AMEVTYPE0_EL0(11), - AMU_AMEVTYPE0_EL0(12), - AMU_AMEVTYPE0_EL0(13), - AMU_AMEVTYPE0_EL0(14), - AMU_AMEVTYPE0_EL0(15), + AMU_AMEVTYPER0_EL0(0), + AMU_AMEVTYPER0_EL0(1), + AMU_AMEVTYPER0_EL0(2), + AMU_AMEVTYPER0_EL0(3), + AMU_AMEVTYPER0_EL0(4), + AMU_AMEVTYPER0_EL0(5), + AMU_AMEVTYPER0_EL0(6), + AMU_AMEVTYPER0_EL0(7), + AMU_AMEVTYPER0_EL0(8), + AMU_AMEVTYPER0_EL0(9), + AMU_AMEVTYPER0_EL0(10), + AMU_AMEVTYPER0_EL0(11), + AMU_AMEVTYPER0_EL0(12), + AMU_AMEVTYPER0_EL0(13), + AMU_AMEVTYPER0_EL0(14), + AMU_AMEVTYPER0_EL0(15), AMU_AMEVCNTR1_EL0(0), AMU_AMEVCNTR1_EL0(1), AMU_AMEVCNTR1_EL0(2), @@ -1661,22 +1661,22 @@ static const struct sys_reg_desc sys_reg_descs[] = { AMU_AMEVCNTR1_EL0(13), AMU_AMEVCNTR1_EL0(14), AMU_AMEVCNTR1_EL0(15), - AMU_AMEVTYPE1_EL0(0), - AMU_AMEVTYPE1_EL0(1), - AMU_AMEVTYPE1_EL0(2), - AMU_AMEVTYPE1_EL0(3), - AMU_AMEVTYPE1_EL0(4), - AMU_AMEVTYPE1_EL0(5), - AMU_AMEVTYPE1_EL0(6), - AMU_AMEVTYPE1_EL0(7), - AMU_AMEVTYPE1_EL0(8), - AMU_AMEVTYPE1_EL0(9), - AMU_AMEVTYPE1_EL0(10), - AMU_AMEVTYPE1_EL0(11), - AMU_AMEVTYPE1_EL0(12), - AMU_AMEVTYPE1_EL0(13), - AMU_AMEVTYPE1_EL0(14), - AMU_AMEVTYPE1_EL0(15), + AMU_AMEVTYPER1_EL0(0), + AMU_AMEVTYPER1_EL0(1), + AMU_AMEVTYPER1_EL0(2), + AMU_AMEVTYPER1_EL0(3), + AMU_AMEVTYPER1_EL0(4), + AMU_AMEVTYPER1_EL0(5), + AMU_AMEVTYPER1_EL0(6), + AMU_AMEVTYPER1_EL0(7), + AMU_AMEVTYPER1_EL0(8), + AMU_AMEVTYPER1_EL0(9), + AMU_AMEVTYPER1_EL0(10), + AMU_AMEVTYPER1_EL0(11), + AMU_AMEVTYPER1_EL0(12), + AMU_AMEVTYPER1_EL0(13), + AMU_AMEVTYPER1_EL0(14), + AMU_AMEVTYPER1_EL0(15), { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, From 58e15716feb562cdba57e99d62c525a1faa37c08 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 20 Jul 2020 14:15:02 +0200 Subject: [PATCH 394/502] s390/time: use CLOCKSOURCE_MASK Make use of CLOCKSOURCE_MASK instead of open-coding it. Signed-off-by: Heiko Carstens --- arch/s390/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 700127ba689d..317059684847 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -253,7 +253,7 @@ static struct clocksource clocksource_tod = { .name = "tod", .rating = 400, .read = read_tod_clock, - .mask = -1ULL, + .mask = CLOCKSOURCE_MASK(64), .mult = 1000, .shift = 12, .flags = CLOCK_SOURCE_IS_CONTINUOUS, From 555701a714f77e01490f633c1080cf97f0ede1f0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 20 Jul 2020 14:16:03 +0200 Subject: [PATCH 395/502] s390/time: select CLOCKSOURCE_VALIDATE_LAST_CYCLE The value returned by read_tod_clock() will overflow on September 17th 2042. To avoid that system time jumps back select CLOCKSOURCE_VALIDATE_LAST_CYCLE which enables a sanity check in order to prevent negative "delta" values. Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 0df33cffec52..d95d323cf213 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -125,6 +125,7 @@ config S390 select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_VMALLOC + select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY From 411155820bb348e71ecc5b1db147b36af98cbc96 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 20 Jul 2020 14:28:36 +0200 Subject: [PATCH 396/502] s390/time: improve comparison for tod steering It doesn't make sense to add zero shifted by 15. It's still zero. Signed-off-by: Heiko Carstens --- arch/s390/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 317059684847..513e59d08a55 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -237,7 +237,7 @@ static u64 read_tod_clock(struct clocksource *cs) preempt_disable(); /* protect from changes to steering parameters */ now = get_tod_clock(); adj = tod_steering_end - now; - if (unlikely((s64) adj >= 0)) + if (unlikely((s64) adj > 0)) /* * manually steer by 1 cycle every 2^16 cycles. This * corresponds to shifting the tod delta by 15. 1s is From ec0160891e387f4771f953b888b1fe951398e5d9 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Tue, 21 Jul 2020 14:26:09 -0600 Subject: [PATCH 397/502] irqdomain/treewide: Free firmware node after domain removal Commit 711419e504eb ("irqdomain: Add the missing assignment of domain->fwnode for named fwnode") unintentionally caused a dangling pointer page fault issue on firmware nodes that were freed after IRQ domain allocation. Commit e3beca48a45b fixed that dangling pointer issue by only freeing the firmware node after an IRQ domain allocation failure. That fix no longer frees the firmware node immediately, but leaves the firmware node allocated after the domain is removed. The firmware node must be kept around through irq_domain_remove, but should be freed it afterwards. Add the missing free operations after domain removal where where appropriate. Fixes: e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated") Signed-off-by: Jon Derrick Signed-off-by: Thomas Gleixner Reviewed-by: Andy Shevchenko Acked-by: Bjorn Helgaas # drivers/pci Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1595363169-7157-1-git-send-email-jonathan.derrick@intel.com --- arch/mips/pci/pci-xtalk-bridge.c | 3 +++ arch/x86/kernel/apic/io_apic.c | 5 +++++ drivers/iommu/intel/irq_remapping.c | 8 ++++++++ drivers/mfd/ioc3.c | 6 ++++++ drivers/pci/controller/vmd.c | 3 +++ 5 files changed, 25 insertions(+) diff --git a/arch/mips/pci/pci-xtalk-bridge.c b/arch/mips/pci/pci-xtalk-bridge.c index 5958217861b8..9b3cc775c55e 100644 --- a/arch/mips/pci/pci-xtalk-bridge.c +++ b/arch/mips/pci/pci-xtalk-bridge.c @@ -728,6 +728,7 @@ err_free_resource: pci_free_resource_list(&host->windows); err_remove_domain: irq_domain_remove(domain); + irq_domain_free_fwnode(fn); return err; } @@ -735,8 +736,10 @@ static int bridge_remove(struct platform_device *pdev) { struct pci_bus *bus = platform_get_drvdata(pdev); struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); + struct fwnode_handle *fn = bc->domain->fwnode; irq_domain_remove(bc->domain); + irq_domain_free_fwnode(fn); pci_lock_rescan_remove(); pci_stop_root_bus(bus); pci_remove_root_bus(bus); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 81ffcfbfaef2..21325a4a78b9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2335,8 +2335,13 @@ static int mp_irqdomain_create(int ioapic) static void ioapic_destroy_irqdomain(int idx) { + struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg; + struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode; + if (ioapics[idx].irqdomain) { irq_domain_remove(ioapics[idx].irqdomain); + if (!cfg->dev) + irq_domain_free_fwnode(fn); ioapics[idx].irqdomain = NULL; } } diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 9564d23d094f..aa096b333a99 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -628,13 +628,21 @@ out_free_table: static void intel_teardown_irq_remapping(struct intel_iommu *iommu) { + struct fwnode_handle *fn; + if (iommu && iommu->ir_table) { if (iommu->ir_msi_domain) { + fn = iommu->ir_msi_domain->fwnode; + irq_domain_remove(iommu->ir_msi_domain); + irq_domain_free_fwnode(fn); iommu->ir_msi_domain = NULL; } if (iommu->ir_domain) { + fn = iommu->ir_domain->fwnode; + irq_domain_remove(iommu->ir_domain); + irq_domain_free_fwnode(fn); iommu->ir_domain = NULL; } free_pages((unsigned long)iommu->ir_table->base, diff --git a/drivers/mfd/ioc3.c b/drivers/mfd/ioc3.c index 74cee7cb0afc..d939ccc46509 100644 --- a/drivers/mfd/ioc3.c +++ b/drivers/mfd/ioc3.c @@ -616,7 +616,10 @@ static int ioc3_mfd_probe(struct pci_dev *pdev, /* Remove all already added MFD devices */ mfd_remove_devices(&ipd->pdev->dev); if (ipd->domain) { + struct fwnode_handle *fn = ipd->domain->fwnode; + irq_domain_remove(ipd->domain); + irq_domain_free_fwnode(fn); free_irq(ipd->domain_irq, (void *)ipd); } pci_iounmap(pdev, regs); @@ -643,7 +646,10 @@ static void ioc3_mfd_remove(struct pci_dev *pdev) /* Release resources */ mfd_remove_devices(&ipd->pdev->dev); if (ipd->domain) { + struct fwnode_handle *fn = ipd->domain->fwnode; + irq_domain_remove(ipd->domain); + irq_domain_free_fwnode(fn); free_irq(ipd->domain_irq, (void *)ipd); } pci_iounmap(pdev, ipd->regs); diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 9a64cf90c291..ebec0a6e77ed 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -560,6 +560,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) if (!vmd->bus) { pci_free_resource_list(&resources); irq_domain_remove(vmd->irq_domain); + irq_domain_free_fwnode(fn); return -ENODEV; } @@ -673,6 +674,7 @@ static void vmd_cleanup_srcu(struct vmd_dev *vmd) static void vmd_remove(struct pci_dev *dev) { struct vmd_dev *vmd = pci_get_drvdata(dev); + struct fwnode_handle *fn = vmd->irq_domain->fwnode; sysfs_remove_link(&vmd->dev->dev.kobj, "domain"); pci_stop_root_bus(vmd->bus); @@ -680,6 +682,7 @@ static void vmd_remove(struct pci_dev *dev) vmd_cleanup_srcu(vmd); vmd_detach_resources(vmd); irq_domain_remove(vmd->irq_domain); + irq_domain_free_fwnode(fn); } #ifdef CONFIG_PM_SLEEP From 0ae3b13aab210e2a8c14371731abddfee228ae24 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Jul 2020 10:33:15 +0200 Subject: [PATCH 398/502] arm64/entry: deduplicate SW PAN entry/exit routines Factor the 12 copies of the SW PAN entry and exit code into callable subroutines, and use alternatives patching to either emit a 'bl' instruction to call them, or a NOP if h/w PAN is found to be available at runtime. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20200721083315.4816-1-ardb@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry.S | 95 +++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 48 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 5304d193c79d..7b9a7c45ef85 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -209,28 +209,9 @@ alternative_cb_end add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Set the TTBR0 PAN bit in SPSR. When the exception is taken from - * EL0, there is no need to check the state of TTBR0_EL1 since - * accesses are always enabled. - * Note that the meaning of this bit differs from the ARMv8.1 PAN - * feature as all TTBR0_EL1 accesses are disabled, not just those to - * user mappings. - */ -alternative_if ARM64_HAS_PAN - b 1f // skip TTBR0 PAN +alternative_if_not ARM64_HAS_PAN + bl __swpan_entry_el\el alternative_else_nop_endif - - .if \el != 0 - mrs x21, ttbr0_el1 - tst x21, #TTBR_ASID_MASK // Check for the reserved ASID - orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR - b.eq 1f // TTBR0 access already disabled - and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR - .endif - - __uaccess_ttbr0_disable x21 -1: #endif stp x22, x23, [sp, #S_PC] @@ -284,34 +265,9 @@ alternative_else_nop_endif .endif #ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR - * PAN bit checking. - */ -alternative_if ARM64_HAS_PAN - b 2f // skip TTBR0 PAN +alternative_if_not ARM64_HAS_PAN + bl __swpan_exit_el\el alternative_else_nop_endif - - .if \el != 0 - tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set - .endif - - __uaccess_ttbr0_enable x0, x1 - - .if \el == 0 - /* - * Enable errata workarounds only if returning to user. The only - * workaround currently required for TTBR0_EL1 changes are for the - * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache - * corruption). - */ - bl post_ttbr_update_workaround - .endif -1: - .if \el != 0 - and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit - .endif -2: #endif .if \el == 0 @@ -391,6 +347,49 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 sb .endm +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. + */ +SYM_CODE_START_LOCAL(__swpan_entry_el1) + mrs x21, ttbr0_el1 + tst x21, #TTBR_ASID_MASK // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR +SYM_INNER_LABEL(__swpan_entry_el0, SYM_L_LOCAL) + __uaccess_ttbr0_disable x21 +1: ret +SYM_CODE_END(__swpan_entry_el1) + + /* + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. + */ +SYM_CODE_START_LOCAL(__swpan_exit_el1) + tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set + __uaccess_ttbr0_enable x0, x1 +1: and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + ret +SYM_CODE_END(__swpan_exit_el1) + +SYM_CODE_START_LOCAL(__swpan_exit_el0) + __uaccess_ttbr0_enable x0, x1 + /* + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). + */ + b post_ttbr_update_workaround +SYM_CODE_END(__swpan_exit_el0) +#endif + .macro irq_stack_entry mov x19, sp // preserve the original sp #ifdef CONFIG_SHADOW_CALL_STACK From a46cec12f4a53ee5113f42b327cbb8d4cda074d2 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 24 Jul 2020 10:41:31 +0100 Subject: [PATCH 399/502] arm64: Reserve HWCAP2_MTE as (1 << 18) While MTE is not supported in the upstream kernel yet, add a comment that HWCAP2_MTE as (1 << 18) is reserved. Glibc makes use of it for the resolving (ifunc) of the MTE-safe string routines. Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpuinfo.c | 1 + 3 files changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index d683bcbf1e7c..22f73fe09030 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -95,6 +95,7 @@ #define KERNEL_HWCAP_DGH __khwcap2_feature(DGH) #define KERNEL_HWCAP_RNG __khwcap2_feature(RNG) #define KERNEL_HWCAP_BTI __khwcap2_feature(BTI) +/* reserved for KERNEL_HWCAP_MTE __khwcap2_feature(MTE) */ /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 2d6ba1c2592e..912162f73529 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -74,5 +74,6 @@ #define HWCAP2_DGH (1 << 15) #define HWCAP2_RNG (1 << 16) #define HWCAP2_BTI (1 << 17) +/* reserved for HWCAP2_MTE (1 << 18) */ #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 86637466daa8..393c6fb1f1cb 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -93,6 +93,7 @@ static const char *const hwcap_str[] = { "dgh", "rng", "bti", + /* reserved for "mte" */ NULL }; From ea0eada45632f4807b2f49de951072283e2d781c Mon Sep 17 00:00:00 2001 From: Gregory Herrero Date: Fri, 17 Jul 2020 16:33:38 +0200 Subject: [PATCH 400/502] recordmcount: only record relocation of type R_AARCH64_CALL26 on arm64. Currently, if a section has a relocation to '_mcount' symbol, a new __mcount_loc entry will be added whatever the relocation type is. This is problematic when a relocation to '_mcount' is in the middle of a section and is not a call for ftrace use. Such relocation could be generated with below code for example: bool is_mcount(unsigned long addr) { return (target == (unsigned long) &_mcount); } With this snippet of code, ftrace will try to patch the mcount location generated by this code on module load and fail with: Call trace: ftrace_bug+0xa0/0x28c ftrace_process_locs+0x2f4/0x430 ftrace_module_init+0x30/0x38 load_module+0x14f0/0x1e78 __do_sys_finit_module+0x100/0x11c __arm64_sys_finit_module+0x28/0x34 el0_svc_common+0x88/0x194 el0_svc_handler+0x38/0x8c el0_svc+0x8/0xc ---[ end trace d828d06b36ad9d59 ]--- ftrace failed to modify [] 0xffffa2dbf3a3a41c actual: 66:a9:3c:90 Initializing ftrace call sites ftrace record flags: 2000000 (0) expected tramp: ffffa2dc6cf66724 So Limit the relocation type to R_AARCH64_CALL26 as in perl version of recordmcount. Fixes: af64d2aa872a ("ftrace: Add arm64 support to recordmcount") Signed-off-by: Gregory Herrero Acked-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20200717143338.19302-1-gregory.herrero@oracle.com Signed-off-by: Catalin Marinas --- scripts/recordmcount.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 7225107a9aaf..e59022b3f125 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -434,6 +434,11 @@ static int arm_is_fake_mcount(Elf32_Rel const *rp) return 1; } +static int arm64_is_fake_mcount(Elf64_Rel const *rp) +{ + return ELF64_R_TYPE(w(rp->r_info)) != R_AARCH64_CALL26; +} + /* 64-bit EM_MIPS has weird ELF64_Rela.r_info. * http://techpubs.sgi.com/library/manuals/4000/007-4658-001/pdf/007-4658-001.pdf * We interpret Table 29 Relocation Operation (Elf64_Rel, Elf64_Rela) [p.40] @@ -547,6 +552,7 @@ static int do_file(char const *const fname) make_nop = make_nop_arm64; rel_type_nop = R_AARCH64_NONE; ideal_nop = ideal_nop4_arm64; + is_fake_mcount64 = arm64_is_fake_mcount; break; case EM_IA_64: reltype = R_IA64_IMM64; break; case EM_MIPS: /* reltype: e_class */ break; From d19e789f068b3d633cbac430764962f404198022 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Jul 2020 13:50:25 +0200 Subject: [PATCH 401/502] compiler.h: Move instrumentation_begin()/end() to new header Linus pointed out that compiler.h - which is a key header that gets included in every single one of the 28,000+ kernel files during a kernel build - was bloated in: 655389666643: ("vmlinux.lds.h: Create section for protection against instrumentation") Linus noted: > I have pulled this, but do we really want to add this to a header file > that is _so_ core that it gets included for basically every single > file built? > > I don't even see those instrumentation_begin/end() things used > anywhere right now. > > It seems excessive. That 53 lines is maybe not a lot, but it pushed > that header file to over 12kB, and while it's mostly comments, it's > extra IO and parsing basically for _every_ single file compiled in the > kernel. > > For what appears to be absolutely zero upside right now, and I really > don't see why this should be in such a core header file! Move these primitives into a new header: , and include that header in the headers that make use of it. Unfortunately one of these headers is asm-generic/bug.h, which does get included in a lot of places, similarly to compiler.h. So the de-bloating effect isn't as good as we'd like it to be - but at least the interfaces are defined separately. No change to functionality intended. Reported-by: Linus Torvalds Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200604071921.GA1361070@gmail.com Cc: Thomas Gleixner Cc: Borislav Petkov Cc: Peter Zijlstra --- arch/x86/include/asm/bug.h | 1 + include/asm-generic/bug.h | 1 + include/linux/compiler.h | 53 ----------------------------- include/linux/context_tracking.h | 2 ++ include/linux/instrumentation.h | 57 ++++++++++++++++++++++++++++++++ 5 files changed, 61 insertions(+), 53 deletions(-) create mode 100644 include/linux/instrumentation.h diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 028189575560..297fa12e7e27 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -3,6 +3,7 @@ #define _ASM_X86_BUG_H #include +#include /* * Despite that some emulators terminate on UD2, we use it for WARN(). diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index c94e33ae3e7b..18b0f4eee8cb 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -3,6 +3,7 @@ #define _ASM_GENERIC_BUG_H #include +#include #define CUT_HERE "------------[ cut here ]------------\n" diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 204e76856435..681894bfde99 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -120,65 +120,12 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(.rodata..c_jump_table) -#ifdef CONFIG_DEBUG_ENTRY -/* Begin/end of an instrumentation safe region */ -#define instrumentation_begin() ({ \ - asm volatile("%c0: nop\n\t" \ - ".pushsection .discard.instr_begin\n\t" \ - ".long %c0b - .\n\t" \ - ".popsection\n\t" : : "i" (__COUNTER__)); \ -}) - -/* - * Because instrumentation_{begin,end}() can nest, objtool validation considers - * _begin() a +1 and _end() a -1 and computes a sum over the instructions. - * When the value is greater than 0, we consider instrumentation allowed. - * - * There is a problem with code like: - * - * noinstr void foo() - * { - * instrumentation_begin(); - * ... - * if (cond) { - * instrumentation_begin(); - * ... - * instrumentation_end(); - * } - * bar(); - * instrumentation_end(); - * } - * - * If instrumentation_end() would be an empty label, like all the other - * annotations, the inner _end(), which is at the end of a conditional block, - * would land on the instruction after the block. - * - * If we then consider the sum of the !cond path, we'll see that the call to - * bar() is with a 0-value, even though, we meant it to happen with a positive - * value. - * - * To avoid this, have _end() be a NOP instruction, this ensures it will be - * part of the condition block and does not escape. - */ -#define instrumentation_end() ({ \ - asm volatile("%c0: nop\n\t" \ - ".pushsection .discard.instr_end\n\t" \ - ".long %c0b - .\n\t" \ - ".popsection\n\t" : : "i" (__COUNTER__)); \ -}) -#endif /* CONFIG_DEBUG_ENTRY */ - #else #define annotate_reachable() #define annotate_unreachable() #define __annotate_jump_table #endif -#ifndef instrumentation_begin -#define instrumentation_begin() do { } while(0) -#define instrumentation_end() do { } while(0) -#endif - #ifndef ASM_UNREACHABLE # define ASM_UNREACHABLE #endif diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 981b880d5b60..d53cd331c4dd 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -5,6 +5,8 @@ #include #include #include +#include + #include diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h new file mode 100644 index 000000000000..93e2ad67fc10 --- /dev/null +++ b/include/linux/instrumentation.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_INSTRUMENTATION_H +#define __LINUX_INSTRUMENTATION_H + +#if defined(CONFIG_DEBUG_ENTRY) && defined(CONFIG_STACK_VALIDATION) + +/* Begin/end of an instrumentation safe region */ +#define instrumentation_begin() ({ \ + asm volatile("%c0: nop\n\t" \ + ".pushsection .discard.instr_begin\n\t" \ + ".long %c0b - .\n\t" \ + ".popsection\n\t" : : "i" (__COUNTER__)); \ +}) + +/* + * Because instrumentation_{begin,end}() can nest, objtool validation considers + * _begin() a +1 and _end() a -1 and computes a sum over the instructions. + * When the value is greater than 0, we consider instrumentation allowed. + * + * There is a problem with code like: + * + * noinstr void foo() + * { + * instrumentation_begin(); + * ... + * if (cond) { + * instrumentation_begin(); + * ... + * instrumentation_end(); + * } + * bar(); + * instrumentation_end(); + * } + * + * If instrumentation_end() would be an empty label, like all the other + * annotations, the inner _end(), which is at the end of a conditional block, + * would land on the instruction after the block. + * + * If we then consider the sum of the !cond path, we'll see that the call to + * bar() is with a 0-value, even though, we meant it to happen with a positive + * value. + * + * To avoid this, have _end() be a NOP instruction, this ensures it will be + * part of the condition block and does not escape. + */ +#define instrumentation_end() ({ \ + asm volatile("%c0: nop\n\t" \ + ".pushsection .discard.instr_end\n\t" \ + ".long %c0b - .\n\t" \ + ".popsection\n\t" : : "i" (__COUNTER__)); \ +}) +#else +# define instrumentation_begin() do { } while(0) +# define instrumentation_end() do { } while(0) +#endif + +#endif /* __LINUX_INSTRUMENTATION_H */ From d53b5c013e1e7c3b43a7487171a84ee2acdd9597 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:16 -0700 Subject: [PATCH 402/502] arm64/vdso: use the fault callback to map vvar pages Currently the vdso has no awareness of time namespaces, which may apply distinct offsets to processes in different namespaces. To handle this within the vdso, we'll need to expose a per-namespace data page. As a preparatory step, this patch separates the vdso data page from the code pages, and has it faulted in via its own fault callback. Subsquent patches will extend this to support distinct pages per time namespace. The vvar vma has to be installed with the VM_PFNMAP flag to handle faults via its vma fault callback. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Link: https://lore.kernel.org/r/20200624083321.144975-2-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index e546df0efefb..eb7798e5eb00 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -107,29 +107,32 @@ static int __vdso_init(enum vdso_abi abi) vdso_info[abi].vdso_code_start) >> PAGE_SHIFT; - /* Allocate the vDSO pagelist, plus a page for the data. */ - vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages + 1, + vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages, sizeof(struct page *), GFP_KERNEL); if (vdso_pagelist == NULL) return -ENOMEM; - /* Grab the vDSO data page. */ - vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data)); - - /* Grab the vDSO code pages. */ pfn = sym_to_pfn(vdso_info[abi].vdso_code_start); for (i = 0; i < vdso_info[abi].vdso_pages; i++) - vdso_pagelist[i + 1] = pfn_to_page(pfn + i); + vdso_pagelist[i] = pfn_to_page(pfn + i); - vdso_info[abi].dm->pages = &vdso_pagelist[0]; - vdso_info[abi].cm->pages = &vdso_pagelist[1]; + vdso_info[abi].cm->pages = vdso_pagelist; return 0; } +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + if (vmf->pgoff == 0) + return vmf_insert_pfn(vma, vmf->address, + sym_to_pfn(vdso_data)); + return VM_FAULT_SIGBUS; +} + static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, @@ -150,7 +153,7 @@ static int __setup_additional_pages(enum vdso_abi abi, } ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE, - VM_READ|VM_MAYREAD, + VM_READ|VM_MAYREAD|VM_PFNMAP, vdso_info[abi].dm); if (IS_ERR(ret)) goto up_fail; @@ -206,6 +209,7 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { #ifdef CONFIG_COMPAT_VDSO [AA32_MAP_VVAR] = { .name = "[vvar]", + .fault = vvar_fault, }, [AA32_MAP_VDSO] = { .name = "[vdso]", @@ -371,6 +375,7 @@ enum aarch64_map { static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { [AA64_MAP_VVAR] = { .name = "[vvar]", + .fault = vvar_fault, }, [AA64_MAP_VDSO] = { .name = "[vdso]", From 1b6867d2916bb91e94ddcc9c709e4779419fe391 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:17 -0700 Subject: [PATCH 403/502] arm64/vdso: Zap vvar pages when switching to a time namespace The order of vvar pages depends on whether a task belongs to the root time namespace or not. In the root time namespace, a task doesn't have a per-namespace page. In a non-root namespace, the VVAR page which contains the system-wide VDSO data is replaced with a namespace specific page that contains clock offsets. Whenever a task changes its namespace, the VVAR page tables are cleared and then they will be re-faulted with a corresponding layout. A task can switch its time namespace only if its ->mm isn't shared with another task. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20200624083321.144975-3-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index eb7798e5eb00..33ac18060bfc 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -124,6 +124,37 @@ static int __vdso_init(enum vdso_abi abi) return 0; } +#ifdef CONFIG_TIME_NS +/* + * The vvar mapping contains data for a specific time namespace, so when a task + * changes namespace we must unmap its vvar data for the old namespace. + * Subsequent faults will map in data for the new namespace. + * + * For more details see timens_setup_vdso_data(). + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + + mmap_read_lock(mm); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long size = vma->vm_end - vma->vm_start; + + if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) + zap_page_range(vma, vma->vm_start, size); +#ifdef CONFIG_COMPAT_VDSO + if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA32].dm)) + zap_page_range(vma, vma->vm_start, size); +#endif + } + + mmap_read_unlock(mm); + return 0; +} +#endif + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { From 3503d56cc7233ced602e38a4c13caa64f00ab2aa Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:18 -0700 Subject: [PATCH 404/502] arm64/vdso: Add time namespace page Allocate the time namespace page among VVAR pages. Provide __arch_get_timens_vdso_data() helper for VDSO code to get the code-relative position of VVARs on that special page. If a task belongs to a time namespace then the VVAR page which contains the system wide VDSO data is replaced with a namespace specific page which has the same layout as the VVAR page. That page has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. The extra check in the case that vdso_data->seq is odd, e.g. a concurrent update of the VDSO data is in progress, is not really affecting regular tasks which are not part of a time namespace as the task is spin waiting for the update to finish and vdso_data->seq to become even again. If a time namespace task hits that code path, it invokes the corresponding time getter function which retrieves the real VVAR page, reads host time and then adds the offset for the requested clock which is stored in the special VVAR page. The time-namespace page isn't allocated on !CONFIG_TIME_NAMESPACE, but vma is the same size, which simplifies criu/vdso migration between different kernel configs. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Cc: Mark Rutland Link: https://lore.kernel.org/r/20200624083321.144975-4-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/vdso.h | 2 ++ .../include/asm/vdso/compat_gettimeofday.h | 12 ++++++++++++ arch/arm64/include/asm/vdso/gettimeofday.h | 8 ++++++++ arch/arm64/kernel/vdso.c | 19 ++++++++++++++++--- arch/arm64/kernel/vdso/vdso.lds.S | 5 ++++- arch/arm64/kernel/vdso32/vdso.lds.S | 5 ++++- include/vdso/datapage.h | 1 + 7 files changed, 47 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/vdso.h b/arch/arm64/include/asm/vdso.h index 07468428fd29..f99dcb94b438 100644 --- a/arch/arm64/include/asm/vdso.h +++ b/arch/arm64/include/asm/vdso.h @@ -12,6 +12,8 @@ */ #define VDSO_LBASE 0x0 +#define __VVAR_PAGES 2 + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index b6907ae78e53..b7c549d46d18 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -152,6 +152,18 @@ static __always_inline const struct vdso_data *__arch_get_vdso_data(void) return ret; } +#ifdef CONFIG_TIME_NS +static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void) +{ + const struct vdso_data *ret; + + /* See __arch_get_vdso_data(). */ + asm volatile("mov %0, %1" : "=r"(ret) : "r"(_timens_data)); + + return ret; +} +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index afba6ba332f8..cf39eae5eaaf 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -96,6 +96,14 @@ const struct vdso_data *__arch_get_vdso_data(void) return _vdso_data; } +#ifdef CONFIG_TIME_NS +static __always_inline +const struct vdso_data *__arch_get_timens_vdso_data(void) +{ + return _timens_data; +} +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 33ac18060bfc..fcb559726920 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -40,6 +40,12 @@ enum vdso_abi { #endif /* CONFIG_COMPAT_VDSO */ }; +enum vvar_pages { + VVAR_DATA_PAGE_OFFSET, + VVAR_TIMENS_PAGE_OFFSET, + VVAR_NR_PAGES, +}; + struct vdso_abi_info { const char *name; const char *vdso_code_start; @@ -125,6 +131,11 @@ static int __vdso_init(enum vdso_abi abi) } #ifdef CONFIG_TIME_NS +struct vdso_data *arch_get_vdso_data(void *vvar_page) +{ + return (struct vdso_data *)(vvar_page); +} + /* * The vvar mapping contains data for a specific time namespace, so when a task * changes namespace we must unmap its vvar data for the old namespace. @@ -173,9 +184,11 @@ static int __setup_additional_pages(enum vdso_abi abi, unsigned long gp_flags = 0; void *ret; + BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ - vdso_mapping_len = vdso_text_len + PAGE_SIZE; + vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { @@ -183,7 +196,7 @@ static int __setup_additional_pages(enum vdso_abi abi, goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE, + ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_PFNMAP, vdso_info[abi].dm); if (IS_ERR(ret)) @@ -192,7 +205,7 @@ static int __setup_additional_pages(enum vdso_abi abi, if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) gp_flags = VM_ARM64_BTI; - vdso_base += PAGE_SIZE; + vdso_base += VVAR_NR_PAGES * PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, VM_READ|VM_EXEC|gp_flags| diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 7ad2d3a0cd48..d808ad31e01f 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -17,7 +17,10 @@ OUTPUT_ARCH(aarch64) SECTIONS { - PROVIDE(_vdso_data = . - PAGE_SIZE); + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S index 337d03522048..3348ce5ea306 100644 --- a/arch/arm64/kernel/vdso32/vdso.lds.S +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -17,7 +17,10 @@ OUTPUT_ARCH(arm) SECTIONS { - PROVIDE_HIDDEN(_vdso_data = . - PAGE_SIZE); + PROVIDE_HIDDEN(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE); +#endif . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 7955c56d6b3c..ee810cae4e1e 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -109,6 +109,7 @@ struct vdso_data { * relocation, and this is what we need. */ extern struct vdso_data _vdso_data[CS_BASES] __attribute__((visibility("hidden"))); +extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden"))); /* * The generic vDSO implementation requires that gettimeofday.h From ee3cda8e46060b021087b6ef451e1cd9fa648af6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:19 -0700 Subject: [PATCH 405/502] arm64/vdso: Handle faults on timens page If a task belongs to a time namespace then the VVAR page which contains the system wide VDSO data is replaced with a namespace specific page which has the same layout as the VVAR page. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Link: https://lore.kernel.org/r/20200624083321.144975-5-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso.c | 56 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index fcb559726920..c11ee18e3e79 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -164,15 +165,62 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_unlock(mm); return 0; } + +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops. + * For more details check_vma_flags() and __access_remote_vm() + */ + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} +#else +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + return NULL; +} #endif static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { - if (vmf->pgoff == 0) - return vmf_insert_pfn(vma, vmf->address, - sym_to_pfn(vdso_data)); - return VM_FAULT_SIGBUS; + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long pfn; + + switch (vmf->pgoff) { + case VVAR_DATA_PAGE_OFFSET: + if (timens_page) + pfn = page_to_pfn(timens_page); + else + pfn = sym_to_pfn(vdso_data); + break; +#ifdef CONFIG_TIME_NS + case VVAR_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!timens_page) + return VM_FAULT_SIGBUS; + pfn = sym_to_pfn(vdso_data); + break; +#endif /* CONFIG_TIME_NS */ + default: + return VM_FAULT_SIGBUS; + } + + return vmf_insert_pfn(vma, vmf->address, pfn); } static int __setup_additional_pages(enum vdso_abi abi, From bcf996434240c611f0fdab2c18cd75dd59cfa3c2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:20 -0700 Subject: [PATCH 406/502] arm64/vdso: Restrict splitting VVAR VMA Forbid splitting VVAR VMA resulting in a stricter ABI and reducing the amount of corner-cases to consider while working further on VDSO time namespace support. As the offset from timens to VVAR page is computed compile-time, the pages in VVAR should stay together and not being partically mremap()'ed. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Link: https://lore.kernel.org/r/20200624083321.144975-6-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index c11ee18e3e79..d4202a32abc9 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -223,6 +223,17 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, return vmf_insert_pfn(vma, vmf->address, pfn); } +static int vvar_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + + if (new_size != VVAR_NR_PAGES * PAGE_SIZE) + return -EINVAL; + + return 0; +} + static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, @@ -302,6 +313,7 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { [AA32_MAP_VVAR] = { .name = "[vvar]", .fault = vvar_fault, + .mremap = vvar_mremap, }, [AA32_MAP_VDSO] = { .name = "[vdso]", @@ -468,6 +480,7 @@ static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { [AA64_MAP_VVAR] = { .name = "[vvar]", .fault = vvar_fault, + .mremap = vvar_mremap, }, [AA64_MAP_VDSO] = { .name = "[vdso]", From 9614cc576d76a7449cd51b60ef81fd0ce19ee694 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:21 -0700 Subject: [PATCH 407/502] arm64: enable time namespace support CONFIG_TIME_NS is dependes on GENERIC_VDSO_TIME_NS. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Link: https://lore.kernel.org/r/20200624083321.144975-7-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 66dc41fd49f2..87255f02ec5b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -118,6 +118,7 @@ config ARM64 select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY + select GENERIC_VDSO_TIME_NS select HANDLE_DOMAIN_IRQ select HARDIRQS_SW_RESEND select HAVE_PCI From b36200f543ff07a1cb346aa582349141df2c8068 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Sat, 11 Jul 2020 11:31:11 +0200 Subject: [PATCH 408/502] io_uring: fix sq array offset calculation rings_size() sets sq_offset to the total size of the rings (the returned value which is used for memory allocation). This is wrong: sq array should be located within the rings, not after them. Set sq_offset to where it should be. Fixes: 75b28affdd6a ("io_uring: allocate the two rings together") Signed-off-by: Dmitry Vyukov Acked-by: Hristo Venev Cc: io-uring@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ff3851d40df4..ca932fb3c67d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7416,6 +7416,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, return SIZE_MAX; #endif + if (sq_offset) + *sq_offset = off; + sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) return SIZE_MAX; @@ -7423,9 +7426,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, if (check_add_overflow(off, sq_array_size, &off)) return SIZE_MAX; - if (sq_offset) - *sq_offset = off; - return off; } From 270a5940700bb6cf9abf36ea10cf1fa0d453aa7a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:41:04 +0300 Subject: [PATCH 409/502] io_uring: rename sr->msg into umsg Every second field in send/recv is called msg, make it a bit more understandable by renaming ->msg, which is a user provided ptr, to ->umsg. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ca932fb3c67d..f17f098c403a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -414,7 +414,7 @@ struct io_connect { struct io_sr_msg { struct file *file; union { - struct user_msghdr __user *msg; + struct user_msghdr __user *umsg; void __user *buf; }; int msg_flags; @@ -3903,7 +3903,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); #ifdef CONFIG_COMPAT @@ -3919,7 +3919,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) io->msg.msg.msg_name = &io->msg.addr; io->msg.iov = io->msg.fast_iov; - ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + ret = sendmsg_copy_msghdr(&io->msg.msg, sr->umsg, sr->msg_flags, &io->msg.iov); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; @@ -3952,7 +3952,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, kmsg->msg.msg_name = &io.msg.addr; io.msg.iov = io.msg.fast_iov; - ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, + ret = sendmsg_copy_msghdr(&io.msg.msg, sr->umsg, sr->msg_flags, &io.msg.iov); if (ret) return ret; @@ -4030,8 +4030,8 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) size_t iov_len; int ret; - ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr, - &uiov, &iov_len); + ret = __copy_msghdr_from_user(&io->msg.msg, sr->umsg, + &io->msg.uaddr, &uiov, &iov_len); if (ret) return ret; @@ -4065,7 +4065,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, compat_size_t len; int ret; - msg_compat = (struct compat_msghdr __user *) sr->msg; + msg_compat = (struct compat_msghdr __user *) sr->umsg; ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr, &ptr, &len); if (ret) @@ -4142,7 +4142,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->bgid = READ_ONCE(sqe->buf_group); @@ -4207,7 +4207,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, else if (force_nonblock) flags |= MSG_DONTWAIT; - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, kmsg->uaddr, flags); if (force_nonblock && ret == -EAGAIN) { ret = io_setup_async_msg(req, kmsg); From 1400e69705baf98d1c9cb73b592a3a68aab1d852 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:41:05 +0300 Subject: [PATCH 410/502] io_uring: use more specific type in rcv/snd msg cp send/recv msghdr initialisation works with struct io_async_msghdr, but pulls the whole struct io_async_ctx for no reason. That complicates it with composite accessing, e.g. io->msg. Use and pass the most specific type, which is struct io_async_msghdr. It is the larget field in union io_async_ctx and doesn't save stack space, but looks clearer. The most of the changes are replacing "io->msg." with "iomsg->" Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 63 +++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f17f098c403a..8acbaddaebb7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3935,7 +3935,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, sock = sock_from_file(req->file, &ret); if (sock) { - struct io_async_ctx io; + struct io_async_msghdr iomsg; unsigned flags; if (req->io) { @@ -3948,14 +3948,13 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, } else { struct io_sr_msg *sr = &req->sr_msg; - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; - - io.msg.iov = io.msg.fast_iov; - ret = sendmsg_copy_msghdr(&io.msg.msg, sr->umsg, - sr->msg_flags, &io.msg.iov); + iomsg.msg.msg_name = &iomsg.addr; + iomsg.iov = iomsg.fast_iov; + ret = sendmsg_copy_msghdr(&iomsg.msg, sr->umsg, + sr->msg_flags, &iomsg.iov); if (ret) return ret; + kmsg = &iomsg; } flags = req->sr_msg.msg_flags; @@ -4023,30 +4022,31 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, return 0; } -static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +static int __io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = &req->sr_msg; struct iovec __user *uiov; size_t iov_len; int ret; - ret = __copy_msghdr_from_user(&io->msg.msg, sr->umsg, - &io->msg.uaddr, &uiov, &iov_len); + ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, + &iomsg->uaddr, &uiov, &iov_len); if (ret) return ret; if (req->flags & REQ_F_BUFFER_SELECT) { if (iov_len > 1) return -EINVAL; - if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov))) + if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov))) return -EFAULT; - sr->len = io->msg.iov[0].iov_len; - iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1, + sr->len = iomsg->iov[0].iov_len; + iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1, sr->len); - io->msg.iov = NULL; + iomsg->iov = NULL; } else { ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &io->msg.iov, &io->msg.msg.msg_iter); + &iomsg->iov, &iomsg->msg.msg_iter); if (ret > 0) ret = 0; } @@ -4056,7 +4056,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) #ifdef CONFIG_COMPAT static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_ctx *io) + struct io_async_msghdr *iomsg) { struct compat_msghdr __user *msg_compat; struct io_sr_msg *sr = &req->sr_msg; @@ -4066,7 +4066,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, int ret; msg_compat = (struct compat_msghdr __user *) sr->umsg; - ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr, + ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr, &ptr, &len); if (ret) return ret; @@ -4083,12 +4083,12 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, return -EFAULT; if (clen < 0) return -EINVAL; - sr->len = io->msg.iov[0].iov_len; - io->msg.iov = NULL; + sr->len = iomsg->iov[0].iov_len; + iomsg->iov = NULL; } else { ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, - &io->msg.iov, - &io->msg.msg.msg_iter); + &iomsg->iov, + &iomsg->msg.msg_iter); if (ret < 0) return ret; } @@ -4097,17 +4097,18 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, } #endif -static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +static int io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { - io->msg.msg.msg_name = &io->msg.addr; - io->msg.iov = io->msg.fast_iov; + iomsg->msg.msg_name = &iomsg->addr; + iomsg->iov = iomsg->fast_iov; #ifdef CONFIG_COMPAT if (req->ctx->compat) - return __io_compat_recvmsg_copy_hdr(req, io); + return __io_compat_recvmsg_copy_hdr(req, iomsg); #endif - return __io_recvmsg_copy_hdr(req, io); + return __io_recvmsg_copy_hdr(req, iomsg); } static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, @@ -4157,7 +4158,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (req->flags & REQ_F_NEED_CLEANUP) return 0; - ret = io_recvmsg_copy_hdr(req, io); + ret = io_recvmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -4173,7 +4174,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, sock = sock_from_file(req->file, &ret); if (sock) { struct io_buffer *kbuf; - struct io_async_ctx io; + struct io_async_msghdr iomsg; unsigned flags; if (req->io) { @@ -4184,12 +4185,10 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; - - ret = io_recvmsg_copy_hdr(req, &io); + ret = io_recvmsg_copy_hdr(req, &iomsg); if (ret) return ret; + kmsg = &iomsg; } kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); From 2ae523ed07f14391d685651f671a7858fe8c368a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:41:06 +0300 Subject: [PATCH 411/502] io_uring: extract io_sendmsg_copy_hdr() Don't repeat send msg initialisation code, it's error prone. Extract and use a helper function. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8acbaddaebb7..a198466544e7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3893,6 +3893,15 @@ static int io_setup_async_msg(struct io_kiocb *req, return -EAGAIN; } +static int io_sendmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + iomsg->iov = iomsg->fast_iov; + iomsg->msg.msg_name = &iomsg->addr; + return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, + req->sr_msg.msg_flags, &iomsg->iov); +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; @@ -3917,10 +3926,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_NEED_CLEANUP) return 0; - io->msg.msg.msg_name = &io->msg.addr; - io->msg.iov = io->msg.fast_iov; - ret = sendmsg_copy_msghdr(&io->msg.msg, sr->umsg, sr->msg_flags, - &io->msg.iov); + ret = io_sendmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -3946,12 +3952,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { - struct io_sr_msg *sr = &req->sr_msg; - - iomsg.msg.msg_name = &iomsg.addr; - iomsg.iov = iomsg.fast_iov; - ret = sendmsg_copy_msghdr(&iomsg.msg, sr->umsg, - sr->msg_flags, &iomsg.iov); + ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) return ret; kmsg = &iomsg; From e73751225bae1e9b67e957afb273366fbb6ca136 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:42:04 +0300 Subject: [PATCH 412/502] io_uring: replace rw->task_work with rq->task_work io_kiocb::task_work was de-unionised, and is not planned to be shared back, because it's too useful and commonly used. Hence, instead of keeping a separate task_work in struct io_async_rw just reuse req->task_work. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a198466544e7..ddff3abff363 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -505,7 +505,6 @@ struct io_async_rw { ssize_t nr_segs; ssize_t size; struct wait_page_queue wpq; - struct callback_head task_work; }; struct io_async_ctx { @@ -2901,33 +2900,11 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_async_buf_cancel(struct callback_head *cb) -{ - struct io_async_rw *rw; - struct io_kiocb *req; - - rw = container_of(cb, struct io_async_rw, task_work); - req = rw->wpq.wait.private; - __io_req_task_cancel(req, -ECANCELED); -} - -static void io_async_buf_retry(struct callback_head *cb) -{ - struct io_async_rw *rw; - struct io_kiocb *req; - - rw = container_of(cb, struct io_async_rw, task_work); - req = rw->wpq.wait.private; - - __io_req_task_submit(req); -} - static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, int sync, void *arg) { struct wait_page_queue *wpq; struct io_kiocb *req = wait->private; - struct io_async_rw *rw = &req->io->rw; struct wait_page_key *key = arg; int ret; @@ -2939,17 +2916,17 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, list_del_init(&wait->entry); - init_task_work(&rw->task_work, io_async_buf_retry); + init_task_work(&req->task_work, io_req_task_submit); /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req, &rw->task_work); + ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { struct task_struct *tsk; /* queue just for cancelation */ - init_task_work(&rw->task_work, io_async_buf_cancel); + init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &rw->task_work, 0); + task_work_add(tsk, &req->task_work, 0); wake_up_process(tsk); } return 1; From b64e3444d4e1c71fe148a4f4535395b1fdd73200 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 22:59:18 +0300 Subject: [PATCH 413/502] io_uring: simplify io_req_map_rw() Don't deref req->io->rw every time, but put it in a local variable. This looks prettier, generates less instructions, and doesn't break alias analysis. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ddff3abff363..3b8465dd0214 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2828,15 +2828,17 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { - req->io->rw.nr_segs = iter->nr_segs; - req->io->rw.size = io_size; - req->io->rw.iov = iovec; - if (!req->io->rw.iov) { - req->io->rw.iov = req->io->rw.fast_iov; - if (req->io->rw.iov != fast_iov) - memcpy(req->io->rw.iov, fast_iov, + struct io_async_rw *rw = &req->io->rw; + + rw->nr_segs = iter->nr_segs; + rw->size = io_size; + if (!iovec) { + rw->iov = rw->fast_iov; + if (rw->iov != fast_iov) + memcpy(rw->iov, fast_iov, sizeof(struct iovec) * iter->nr_segs); } else { + rw->iov = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } From c3e330a493740a2a8312dcb7b1cffceaec7f619a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 22:59:19 +0300 Subject: [PATCH 414/502] io_uring: add a helper for async rw iovec prep Preparing reads/writes for async is a bit tricky. Extract a helper to not repeat it twice. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3b8465dd0214..31466bcd833e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2872,11 +2872,27 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, return 0; } +static inline int io_rw_prep_async(struct io_kiocb *req, int rw, + bool force_nonblock) +{ + struct io_async_ctx *io = req->io; + struct iov_iter iter; + ssize_t ret; + + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock); + req->io = io; + if (unlikely(ret < 0)) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; +} + static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { - struct io_async_ctx *io; - struct iov_iter iter; ssize_t ret; ret = io_prep_rw(req, sqe, force_nonblock); @@ -2889,17 +2905,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; - - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; - if (ret < 0) - return ret; - - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + return io_rw_prep_async(req, READ, force_nonblock); } static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, @@ -3043,8 +3049,6 @@ out_free: static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { - struct io_async_ctx *io; - struct iov_iter iter; ssize_t ret; ret = io_prep_rw(req, sqe, force_nonblock); @@ -3059,17 +3063,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; - - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; - if (ret < 0) - return ret; - - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + return io_rw_prep_async(req, WRITE, force_nonblock); } static int io_write(struct io_kiocb *req, bool force_nonblock, From 252917c30f551e8e4377faac81d7fcf8e9629df1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 22:59:20 +0300 Subject: [PATCH 415/502] io_uring: follow **iovec idiom in io_import_iovec As for import_iovec(), return !=NULL iovec from io_import_iovec() only when it should be freed. That includes returning NULL when iovec is already in req->io, because it should be deallocated by other means, e.g. inside op handler. After io_setup_async_rw() local iovec to ->io, just mark it NULL, to follow the idea in io_{read,write} as well. That's easier to follow, and especially useful if we want to reuse per-op space for completion data. Signed-off-by: Pavel Begunkov [axboe: only call kfree() on non-NULL pointer] Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 31466bcd833e..64ae5b681c62 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2740,10 +2740,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->io) { struct io_async_rw *iorw = &req->io->rw; - *iovec = iorw->iov; - iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); - if (iorw->iov == iorw->fast_iov) - *iovec = NULL; + iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size); + *iovec = NULL; return iorw->size; } @@ -3026,6 +3024,8 @@ copy_iov: inline_vecs, &iter); if (ret) goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; /* if we can retry, do so with the callbacks armed */ if (io_rw_should_retry(req)) { ret2 = io_iter_do_read(req, &iter); @@ -3041,7 +3041,7 @@ copy_iov: } } out_free: - if (!(req->flags & REQ_F_NEED_CLEANUP)) + if (iovec) kfree(iovec); return ret; } @@ -3143,11 +3143,13 @@ copy_iov: inline_vecs, &iter); if (ret) goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; return -EAGAIN; } } out_free: - if (!(req->flags & REQ_F_NEED_CLEANUP)) + if (iovec) kfree(iovec); return ret; } From 3ca405ebfc1c3445b049dd25ca3338cbc99837d1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:08 +0300 Subject: [PATCH 416/502] io_uring: share completion list w/ per-op space Calling io_req_complete(req) means that the request is done, and there is nothing left but to clean it up. That also means that per-op data after that should not be used, so we're free to reuse it in completion path, e.g. to store overflow_list as done in this patch. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 64ae5b681c62..3cadd5f963b7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -487,6 +487,11 @@ struct io_statx { struct statx __user *buffer; }; +struct io_completion { + struct file *file; + struct list_head list; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -622,6 +627,8 @@ struct io_kiocb { struct io_splice splice; struct io_provide_buf pbuf; struct io_statx statx; + /* use only after cleaning per-op data, see io_clean_op() */ + struct io_completion compl; }; struct io_async_ctx *io; @@ -896,7 +903,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, static int io_grab_files(struct io_kiocb *req); static void io_complete_rw_common(struct kiocb *kiocb, long res, struct io_comp_state *cs); -static void io_cleanup_req(struct io_kiocb *req); +static void __io_clean_op(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); static void __io_queue_sqe(struct io_kiocb *req, @@ -936,6 +943,12 @@ static void io_get_req_task(struct io_kiocb *req) req->flags |= REQ_F_TASK_PINNED; } +static inline void io_clean_op(struct io_kiocb *req) +{ + if (req->flags & REQ_F_NEED_CLEANUP) + __io_clean_op(req); +} + /* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ static void __io_put_req_task(struct io_kiocb *req) { @@ -1413,8 +1426,8 @@ static void io_submit_flush_completions(struct io_comp_state *cs) while (!list_empty(&cs->list)) { struct io_kiocb *req; - req = list_first_entry(&cs->list, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(&cs->list, struct io_kiocb, compl.list); + list_del(&req->compl.list); __io_cqring_fill_event(req, req->result, req->cflags); if (!(req->flags & REQ_F_LINK_HEAD)) { req->flags |= REQ_F_COMP_LOCKED; @@ -1439,9 +1452,10 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, io_cqring_add_event(req, res, cflags); io_put_req(req); } else { + io_clean_op(req); req->result = res; req->cflags = cflags; - list_add_tail(&req->list, &cs->list); + list_add_tail(&req->compl.list, &cs->list); if (++cs->nr >= 32) io_submit_flush_completions(cs); } @@ -1515,8 +1529,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, static void io_dismantle_req(struct io_kiocb *req) { - if (req->flags & REQ_F_NEED_CLEANUP) - io_cleanup_req(req); + io_clean_op(req); if (req->io) kfree(req->io); @@ -5402,7 +5415,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EIOCBQUEUED; } -static void io_cleanup_req(struct io_kiocb *req) +static void __io_clean_op(struct io_kiocb *req) { struct io_async_ctx *io = req->io; From 540e32a0855e700affa29b1112bf2dbb1fa7702a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:09 +0300 Subject: [PATCH 417/502] io_uring: rename ctx->poll into ctx->iopoll It supports both polling and I/O polling. Rename ctx->poll to clearly show that it's only in I/O poll case. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3cadd5f963b7..c8ebd227c837 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -320,12 +320,12 @@ struct io_ring_ctx { spinlock_t completion_lock; /* - * ->poll_list is protected by the ctx->uring_lock for + * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ - struct list_head poll_list; + struct list_head iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_file; @@ -1064,7 +1064,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); - INIT_LIST_HEAD(&ctx->poll_list); + INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -2009,7 +2009,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, spin = !ctx->poll_multi_file && *nr_events < min; ret = 0; - list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, list) { struct kiocb *kiocb = &req->rw.kiocb; /* @@ -2051,7 +2051,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { - while (!list_empty(&ctx->poll_list) && !need_resched()) { + while (!list_empty(&ctx->iopoll_list) && !need_resched()) { int ret; ret = io_do_iopoll(ctx, nr_events, min); @@ -2074,7 +2074,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) return; mutex_lock(&ctx->uring_lock); - while (!list_empty(&ctx->poll_list)) { + while (!list_empty(&ctx->iopoll_list)) { unsigned int nr_events = 0; io_do_iopoll(ctx, &nr_events, 0); @@ -2291,12 +2291,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * how we do polling eventually, not spinning if we're on potentially * different devices. */ - if (list_empty(&ctx->poll_list)) { + if (list_empty(&ctx->iopoll_list)) { ctx->poll_multi_file = false; } else if (!ctx->poll_multi_file) { struct io_kiocb *list_req; - list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, + list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, list); if (list_req->file != req->file) ctx->poll_multi_file = true; @@ -2307,9 +2307,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->list, &ctx->poll_list); + list_add(&req->list, &ctx->iopoll_list); else - list_add_tail(&req->list, &ctx->poll_list); + list_add_tail(&req->list, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sqo_wait)) @@ -6329,11 +6329,11 @@ static int io_sq_thread(void *data) while (!kthread_should_park()) { unsigned int to_submit; - if (!list_empty(&ctx->poll_list)) { + if (!list_empty(&ctx->iopoll_list)) { unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->poll_list) && !need_resched()) + if (!list_empty(&ctx->iopoll_list) && !need_resched()) io_do_iopoll(ctx, &nr_events, 0); else timeout = jiffies + ctx->sq_thread_idle; @@ -6362,7 +6362,7 @@ static int io_sq_thread(void *data) * more IO, we should wait for the application to * reap events and wake us up. */ - if (!list_empty(&ctx->poll_list) || need_resched() || + if (!list_empty(&ctx->iopoll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { io_run_task_work(); @@ -6375,13 +6375,13 @@ static int io_sq_thread(void *data) /* * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to poll_list, it - * is because reqs may have been punted to io worker and - * will be added to poll_list later, hence check the - * poll_list again. + * to check if there are new reqs added to iopoll_list, + * it is because reqs may have been punted to io worker + * and will be added to iopoll_list later, hence check + * the iopoll_list again. */ if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->poll_list)) { + !list_empty_careful(&ctx->iopoll_list)) { finish_wait(&ctx->sqo_wait, &wait); continue; } From d21ffe7eca82d47b489760899912f81e30456e2e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:10 +0300 Subject: [PATCH 418/502] io_uring: use inflight_entry list for iopoll'ing req->inflight_entry is used to track requests that grabbed files_struct. Let's share it with iopoll list, because the only iopoll'ed ops are reads and writes, which don't need a file table. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c8ebd227c837..8a89480a57ec 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -651,6 +651,10 @@ struct io_kiocb { struct list_head link_list; + /* + * 1. used with ctx->iopoll_list with reads/writes + * 2. to track reqs with ->files (see io_op_def::file_table) + */ struct list_head inflight_entry; struct percpu_ref *fixed_file_refs; @@ -1943,8 +1947,8 @@ static void io_iopoll_queue(struct list_head *again) struct io_kiocb *req; do { - req = list_first_entry(again, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(again, struct io_kiocb, inflight_entry); + list_del(&req->inflight_entry); if (!io_rw_reissue(req, -EAGAIN)) io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); } while (!list_empty(again)); @@ -1967,13 +1971,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, while (!list_empty(done)) { int cflags = 0; - req = list_first_entry(done, struct io_kiocb, list); + req = list_first_entry(done, struct io_kiocb, inflight_entry); if (READ_ONCE(req->result) == -EAGAIN) { req->iopoll_completed = 0; - list_move_tail(&req->list, &again); + list_move_tail(&req->inflight_entry, &again); continue; } - list_del(&req->list); + list_del(&req->inflight_entry); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_kbuf(req); @@ -2009,7 +2013,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, spin = !ctx->poll_multi_file && *nr_events < min; ret = 0; - list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, list) { + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { struct kiocb *kiocb = &req->rw.kiocb; /* @@ -2018,7 +2022,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * and complete those lists first, if we have entries there. */ if (READ_ONCE(req->iopoll_completed)) { - list_move_tail(&req->list, &done); + list_move_tail(&req->inflight_entry, &done); continue; } if (!list_empty(&done)) @@ -2030,7 +2034,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, /* iopoll may have completed current req */ if (READ_ONCE(req->iopoll_completed)) - list_move_tail(&req->list, &done); + list_move_tail(&req->inflight_entry, &done); if (ret && spin) spin = false; @@ -2297,7 +2301,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req) struct io_kiocb *list_req; list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, - list); + inflight_entry); if (list_req->file != req->file) ctx->poll_multi_file = true; } @@ -2307,9 +2311,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->list, &ctx->iopoll_list); + list_add(&req->inflight_entry, &ctx->iopoll_list); else - list_add_tail(&req->list, &ctx->iopoll_list); + list_add_tail(&req->inflight_entry, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sqo_wait)) From 40d8ddd4facb80760d5a0c61a7cf026d5ff73ff0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:11 +0300 Subject: [PATCH 419/502] io_uring: use completion list for CQ overflow As with the completion path, also use compl.list for overflowed requests. If cleaned up properly, nobody needs per-op data there anymore. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a89480a57ec..2122b37e68e3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1339,8 +1339,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) break; req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, - list); - list_move(&req->list, &list); + compl.list); + list_move(&req->compl.list, &list); req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); @@ -1362,8 +1362,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) io_cqring_ev_posted(ctx); while (!list_empty(&list)) { - req = list_first_entry(&list, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(&list, struct io_kiocb, compl.list); + list_del(&req->compl.list); io_put_req(req); } @@ -1396,11 +1396,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) set_bit(0, &ctx->cq_check_overflow); ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } + io_clean_op(req); req->flags |= REQ_F_OVERFLOW; - refcount_inc(&req->refs); req->result = res; req->cflags = cflags; - list_add_tail(&req->list, &ctx->cq_overflow_list); + refcount_inc(&req->refs); + list_add_tail(&req->compl.list, &ctx->cq_overflow_list); } } @@ -7835,7 +7836,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (cancel_req->flags & REQ_F_OVERFLOW) { spin_lock_irq(&ctx->completion_lock); - list_del(&cancel_req->list); + list_del(&cancel_req->compl.list); cancel_req->flags &= ~REQ_F_OVERFLOW; if (list_empty(&ctx->cq_overflow_list)) { clear_bit(0, &ctx->sq_check_overflow); From 135fcde8496b03d31648171dbc038990112e41d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:12 +0300 Subject: [PATCH 420/502] io_uring: add req->timeout.list Instead of using shared req->list, hang timeouts up on their own list entry. struct io_timeout have enough extra space for it, but if that will be a problem ->inflight_entry can reused for that. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2122b37e68e3..2544795cfd30 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -396,6 +396,7 @@ struct io_timeout { int flags; u32 off; u32 target_seq; + struct list_head list; }; struct io_rw { @@ -1213,7 +1214,7 @@ static void io_kill_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); - list_del_init(&req->list); + list_del_init(&req->timeout.list); req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); io_put_req(req); @@ -1225,7 +1226,7 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) struct io_kiocb *req, *tmp; spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) io_kill_timeout(req); spin_unlock_irq(&ctx->completion_lock); } @@ -1248,7 +1249,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->timeout_list)) { struct io_kiocb *req = list_first_entry(&ctx->timeout_list, - struct io_kiocb, list); + struct io_kiocb, timeout.list); if (io_is_timeout_noseq(req)) break; @@ -1256,7 +1257,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) - atomic_read(&ctx->cq_timeouts)) break; - list_del_init(&req->list); + list_del_init(&req->timeout.list); io_kill_timeout(req); } } @@ -4997,8 +4998,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) * We could be racing with timeout deletion. If the list is empty, * then timeout lookup already found it and will be handling it. */ - if (!list_empty(&req->list)) - list_del_init(&req->list); + if (!list_empty(&req->timeout.list)) + list_del_init(&req->timeout.list); io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); @@ -5015,9 +5016,9 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) struct io_kiocb *req; int ret = -ENOENT; - list_for_each_entry(req, &ctx->timeout_list, list) { + list_for_each_entry(req, &ctx->timeout_list, timeout.list) { if (user_data == req->user_data) { - list_del_init(&req->list); + list_del_init(&req->timeout.list); ret = 0; break; } @@ -5139,7 +5140,8 @@ static int io_timeout(struct io_kiocb *req) * the one we need first. */ list_for_each_prev(entry, &ctx->timeout_list) { - struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); + struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, + timeout.list); if (io_is_timeout_noseq(nxt)) continue; @@ -5148,7 +5150,7 @@ static int io_timeout(struct io_kiocb *req) break; } add: - list_add(&req->list, entry); + list_add(&req->timeout.list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); From 7d6ddea6beaf6639cf3a2b291dcdac6fe1edc584 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:13 +0300 Subject: [PATCH 421/502] io_uring: remove init for unused list poll*() doesn't use req->list, don't init it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2544795cfd30..1e4ac48b1557 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4964,7 +4964,6 @@ static int io_poll_add(struct io_kiocb *req) req->flags &= ~REQ_F_WORK_INITIALIZED; INIT_HLIST_NODE(&req->hash_node); - INIT_LIST_HEAD(&req->list); ipt.pt._qproc = io_poll_queue_proc; mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, From 27dc8338e5fb0e0ed5b272e792f4ffad7f3bc03e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:14 +0300 Subject: [PATCH 422/502] io_uring: use non-intrusive list for defer The only left user of req->list is DRAIN, hence instead of keeping a separate per request list for it, do that with old fashion non-intrusive lists allocated on demand. That's a really slow path, so that's OK. This removes req->list and so sheds 16 bytes from io_kiocb. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1e4ac48b1557..6e6e71310785 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -641,7 +641,6 @@ struct io_kiocb { u16 buf_index; struct io_ring_ctx *ctx; - struct list_head list; unsigned int flags; refcount_t refs; struct task_struct *task; @@ -676,6 +675,11 @@ struct io_kiocb { struct callback_head task_work; }; +struct io_defer_entry { + struct list_head list; + struct io_kiocb *req; +}; + #define IO_IOPOLL_BATCH 8 struct io_comp_state { @@ -1234,14 +1238,15 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) static void __io_queue_deferred(struct io_ring_ctx *ctx) { do { - struct io_kiocb *req = list_first_entry(&ctx->defer_list, - struct io_kiocb, list); + struct io_defer_entry *de = list_first_entry(&ctx->defer_list, + struct io_defer_entry, list); - if (req_need_defer(req)) + if (req_need_defer(de->req)) break; - list_del_init(&req->list); + list_del_init(&de->list); /* punt-init is done before queueing for defer */ - __io_queue_async_work(req); + __io_queue_async_work(de->req); + kfree(de); } while (!list_empty(&ctx->defer_list)); } @@ -5394,6 +5399,7 @@ static int io_req_defer_prep(struct io_kiocb *req, static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; + struct io_defer_entry *de; int ret; /* Still need defer if there is pending req in defer list. */ @@ -5408,15 +5414,20 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } io_prep_async_link(req); + de = kmalloc(sizeof(*de), GFP_KERNEL); + if (!de) + return -ENOMEM; spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); + kfree(de); return 0; } trace_io_uring_defer(ctx, req, req->user_data); - list_add_tail(&req->list, &ctx->defer_list); + de->req = req; + list_add_tail(&de->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; } From 9cf7c104deaef52d6fd7c103a716e31d9815ede8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:15 +0300 Subject: [PATCH 423/502] io_uring: remove sequence from io_kiocb req->sequence is used only for deferred (i.e. DRAIN) requests, but initialised for every request. Remove req->sequence from io_kiocb together with its initialisation in io_init_req(). Replace it with a new field in struct io_defer_entry, that will be calculated only when needed in io_req_defer(), which is a slow path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e6e71310785..efa132831f3d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -639,6 +639,7 @@ struct io_kiocb { u8 iopoll_completed; u16 buf_index; + u32 result; struct io_ring_ctx *ctx; unsigned int flags; @@ -646,8 +647,6 @@ struct io_kiocb { struct task_struct *task; unsigned long fsize; u64 user_data; - u32 result; - u32 sequence; struct list_head link_list; @@ -678,6 +677,7 @@ struct io_kiocb { struct io_defer_entry { struct list_head list; struct io_kiocb *req; + u32 seq; }; #define IO_IOPOLL_BATCH 8 @@ -1090,13 +1090,13 @@ err: return NULL; } -static inline bool req_need_defer(struct io_kiocb *req) +static bool req_need_defer(struct io_kiocb *req, u32 seq) { if (unlikely(req->flags & REQ_F_IO_DRAIN)) { struct io_ring_ctx *ctx = req->ctx; - return req->sequence != ctx->cached_cq_tail - + atomic_read(&ctx->cached_cq_overflow); + return seq != ctx->cached_cq_tail + + atomic_read(&ctx->cached_cq_overflow); } return false; @@ -1241,7 +1241,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); - if (req_need_defer(de->req)) + if (req_need_defer(de->req, de->seq)) break; list_del_init(&de->list); /* punt-init is done before queueing for defer */ @@ -5396,14 +5396,35 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } +static u32 io_get_sequence(struct io_kiocb *req) +{ + struct io_kiocb *pos; + struct io_ring_ctx *ctx = req->ctx; + u32 total_submitted, nr_reqs = 1; + + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(pos, &req->link_list, link_list) + nr_reqs++; + + total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; + return total_submitted - nr_reqs; +} + static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; int ret; + u32 seq; /* Still need defer if there is pending req in defer list. */ - if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list)) + if (likely(list_empty_careful(&ctx->defer_list) && + !(req->flags & REQ_F_IO_DRAIN))) + return 0; + + seq = io_get_sequence(req); + /* Still a chance to pass the sequence check */ + if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; if (!req->io) { @@ -5419,7 +5440,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; spin_lock_irq(&ctx->completion_lock); - if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { + if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); kfree(de); return 0; @@ -5427,6 +5448,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) trace_io_uring_defer(ctx, req, req->user_data); de->req = req; + de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; @@ -6204,12 +6226,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags; int id; - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped; req->opcode = READ_ONCE(sqe->opcode); req->user_data = READ_ONCE(sqe->user_data); req->io = NULL; From 0f7e466b393abab86be96ffcf00af383afddc0d1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:16 +0300 Subject: [PATCH 424/502] io_uring: place cflags into completion data req->cflags is used only for defer-completion path, just use completion data to store it. With the 4 bytes from the ->sequence patch and compacting io_kiocb, this frees 8 bytes. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index efa132831f3d..4d0fd9ddd3dc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -491,6 +491,7 @@ struct io_statx { struct io_completion { struct file *file; struct list_head list; + int cflags; }; struct io_async_connect { @@ -633,7 +634,6 @@ struct io_kiocb { }; struct io_async_ctx *io; - int cflags; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; @@ -1351,7 +1351,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, req->cflags); + WRITE_ONCE(cqe->flags, req->compl.cflags); } else { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1405,7 +1405,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) io_clean_op(req); req->flags |= REQ_F_OVERFLOW; req->result = res; - req->cflags = cflags; + req->compl.cflags = cflags; refcount_inc(&req->refs); list_add_tail(&req->compl.list, &ctx->cq_overflow_list); } @@ -1439,7 +1439,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs) req = list_first_entry(&cs->list, struct io_kiocb, compl.list); list_del(&req->compl.list); - __io_cqring_fill_event(req, req->result, req->cflags); + __io_cqring_fill_event(req, req->result, req->compl.cflags); if (!(req->flags & REQ_F_LINK_HEAD)) { req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -1465,7 +1465,7 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, } else { io_clean_op(req); req->result = res; - req->cflags = cflags; + req->compl.cflags = cflags; list_add_tail(&req->compl.list, &cs->list); if (++cs->nr >= 32) io_submit_flush_completions(cs); From dca9cf8b87f55c96f072c1fc6bc90e2b97a8e19f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:49 +0300 Subject: [PATCH 425/502] io_uring: inline io_req_work_grab_env() The only caller of io_req_work_grab_env() is io_prep_async_work(), and they are both initialising req->work. Inline grab_env(), it's easier to keep this way, moreover there already were bugs with misplacing io_req_init_async(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 50 ++++++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4d0fd9ddd3dc..a06d5b9cc046 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1115,31 +1115,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static void io_req_work_grab_env(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - - io_req_init_async(req); - - if (!req->work.mm && def->needs_mm) { - mmgrab(current->mm); - req->work.mm = current->mm; - } - if (!req->work.creds) - req->work.creds = get_current_cred(); - if (!req->work.fs && def->needs_fs) { - spin_lock(¤t->fs->lock); - if (!current->fs->in_exec) { - req->work.fs = current->fs; - req->work.fs->users++; - } else { - req->work.flags |= IO_WQ_WORK_CANCEL; - } - spin_unlock(¤t->fs->lock); - } -} - -static inline void io_req_work_drop_env(struct io_kiocb *req) +static void io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) return; @@ -1177,8 +1153,22 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - - io_req_work_grab_env(req); + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; + } + if (!req->work.creds) + req->work.creds = get_current_cred(); + if (!req->work.fs && def->needs_fs) { + spin_lock(¤t->fs->lock); + if (!current->fs->in_exec) { + req->work.fs = current->fs; + req->work.fs->users++; + } else { + req->work.flags |= IO_WQ_WORK_CANCEL; + } + spin_unlock(¤t->fs->lock); + } } static void io_prep_async_link(struct io_kiocb *req) @@ -1547,7 +1537,7 @@ static void io_dismantle_req(struct io_kiocb *req) if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); __io_put_req_task(req); - io_req_work_drop_env(req); + io_req_clean_work(req); if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -4825,7 +4815,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) io_put_req(req); /* * restore ->work because we will call - * io_req_work_drop_env below when dropping the + * io_req_clean_work below when dropping the * final reference. */ if (req->flags & REQ_F_WORK_INITIALIZED) @@ -4965,7 +4955,7 @@ static int io_poll_add(struct io_kiocb *req) __poll_t mask; /* ->work is in union with hash_node and others */ - io_req_work_drop_env(req); + io_req_clean_work(req); req->flags &= ~REQ_F_WORK_INITIALIZED; INIT_HLIST_NODE(&req->hash_node); From 1c2da9e8839d6437b43f2c805411d1a0cbd70165 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:50 +0300 Subject: [PATCH 426/502] io_uring: remove empty cleanup of OP_OPEN* reqs A switch in __io_clean_op() doesn't have default, it's pointless to list opcodes that doesn't do any cleanup. Remove IORING_OP_OPEN* from there. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a06d5b9cc046..8d6f1c4e8dac 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5473,9 +5473,6 @@ static void __io_clean_op(struct io_kiocb *req) if (req->flags & REQ_F_BUFFER_SELECTED) kfree(req->sr_msg.kbuf); break; - case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: - break; case IORING_OP_SPLICE: case IORING_OP_TEE: io_put_file(req, req->splice.file_in, From 327d6d968b195cfc48ff97c49b56520aac922f65 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:51 +0300 Subject: [PATCH 427/502] io_uring: alloc ->io in io_req_defer_prep() Every call to io_req_defer_prep() is prepended with allocating ->io, just do that in the function. And while we're at it, mark error paths with unlikey and replace "if (ret < 0)" with "if (ret)". There is only one change in the observable behaviour, that's instead of killing the head request right away on error, it postpones it until the link is assembled, that looks more preferable. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8d6f1c4e8dac..6a1cd2aea018 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5279,6 +5279,9 @@ static int io_req_defer_prep(struct io_kiocb *req, if (!sqe) return 0; + if (io_alloc_async_ctx(req)) + return -EAGAIN; + if (io_op_defs[req->opcode].file_table) { io_req_init_async(req); ret = io_grab_files(req); @@ -5418,10 +5421,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; if (!req->io) { - if (io_alloc_async_ctx(req)) - return -EAGAIN; ret = io_req_defer_prep(req, sqe); - if (ret < 0) + if (ret) return ret; } io_prep_async_link(req); @@ -6024,11 +6025,8 @@ fail_req: } } else if (req->flags & REQ_F_FORCE_ASYNC) { if (!req->io) { - ret = -EAGAIN; - if (io_alloc_async_ctx(req)) - goto fail_req; ret = io_req_defer_prep(req, sqe); - if (unlikely(ret < 0)) + if (unlikely(ret)) goto fail_req; } @@ -6081,11 +6079,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_req_defer_prep(req, sqe); - if (ret) { + if (unlikely(ret)) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; return ret; @@ -6108,11 +6103,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_LINK_HEAD; INIT_LIST_HEAD(&req->link_list); - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_req_defer_prep(req, sqe); - if (ret) + if (unlikely(ret)) req->flags |= REQ_F_FAIL_LINK; *link = req; } else { From 57f1a64958543fe18a7fe0addbfb31bb2ceeaea2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:52 +0300 Subject: [PATCH 428/502] io_uring/io-wq: move RLIMIT_FSIZE to io-wq RLIMIT_SIZE in needed only for execution from an io-wq context, hence move all preparations from hot path to io-wq work setup. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 1 + fs/io-wq.h | 1 + fs/io_uring.c | 22 +++++++++------------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 72f759e1d6eb..8702d3c3b291 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -462,6 +462,7 @@ static void io_impersonate_work(struct io_worker *worker, io_wq_switch_mm(worker, work); if (worker->cur_creds != work->creds) io_wq_switch_creds(worker, work); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; } static void io_assign_current_work(struct io_worker *worker, diff --git a/fs/io-wq.h b/fs/io-wq.h index 114f12ec2d65..ddaf9614cf9b 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -89,6 +89,7 @@ struct io_wq_work { struct mm_struct *mm; const struct cred *creds; struct fs_struct *fs; + unsigned long fsize; unsigned flags; }; diff --git a/fs/io_uring.c b/fs/io_uring.c index 6a1cd2aea018..8b2f7a1bbd06 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -645,7 +645,6 @@ struct io_kiocb { unsigned int flags; refcount_t refs; struct task_struct *task; - unsigned long fsize; u64 user_data; struct list_head link_list; @@ -736,6 +735,7 @@ struct io_op_def { unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; + unsigned needs_fsize : 1; }; static const struct io_op_def io_op_defs[] = { @@ -755,6 +755,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -769,6 +770,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -821,6 +823,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .needs_fsize = 1, }, [IORING_OP_OPENAT] = { .file_table = 1, @@ -852,6 +855,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -1169,6 +1173,10 @@ static void io_prep_async_work(struct io_kiocb *req) } spin_unlock(¤t->fs->lock); } + if (def->needs_fsize) + req->work.fsize = rlimit(RLIMIT_FSIZE); + else + req->work.fsize = RLIM_INFINITY; } static void io_prep_async_link(struct io_kiocb *req) @@ -3072,8 +3080,6 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - req->fsize = rlimit(RLIMIT_FSIZE); - /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -3130,17 +3136,11 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, } kiocb->ki_flags |= IOCB_WRITE; - if (!force_nonblock) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; - if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); - if (!force_nonblock) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. @@ -3335,7 +3335,6 @@ static int io_fallocate_prep(struct io_kiocb *req, req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->addr); req->sync.mode = READ_ONCE(sqe->len); - req->fsize = rlimit(RLIMIT_FSIZE); return 0; } @@ -3346,11 +3345,8 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock) /* fallocate always requiring blocking context */ if (force_nonblock) return -EAGAIN; - - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; if (ret < 0) req_set_fail_links(req); io_req_complete(req, ret); From 06ef3608b0eed673fcbc62cf74c8d3ad0007a337 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:33 +0300 Subject: [PATCH 429/502] io_uring: simplify file ref tracking in submission state Currently, file refs in struct io_submit_state are tracked with 2 vars: @has_refs -- how many refs were initially taken @used_refs -- number of refs used Replace it with a single variable counting how many refs left at the current moment. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8b2f7a1bbd06..28b47533454a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -707,7 +707,6 @@ struct io_submit_state { struct file *file; unsigned int fd; unsigned int has_refs; - unsigned int used_refs; unsigned int ios_left; }; @@ -2327,10 +2326,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req) static void __io_state_file_put(struct io_submit_state *state) { - int diff = state->has_refs - state->used_refs; - - if (diff) - fput_many(state->file, diff); + if (state->has_refs) + fput_many(state->file, state->has_refs); state->file = NULL; } @@ -2352,7 +2349,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) if (state->file) { if (state->fd == fd) { - state->used_refs++; + state->has_refs--; state->ios_left--; return state->file; } @@ -2363,9 +2360,8 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) return NULL; state->fd = fd; - state->has_refs = state->ios_left; - state->used_refs = 1; state->ios_left--; + state->has_refs = state->ios_left; return state->file; } From 7a7cacba8b4560403615b04d57bdcd1f93f90f10 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:27:59 +0300 Subject: [PATCH 430/502] io_uring: indent left {send,recv}[msg]() Flip over "if (sock)" condition with return on error, the upper layer will take care. That change will be handy later, but already removes an extra jump from hot path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 269 +++++++++++++++++++++++++------------------------- 1 file changed, 133 insertions(+), 136 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 28b47533454a..264b1e5e2d54 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3916,42 +3916,41 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg = NULL; struct socket *sock; + unsigned flags; int ret; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_async_msghdr iomsg; - unsigned flags; + if (unlikely(!sock)) + return ret; - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if (force_nonblock && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_sendmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; } + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -3964,39 +3963,38 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, static int io_send(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; struct socket *sock; + unsigned flags; int ret; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - struct msghdr msg; - struct iovec iov; - unsigned flags; + if (unlikely(!sock)) + return ret; - ret = import_single_range(WRITE, sr->buf, sr->len, &iov, - &msg.msg_iter); - if (ret) - return ret; + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + return ret; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - } + msg.msg_flags = flags; + ret = sock_sendmsg(sock, &msg); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; if (ret < 0) req_set_fail_links(req); @@ -4149,62 +4147,62 @@ static int io_recvmsg_prep(struct io_kiocb *req, static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg = NULL; struct socket *sock; + struct io_buffer *kbuf; + unsigned flags; int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_buffer *kbuf; - struct io_async_msghdr iomsg; - unsigned flags; + if (unlikely(!sock)) + return ret; - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - ret = io_recvmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) { - return PTR_ERR(kbuf); - } else if (kbuf) { - kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, - 1, req->sr_msg.len); - } - - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, - kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - ret = io_setup_async_msg(req, kmsg); - if (ret != -EAGAIN) - kfree(kbuf); + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_recvmsg_copy_hdr(req, &iomsg); + if (ret) return ret; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (kbuf) - kfree(kbuf); + kmsg = &iomsg; } + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) { + return PTR_ERR(kbuf); + } else if (kbuf) { + kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, + 1, req->sr_msg.len); + } + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, + kmsg->uaddr, flags); + if (force_nonblock && ret == -EAGAIN) { + ret = io_setup_async_msg(req, kmsg); + if (ret != -EAGAIN) + kfree(kbuf); + return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (kbuf) + kfree(kbuf); + if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) req_set_fail_links(req); __io_req_complete(req, ret, cflags, cs); @@ -4215,51 +4213,50 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { struct io_buffer *kbuf = NULL; + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + void __user *buf = sr->buf; struct socket *sock; + struct iovec iov; + unsigned flags; int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - void __user *buf = sr->buf; - struct msghdr msg; - struct iovec iov; - unsigned flags; + if (unlikely(!sock)) + return ret; - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - else if (kbuf) - buf = u64_to_user_ptr(kbuf->addr); + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + else if (kbuf) + buf = u64_to_user_ptr(kbuf->addr); - ret = import_single_range(READ, buf, sr->len, &iov, - &msg.msg_iter); - if (ret) { - kfree(kbuf); - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_iocb = NULL; - msg.msg_flags = 0; - - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - ret = sock_recvmsg(sock, &msg, flags); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; + ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) { + kfree(kbuf); + return ret; } + req->flags |= REQ_F_NEED_CLEANUP; + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = sock_recvmsg(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) From 6b754c8b912a164fbb15b7b839d51709c3d9ee6f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:00 +0300 Subject: [PATCH 431/502] io_uring: remove extra checks in send/recv With the return on a bad socket, kmsg is always non-null by the end of the function, prune left extra checks and initialisations. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 264b1e5e2d54..ac3c16ea7d23 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3916,7 +3916,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr iomsg, *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; unsigned flags; int ret; @@ -3951,7 +3951,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) @@ -4147,7 +4147,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr iomsg, *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; struct io_buffer *kbuf; unsigned flags; @@ -4199,7 +4199,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, if (kbuf) kfree(kbuf); - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -4212,7 +4212,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, static int io_recv(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_buffer *kbuf = NULL; + struct io_buffer *kbuf; struct io_sr_msg *sr = &req->sr_msg; struct msghdr msg; void __user *buf = sr->buf; From 14c32eee9286621dd437b53460e44bd11e5bc08d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:01 +0300 Subject: [PATCH 432/502] io_uring: don't forget cflags in io_recv() Instead of returning error from io_recv(), go through generic cleanup path, because it'll retain cflags for userspace. Do the same for io_send() for consistency. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ac3c16ea7d23..2ffacfbf9094 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3976,7 +3976,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) - return ret; + return ret;; msg.msg_name = NULL; msg.msg_control = NULL; @@ -4232,10 +4232,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, buf = u64_to_user_ptr(kbuf->addr); ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); - if (unlikely(ret)) { - kfree(kbuf); - return ret; - } + if (unlikely(ret)) + goto out_free; req->flags |= REQ_F_NEED_CLEANUP; msg.msg_name = NULL; @@ -4256,7 +4254,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; - +out_free: kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) From 0e1b6fe3d1e5f1b79c5bec37881c98febfba7718 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:02 +0300 Subject: [PATCH 433/502] io_uring: free selected-bufs if error'ed io_clean_op() may be skipped even if there is a selected io_buffer, that's because *select_buffer() funcions never set REQ_F_NEED_CLEANUP. Trigger io_clean_op() when REQ_F_BUFFER_SELECTED is set as well, and and clear the flag if was freed out of it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 83 ++++++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2ffacfbf9094..4448b1e9a754 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -957,7 +957,7 @@ static void io_get_req_task(struct io_kiocb *req) static inline void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_NEED_CLEANUP) + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) __io_clean_op(req); } @@ -1931,6 +1931,7 @@ static int io_put_kbuf(struct io_kiocb *req) cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; cflags |= IORING_CQE_F_BUFFER; req->rw.addr = 0; + req->flags &= ~REQ_F_BUFFER_SELECTED; kfree(kbuf); return cflags; } @@ -4188,20 +4189,16 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - ret = io_setup_async_msg(req, kmsg); - if (ret != -EAGAIN) - kfree(kbuf); - return ret; - } + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; + if (kbuf) kfree(kbuf); - if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); - req->flags &= ~REQ_F_NEED_CLEANUP; + req->flags &= ~(REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED); if (ret < 0) req_set_fail_links(req); @@ -4235,7 +4232,6 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (unlikely(ret)) goto out_free; - req->flags |= REQ_F_NEED_CLEANUP; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; @@ -4255,7 +4251,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; out_free: - kfree(kbuf); + if (kbuf) + kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); @@ -5436,39 +5433,45 @@ static void __io_clean_op(struct io_kiocb *req) { struct io_async_ctx *io = req->io; - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - if (req->flags & REQ_F_BUFFER_SELECTED) + if (req->flags & REQ_F_BUFFER_SELECTED) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: kfree((void *)(unsigned long)req->rw.addr); - /* fallthrough */ - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - if (io->rw.iov != io->rw.fast_iov) - kfree(io->rw.iov); - break; - case IORING_OP_RECVMSG: - if (req->flags & REQ_F_BUFFER_SELECTED) + break; + case IORING_OP_RECVMSG: + case IORING_OP_RECV: kfree(req->sr_msg.kbuf); - /* fallthrough */ - case IORING_OP_SENDMSG: - if (io->msg.iov != io->msg.fast_iov) - kfree(io->msg.iov); - break; - case IORING_OP_RECV: - if (req->flags & REQ_F_BUFFER_SELECTED) - kfree(req->sr_msg.kbuf); - break; - case IORING_OP_SPLICE: - case IORING_OP_TEE: - io_put_file(req, req->splice.file_in, - (req->splice.flags & SPLICE_F_FD_IN_FIXED)); - break; + break; + } + req->flags &= ~REQ_F_BUFFER_SELECTED; } - req->flags &= ~REQ_F_NEED_CLEANUP; + if (req->flags & REQ_F_NEED_CLEANUP) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + if (io->rw.iov != io->rw.fast_iov) + kfree(io->rw.iov); + break; + case IORING_OP_RECVMSG: + case IORING_OP_SENDMSG: + if (io->msg.iov != io->msg.fast_iov) + kfree(io->msg.iov); + break; + case IORING_OP_SPLICE: + case IORING_OP_TEE: + io_put_file(req, req->splice.file_in, + (req->splice.flags & SPLICE_F_FD_IN_FIXED)); + break; + } + req->flags &= ~REQ_F_NEED_CLEANUP; + } } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, From bc02ef3325e3ef524ef29b65681ca4207b781224 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:03 +0300 Subject: [PATCH 434/502] io_uring: move BUFFER_SELECT check into *recv[msg] Move REQ_F_BUFFER_SELECT flag check out of io_recv_buffer_select(), and do that in its call sites That saves us from double error checking and possibly an extra function call. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4448b1e9a754..8dd9037e332e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4098,9 +4098,6 @@ static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, struct io_sr_msg *sr = &req->sr_msg; struct io_buffer *kbuf; - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return NULL; - kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); if (IS_ERR(kbuf)) return kbuf; @@ -4150,7 +4147,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, { struct io_async_msghdr iomsg, *kmsg; struct socket *sock; - struct io_buffer *kbuf; + struct io_buffer *kbuf = NULL; unsigned flags; int ret, cflags = 0; @@ -4172,10 +4169,10 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, kmsg = &iomsg; } - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) { - return PTR_ERR(kbuf); - } else if (kbuf) { + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, 1, req->sr_msg.len); @@ -4222,11 +4219,12 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (unlikely(!sock)) return ret; - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - else if (kbuf) + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); buf = u64_to_user_ptr(kbuf->addr); + } ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) From 8ff069bf2efd7b7aeb90b56ea8edc165c93d8940 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:04 +0300 Subject: [PATCH 435/502] io_uring: extract io_put_kbuf() helper Extract a common helper for cleaning up a selected buffer, this will be used shortly. By the way, correct cflags types to unsigned and, as kbufs are anyway tracked by a flag, remove useless zeroing req->rw.addr. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8dd9037e332e..871ada2a29c3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1922,20 +1922,25 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static int io_put_kbuf(struct io_kiocb *req) +static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) { - struct io_buffer *kbuf; - int cflags; + unsigned int cflags; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; cflags |= IORING_CQE_F_BUFFER; - req->rw.addr = 0; req->flags &= ~REQ_F_BUFFER_SELECTED; kfree(kbuf); return cflags; } +static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) +{ + struct io_buffer *kbuf; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + return io_put_kbuf(req, kbuf); +} + static inline bool io_run_task_work(void) { if (current->task_works) { @@ -1985,7 +1990,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, list_del(&req->inflight_entry); if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_kbuf(req); + cflags = io_put_rw_kbuf(req); __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; @@ -2177,7 +2182,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res, if (res != req->result) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_kbuf(req); + cflags = io_put_rw_kbuf(req); __io_req_complete(req, res, cflags, cs); } From 7fbb1b541f4286cc337b9bca1e5bad0ce4ee978c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:05 +0300 Subject: [PATCH 436/502] io_uring: don't open-code recv kbuf managment Don't implement fast path of kbuf freeing and management inlined into io_recv{,msg}(), that's error prone and duplicates handling. Replace it with a helper io_put_recv_kbuf(), which mimics io_put_rw_kbuf() in the io_read/write(). This also keeps cflags calculation in one place, removing duplication between rw and recv/send. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 871ada2a29c3..6e5ea7991c08 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4098,7 +4098,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, } static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - int *cflags, bool needs_lock) + bool needs_lock) { struct io_sr_msg *sr = &req->sr_msg; struct io_buffer *kbuf; @@ -4109,12 +4109,14 @@ static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, sr->kbuf = kbuf; req->flags |= REQ_F_BUFFER_SELECTED; - - *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - *cflags |= IORING_CQE_F_BUFFER; return kbuf; } +static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) +{ + return io_put_kbuf(req, req->sr_msg.kbuf); +} + static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4152,7 +4154,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, { struct io_async_msghdr iomsg, *kmsg; struct socket *sock; - struct io_buffer *kbuf = NULL; + struct io_buffer *kbuf; unsigned flags; int ret, cflags = 0; @@ -4175,7 +4177,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, } if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + kbuf = io_recv_buffer_select(req, !force_nonblock); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); @@ -4196,12 +4198,11 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; - if (kbuf) - kfree(kbuf); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); - req->flags &= ~(REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED); - + req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); __io_req_complete(req, ret, cflags, cs); @@ -4225,7 +4226,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, return ret; if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + kbuf = io_recv_buffer_select(req, !force_nonblock); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); buf = u64_to_user_ptr(kbuf->addr); @@ -4254,9 +4255,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; out_free: - if (kbuf) - kfree(kbuf); - req->flags &= ~REQ_F_NEED_CLEANUP; + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); if (ret < 0) req_set_fail_links(req); __io_req_complete(req, ret, cflags, cs); From 5dbcad51f78434e782d0470b8b5fc4380700c35f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jul 2020 11:31:20 +0300 Subject: [PATCH 437/502] io_uring: don't miscount pinned memory io_sqe_buffer_unregister() uses cxt->sqo_mm for memory accounting, but io_ring_ctx_free() drops ->sqo_mm before leaving pinned_vm over-accounted. Postpone mm cleanup for when it's not needed anymore. Fixes: 309758254ea62 ("io_uring: report pinned memory usage") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e5ea7991c08..ba7ce103667b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7670,12 +7670,12 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); + io_sqe_buffer_unregister(ctx); if (ctx->sqo_mm) { mmdrop(ctx->sqo_mm); ctx->sqo_mm = NULL; } - io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); From cbcf72148da4af55ea81cfb351ea7c026ff1014f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jul 2020 11:31:21 +0300 Subject: [PATCH 438/502] io_uring: return locked and pinned page accounting Locked and pinned memory accounting in io_{,un}account_mem() depends on having ->sqo_mm, which is NULL after a recent change for non SQPOLL'ed io_ring. That disables the accounting. Return ->sqo_mm initialisation back, and do __io_sq_thread_acquire_mm() based on IORING_SETUP_SQPOLL flag. Fixes: 8eb06d7e8dd85 ("io_uring: fix missing ->mm on exit") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ba7ce103667b..680b16f71a03 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -981,7 +981,8 @@ static void io_sq_thread_drop_mm(void) static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { if (!current->mm) { - if (unlikely(!ctx->sqo_mm || !mmget_not_zero(ctx->sqo_mm))) + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || + !mmget_not_zero(ctx->sqo_mm))) return -EFAULT; kthread_use_mm(ctx->sqo_mm); } @@ -7259,10 +7260,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - if (ctx->flags & IORING_SETUP_SQPOLL) { - mmgrab(current->mm); - ctx->sqo_mm = current->mm; + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + if (ctx->flags & IORING_SETUP_SQPOLL) { ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto err; From dd6f843a9fca8f225c86fee5f50da429c369c045 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jul 2020 11:32:51 +0300 Subject: [PATCH 439/502] tasks: add put_task_struct_many() put_task_struct_many() is as put_task_struct() but puts several references at once. Useful to batching it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/sched/task.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 38359071236a..1301077f9c24 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -126,6 +126,12 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } +static inline void put_task_struct_many(struct task_struct *t, int nr) +{ + if (refcount_sub_and_test(nr, &t->usage)) + __put_task_struct(t); +} + void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT From 5af1d13e8f0d8839db04a71ec786f369b0e67234 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jul 2020 11:32:52 +0300 Subject: [PATCH 440/502] io_uring: batch put_task_struct() As every iopoll request have a task ref, it becomes expensive to put them one by one, instead we can put several at once integrating that into io_req_free_batch(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 680b16f71a03..3a415d924b93 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1544,7 +1544,6 @@ static void io_dismantle_req(struct io_kiocb *req) kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - __io_put_req_task(req); io_req_clean_work(req); if (req->flags & REQ_F_INFLIGHT) { @@ -1564,6 +1563,7 @@ static void __io_free_req(struct io_kiocb *req) struct io_ring_ctx *ctx; io_dismantle_req(req); + __io_put_req_task(req); ctx = req->ctx; if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -1807,8 +1807,18 @@ static void io_free_req(struct io_kiocb *req) struct req_batch { void *reqs[IO_IOPOLL_BATCH]; int to_free; + + struct task_struct *task; + int task_refs; }; +static inline void io_init_req_batch(struct req_batch *rb) +{ + rb->to_free = 0; + rb->task_refs = 0; + rb->task = NULL; +} + static void __io_req_free_batch_flush(struct io_ring_ctx *ctx, struct req_batch *rb) { @@ -1822,6 +1832,10 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, { if (rb->to_free) __io_req_free_batch_flush(ctx, rb); + if (rb->task) { + put_task_struct_many(rb->task, rb->task_refs); + rb->task = NULL; + } } static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) @@ -1833,6 +1847,17 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) if (req->flags & REQ_F_LINK_HEAD) io_queue_next(req); + if (req->flags & REQ_F_TASK_PINNED) { + if (req->task != rb->task) { + if (rb->task) + put_task_struct_many(rb->task, rb->task_refs); + rb->task = req->task; + rb->task_refs = 0; + } + rb->task_refs++; + req->flags &= ~REQ_F_TASK_PINNED; + } + io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) @@ -1978,7 +2003,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, /* order with ->result store in io_complete_rw_iopoll() */ smp_rmb(); - rb.to_free = 0; + io_init_req_batch(&rb); while (!list_empty(done)) { int cflags = 0; From 23b3628e45924419399da48c2b3a522b05557c91 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 23 Jul 2020 20:57:24 +0800 Subject: [PATCH 441/502] io_uring: clear IORING_SQ_NEED_WAKEUP after executing task works In io_sq_thread(), if there are task works to handle, current codes will skip schedule() and go on polling sq again, but forget to clear IORING_SQ_NEED_WAKEUP flag, fix this issue. Also add two helpers to set and clear IORING_SQ_NEED_WAKEUP flag, Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3a415d924b93..6f3f18a99f4f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6344,6 +6344,21 @@ fail_req: return submitted; } +static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) +{ + /* Tell userspace we may need a wakeup call */ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + +static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) +{ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; @@ -6417,10 +6432,7 @@ static int io_sq_thread(void *data) continue; } - /* Tell userspace we may need a wakeup call */ - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_set_wakeup_flag(ctx); to_submit = io_sqring_entries(ctx); if (!to_submit || ret == -EBUSY) { @@ -6430,6 +6442,7 @@ static int io_sq_thread(void *data) } if (io_run_task_work()) { finish_wait(&ctx->sqo_wait, &wait); + io_ring_clear_wakeup_flag(ctx); continue; } if (signal_pending(current)) @@ -6437,17 +6450,13 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_clear_wakeup_flag(ctx); ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_clear_wakeup_flag(ctx); } mutex_lock(&ctx->uring_lock); From ae34817bd93e373a03203a4c6892735c430a14e1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jul 2020 20:25:20 +0300 Subject: [PATCH 442/502] io_uring: don't do opcode prep twice Calling into opcode prep handlers may be dangerous, as they re-read SQE but might not re-initialise requests completely. If io_req_defer() passed fast checks and is done with preparations, punt it async. As all other cases are covered with nulling @sqe, this guarantees that io_[opcode]_prep() are visited only once per request. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f3f18a99f4f..38e4c3902963 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5447,7 +5447,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); kfree(de); - return 0; + io_queue_async_work(req); + return -EIOCBQUEUED; } trace_io_uring_defer(ctx, req, req->user_data); From f56040b81999871973d21f334b4657957422c90e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jul 2020 20:25:21 +0300 Subject: [PATCH 443/502] io_uring: deduplicate io_grab_files() calls Move io_req_init_async() into io_grab_files(), it's safer this way. Note that io_queue_async_work() does *init_async(), so it's valid to move out of __io_queue_sqe() punt path. Also, add a helper around io_grab_files(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 38e4c3902963..c7e8e9a1b27b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -912,7 +912,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); -static int io_grab_files(struct io_kiocb *req); +static int io_prep_work_files(struct io_kiocb *req); static void io_complete_rw_common(struct kiocb *kiocb, long res, struct io_comp_state *cs); static void __io_clean_op(struct io_kiocb *req); @@ -5294,13 +5294,9 @@ static int io_req_defer_prep(struct io_kiocb *req, if (io_alloc_async_ctx(req)) return -EAGAIN; - - if (io_op_defs[req->opcode].file_table) { - io_req_init_async(req); - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; - } + ret = io_prep_work_files(req); + if (unlikely(ret)) + return ret; switch (req->opcode) { case IORING_OP_NOP: @@ -5851,6 +5847,8 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + io_req_init_async(req); + if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) return 0; if (!ctx->ring_file) @@ -5876,6 +5874,13 @@ static int io_grab_files(struct io_kiocb *req) return ret; } +static inline int io_prep_work_files(struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].file_table) + return 0; + return io_grab_files(req); +} + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -5987,14 +5992,9 @@ again: goto exit; } punt: - io_req_init_async(req); - - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (ret) - goto err; - } - + ret = io_prep_work_files(req); + if (unlikely(ret)) + goto err; /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. From b65e0dd6a2de050d3fc4c0db4969a245f4e7273e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jul 2020 14:41:58 +0300 Subject: [PATCH 444/502] io_uring: mark ->work uninitialised after cleanup Remove REQ_F_WORK_INITIALIZED after io_req_clean_work(). That's a cold path but is safer for those using io_req_clean_work() out of *dismantle_req()/*io_free(). And for the same reason zero work.fs Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c7e8e9a1b27b..59f1f473ffc7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1141,7 +1141,9 @@ static void io_req_clean_work(struct io_kiocb *req) spin_unlock(&req->work.fs->lock); if (fs) free_fs_struct(fs); + req->work.fs = NULL; } + req->flags &= ~REQ_F_WORK_INITIALIZED; } static void io_prep_async_work(struct io_kiocb *req) @@ -4969,7 +4971,6 @@ static int io_poll_add(struct io_kiocb *req) /* ->work is in union with hash_node and others */ io_req_clean_work(req); - req->flags &= ~REQ_F_WORK_INITIALIZED; INIT_HLIST_NODE(&req->hash_node); ipt.pt._qproc = io_poll_queue_proc; From f063c5477eb392c315aa25ad538b4920b367ea05 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jul 2020 14:41:59 +0300 Subject: [PATCH 445/502] io_uring: fix missing io_queue_linked_timeout() Whoever called io_prep_linked_timeout() should also do io_queue_linked_timeout(). __io_queue_sqe() doesn't follow that for the punting path leaving linked timeouts prepared but never queued. Fixes: 6df1db6b54243 ("io_uring: fix mis-refcounting linked timeouts") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 59f1f473ffc7..3e406bc1f855 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5987,20 +5987,20 @@ again: * doesn't support non-blocking read/write attempts */ if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - if (io_arm_poll_handler(req)) { - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); - goto exit; - } + if (!io_arm_poll_handler(req)) { punt: - ret = io_prep_work_files(req); - if (unlikely(ret)) - goto err; - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req); + ret = io_prep_work_files(req); + if (unlikely(ret)) + goto err; + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req); + } + + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); goto exit; } From b089ed390b5c9bc248a32168709cfa01099caf9d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jul 2020 14:42:00 +0300 Subject: [PATCH 446/502] io-wq: update hash bits Linked requests are hashed, remove a comment stating otherwise. Also move hash bits to emphasise that we don't carry it through loop iteration and set it every time. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 8702d3c3b291..e92c4724480c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -490,7 +490,6 @@ static void io_worker_handle_work(struct io_worker *worker) do { struct io_wq_work *work; - unsigned int hash; get_next: /* * If we got some work, mark us as busy. If we didn't, but @@ -513,6 +512,7 @@ get_next: /* handle a whole dependent link */ do { struct io_wq_work *old_work, *next_hashed, *linked; + unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); io_impersonate_work(worker, work); @@ -523,7 +523,6 @@ get_next: if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) work->flags |= IO_WQ_WORK_CANCEL; - hash = io_get_work_hash(work); old_work = work; linked = wq->do_work(work); @@ -542,8 +541,6 @@ get_next: spin_lock_irq(&wqe->lock); wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; - /* dependent work is not hashed */ - hash = -1U; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; From 4631f3ca493a7c8f9f31aef45fc0fc0e182155b7 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 7 Jul 2020 16:42:19 +0200 Subject: [PATCH 447/502] s390/pci: clarify comment in s390_mmio_read/write The existing comment was talking about reading in the write part and vice versa. While we are here make it more clear why restricting the syscalls to MIO capable devices is okay. Signed-off-by: Niklas Schnelle Signed-off-by: Heiko Carstens --- arch/s390/pci/pci_mmio.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 38efa3e852c4..401cf670a243 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -155,10 +155,12 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, return -EINVAL; /* - * Only support read access to MIO capable devices on a MIO enabled - * system. Otherwise we would have to check for every address if it is - * a special ZPCI_ADDR and we would have to do a get_pfn() which we - * don't need for MIO capable devices. + * We only support write access to MIO capable devices if we are on + * a MIO enabled system. Otherwise we would have to check for every + * address if it is a special ZPCI_ADDR and would have to do + * a get_pfn() which we don't need for MIO capable devices. Currently + * ISM devices are the only devices without MIO support and there is no + * known need for accessing these from userspace. */ if (static_branch_likely(&have_mio)) { ret = __memcpy_toio_inuser((void __iomem *) mmio_addr, @@ -282,10 +284,12 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, return -EINVAL; /* - * Only support write access to MIO capable devices on a MIO enabled - * system. Otherwise we would have to check for every address if it is - * a special ZPCI_ADDR and we would have to do a get_pfn() which we - * don't need for MIO capable devices. + * We only support read access to MIO capable devices if we are on + * a MIO enabled system. Otherwise we would have to check for every + * address if it is a special ZPCI_ADDR and would have to do + * a get_pfn() which we don't need for MIO capable devices. Currently + * ISM devices are the only devices without MIO support and there is no + * known need for accessing these from userspace. */ if (static_branch_likely(&have_mio)) { ret = __memcpy_fromio_inuser( From 73d6eb48d26930f0cbdc8bf1ccb0ad964e7d2b90 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 22 Jul 2020 23:58:54 +0200 Subject: [PATCH 448/502] s390: enable HAVE_FUNCTION_ERROR_INJECTION This kernel feature is required for enabling BPF_KPROBE_OVERRIDE. Define override_function_with_return() and regs_set_return_value() functions, and fix compile errors in syscall_wrapper.h. Signed-off-by: Ilya Leoshkevich Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + arch/s390/include/asm/ptrace.h | 5 +++++ arch/s390/include/asm/syscall_wrapper.h | 6 +++--- arch/s390/lib/Makefile | 2 ++ arch/s390/lib/error-inject.c | 14 ++++++++++++++ 5 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 arch/s390/lib/error-inject.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d95d323cf213..9cfd8de907cb 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -145,6 +145,7 @@ config S390 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_FENTRY select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if FUTEX diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index f009a13afe71..16b3e4396312 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -184,5 +184,10 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) return regs->gprs[15]; } +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +{ + regs->gprs[2] = rc; +} + #endif /* __ASSEMBLY__ */ #endif /* _S390_PTRACE_H */ diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h index 3c3d6fe8e2f0..1320f4213d80 100644 --- a/arch/s390/include/asm/syscall_wrapper.h +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -30,7 +30,7 @@ }) #define __S390_SYS_STUBx(x, name, ...) \ - asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ + asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));\ ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ { \ @@ -46,7 +46,7 @@ #define COMPAT_SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ asmlinkage long __s390_compat_sys_##sname(void); \ - ALLOW_ERROR_INJECTION(__s390_compat__sys_##sname, ERRNO); \ + ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO); \ asmlinkage long __s390_compat_sys_##sname(void) #define SYSCALL_DEFINE0(sname) \ @@ -72,7 +72,7 @@ asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_compat_sys##name)))); \ - ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \ + ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 28fd66d558ff..678333936f78 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -14,3 +14,5 @@ KASAN_SANITIZE_uaccess.o := n obj-$(CONFIG_S390_UNWIND_SELFTEST) += test_unwind.o CFLAGS_test_unwind.o += -fno-optimize-sibling-calls + +lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/s390/lib/error-inject.c b/arch/s390/lib/error-inject.c new file mode 100644 index 000000000000..8c9d4da87eef --- /dev/null +++ b/arch/s390/lib/error-inject.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ +#include +#include +#include + +void override_function_with_return(struct pt_regs *regs) +{ + /* + * Emulate 'br 14'. 'regs' is captured by kprobes on entry to some + * kernel function. + */ + regs->psw.addr = regs->gprs[14]; +} +NOKPROBE_SYMBOL(override_function_with_return); From 8398b226b8f01df902450658a139ee01d9f4c482 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 Jul 2020 11:45:50 +0200 Subject: [PATCH 449/502] s390/vmem: rename vmem_add_mem() to vmem_add_range() Let's match the name to vmem_remove_range(). Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Gerald Schaefer Signed-off-by: David Hildenbrand Message-Id: <20200722094558.9828-2-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/mm/vmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 3b9e71654c37..66c5333020ea 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -57,7 +57,7 @@ pte_t __ref *vmem_pte_alloc(void) /* * Add a physical memory range to the 1:1 mapping. */ -static int vmem_add_mem(unsigned long start, unsigned long size) +static int vmem_add_range(unsigned long start, unsigned long size) { unsigned long pgt_prot, sgt_prot, r3_prot; unsigned long pages4k, pages1m, pages2g; @@ -308,7 +308,7 @@ int vmem_add_mapping(unsigned long start, unsigned long size) return -ERANGE; mutex_lock(&vmem_mutex); - ret = vmem_add_mem(start, size); + ret = vmem_add_range(start, size); if (ret) vmem_remove_range(start, size); mutex_unlock(&vmem_mutex); @@ -325,7 +325,7 @@ void __init vmem_map_init(void) struct memblock_region *reg; for_each_memblock(memory, reg) - vmem_add_mem(reg->base, reg->size); + vmem_add_range(reg->base, reg->size); __set_memory((unsigned long)_stext, (unsigned long)(_etext - _stext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); From 3e0d3e408e63839625b210e5eb7269c45b870a38 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 Jul 2020 11:45:51 +0200 Subject: [PATCH 450/502] s390/vmem: consolidate vmem_add_range() and vmem_remove_range() We want to have only a single pagetable walker and reuse the same functionality for vmemmap handling. Let's start by consolidating vmem_add_range() and vmem_remove_range(), converting it into a recursive implementation. A recursive implementation makes it easier to expand individual cases without harming readability. In addition, we minimize traversing the whole hierarchy over and over again. One change is that we don't unmap large PMDs/PUDs when not completely covered by the request, something that should never happen with direct mappings, unless one would be removing in other granularity than added, which would be broken already. Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Gerald Schaefer Signed-off-by: David Hildenbrand Message-Id: <20200722094558.9828-3-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/mm/vmem.c | 337 +++++++++++++++++++++++++++----------------- 1 file changed, 208 insertions(+), 129 deletions(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 66c5333020ea..177daf389d39 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -54,88 +54,218 @@ pte_t __ref *vmem_pte_alloc(void) return pte; } +static void modify_pte_table(pmd_t *pmd, unsigned long addr, unsigned long end, + bool add) +{ + unsigned long prot, pages = 0; + pte_t *pte; + + prot = pgprot_val(PAGE_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_PAGE_NOEXEC; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (!add) { + if (pte_none(*pte)) + continue; + pte_clear(&init_mm, addr, pte); + } else if (pte_none(*pte)) { + pte_val(*pte) = addr | prot; + } else + continue; + + pages++; + } + + update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); +} + +static int modify_pmd_table(pud_t *pud, unsigned long addr, unsigned long end, + bool add) +{ + unsigned long next, prot, pages = 0; + int ret = -ENOMEM; + pmd_t *pmd; + pte_t *pte; + + prot = pgprot_val(SEGMENT_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_SEGMENT_ENTRY_NOEXEC; + + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!add) { + if (pmd_none(*pmd)) + continue; + if (pmd_large(*pmd) && !add) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + pmd_clear(pmd); + pages++; + } + continue; + } + } else if (pmd_none(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE) && + MACHINE_HAS_EDAT1 && addr && + !debug_pagealloc_enabled()) { + pmd_val(*pmd) = addr | prot; + pages++; + continue; + } + pte = vmem_pte_alloc(); + if (!pte) + goto out; + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) + continue; + + modify_pte_table(pmd, addr, next, add); + } + ret = 0; +out: + update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); + return ret; +} + +static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, + bool add) +{ + unsigned long next, prot, pages = 0; + int ret = -ENOMEM; + pud_t *pud; + pmd_t *pmd; + + prot = pgprot_val(REGION3_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_REGION_ENTRY_NOEXEC; + + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!add) { + if (pud_none(*pud)) + continue; + if (pud_large(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + pud_clear(pud); + pages++; + } + continue; + } + } else if (pud_none(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE) && + MACHINE_HAS_EDAT2 && addr && + !debug_pagealloc_enabled()) { + pud_val(*pud) = addr | prot; + pages++; + continue; + } + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) + goto out; + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) + continue; + + ret = modify_pmd_table(pud, addr, next, add); + if (ret) + goto out; + } + ret = 0; +out: + update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); + return ret; +} + +static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, + bool add) +{ + unsigned long next; + int ret = -ENOMEM; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + + if (!add) { + if (p4d_none(*p4d)) + continue; + } else if (p4d_none(*p4d)) { + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) + goto out; + } + + ret = modify_pud_table(p4d, addr, next, add); + if (ret) + goto out; + } + ret = 0; +out: + return ret; +} + +static int modify_pagetable(unsigned long start, unsigned long end, bool add) +{ + unsigned long addr, next; + int ret = -ENOMEM; + pgd_t *pgd; + p4d_t *p4d; + + if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) + return -EINVAL; + + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + pgd = pgd_offset_k(addr); + + if (!add) { + if (pgd_none(*pgd)) + continue; + } else if (pgd_none(*pgd)) { + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + + ret = modify_p4d_table(pgd, addr, next, add); + if (ret) + goto out; + } + ret = 0; +out: + if (!add) + flush_tlb_kernel_range(start, end); + return ret; +} + +static int add_pagetable(unsigned long start, unsigned long end) +{ + return modify_pagetable(start, end, true); +} + +static int remove_pagetable(unsigned long start, unsigned long end) +{ + return modify_pagetable(start, end, false); +} + /* * Add a physical memory range to the 1:1 mapping. */ static int vmem_add_range(unsigned long start, unsigned long size) { - unsigned long pgt_prot, sgt_prot, r3_prot; - unsigned long pages4k, pages1m, pages2g; - unsigned long end = start + size; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - int ret = -ENOMEM; - - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - r3_prot = pgprot_val(REGION3_KERNEL); - if (!MACHINE_HAS_NX) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; - r3_prot &= ~_REGION_ENTRY_NOEXEC; - } - pages4k = pages1m = pages2g = 0; - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); - if (!p4_dir) - goto out; - pgd_populate(&init_mm, pg_dir, p4_dir); - } - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); - if (!pu_dir) - goto out; - p4d_populate(&init_mm, p4_dir, pu_dir); - } - pu_dir = pud_offset(p4_dir, address); - if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address && - !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) && - !debug_pagealloc_enabled()) { - pud_val(*pu_dir) = address | r3_prot; - address += PUD_SIZE; - pages2g++; - continue; - } - if (pud_none(*pu_dir)) { - pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); - if (!pm_dir) - goto out; - pud_populate(&init_mm, pu_dir, pm_dir); - } - pm_dir = pmd_offset(pu_dir, address); - if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address && - !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) && - !debug_pagealloc_enabled()) { - pmd_val(*pm_dir) = address | sgt_prot; - address += PMD_SIZE; - pages1m++; - continue; - } - if (pmd_none(*pm_dir)) { - pt_dir = vmem_pte_alloc(); - if (!pt_dir) - goto out; - pmd_populate(&init_mm, pm_dir, pt_dir); - } - - pt_dir = pte_offset_kernel(pm_dir, address); - pte_val(*pt_dir) = address | pgt_prot; - address += PAGE_SIZE; - pages4k++; - } - ret = 0; -out: - update_page_count(PG_DIRECT_MAP_4K, pages4k); - update_page_count(PG_DIRECT_MAP_1M, pages1m); - update_page_count(PG_DIRECT_MAP_2G, pages2g); - return ret; + return add_pagetable(start, start + size); } /* @@ -144,58 +274,7 @@ out: */ static void vmem_remove_range(unsigned long start, unsigned long size) { - unsigned long pages4k, pages1m, pages2g; - unsigned long end = start + size; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - - pages4k = pages1m = pages2g = 0; - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - address += PGDIR_SIZE; - continue; - } - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - address += P4D_SIZE; - continue; - } - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - address += PUD_SIZE; - continue; - } - if (pud_large(*pu_dir)) { - pud_clear(pu_dir); - address += PUD_SIZE; - pages2g++; - continue; - } - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - address += PMD_SIZE; - continue; - } - if (pmd_large(*pm_dir)) { - pmd_clear(pm_dir); - address += PMD_SIZE; - pages1m++; - continue; - } - pt_dir = pte_offset_kernel(pm_dir, address); - pte_clear(&init_mm, address, pt_dir); - address += PAGE_SIZE; - pages4k++; - } - flush_tlb_kernel_range(start, end); - update_page_count(PG_DIRECT_MAP_4K, -pages4k); - update_page_count(PG_DIRECT_MAP_1M, -pages1m); - update_page_count(PG_DIRECT_MAP_2G, -pages2g); + remove_pagetable(start, start + size); } /* From 9ec8fa8dc331be6b63726be696b2b21d0031a09b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 Jul 2020 11:45:52 +0200 Subject: [PATCH 451/502] s390/vmemmap: extend modify_pagetable() to handle vmemmap Extend our shiny new modify_pagetable() to handle !direct (vmemmap) mappings. Convert vmemmap_populate() and implement vmemmap_free(). Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Gerald Schaefer Signed-off-by: David Hildenbrand Message-Id: <20200722094558.9828-4-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/mm/vmem.c | 181 +++++++++++++++++++------------------------- 1 file changed, 76 insertions(+), 105 deletions(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 177daf389d39..43fe1e2eb90e 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -29,6 +29,15 @@ static void __ref *vmem_alloc_pages(unsigned int order) return (void *) memblock_phys_alloc(size, size); } +static void vmem_free_pages(unsigned long addr, int order) +{ + /* We don't expect boot memory to be removed ever. */ + if (!slab_is_available() || + WARN_ON_ONCE(PageReserved(phys_to_page(addr)))) + return; + free_pages(addr, order); +} + void *vmem_crst_alloc(unsigned long val) { unsigned long *table; @@ -54,10 +63,12 @@ pte_t __ref *vmem_pte_alloc(void) return pte; } -static void modify_pte_table(pmd_t *pmd, unsigned long addr, unsigned long end, - bool add) +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, + unsigned long end, bool add, bool direct) { unsigned long prot, pages = 0; + int ret = -ENOMEM; pte_t *pte; prot = pgprot_val(PAGE_KERNEL); @@ -69,20 +80,34 @@ static void modify_pte_table(pmd_t *pmd, unsigned long addr, unsigned long end, if (!add) { if (pte_none(*pte)) continue; + if (!direct) + vmem_free_pages(pfn_to_phys(pte_pfn(*pte)), 0); pte_clear(&init_mm, addr, pte); } else if (pte_none(*pte)) { - pte_val(*pte) = addr | prot; + if (!direct) { + void *new_page = vmemmap_alloc_block(PAGE_SIZE, + NUMA_NO_NODE); + + if (!new_page) + goto out; + pte_val(*pte) = __pa(new_page) | prot; + } else + pte_val(*pte) = addr | prot; } else continue; pages++; } - - update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); + return ret; } -static int modify_pmd_table(pud_t *pud, unsigned long addr, unsigned long end, - bool add) +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, + unsigned long end, bool add, bool direct) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -103,6 +128,9 @@ static int modify_pmd_table(pud_t *pud, unsigned long addr, unsigned long end, if (pmd_large(*pmd) && !add) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + vmem_free_pages(pmd_deref(*pmd), + get_order(PMD_SIZE)); pmd_clear(pmd); pages++; } @@ -111,11 +139,27 @@ static int modify_pmd_table(pud_t *pud, unsigned long addr, unsigned long end, } else if (pmd_none(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE) && - MACHINE_HAS_EDAT1 && addr && + MACHINE_HAS_EDAT1 && addr && direct && !debug_pagealloc_enabled()) { pmd_val(*pmd) = addr | prot; pages++; continue; + } else if (!direct && MACHINE_HAS_EDAT1) { + void *new_page; + + /* + * Use 1MB frames for vmemmap if available. We + * always use large frames even if they are only + * partially used. Otherwise we would have also + * page tables since vmemmap_populate gets + * called for each section separately. + */ + new_page = vmemmap_alloc_block(PMD_SIZE, + NUMA_NO_NODE); + if (!new_page) + goto out; + pmd_val(*pmd) = __pa(new_page) | prot; + continue; } pte = vmem_pte_alloc(); if (!pte) @@ -124,16 +168,19 @@ static int modify_pmd_table(pud_t *pud, unsigned long addr, unsigned long end, } else if (pmd_large(*pmd)) continue; - modify_pte_table(pmd, addr, next, add); + ret = modify_pte_table(pmd, addr, next, add, direct); + if (ret) + goto out; } ret = 0; out: - update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); + if (direct) + update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); return ret; } static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, - bool add) + bool add, bool direct) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -162,7 +209,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, } else if (pud_none(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE) && - MACHINE_HAS_EDAT2 && addr && + MACHINE_HAS_EDAT2 && addr && direct && !debug_pagealloc_enabled()) { pud_val(*pud) = addr | prot; pages++; @@ -175,18 +222,19 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, } else if (pud_large(*pud)) continue; - ret = modify_pmd_table(pud, addr, next, add); + ret = modify_pmd_table(pud, addr, next, add, direct); if (ret) goto out; } ret = 0; out: - update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); + if (direct) + update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); return ret; } static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, - bool add) + bool add, bool direct) { unsigned long next; int ret = -ENOMEM; @@ -206,7 +254,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, goto out; } - ret = modify_pud_table(p4d, addr, next, add); + ret = modify_pud_table(p4d, addr, next, add, direct); if (ret) goto out; } @@ -215,7 +263,8 @@ out: return ret; } -static int modify_pagetable(unsigned long start, unsigned long end, bool add) +static int modify_pagetable(unsigned long start, unsigned long end, bool add, + bool direct) { unsigned long addr, next; int ret = -ENOMEM; @@ -239,7 +288,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add) pgd_populate(&init_mm, pgd, p4d); } - ret = modify_p4d_table(pgd, addr, next, add); + ret = modify_p4d_table(pgd, addr, next, add, direct); if (ret) goto out; } @@ -250,14 +299,14 @@ out: return ret; } -static int add_pagetable(unsigned long start, unsigned long end) +static int add_pagetable(unsigned long start, unsigned long end, bool direct) { - return modify_pagetable(start, end, true); + return modify_pagetable(start, end, true, direct); } -static int remove_pagetable(unsigned long start, unsigned long end) +static int remove_pagetable(unsigned long start, unsigned long end, bool direct) { - return modify_pagetable(start, end, false); + return modify_pagetable(start, end, false, direct); } /* @@ -265,7 +314,7 @@ static int remove_pagetable(unsigned long start, unsigned long end) */ static int vmem_add_range(unsigned long start, unsigned long size) { - return add_pagetable(start, start + size); + return add_pagetable(start, start + size, true); } /* @@ -274,7 +323,7 @@ static int vmem_add_range(unsigned long start, unsigned long size) */ static void vmem_remove_range(unsigned long start, unsigned long size) { - remove_pagetable(start, start + size); + remove_pagetable(start, start + size, true); } /* @@ -283,92 +332,14 @@ static void vmem_remove_range(unsigned long start, unsigned long size) int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { - unsigned long pgt_prot, sgt_prot; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - int ret = -ENOMEM; - - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - if (!MACHINE_HAS_NX) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; - } - for (address = start; address < end;) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); - if (!p4_dir) - goto out; - pgd_populate(&init_mm, pg_dir, p4_dir); - } - - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); - if (!pu_dir) - goto out; - p4d_populate(&init_mm, p4_dir, pu_dir); - } - - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); - if (!pm_dir) - goto out; - pud_populate(&init_mm, pu_dir, pm_dir); - } - - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - /* Use 1MB frames for vmemmap if available. We always - * use large frames even if they are only partially - * used. - * Otherwise we would have also page tables since - * vmemmap_populate gets called for each section - * separately. */ - if (MACHINE_HAS_EDAT1) { - void *new_page; - - new_page = vmemmap_alloc_block(PMD_SIZE, node); - if (!new_page) - goto out; - pmd_val(*pm_dir) = __pa(new_page) | sgt_prot; - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - pt_dir = vmem_pte_alloc(); - if (!pt_dir) - goto out; - pmd_populate(&init_mm, pm_dir, pt_dir); - } else if (pmd_large(*pm_dir)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - - pt_dir = pte_offset_kernel(pm_dir, address); - if (pte_none(*pt_dir)) { - void *new_page; - - new_page = vmemmap_alloc_block(PAGE_SIZE, node); - if (!new_page) - goto out; - pte_val(*pt_dir) = __pa(new_page) | pgt_prot; - } - address += PAGE_SIZE; - } - ret = 0; -out: - return ret; + /* We don't care about the node, just use NUMA_NO_NODE on allocations */ + return add_pagetable(start, end, false); } void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { + remove_pagetable(start, end, false); } void vmem_remove_mapping(unsigned long start, unsigned long size) From c00f05a92424c7788fdbf0909b823f8027596d66 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 Jul 2020 11:45:53 +0200 Subject: [PATCH 452/502] s390/vmemmap: cleanup when vmemmap_populate() fails Cleanup what we partially added in case vmemmap_populate() fails. For vmem, this is already handled by vmem_add_mapping(). Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Gerald Schaefer Signed-off-by: David Hildenbrand Message-Id: <20200722094558.9828-5-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/mm/vmem.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 43fe1e2eb90e..be32a38bb91f 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -332,8 +332,13 @@ static void vmem_remove_range(unsigned long start, unsigned long size) int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { + int ret; + /* We don't care about the node, just use NUMA_NO_NODE on allocations */ - return add_pagetable(start, end, false); + ret = add_pagetable(start, end, false); + if (ret) + remove_pagetable(start, end, false); + return ret; } void vmemmap_free(unsigned long start, unsigned long end, From aa18e0e65800bf3250b23914a28e0e3fd9cadec2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 Jul 2020 11:45:54 +0200 Subject: [PATCH 453/502] s390/vmemmap: take the vmem_mutex when populating/freeing Let's synchronize all accesses to the 1:1 and vmemmap mappings. This will be especially relevant when wanting to cleanup empty page tables that could be shared by both. Avoid races when removing tables that might be just about to get reused. Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Gerald Schaefer Signed-off-by: David Hildenbrand Message-Id: <20200722094558.9828-6-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/mm/vmem.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index be32a38bb91f..a2b79681df69 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -334,17 +334,21 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, { int ret; + mutex_lock(&vmem_mutex); /* We don't care about the node, just use NUMA_NO_NODE on allocations */ ret = add_pagetable(start, end, false); if (ret) remove_pagetable(start, end, false); + mutex_unlock(&vmem_mutex); return ret; } void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { + mutex_lock(&vmem_mutex); remove_pagetable(start, end, false); + mutex_unlock(&vmem_mutex); } void vmem_remove_mapping(unsigned long start, unsigned long size) From b9ff81003cf1a0b12b8d60b6ef33a34e84dfe7ac Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 Jul 2020 11:45:55 +0200 Subject: [PATCH 454/502] s390/vmem: cleanup empty page tables Let's cleanup empty page tables. Consider only page tables that fully fall into the idendity mapping and the vmemmap range. As there are no valid accesses to vmem/vmemmap within non-populated ranges, the single tlb flush at the end should be sufficient. Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Gerald Schaefer Signed-off-by: David Hildenbrand Message-Id: <20200722094558.9828-7-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/mm/vmem.c | 102 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index a2b79681df69..b831f9f9130a 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -63,6 +63,15 @@ pte_t __ref *vmem_pte_alloc(void) return pte; } +static void vmem_pte_free(unsigned long *table) +{ + /* We don't expect boot memory to be removed ever. */ + if (!slab_is_available() || + WARN_ON_ONCE(PageReserved(virt_to_page(table)))) + return; + page_table_free(&init_mm, table); +} + /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, unsigned long end, bool add, bool direct) @@ -105,6 +114,21 @@ out: return ret; } +static void try_free_pte_table(pmd_t *pmd, unsigned long start) +{ + pte_t *pte; + int i; + + /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ + pte = pte_offset_kernel(pmd, start); + for (i = 0; i < PTRS_PER_PTE; i++, pte++) + if (!pte_none(*pte)) + return; + + vmem_pte_free(__va(pmd_deref(*pmd))); + pmd_clear(pmd); +} + /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, unsigned long end, bool add, bool direct) @@ -171,6 +195,8 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, ret = modify_pte_table(pmd, addr, next, add, direct); if (ret) goto out; + if (!add) + try_free_pte_table(pmd, addr & PMD_MASK); } ret = 0; out: @@ -179,6 +205,29 @@ out: return ret; } +static void try_free_pmd_table(pud_t *pud, unsigned long start) +{ + const unsigned long end = start + PUD_SIZE; + pmd_t *pmd; + int i; + + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (end > VMALLOC_START) + return; +#ifdef CONFIG_KASAN + if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) + return; +#endif + + pmd = pmd_offset(pud, start); + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) + if (!pmd_none(*pmd)) + return; + + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); + pud_clear(pud); +} + static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, bool add, bool direct) { @@ -225,6 +274,8 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, ret = modify_pmd_table(pud, addr, next, add, direct); if (ret) goto out; + if (!add) + try_free_pmd_table(pud, addr & PUD_MASK); } ret = 0; out: @@ -233,6 +284,29 @@ out: return ret; } +static void try_free_pud_table(p4d_t *p4d, unsigned long start) +{ + const unsigned long end = start + P4D_SIZE; + pud_t *pud; + int i; + + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (end > VMALLOC_START) + return; +#ifdef CONFIG_KASAN + if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) + return; +#endif + + pud = pud_offset(p4d, start); + for (i = 0; i < PTRS_PER_PUD; i++, pud++) + if (!pud_none(*pud)) + return; + + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); + p4d_clear(p4d); +} + static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, bool add, bool direct) { @@ -257,12 +331,37 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, ret = modify_pud_table(p4d, addr, next, add, direct); if (ret) goto out; + if (!add) + try_free_pud_table(p4d, addr & P4D_MASK); } ret = 0; out: return ret; } +static void try_free_p4d_table(pgd_t *pgd, unsigned long start) +{ + const unsigned long end = start + PGDIR_SIZE; + p4d_t *p4d; + int i; + + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (end > VMALLOC_START) + return; +#ifdef CONFIG_KASAN + if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) + return; +#endif + + p4d = p4d_offset(pgd, start); + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) + if (!p4d_none(*p4d)) + return; + + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); + pgd_clear(pgd); +} + static int modify_pagetable(unsigned long start, unsigned long end, bool add, bool direct) { @@ -291,6 +390,8 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, ret = modify_p4d_table(pgd, addr, next, add, direct); if (ret) goto out; + if (!add) + try_free_p4d_table(pgd, addr & PGDIR_MASK); } ret = 0; out: @@ -319,7 +420,6 @@ static int vmem_add_range(unsigned long start, unsigned long size) /* * Remove a physical memory range from the 1:1 mapping. - * Currently only invalidates page table entries. */ static void vmem_remove_range(unsigned long start, unsigned long size) { From f2057b4266a6be469ea0630971cf3cd933e42cce Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 Jul 2020 11:45:56 +0200 Subject: [PATCH 455/502] s390/vmemmap: fallback to PTEs if mapping large PMD fails Let's fallback to single pages if short on huge pages. No need to stop memory hotplug. Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Gerald Schaefer Signed-off-by: David Hildenbrand Message-Id: <20200722094558.9828-8-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/mm/vmem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index b831f9f9130a..e82a63de19db 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -180,10 +180,10 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, */ new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); - if (!new_page) - goto out; - pmd_val(*pmd) = __pa(new_page) | prot; - continue; + if (new_page) { + pmd_val(*pmd) = __pa(new_page) | prot; + continue; + } } pte = vmem_pte_alloc(); if (!pte) From cd5781d63eaf6dbf89532d8c7c214786b767ee16 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 Jul 2020 11:45:57 +0200 Subject: [PATCH 456/502] s390/vmemmap: remember unused sub-pmd ranges With a memmap size of 56 bytes or 72 bytes per page, the memmap for a 256 MB section won't span full PMDs. As we populate single sections and depopulate single sections, the depopulation step would not be able to free all vmemmap pmds anymore. Do it similarly to x86, marking the unused memmap ranges in a special way (pad it with 0xFD). This allows us to add/remove sections, cleaning up all allocated vmemmap pages even if the memmap size is not multiple of 16 bytes per page. A 56 byte memmap can, for example, be created with !CONFIG_MEMCG and !CONFIG_SLUB. Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Gerald Schaefer Signed-off-by: David Hildenbrand Message-Id: <20200722094558.9828-9-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/mm/vmem.c | 51 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index e82a63de19db..df361bbacda1 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -72,6 +72,42 @@ static void vmem_pte_free(unsigned long *table) page_table_free(&init_mm, table); } +#define PAGE_UNUSED 0xFD + +static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) +{ + /* + * As we expect to add in the same granularity as we remove, it's + * sufficient to mark only some piece used to block the memmap page from + * getting removed (just in case the memmap never gets initialized, + * e.g., because the memory block never gets onlined). + */ + memset(__va(start), 0, sizeof(struct page)); +} + +static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) +{ + void *page = __va(ALIGN_DOWN(start, PMD_SIZE)); + + /* Could be our memmap page is filled with PAGE_UNUSED already ... */ + vmemmap_use_sub_pmd(start, end); + + /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ + if (!IS_ALIGNED(start, PMD_SIZE)) + memset(page, PAGE_UNUSED, start - __pa(page)); + if (!IS_ALIGNED(end, PMD_SIZE)) + memset(__va(end), PAGE_UNUSED, __pa(page) + PMD_SIZE - end); +} + +/* Returns true if the PMD is completely unused and can be freed. */ +static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) +{ + void *page = __va(ALIGN_DOWN(start, PMD_SIZE)); + + memset(__va(start), PAGE_UNUSED, end - start); + return !memchr_inv(page, PAGE_UNUSED, PMD_SIZE); +} + /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, unsigned long end, bool add, bool direct) @@ -157,6 +193,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, get_order(PMD_SIZE)); pmd_clear(pmd); pages++; + } else if (!direct && + vmemmap_unuse_sub_pmd(addr, next)) { + vmem_free_pages(pmd_deref(*pmd), + get_order(PMD_SIZE)); + pmd_clear(pmd); } continue; } @@ -182,6 +223,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, NUMA_NO_NODE); if (new_page) { pmd_val(*pmd) = __pa(new_page) | prot; + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) { + vmemmap_use_new_sub_pmd(addr, + next); + } continue; } } @@ -189,8 +235,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, if (!pte) goto out; pmd_populate(&init_mm, pmd, pte); - } else if (pmd_large(*pmd)) + } else if (pmd_large(*pmd)) { + if (!direct) + vmemmap_use_sub_pmd(addr, next); continue; + } ret = modify_pte_table(pmd, addr, next, add, direct); if (ret) From 2c114df071935762ffa88144cdab03d84beaa702 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 Jul 2020 11:45:58 +0200 Subject: [PATCH 457/502] s390/vmemmap: avoid memset(PAGE_UNUSED) when adding consecutive sections Let's avoid memset(PAGE_UNUSED) when adding consecutive sections, whereby the vmemmap of a single section does not span full PMDs. Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Gerald Schaefer Signed-off-by: David Hildenbrand Message-Id: <20200722094558.9828-10-david@redhat.com> Signed-off-by: Heiko Carstens --- arch/s390/mm/vmem.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index df361bbacda1..70ebfc7958a6 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -74,7 +74,22 @@ static void vmem_pte_free(unsigned long *table) #define PAGE_UNUSED 0xFD -static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) +/* + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges + * from unused_pmd_start to next PMD_SIZE boundary. + */ +static unsigned long unused_pmd_start; + +static void vmemmap_flush_unused_pmd(void) +{ + if (!unused_pmd_start) + return; + memset(__va(unused_pmd_start), PAGE_UNUSED, + ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start); + unused_pmd_start = 0; +} + +static void __vmemmap_use_sub_pmd(unsigned long start, unsigned long end) { /* * As we expect to add in the same granularity as we remove, it's @@ -85,18 +100,41 @@ static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) memset(__va(start), 0, sizeof(struct page)); } +static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) +{ + /* + * We only optimize if the new used range directly follows the + * previously unused range (esp., when populating consecutive sections). + */ + if (unused_pmd_start == start) { + unused_pmd_start = end; + if (likely(IS_ALIGNED(unused_pmd_start, PMD_SIZE))) + unused_pmd_start = 0; + return; + } + vmemmap_flush_unused_pmd(); + __vmemmap_use_sub_pmd(start, end); +} + static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) { void *page = __va(ALIGN_DOWN(start, PMD_SIZE)); + vmemmap_flush_unused_pmd(); + /* Could be our memmap page is filled with PAGE_UNUSED already ... */ - vmemmap_use_sub_pmd(start, end); + __vmemmap_use_sub_pmd(start, end); /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ if (!IS_ALIGNED(start, PMD_SIZE)) memset(page, PAGE_UNUSED, start - __pa(page)); + /* + * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of + * consecutive sections. Remember for the last added PMD the last + * unused range in the populated PMD. + */ if (!IS_ALIGNED(end, PMD_SIZE)) - memset(__va(end), PAGE_UNUSED, __pa(page) + PMD_SIZE - end); + unused_pmd_start = end; } /* Returns true if the PMD is completely unused and can be freed. */ @@ -104,6 +142,7 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) { void *page = __va(ALIGN_DOWN(start, PMD_SIZE)); + vmemmap_flush_unused_pmd(); memset(__va(start), PAGE_UNUSED, end - start); return !memchr_inv(page, PAGE_UNUSED, PMD_SIZE); } From 9a996c67a65d937b23408e56935ef23404c9418e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 23 Jul 2020 21:42:36 +0200 Subject: [PATCH 458/502] s390/vmemmap: coding style updates Signed-off-by: Heiko Carstens --- arch/s390/mm/vmem.c | 55 +++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 35 deletions(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 70ebfc7958a6..1aed1a4dfc2d 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -169,17 +169,17 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, pte_clear(&init_mm, addr, pte); } else if (pte_none(*pte)) { if (!direct) { - void *new_page = vmemmap_alloc_block(PAGE_SIZE, - NUMA_NO_NODE); + void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); if (!new_page) goto out; pte_val(*pte) = __pa(new_page) | prot; - } else + } else { pte_val(*pte) = addr | prot; - } else + } + } else { continue; - + } pages++; } ret = 0; @@ -196,10 +196,10 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start) /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ pte = pte_offset_kernel(pmd, start); - for (i = 0; i < PTRS_PER_PTE; i++, pte++) + for (i = 0; i < PTRS_PER_PTE; i++, pte++) { if (!pte_none(*pte)) return; - + } vmem_pte_free(__va(pmd_deref(*pmd))); pmd_clear(pmd); } @@ -220,7 +220,6 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, pmd = pmd_offset(pud, addr); for (; addr < end; addr = next, pmd++) { next = pmd_addr_end(addr, end); - if (!add) { if (pmd_none(*pmd)) continue; @@ -228,14 +227,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) - vmem_free_pages(pmd_deref(*pmd), - get_order(PMD_SIZE)); + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); pmd_clear(pmd); pages++; - } else if (!direct && - vmemmap_unuse_sub_pmd(addr, next)) { - vmem_free_pages(pmd_deref(*pmd), - get_order(PMD_SIZE)); + } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); pmd_clear(pmd); } continue; @@ -258,14 +254,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, * page tables since vmemmap_populate gets * called for each section separately. */ - new_page = vmemmap_alloc_block(PMD_SIZE, - NUMA_NO_NODE); + new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); if (new_page) { pmd_val(*pmd) = __pa(new_page) | prot; if (!IS_ALIGNED(addr, PMD_SIZE) || !IS_ALIGNED(next, PMD_SIZE)) { - vmemmap_use_new_sub_pmd(addr, - next); + vmemmap_use_new_sub_pmd(addr, next); } continue; } @@ -279,7 +273,6 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, vmemmap_use_sub_pmd(addr, next); continue; } - ret = modify_pte_table(pmd, addr, next, add, direct); if (ret) goto out; @@ -306,12 +299,10 @@ static void try_free_pmd_table(pud_t *pud, unsigned long start) if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) return; #endif - pmd = pmd_offset(pud, start); for (i = 0; i < PTRS_PER_PMD; i++, pmd++) if (!pmd_none(*pmd)) return; - vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); pud_clear(pud); } @@ -327,11 +318,9 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, prot = pgprot_val(REGION3_KERNEL); if (!MACHINE_HAS_NX) prot &= ~_REGION_ENTRY_NOEXEC; - pud = pud_offset(p4d, addr); for (; addr < end; addr = next, pud++) { next = pud_addr_end(addr, end); - if (!add) { if (pud_none(*pud)) continue; @@ -356,9 +345,9 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (!pmd) goto out; pud_populate(&init_mm, pud, pmd); - } else if (pud_large(*pud)) + } else if (pud_large(*pud)) { continue; - + } ret = modify_pmd_table(pud, addr, next, add, direct); if (ret) goto out; @@ -387,10 +376,10 @@ static void try_free_pud_table(p4d_t *p4d, unsigned long start) #endif pud = pud_offset(p4d, start); - for (i = 0; i < PTRS_PER_PUD; i++, pud++) + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { if (!pud_none(*pud)) return; - + } vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); p4d_clear(p4d); } @@ -406,7 +395,6 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, p4d = p4d_offset(pgd, addr); for (; addr < end; addr = next, p4d++) { next = p4d_addr_end(addr, end); - if (!add) { if (p4d_none(*p4d)) continue; @@ -415,7 +403,6 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, if (!pud) goto out; } - ret = modify_pud_table(p4d, addr, next, add, direct); if (ret) goto out; @@ -442,10 +429,10 @@ static void try_free_p4d_table(pgd_t *pgd, unsigned long start) #endif p4d = p4d_offset(pgd, start); - for (i = 0; i < PTRS_PER_P4D; i++, p4d++) + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { if (!p4d_none(*p4d)) return; - + } vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); pgd_clear(pgd); } @@ -460,7 +447,6 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) return -EINVAL; - for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); @@ -474,7 +460,6 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, goto out; pgd_populate(&init_mm, pgd, p4d); } - ret = modify_p4d_table(pgd, addr, next, add, direct); if (ret) goto out; @@ -518,7 +503,7 @@ static void vmem_remove_range(unsigned long start, unsigned long size) * Add a backed mem_map array to the virtual mem_map array. */ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap) { int ret; @@ -532,7 +517,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, } void vmemmap_free(unsigned long start, unsigned long end, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap) { mutex_lock(&vmem_mutex); remove_pagetable(start, end, false); From ed00495333ccc80fc8fb86fb43773c3c2a499466 Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Mon, 27 Jul 2020 14:48:52 +0200 Subject: [PATCH 459/502] locking/lockdep: Fix TRACE_IRQFLAGS vs. NMIs Prior to commit: 859d069ee1dd ("lockdep: Prepare for NMI IRQ state tracking") IRQ state tracking was disabled in NMIs due to nmi_enter() doing lockdep_off() -- with the obvious requirement that NMI entry call nmi_enter() before trace_hardirqs_off(). [ AFAICT, PowerPC and SH violate this order on their NMI entry ] However, that commit explicitly changed lockdep_hardirqs_*() to ignore lockdep_off() and breaks every architecture that has irq-tracing in it's NMI entry that hasn't been fixed up (x86 being the only fixed one at this point). The reason for this change is that by ignoring lockdep_off() we can: - get rid of 'current->lockdep_recursion' in lockdep_assert_irqs*() which was going to to give header-recursion issues with the seqlock rework. - allow these lockdep_assert_*() macros to function in NMI context. Restore the previous state of things and allow an architecture to opt-in to the NMI IRQ tracking support, however instead of relying on lockdep_off(), rely on in_nmi(), both are part of nmi_enter() and so over-all entry ordering doesn't need to change. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200727124852.GK119549@hirez.programming.kicks-ass.net --- arch/x86/Kconfig.debug | 3 +++ kernel/locking/lockdep.c | 8 +++++++- lib/Kconfig.debug | 6 ++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 0dd319e6e5b4..ee1d3c5834c6 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -3,6 +3,9 @@ config TRACE_IRQFLAGS_SUPPORT def_bool y +config TRACE_IRQFLAGS_NMI_SUPPORT + def_bool y + config EARLY_PRINTK_USB bool diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d595623c4b34..8b0b28b4546b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3712,6 +3712,9 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) * and not rely on hardware state like normal interrupts. */ if (unlikely(in_nmi())) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + /* * Skip: * - recursion check, because NMI can hit lockdep; @@ -3773,7 +3776,10 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) * they will restore the software state. This ensures the software * state is consistent inside NMIs as well. */ - if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK))) + if (in_nmi()) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) return; /* diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9ad9210d70a1..fa964b51f066 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1325,11 +1325,17 @@ config WW_MUTEX_SELFTEST endmenu # lock debugging config TRACE_IRQFLAGS + depends on TRACE_IRQFLAGS_SUPPORT bool help Enables hooks to interrupt enabling and disabling for either tracing or lock debugging. +config TRACE_IRQFLAGS_NMI + def_bool y + depends on TRACE_IRQFLAGS + depends on TRACE_IRQFLAGS_NMI_SUPPORT + config STACKTRACE bool "Stack backtrace support" depends on STACKTRACE_SUPPORT From f0c7baca180046824e07fc5f1326e83a8fd150c7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2020 22:44:41 +0200 Subject: [PATCH 460/502] genirq/affinity: Make affinity setting if activated opt-in John reported that on a RK3288 system the perf per CPU interrupts are all affine to CPU0 and provided the analysis: "It looks like what happens is that because the interrupts are not per-CPU in the hardware, armpmu_request_irq() calls irq_force_affinity() while the interrupt is deactivated and then request_irq() with IRQF_PERCPU | IRQF_NOBALANCING. Now when irq_startup() runs with IRQ_STARTUP_NORMAL, it calls irq_setup_affinity() which returns early because IRQF_PERCPU and IRQF_NOBALANCING are set, leaving the interrupt on its original CPU." This was broken by the recent commit which blocked interrupt affinity setting in hardware before activation of the interrupt. While this works in general, it does not work for this particular case. As contrary to the initial analysis not all interrupt chip drivers implement an activate callback, the safe cure is to make the deferred interrupt affinity setting at activation time opt-in. Implement the necessary core logic and make the two irqchip implementations for which this is required opt-in. In hindsight this would have been the right thing to do, but ... Fixes: baedb87d1b53 ("genirq/affinity: Handle affinity setting on inactive interrupts correctly") Reported-by: John Keeping Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Acked-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/87blk4tzgm.fsf@nanos.tec.linutronix.de --- arch/x86/kernel/apic/vector.c | 4 ++++ drivers/irqchip/irq-gic-v3-its.c | 5 ++++- include/linux/irq.h | 13 +++++++++++++ kernel/irq/manage.c | 6 +++++- 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 7649da2478d8..dae32d948bf2 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -560,6 +560,10 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, * as that can corrupt the affinity move state. */ irqd_set_handle_enforce_irqctx(irqd); + + /* Don't invoke affinity setter on deactivated interrupts */ + irqd_set_affinity_on_activate(irqd); + /* * Legacy vectors are already assigned when the IOAPIC * takes them over. They stay on the same vector. This is diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index beac4caefad9..103d850b5595 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3523,6 +3523,7 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, msi_alloc_info_t *info = args; struct its_device *its_dev = info->scratchpad[0].ptr; struct its_node *its = its_dev->its; + struct irq_data *irqd; irq_hw_number_t hwirq; int err; int i; @@ -3542,7 +3543,9 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, &its_irq_chip, its_dev); - irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(virq + i))); + irqd = irq_get_irq_data(virq + i); + irqd_set_single_target(irqd); + irqd_set_affinity_on_activate(irqd); pr_debug("ID:%d pID:%d vID:%d\n", (int)(hwirq + i - its_dev->event_map.lpi_base), (int)(hwirq + i), virq + i); diff --git a/include/linux/irq.h b/include/linux/irq.h index 8d5bc2c237d7..1b7f4dfee35b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -213,6 +213,8 @@ struct irq_data { * required * IRQD_HANDLE_ENFORCE_IRQCTX - Enforce that handle_irq_*() is only invoked * from actual interrupt context. + * IRQD_AFFINITY_ON_ACTIVATE - Affinity is set on activation. Don't call + * irq_chip::irq_set_affinity() when deactivated. */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -237,6 +239,7 @@ enum { IRQD_CAN_RESERVE = (1 << 26), IRQD_MSI_NOMASK_QUIRK = (1 << 27), IRQD_HANDLE_ENFORCE_IRQCTX = (1 << 28), + IRQD_AFFINITY_ON_ACTIVATE = (1 << 29), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -421,6 +424,16 @@ static inline bool irqd_msi_nomask_quirk(struct irq_data *d) return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK; } +static inline void irqd_set_affinity_on_activate(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE; +} + +static inline bool irqd_affinity_on_activate(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_AFFINITY_ON_ACTIVATE; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2a9fec53e159..48c38e09c673 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -320,12 +320,16 @@ static bool irq_set_affinity_deactivated(struct irq_data *data, struct irq_desc *desc = irq_data_to_desc(data); /* + * Handle irq chips which can handle affinity only in activated + * state correctly + * * If the interrupt is not yet activated, just store the affinity * mask and do not call the chip driver at all. On activation the * driver has to make sure anyway that the interrupt is in a * useable state so startup works. */ - if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data)) + if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || + irqd_is_activated(data) || !irqd_affinity_on_activate(data)) return false; cpumask_copy(desc->irq_common_data.affinity, mask); From aa251fc5b936d3ddb4b4c4b36427eb9aa3347c82 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 25 Jul 2020 13:30:55 +0100 Subject: [PATCH 461/502] genirq/debugfs: Add missing irqchip flags Recently introduced irqchip flags lack the corresponding printouts in debugfs. Add them. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/874kpvydxc.wl-maz@kernel.org --- kernel/irq/debugfs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 4f9f844074db..b95ff5d5f4bd 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -112,6 +112,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_AFFINITY_SET), BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), + BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), BIT_MASK_DESCR(IRQD_CAN_RESERVE), BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), @@ -120,6 +121,10 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_WAKEUP_STATE), BIT_MASK_DESCR(IRQD_WAKEUP_ARMED), + + BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET), + + BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), }; static const struct irq_bit_descr irqdesc_states[] = { From e885d5d94793ef342e49d55672baabbc16e32bb1 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 16 Jul 2020 16:36:50 +1000 Subject: [PATCH 462/502] lockdep: Move list.h inclusion into lockdep.h Currently lockdep_types.h includes list.h without actually using any of its macros or functions. All it needs are the type definitions which were moved into types.h long ago. This potentially causes inclusion loops because both are included by many core header files. This patch moves the list.h inclusion into lockdep.h. Note that we could probably remove it completely but that could potentially result in compile failures should any end users not include list.h directly and also be unlucky enough to not get list.h via some other header file. Reported-by: Petr Mladek Signed-off-by: Herbert Xu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Petr Mladek Link: https://lkml.kernel.org/r/20200716063649.GA23065@gondor.apana.org.au --- include/linux/lockdep.h | 1 + include/linux/lockdep_types.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 3b73cf84f77d..b1ad5c045353 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -21,6 +21,7 @@ extern int lock_stat; #ifdef CONFIG_LOCKDEP #include +#include #include #include diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 7b9350624577..bb35b449f533 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -32,8 +32,6 @@ enum lockdep_wait_type { #ifdef CONFIG_LOCKDEP -#include - /* * We'd rather not expose kernel/lockdep_states.h this wide, but we do need * the total number of states... :-( From 112a0e4171e111e963aada3fe790c71accf4d705 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 28 Jul 2020 16:34:00 +0900 Subject: [PATCH 463/502] kprobes: Remove unnecessary module_mutex locking from kprobe_optimizer() Since we already lock both kprobe_mutex and text_mutex in the optimizer, text will not be changed and the module unloading will be stopped inside kprobes_module_callback(). The mutex_lock() has originally been introduced to avoid conflict with text modification, at that point we didn't hold text_mutex. But after: f1c6ece23729 ("kprobes: Fix potential deadlock in kprobe_optimizer()") We started holding the text_mutex and don't need the modules mutex anyway. So remove the module_mutex locking. [ mingo: Amended the changelog. ] Suggested-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Cc: Jarkko Sakkinen Link: https://lore.kernel.org/r/20200728163400.e00b09c594763349f99ce6cb@kernel.org --- kernel/kprobes.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 146c648eb943..e87679a48ba2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -598,8 +598,6 @@ static void kprobe_optimizer(struct work_struct *work) mutex_lock(&kprobe_mutex); cpus_read_lock(); mutex_lock(&text_mutex); - /* Lock modules while optimizing kprobes */ - mutex_lock(&module_mutex); /* * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) @@ -624,7 +622,6 @@ static void kprobe_optimizer(struct work_struct *work) /* Step 4: Free cleaned kprobes after quiesence period */ do_free_cleaned_kprobes(); - mutex_unlock(&module_mutex); mutex_unlock(&text_mutex); cpus_read_unlock(); From d903b6d029d66e6478562d75ea18d89098f7b7e8 Mon Sep 17 00:00:00 2001 From: Pu Wen Date: Mon, 20 Jul 2020 16:22:05 +0800 Subject: [PATCH 464/502] perf/x86/rapl: Add Hygon Fam18h RAPL support Hygon Family 18h(Dhyana) support RAPL in bit 14 of CPUID 0x80000007 EDX, and has MSRs RAPL_PWR_UNIT/CORE_ENERGY_STAT/PKG_ENERGY_STAT. So add Hygon Dhyana Family 18h support for RAPL. The output is available via the energy-pkg pseudo event: $ perf stat -a -I 1000 --per-socket -e power/energy-pkg/ [ mingo: Tidied up the initializers. ] Signed-off-by: Pu Wen Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200720082205.1307-1-puwen@hygon.cn --- arch/x86/events/rapl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 0f2bf59f4354..68b38820b10e 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -787,7 +787,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), - X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), + X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), + X86_MATCH_VENDOR_FAM(HYGON, 0x18, &model_amd_fam17h), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); From 07d2e59f27cd728e6982b52441673886a6d04267 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:02 +0100 Subject: [PATCH 465/502] ACPI/IORT: Make iort_match_node_callback walk the ACPI namespace for NC When the iort_match_node_callback is invoked for a named component the match should be executed upon a device with an ACPI companion. For devices with no ACPI companion set-up the ACPI device tree must be walked in order to find the first parent node with a companion set and check the parent node against the named component entry to check whether there is a match and therefore an IORT node describing the in/out ID translation for the device has been found. Signed-off-by: Lorenzo Pieralisi Cc: Will Deacon Cc: Hanjun Guo Cc: Sudeep Holla Cc: Robin Murphy Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20200619082013.13661-2-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 28a6b387e80e..5eee81758184 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -264,15 +264,31 @@ static acpi_status iort_match_node_callback(struct acpi_iort_node *node, if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; - struct acpi_device *adev = to_acpi_device_node(dev->fwnode); + struct acpi_device *adev; struct acpi_iort_named_component *ncomp; + struct device *nc_dev = dev; + + /* + * Walk the device tree to find a device with an + * ACPI companion; there is no point in scanning + * IORT for a device matching a named component if + * the device does not have an ACPI companion to + * start with. + */ + do { + adev = ACPI_COMPANION(nc_dev); + if (adev) + break; + + nc_dev = nc_dev->parent; + } while (nc_dev); if (!adev) goto out; status = acpi_get_name(adev->handle, ACPI_FULL_PATHNAME, &buf); if (ACPI_FAILURE(status)) { - dev_warn(dev, "Can't get device full path name\n"); + dev_warn(nc_dev, "Can't get device full path name\n"); goto out; } From d1718a1b7a86743b9c517bf9521695ba909c734f Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:03 +0100 Subject: [PATCH 466/502] ACPI/IORT: Make iort_get_device_domain IRQ domain agnostic iort_get_device_domain() is PCI specific but it need not be, since it can be used to retrieve IRQ domain nexus of any kind by adding an irq_domain_bus_token input to it. Make it PCI agnostic by also renaming the requestor ID input to a more generic ID name. Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas # pci/msi.c Cc: Will Deacon Cc: Hanjun Guo Cc: Bjorn Helgaas Cc: Sudeep Holla Cc: Robin Murphy Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20200619082013.13661-3-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 14 +++++++------- drivers/pci/msi.c | 3 ++- include/linux/acpi_iort.h | 7 ++++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 5eee81758184..902e2aaca946 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -550,7 +550,6 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev) node = iort_get_iort_node(dev->fwnode); if (node) return node; - /* * if not, then it should be a platform device defined in * DSDT/SSDT (with Named Component node in IORT) @@ -641,13 +640,13 @@ static int __maybe_unused iort_find_its_base(u32 its_id, phys_addr_t *base) /** * iort_dev_find_its_id() - Find the ITS identifier for a device * @dev: The device. - * @req_id: Device's requester ID + * @id: Device's ID * @idx: Index of the ITS identifier list. * @its_id: ITS identifier. * * Returns: 0 on success, appropriate error value otherwise */ -static int iort_dev_find_its_id(struct device *dev, u32 req_id, +static int iort_dev_find_its_id(struct device *dev, u32 id, unsigned int idx, int *its_id) { struct acpi_iort_its_group *its; @@ -657,7 +656,7 @@ static int iort_dev_find_its_id(struct device *dev, u32 req_id, if (!node) return -ENXIO; - node = iort_node_map_id(node, req_id, NULL, IORT_MSI_TYPE); + node = iort_node_map_id(node, id, NULL, IORT_MSI_TYPE); if (!node) return -ENXIO; @@ -680,19 +679,20 @@ static int iort_dev_find_its_id(struct device *dev, u32 req_id, * * Returns: the MSI domain for this device, NULL otherwise */ -struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id) +struct irq_domain *iort_get_device_domain(struct device *dev, u32 id, + enum irq_domain_bus_token bus_token) { struct fwnode_handle *handle; int its_id; - if (iort_dev_find_its_id(dev, req_id, 0, &its_id)) + if (iort_dev_find_its_id(dev, id, 0, &its_id)) return NULL; handle = iort_find_domain_token(its_id); if (!handle) return NULL; - return irq_find_matching_fwnode(handle, DOMAIN_BUS_PCI_MSI); + return irq_find_matching_fwnode(handle, bus_token); } static void iort_set_device_domain(struct device *dev, diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 6b43a5455c7a..74a91f52ecc0 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1558,7 +1558,8 @@ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev) pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); dom = of_msi_map_get_device_domain(&pdev->dev, rid); if (!dom) - dom = iort_get_device_domain(&pdev->dev, rid); + dom = iort_get_device_domain(&pdev->dev, rid, + DOMAIN_BUS_PCI_MSI); return dom; } #endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */ diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index 8e7e2ec37f1b..08ec6bd2297f 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -29,7 +29,8 @@ struct fwnode_handle *iort_find_domain_token(int trans_id); #ifdef CONFIG_ACPI_IORT void acpi_iort_init(void); u32 iort_msi_map_rid(struct device *dev, u32 req_id); -struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id); +struct irq_domain *iort_get_device_domain(struct device *dev, u32 id, + enum irq_domain_bus_token bus_token); void acpi_configure_pmsi_domain(struct device *dev); int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id); /* IOMMU interface */ @@ -40,8 +41,8 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head); static inline void acpi_iort_init(void) { } static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id) { return req_id; } -static inline struct irq_domain *iort_get_device_domain(struct device *dev, - u32 req_id) +static inline struct irq_domain *iort_get_device_domain( + struct device *dev, u32 id, enum irq_domain_bus_token bus_token) { return NULL; } static inline void acpi_configure_pmsi_domain(struct device *dev) { } /* IOMMU interface */ From 39c3cf566ceafa7c1ae331a5f26fbb685d670001 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:04 +0100 Subject: [PATCH 467/502] ACPI/IORT: Make iort_msi_map_rid() PCI agnostic There is nothing PCI specific in iort_msi_map_rid(). Rename the function using a bus protocol agnostic name, iort_msi_map_id(), and convert current callers to it. Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas Cc: Will Deacon Cc: Hanjun Guo Cc: Bjorn Helgaas Cc: Sudeep Holla Cc: Robin Murphy Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20200619082013.13661-4-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 12 ++++++------ drivers/pci/msi.c | 2 +- include/linux/acpi_iort.h | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 902e2aaca946..53f9ef515089 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -568,22 +568,22 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev) } /** - * iort_msi_map_rid() - Map a MSI requester ID for a device + * iort_msi_map_id() - Map a MSI input ID for a device * @dev: The device for which the mapping is to be done. - * @req_id: The device requester ID. + * @input_id: The device input ID. * - * Returns: mapped MSI RID on success, input requester ID otherwise + * Returns: mapped MSI ID on success, input ID otherwise */ -u32 iort_msi_map_rid(struct device *dev, u32 req_id) +u32 iort_msi_map_id(struct device *dev, u32 input_id) { struct acpi_iort_node *node; u32 dev_id; node = iort_find_dev_node(dev); if (!node) - return req_id; + return input_id; - iort_node_map_id(node, req_id, &dev_id, IORT_MSI_TYPE); + iort_node_map_id(node, input_id, &dev_id, IORT_MSI_TYPE); return dev_id; } diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 74a91f52ecc0..77f48b95e277 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1536,7 +1536,7 @@ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev) of_node = irq_domain_get_of_node(domain); rid = of_node ? of_msi_map_rid(&pdev->dev, of_node, rid) : - iort_msi_map_rid(&pdev->dev, rid); + iort_msi_map_id(&pdev->dev, rid); return rid; } diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index 08ec6bd2297f..e51425e083da 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -28,7 +28,7 @@ void iort_deregister_domain_token(int trans_id); struct fwnode_handle *iort_find_domain_token(int trans_id); #ifdef CONFIG_ACPI_IORT void acpi_iort_init(void); -u32 iort_msi_map_rid(struct device *dev, u32 req_id); +u32 iort_msi_map_id(struct device *dev, u32 id); struct irq_domain *iort_get_device_domain(struct device *dev, u32 id, enum irq_domain_bus_token bus_token); void acpi_configure_pmsi_domain(struct device *dev); @@ -39,8 +39,8 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev); int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head); #else static inline void acpi_iort_init(void) { } -static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id) -{ return req_id; } +static inline u32 iort_msi_map_id(struct device *dev, u32 id) +{ return id; } static inline struct irq_domain *iort_get_device_domain( struct device *dev, u32 id, enum irq_domain_bus_token bus_token) { return NULL; } From 3a3d208beede7ae03f8c80bed01f47d6b98d4ceb Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:05 +0100 Subject: [PATCH 468/502] ACPI/IORT: Remove useless PCI bus walk The PCI bus domain number (used in the iort_match_node_callback() - pci_domain_nr() call) is cascaded through the PCI bus hierarchy at PCI bus enumeration time, therefore there is no need in iort_find_dev_node() to walk the PCI bus upwards to grab the root bus to be passed to iort_scan_node(), the device->bus PCI bus pointer will do. Remove this useless code. Signed-off-by: Lorenzo Pieralisi Cc: Will Deacon Cc: Hanjun Guo Cc: Sudeep Holla Cc: Robin Murphy Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20200619082013.13661-5-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 53f9ef515089..421c6976ab81 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -558,10 +558,7 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev) iort_match_node_callback, dev); } - /* Find a PCI root bus */ pbus = to_pci_dev(dev)->bus; - while (!pci_is_root_bus(pbus)) - pbus = pbus->parent; return iort_scan_node(ACPI_IORT_NODE_PCI_ROOT_COMPLEX, iort_match_node_callback, &pbus->dev); From b8e069a2a8da02137605ba585837a3a0c45df01a Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:06 +0100 Subject: [PATCH 469/502] ACPI/IORT: Add an input ID to acpi_dma_configure() Some HW devices are created as child devices of proprietary busses, that have a bus specific policy defining how the child devices wires representing the devices ID are translated into IOMMU and IRQ controllers device IDs. Current IORT code provides translations for: - PCI devices, where the device ID is well identified at bus level as the requester ID (RID) - Platform devices that are endpoint devices where the device ID is retrieved from the ACPI object IORT mappings (Named components single mappings). A platform device is represented in IORT as a named component node For devices that are child devices of proprietary busses the IORT firmware represents the bus node as a named component node in IORT and it is up to that named component node to define in/out bus specific ID translations for the bus child devices that are allocated and created in a bus specific manner. In order to make IORT ID translations available for proprietary bus child devices, the current ACPI (and IORT) code must be augmented to provide an additional ID parameter to acpi_dma_configure() representing the child devices input ID. This ID is bus specific and it is retrieved in bus specific code. By adding an ID parameter to acpi_dma_configure(), the IORT code can map the child device ID to an IOMMU stream ID through the IORT named component representing the bus in/out ID mappings. Signed-off-by: Lorenzo Pieralisi Cc: Will Deacon Cc: Hanjun Guo Cc: Sudeep Holla Cc: Robin Murphy Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20200619082013.13661-6-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 59 +++++++++++++++++++++++++++++---------- drivers/acpi/scan.c | 8 ++++-- include/acpi/acpi_bus.h | 9 ++++-- include/linux/acpi.h | 7 +++++ include/linux/acpi_iort.h | 7 +++-- 5 files changed, 67 insertions(+), 23 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 421c6976ab81..ec782e4a0fe4 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -978,19 +978,54 @@ static void iort_named_component_init(struct device *dev, nc->node_flags); } +static int iort_nc_iommu_map(struct device *dev, struct acpi_iort_node *node) +{ + struct acpi_iort_node *parent; + int err = -ENODEV, i = 0; + u32 streamid = 0; + + do { + + parent = iort_node_map_platform_id(node, &streamid, + IORT_IOMMU_TYPE, + i++); + + if (parent) + err = iort_iommu_xlate(dev, parent, streamid); + } while (parent && !err); + + return err; +} + +static int iort_nc_iommu_map_id(struct device *dev, + struct acpi_iort_node *node, + const u32 *in_id) +{ + struct acpi_iort_node *parent; + u32 streamid; + + parent = iort_node_map_id(node, *in_id, &streamid, IORT_IOMMU_TYPE); + if (parent) + return iort_iommu_xlate(dev, parent, streamid); + + return -ENODEV; +} + + /** - * iort_iommu_configure - Set-up IOMMU configuration for a device. + * iort_iommu_configure_id - Set-up IOMMU configuration for a device. * * @dev: device to configure + * @id_in: optional input id const value pointer * * Returns: iommu_ops pointer on configuration success * NULL on configuration failure */ -const struct iommu_ops *iort_iommu_configure(struct device *dev) +const struct iommu_ops *iort_iommu_configure_id(struct device *dev, + const u32 *id_in) { - struct acpi_iort_node *node, *parent; + struct acpi_iort_node *node; const struct iommu_ops *ops; - u32 streamid = 0; int err = -ENODEV; /* @@ -1019,21 +1054,13 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev) if (fwspec && iort_pci_rc_supports_ats(node)) fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS; } else { - int i = 0; - node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT, iort_match_node_callback, dev); if (!node) return NULL; - do { - parent = iort_node_map_platform_id(node, &streamid, - IORT_IOMMU_TYPE, - i++); - - if (parent) - err = iort_iommu_xlate(dev, parent, streamid); - } while (parent && !err); + err = id_in ? iort_nc_iommu_map_id(dev, node, id_in) : + iort_nc_iommu_map(dev, node); if (!err) iort_named_component_init(dev, node); @@ -1058,6 +1085,7 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev) return ops; } + #else static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev) { return NULL; } @@ -1066,7 +1094,8 @@ static inline int iort_add_device_replay(const struct iommu_ops *ops, { return 0; } int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) { return 0; } -const struct iommu_ops *iort_iommu_configure(struct device *dev) +const struct iommu_ops *iort_iommu_configure_id(struct device *dev, + const u32 *input_id) { return NULL; } #endif diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 8777faced51a..2142f1554761 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1457,8 +1457,10 @@ int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset, * acpi_dma_configure - Set-up DMA configuration for the device. * @dev: The pointer to the device * @attr: device dma attributes + * @input_id: input device id const value pointer */ -int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) +int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, + const u32 *input_id) { const struct iommu_ops *iommu; u64 dma_addr = 0, size = 0; @@ -1470,7 +1472,7 @@ int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) iort_dma_setup(dev, &dma_addr, &size); - iommu = iort_iommu_configure(dev); + iommu = iort_iommu_configure_id(dev, input_id); if (PTR_ERR(iommu) == -EPROBE_DEFER) return -EPROBE_DEFER; @@ -1479,7 +1481,7 @@ int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) return 0; } -EXPORT_SYMBOL_GPL(acpi_dma_configure); +EXPORT_SYMBOL_GPL(acpi_dma_configure_id); static void acpi_init_coherency(struct acpi_device *adev) { diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 5afb6ceb284f..a3abcc4b7d9f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -588,8 +588,13 @@ bool acpi_dma_supported(struct acpi_device *adev); enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev); int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset, u64 *size); -int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr); - +int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, + const u32 *input_id); +static inline int acpi_dma_configure(struct device *dev, + enum dev_dma_attr attr) +{ + return acpi_dma_configure_id(dev, attr, NULL); +} struct acpi_device *acpi_find_child_device(struct acpi_device *parent, u64 address, bool check_children); int acpi_is_root_bridge(acpi_handle); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index d661cd0ee64d..6d2c47489d90 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -905,6 +905,13 @@ static inline int acpi_dma_configure(struct device *dev, return 0; } +static inline int acpi_dma_configure_id(struct device *dev, + enum dev_dma_attr attr, + const u32 *input_id) +{ + return 0; +} + #define ACPI_PTR(_ptr) (NULL) static inline void acpi_device_set_enumerated(struct acpi_device *adev) diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index e51425e083da..20a32120bb88 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -35,7 +35,8 @@ void acpi_configure_pmsi_domain(struct device *dev); int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id); /* IOMMU interface */ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size); -const struct iommu_ops *iort_iommu_configure(struct device *dev); +const struct iommu_ops *iort_iommu_configure_id(struct device *dev, + const u32 *id_in); int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head); #else static inline void acpi_iort_init(void) { } @@ -48,8 +49,8 @@ static inline void acpi_configure_pmsi_domain(struct device *dev) { } /* IOMMU interface */ static inline void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size) { } -static inline const struct iommu_ops *iort_iommu_configure( - struct device *dev) +static inline const struct iommu_ops *iort_iommu_configure_id( + struct device *dev, const u32 *id_in) { return NULL; } static inline int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) From 746a71d02b5d15817fcb13c956ba999a87773952 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:07 +0100 Subject: [PATCH 470/502] of/iommu: Make of_map_rid() PCI agnostic There is nothing PCI specific (other than the RID - requester ID) in the of_map_rid() implementation, so the same function can be reused for input/output IDs mapping for other busses just as well. Rename the RID instances/names to a generic "id" tag. No functionality change intended. Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Joerg Roedel Cc: Rob Herring Cc: Joerg Roedel Cc: Robin Murphy Cc: Marc Zyngier Link: https://lore.kernel.org/r/20200619082013.13661-7-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/iommu/of_iommu.c | 4 ++-- drivers/of/base.c | 42 ++++++++++++++++++++-------------------- drivers/of/irq.c | 2 +- include/linux/of.h | 4 ++-- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 20738aacac89..016316244737 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -129,7 +129,7 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) struct of_phandle_args iommu_spec = { .args_count = 1 }; int err; - err = of_map_rid(info->np, alias, "iommu-map", "iommu-map-mask", + err = of_map_id(info->np, alias, "iommu-map", "iommu-map-mask", &iommu_spec.np, iommu_spec.args); if (err) return err == -ENODEV ? NO_IOMMU : err; @@ -145,7 +145,7 @@ static int of_fsl_mc_iommu_init(struct fsl_mc_device *mc_dev, struct of_phandle_args iommu_spec = { .args_count = 1 }; int err; - err = of_map_rid(master_np, mc_dev->icid, "iommu-map", + err = of_map_id(master_np, mc_dev->icid, "iommu-map", "iommu-map-mask", &iommu_spec.np, iommu_spec.args); if (err) diff --git a/drivers/of/base.c b/drivers/of/base.c index ae03b1218b06..ea44fea99813 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -2201,15 +2201,15 @@ int of_find_last_cache_level(unsigned int cpu) } /** - * of_map_rid - Translate a requester ID through a downstream mapping. + * of_map_id - Translate an ID through a downstream mapping. * @np: root complex device node. - * @rid: device requester ID to map. + * @id: device ID to map. * @map_name: property name of the map to use. * @map_mask_name: optional property name of the mask to use. * @target: optional pointer to a target device node. * @id_out: optional pointer to receive the translated ID. * - * Given a device requester ID, look up the appropriate implementation-defined + * Given a device ID, look up the appropriate implementation-defined * platform ID and/or the target device which receives transactions on that * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or * @id_out may be NULL if only the other is required. If @target points to @@ -2219,11 +2219,11 @@ int of_find_last_cache_level(unsigned int cpu) * * Return: 0 on success or a standard error code on failure. */ -int of_map_rid(struct device_node *np, u32 rid, +int of_map_id(struct device_node *np, u32 id, const char *map_name, const char *map_mask_name, struct device_node **target, u32 *id_out) { - u32 map_mask, masked_rid; + u32 map_mask, masked_id; int map_len; const __be32 *map = NULL; @@ -2235,7 +2235,7 @@ int of_map_rid(struct device_node *np, u32 rid, if (target) return -ENODEV; /* Otherwise, no map implies no translation */ - *id_out = rid; + *id_out = id; return 0; } @@ -2255,22 +2255,22 @@ int of_map_rid(struct device_node *np, u32 rid, if (map_mask_name) of_property_read_u32(np, map_mask_name, &map_mask); - masked_rid = map_mask & rid; + masked_id = map_mask & id; for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) { struct device_node *phandle_node; - u32 rid_base = be32_to_cpup(map + 0); + u32 id_base = be32_to_cpup(map + 0); u32 phandle = be32_to_cpup(map + 1); u32 out_base = be32_to_cpup(map + 2); - u32 rid_len = be32_to_cpup(map + 3); + u32 id_len = be32_to_cpup(map + 3); - if (rid_base & ~map_mask) { - pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores rid-base (0x%x)\n", + if (id_base & ~map_mask) { + pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores id-base (0x%x)\n", np, map_name, map_name, - map_mask, rid_base); + map_mask, id_base); return -EFAULT; } - if (masked_rid < rid_base || masked_rid >= rid_base + rid_len) + if (masked_id < id_base || masked_id >= id_base + id_len) continue; phandle_node = of_find_node_by_phandle(phandle); @@ -2288,20 +2288,20 @@ int of_map_rid(struct device_node *np, u32 rid, } if (id_out) - *id_out = masked_rid - rid_base + out_base; + *id_out = masked_id - id_base + out_base; - pr_debug("%pOF: %s, using mask %08x, rid-base: %08x, out-base: %08x, length: %08x, rid: %08x -> %08x\n", - np, map_name, map_mask, rid_base, out_base, - rid_len, rid, masked_rid - rid_base + out_base); + pr_debug("%pOF: %s, using mask %08x, id-base: %08x, out-base: %08x, length: %08x, id: %08x -> %08x\n", + np, map_name, map_mask, id_base, out_base, + id_len, id, masked_id - id_base + out_base); return 0; } - pr_info("%pOF: no %s translation for rid 0x%x on %pOF\n", np, map_name, - rid, target && *target ? *target : NULL); + pr_info("%pOF: no %s translation for id 0x%x on %pOF\n", np, map_name, + id, target && *target ? *target : NULL); /* Bypasses translation */ if (id_out) - *id_out = rid; + *id_out = id; return 0; } -EXPORT_SYMBOL_GPL(of_map_rid); +EXPORT_SYMBOL_GPL(of_map_id); diff --git a/drivers/of/irq.c b/drivers/of/irq.c index a296eaf52a5b..d632bc5b3a2d 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -587,7 +587,7 @@ static u32 __of_msi_map_rid(struct device *dev, struct device_node **np, * "msi-map" property. */ for (parent_dev = dev; parent_dev; parent_dev = parent_dev->parent) - if (!of_map_rid(parent_dev->of_node, rid_in, "msi-map", + if (!of_map_id(parent_dev->of_node, rid_in, "msi-map", "msi-map-mask", np, &rid_out)) break; return rid_out; diff --git a/include/linux/of.h b/include/linux/of.h index c669c0a4732f..60abe3f636ad 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -554,7 +554,7 @@ bool of_console_check(struct device_node *dn, char *name, int index); extern int of_cpu_node_to_id(struct device_node *np); -int of_map_rid(struct device_node *np, u32 rid, +int of_map_id(struct device_node *np, u32 id, const char *map_name, const char *map_mask_name, struct device_node **target, u32 *id_out); @@ -978,7 +978,7 @@ static inline int of_cpu_node_to_id(struct device_node *np) return -ENODEV; } -static inline int of_map_rid(struct device_node *np, u32 rid, +static inline int of_map_id(struct device_node *np, u32 id, const char *map_name, const char *map_mask_name, struct device_node **target, u32 *id_out) { From a081bd4af4ce80d845a0bab355ab5d0822db8058 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:08 +0100 Subject: [PATCH 471/502] of/device: Add input id to of_dma_configure() Devices sitting on proprietary busses have a device ID space that is owned by the respective bus and related firmware bindings. In order to let the generic OF layer handle the input translations to an IOMMU id, for such busses the current of_dma_configure() interface should be extended in order to allow the bus layer to provide the device input id parameter - that is retrieved/assigned in bus specific code and firmware. Augment of_dma_configure() to add an optional input_id parameter, leaving current functionality unchanged. Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Cc: Rob Herring Cc: Robin Murphy Cc: Joerg Roedel Cc: Laurentiu Tudor Link: https://lore.kernel.org/r/20200619082013.13661-8-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/bus/fsl-mc/fsl-mc-bus.c | 4 +- drivers/iommu/of_iommu.c | 85 ++++++++++++++++++--------------- drivers/of/device.c | 8 ++-- include/linux/of_device.h | 16 ++++++- include/linux/of_iommu.h | 6 ++- 5 files changed, 72 insertions(+), 47 deletions(-) diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index 40526da5c6a6..8ead3f0238f2 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -118,11 +118,13 @@ static int fsl_mc_bus_uevent(struct device *dev, struct kobj_uevent_env *env) static int fsl_mc_dma_configure(struct device *dev) { struct device *dma_dev = dev; + struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev); + u32 input_id = mc_dev->icid; while (dev_is_fsl_mc(dma_dev)) dma_dev = dma_dev->parent; - return of_dma_configure(dev, dma_dev->of_node, 0); + return of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id); } static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 016316244737..e505b9130a1c 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -118,6 +118,43 @@ static int of_iommu_xlate(struct device *dev, return ret; } +static int of_iommu_configure_dev_id(struct device_node *master_np, + struct device *dev, + const u32 *id) +{ + struct of_phandle_args iommu_spec = { .args_count = 1 }; + int err; + + err = of_map_id(master_np, *id, "iommu-map", + "iommu-map-mask", &iommu_spec.np, + iommu_spec.args); + if (err) + return err == -ENODEV ? NO_IOMMU : err; + + err = of_iommu_xlate(dev, &iommu_spec); + of_node_put(iommu_spec.np); + return err; +} + +static int of_iommu_configure_dev(struct device_node *master_np, + struct device *dev) +{ + struct of_phandle_args iommu_spec; + int err = NO_IOMMU, idx = 0; + + while (!of_parse_phandle_with_args(master_np, "iommus", + "#iommu-cells", + idx, &iommu_spec)) { + err = of_iommu_xlate(dev, &iommu_spec); + of_node_put(iommu_spec.np); + idx++; + if (err) + break; + } + + return err; +} + struct of_pci_iommu_alias_info { struct device *dev; struct device_node *np; @@ -126,38 +163,21 @@ struct of_pci_iommu_alias_info { static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) { struct of_pci_iommu_alias_info *info = data; - struct of_phandle_args iommu_spec = { .args_count = 1 }; - int err; + u32 input_id = alias; - err = of_map_id(info->np, alias, "iommu-map", "iommu-map-mask", - &iommu_spec.np, iommu_spec.args); - if (err) - return err == -ENODEV ? NO_IOMMU : err; - - err = of_iommu_xlate(info->dev, &iommu_spec); - of_node_put(iommu_spec.np); - return err; + return of_iommu_configure_dev_id(info->np, info->dev, &input_id); } -static int of_fsl_mc_iommu_init(struct fsl_mc_device *mc_dev, - struct device_node *master_np) +static int of_iommu_configure_device(struct device_node *master_np, + struct device *dev, const u32 *id) { - struct of_phandle_args iommu_spec = { .args_count = 1 }; - int err; - - err = of_map_id(master_np, mc_dev->icid, "iommu-map", - "iommu-map-mask", &iommu_spec.np, - iommu_spec.args); - if (err) - return err == -ENODEV ? NO_IOMMU : err; - - err = of_iommu_xlate(&mc_dev->dev, &iommu_spec); - of_node_put(iommu_spec.np); - return err; + return (id) ? of_iommu_configure_dev_id(master_np, dev, id) : + of_iommu_configure_dev(master_np, dev); } const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np) + struct device_node *master_np, + const u32 *id) { const struct iommu_ops *ops = NULL; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -188,21 +208,8 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, pci_request_acs(); err = pci_for_each_dma_alias(to_pci_dev(dev), of_pci_iommu_init, &info); - } else if (dev_is_fsl_mc(dev)) { - err = of_fsl_mc_iommu_init(to_fsl_mc_device(dev), master_np); } else { - struct of_phandle_args iommu_spec; - int idx = 0; - - while (!of_parse_phandle_with_args(master_np, "iommus", - "#iommu-cells", - idx, &iommu_spec)) { - err = of_iommu_xlate(dev, &iommu_spec); - of_node_put(iommu_spec.np); - idx++; - if (err) - break; - } + err = of_iommu_configure_device(master_np, dev, id); fwspec = dev_iommu_fwspec_get(dev); if (!err && fwspec) diff --git a/drivers/of/device.c b/drivers/of/device.c index 27203bfd0b22..b439c1e05434 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -78,6 +78,7 @@ int of_device_add(struct platform_device *ofdev) * @np: Pointer to OF node having DMA configuration * @force_dma: Whether device is to be set up by of_dma_configure() even if * DMA capability is not explicitly described by firmware. + * @id: Optional const pointer value input id * * Try to get devices's DMA configuration from DT and update it * accordingly. @@ -86,7 +87,8 @@ int of_device_add(struct platform_device *ofdev) * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events * to fix up DMA configuration. */ -int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma) +int of_dma_configure_id(struct device *dev, struct device_node *np, + bool force_dma, const u32 *id) { u64 dma_addr, paddr, size = 0; int ret; @@ -160,7 +162,7 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma) dev_dbg(dev, "device is%sdma coherent\n", coherent ? " " : " not "); - iommu = of_iommu_configure(dev, np); + iommu = of_iommu_configure(dev, np, id); if (PTR_ERR(iommu) == -EPROBE_DEFER) return -EPROBE_DEFER; @@ -171,7 +173,7 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma) return 0; } -EXPORT_SYMBOL_GPL(of_dma_configure); +EXPORT_SYMBOL_GPL(of_dma_configure_id); int of_device_register(struct platform_device *pdev) { diff --git a/include/linux/of_device.h b/include/linux/of_device.h index 8d31e39dd564..07ca187fc5e4 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -55,9 +55,15 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) return of_node_get(cpu_dev->of_node); } -int of_dma_configure(struct device *dev, +int of_dma_configure_id(struct device *dev, struct device_node *np, - bool force_dma); + bool force_dma, const u32 *id); +static inline int of_dma_configure(struct device *dev, + struct device_node *np, + bool force_dma) +{ + return of_dma_configure_id(dev, np, force_dma, NULL); +} #else /* CONFIG_OF */ static inline int of_driver_match_device(struct device *dev, @@ -106,6 +112,12 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) return NULL; } +static inline int of_dma_configure_id(struct device *dev, + struct device_node *np, + bool force_dma) +{ + return 0; +} static inline int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma) diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h index f3d40dd7bb66..16f4b3e87f20 100644 --- a/include/linux/of_iommu.h +++ b/include/linux/of_iommu.h @@ -13,7 +13,8 @@ extern int of_get_dma_window(struct device_node *dn, const char *prefix, size_t *size); extern const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np); + struct device_node *master_np, + const u32 *id); #else @@ -25,7 +26,8 @@ static inline int of_get_dma_window(struct device_node *dn, const char *prefix, } static inline const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np) + struct device_node *master_np, + const u32 *id) { return NULL; } From 5bda70c6162de9536cc983eacd24261c9c5de596 Mon Sep 17 00:00:00 2001 From: Laurentiu Tudor Date: Fri, 19 Jun 2020 09:20:09 +0100 Subject: [PATCH 472/502] dt-bindings: arm: fsl: Add msi-map device-tree binding for fsl-mc bus The existing bindings cannot be used to specify the relationship between fsl-mc devices and GIC ITSes. Add a generic binding for mapping fsl-mc devices to GIC ITSes, using msi-map property. In addition, deprecate msi-parent property which no longer makes sense now that we support translating the MSIs. Signed-off-by: Laurentiu Tudor Signed-off-by: Diana Craciun Reviewed-by: Rob Herring Cc: Rob Herring Link: https://lore.kernel.org/r/20200619082013.13661-9-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- .../devicetree/bindings/misc/fsl,qoriq-mc.txt | 50 ++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt index 9134e9bcca56..ebd329181c14 100644 --- a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt +++ b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt @@ -28,6 +28,16 @@ Documentation/devicetree/bindings/iommu/iommu.txt. For arm-smmu binding, see: Documentation/devicetree/bindings/iommu/arm,smmu.yaml. +The MSI writes are accompanied by sideband data which is derived from the ICID. +The msi-map property is used to associate the devices with both the ITS +controller and the sideband data which accompanies the writes. + +For generic MSI bindings, see +Documentation/devicetree/bindings/interrupt-controller/msi.txt. + +For GICv3 and GIC ITS bindings, see: +Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml. + Required properties: - compatible @@ -49,11 +59,6 @@ Required properties: region may not be present in some scenarios, such as in the device tree presented to a virtual machine. - - msi-parent - Value type: - Definition: Must be present and point to the MSI controller node - handling message interrupts for the MC. - - ranges Value type: Definition: A standard property. Defines the mapping between the child @@ -119,6 +124,28 @@ Optional properties: associated with the listed IOMMU, with the iommu-specifier (i - icid-base + iommu-base). +- msi-map: Maps an ICID to a GIC ITS and associated msi-specifier + data. + + The property is an arbitrary number of tuples of + (icid-base,gic-its,msi-base,length). + + Any ICID in the interval [icid-base, icid-base + length) is + associated with the listed GIC ITS, with the msi-specifier + (i - icid-base + msi-base). + +Deprecated properties: + + - msi-parent + Value type: + Definition: Describes the MSI controller node handling message + interrupts for the MC. When there is no translation + between the ICID and deviceID this property can be used + to describe the MSI controller used by the devices on the + mc-bus. + The use of this property for mc-bus is deprecated. Please + use msi-map. + Example: smmu: iommu@5000000 { @@ -128,13 +155,24 @@ Example: ... }; + gic: interrupt-controller@6000000 { + compatible = "arm,gic-v3"; + ... + } + its: gic-its@6020000 { + compatible = "arm,gic-v3-its"; + msi-controller; + ... + }; + fsl_mc: fsl-mc@80c000000 { compatible = "fsl,qoriq-mc"; reg = <0x00000008 0x0c000000 0 0x40>, /* MC portal base */ <0x00000000 0x08340000 0 0x40000>; /* MC control reg */ - msi-parent = <&its>; /* define map for ICIDs 23-64 */ iommu-map = <23 &smmu 23 41>; + /* define msi map for ICIDs 23-64 */ + msi-map = <23 &its 23 41>; #address-cells = <3>; #size-cells = <1>; From 6f881aba01109a01a43e4f135673c19190f61133 Mon Sep 17 00:00:00 2001 From: Diana Craciun Date: Fri, 19 Jun 2020 09:20:10 +0100 Subject: [PATCH 473/502] of/irq: make of_msi_map_get_device_domain() bus agnostic of_msi_map_get_device_domain() is PCI specific but it need not be and can be easily changed to be bus agnostic in order to be used by other busses by adding an IRQ domain bus token as an input parameter. Signed-off-by: Diana Craciun Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Bjorn Helgaas # pci/msi.c Cc: Bjorn Helgaas Cc: Rob Herring Cc: Marc Zyngier Link: https://lore.kernel.org/r/20200619082013.13661-10-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/of/irq.c | 8 +++++--- drivers/pci/msi.c | 2 +- include/linux/of_irq.h | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index d632bc5b3a2d..1005e4f349ef 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -613,18 +613,20 @@ u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in) * of_msi_map_get_device_domain - Use msi-map to find the relevant MSI domain * @dev: device for which the mapping is to be done. * @rid: Requester ID for the device. + * @bus_token: Bus token * * Walk up the device hierarchy looking for devices with a "msi-map" * property. * * Returns: the MSI domain for this device (or NULL on failure) */ -struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 rid) +struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id, + u32 bus_token) { struct device_node *np = NULL; - __of_msi_map_rid(dev, &np, rid); - return irq_find_matching_host(np, DOMAIN_BUS_PCI_MSI); + __of_msi_map_rid(dev, &np, id); + return irq_find_matching_host(np, bus_token); } /** diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 77f48b95e277..b4bfe0b03b2d 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1556,7 +1556,7 @@ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev) u32 rid = pci_dev_id(pdev); pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); - dom = of_msi_map_get_device_domain(&pdev->dev, rid); + dom = of_msi_map_get_device_domain(&pdev->dev, rid, DOMAIN_BUS_PCI_MSI); if (!dom) dom = iort_get_device_domain(&pdev->dev, rid, DOMAIN_BUS_PCI_MSI); diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 1214cabb2247..7142a3722758 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -52,7 +52,8 @@ extern struct irq_domain *of_msi_get_domain(struct device *dev, struct device_node *np, enum irq_domain_bus_token token); extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, - u32 rid); + u32 id, + u32 bus_token); extern void of_msi_configure(struct device *dev, struct device_node *np); u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); #else @@ -85,7 +86,7 @@ static inline struct irq_domain *of_msi_get_domain(struct device *dev, return NULL; } static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev, - u32 rid) + u32 id, u32 bus_token) { return NULL; } From 2bcdd8f2c07f1aa1bfd34fa0dab8e06949e34846 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:11 +0100 Subject: [PATCH 474/502] of/irq: Make of_msi_map_rid() PCI bus agnostic There is nothing PCI bus specific in the of_msi_map_rid() implementation other than the requester ID tag for the input ID space. Rename requester ID to a more generic ID so that the translation code can be used by all busses that require input/output ID translations. No functional change intended. Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Cc: Bjorn Helgaas Cc: Rob Herring Cc: Marc Zyngier Link: https://lore.kernel.org/r/20200619082013.13661-11-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/of/irq.c | 28 ++++++++++++++-------------- drivers/pci/msi.c | 2 +- include/linux/of_irq.h | 8 ++++---- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 1005e4f349ef..25d17b8a1a1a 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -576,43 +576,43 @@ err: } } -static u32 __of_msi_map_rid(struct device *dev, struct device_node **np, - u32 rid_in) +static u32 __of_msi_map_id(struct device *dev, struct device_node **np, + u32 id_in) { struct device *parent_dev; - u32 rid_out = rid_in; + u32 id_out = id_in; /* * Walk up the device parent links looking for one with a * "msi-map" property. */ for (parent_dev = dev; parent_dev; parent_dev = parent_dev->parent) - if (!of_map_id(parent_dev->of_node, rid_in, "msi-map", - "msi-map-mask", np, &rid_out)) + if (!of_map_id(parent_dev->of_node, id_in, "msi-map", + "msi-map-mask", np, &id_out)) break; - return rid_out; + return id_out; } /** - * of_msi_map_rid - Map a MSI requester ID for a device. + * of_msi_map_id - Map a MSI ID for a device. * @dev: device for which the mapping is to be done. * @msi_np: device node of the expected msi controller. - * @rid_in: unmapped MSI requester ID for the device. + * @id_in: unmapped MSI ID for the device. * * Walk up the device hierarchy looking for devices with a "msi-map" - * property. If found, apply the mapping to @rid_in. + * property. If found, apply the mapping to @id_in. * - * Returns the mapped MSI requester ID. + * Returns the mapped MSI ID. */ -u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in) +u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in) { - return __of_msi_map_rid(dev, &msi_np, rid_in); + return __of_msi_map_id(dev, &msi_np, id_in); } /** * of_msi_map_get_device_domain - Use msi-map to find the relevant MSI domain * @dev: device for which the mapping is to be done. - * @rid: Requester ID for the device. + * @id: Device ID. * @bus_token: Bus token * * Walk up the device hierarchy looking for devices with a "msi-map" @@ -625,7 +625,7 @@ struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id, { struct device_node *np = NULL; - __of_msi_map_rid(dev, &np, id); + __of_msi_map_id(dev, &np, id); return irq_find_matching_host(np, bus_token); } diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index b4bfe0b03b2d..19aeadb22f11 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1535,7 +1535,7 @@ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev) pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); of_node = irq_domain_get_of_node(domain); - rid = of_node ? of_msi_map_rid(&pdev->dev, of_node, rid) : + rid = of_node ? of_msi_map_id(&pdev->dev, of_node, rid) : iort_msi_map_id(&pdev->dev, rid); return rid; diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 7142a3722758..e8b78139f78c 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -55,7 +55,7 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id, u32 bus_token); extern void of_msi_configure(struct device *dev, struct device_node *np); -u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); +u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in); #else static inline int of_irq_count(struct device_node *dev) { @@ -93,10 +93,10 @@ static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev static inline void of_msi_configure(struct device *dev, struct device_node *np) { } -static inline u32 of_msi_map_rid(struct device *dev, - struct device_node *msi_np, u32 rid_in) +static inline u32 of_msi_map_id(struct device *dev, + struct device_node *msi_np, u32 id_in) { - return rid_in; + return id_in; } #endif From 998fb7badf0362a2057694878098642ef363d899 Mon Sep 17 00:00:00 2001 From: Diana Craciun Date: Fri, 19 Jun 2020 09:20:12 +0100 Subject: [PATCH 475/502] bus/fsl-mc: Refactor the MSI domain creation in the DPRC driver The DPRC driver is not taking into account the msi-map property and assumes that the icid is the same as the stream ID. Although this assumption is correct, generalize the code to include a translation between icid and streamID. Furthermore do not just copy the MSI domain from parent (for child containers), but use the information provided by the msi-map property. If the msi-map property is missing from the device tree retain the old behaviour for backward compatibility ie the child DPRC objects inherit the MSI domain from the parent. Signed-off-by: Diana Craciun Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20200619082013.13661-12-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/bus/fsl-mc/dprc-driver.c | 31 ++++++--------------- drivers/bus/fsl-mc/fsl-mc-bus.c | 4 +-- drivers/bus/fsl-mc/fsl-mc-msi.c | 29 +++++++++++-------- drivers/bus/fsl-mc/fsl-mc-private.h | 6 ++-- drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c | 15 +++++++++- 5 files changed, 46 insertions(+), 39 deletions(-) diff --git a/drivers/bus/fsl-mc/dprc-driver.c b/drivers/bus/fsl-mc/dprc-driver.c index c8b1c3842c1a..189bff2115a8 100644 --- a/drivers/bus/fsl-mc/dprc-driver.c +++ b/drivers/bus/fsl-mc/dprc-driver.c @@ -592,6 +592,7 @@ static int dprc_probe(struct fsl_mc_device *mc_dev) bool mc_io_created = false; bool msi_domain_set = false; u16 major_ver, minor_ver; + struct irq_domain *mc_msi_domain; if (!is_fsl_mc_bus_dprc(mc_dev)) return -EINVAL; @@ -621,31 +622,15 @@ static int dprc_probe(struct fsl_mc_device *mc_dev) return error; mc_io_created = true; + } - /* - * Inherit parent MSI domain: - */ - dev_set_msi_domain(&mc_dev->dev, - dev_get_msi_domain(parent_dev)); - msi_domain_set = true; + mc_msi_domain = fsl_mc_find_msi_domain(&mc_dev->dev); + if (!mc_msi_domain) { + dev_warn(&mc_dev->dev, + "WARNING: MC bus without interrupt support\n"); } else { - /* - * This is a root DPRC - */ - struct irq_domain *mc_msi_domain; - - if (dev_is_fsl_mc(parent_dev)) - return -EINVAL; - - error = fsl_mc_find_msi_domain(parent_dev, - &mc_msi_domain); - if (error < 0) { - dev_warn(&mc_dev->dev, - "WARNING: MC bus without interrupt support\n"); - } else { - dev_set_msi_domain(&mc_dev->dev, mc_msi_domain); - msi_domain_set = true; - } + dev_set_msi_domain(&mc_dev->dev, mc_msi_domain); + msi_domain_set = true; } error = dprc_open(mc_dev->mc_io, 0, mc_dev->obj_desc.id, diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index 8ead3f0238f2..824ff77bbe86 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -370,8 +370,8 @@ EXPORT_SYMBOL_GPL(fsl_mc_get_version); /** * fsl_mc_get_root_dprc - function to traverse to the root dprc */ -static void fsl_mc_get_root_dprc(struct device *dev, - struct device **root_dprc_dev) +void fsl_mc_get_root_dprc(struct device *dev, + struct device **root_dprc_dev) { if (!dev) { *root_dprc_dev = NULL; diff --git a/drivers/bus/fsl-mc/fsl-mc-msi.c b/drivers/bus/fsl-mc/fsl-mc-msi.c index 8b9c66d7c4ff..e7bbff445a83 100644 --- a/drivers/bus/fsl-mc/fsl-mc-msi.c +++ b/drivers/bus/fsl-mc/fsl-mc-msi.c @@ -177,23 +177,30 @@ struct irq_domain *fsl_mc_msi_create_irq_domain(struct fwnode_handle *fwnode, return domain; } -int fsl_mc_find_msi_domain(struct device *mc_platform_dev, - struct irq_domain **mc_msi_domain) +struct irq_domain *fsl_mc_find_msi_domain(struct device *dev) { - struct irq_domain *msi_domain; - struct device_node *mc_of_node = mc_platform_dev->of_node; + struct irq_domain *msi_domain = NULL; + struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev); - msi_domain = of_msi_get_domain(mc_platform_dev, mc_of_node, - DOMAIN_BUS_FSL_MC_MSI); + msi_domain = of_msi_map_get_device_domain(dev, mc_dev->icid, + DOMAIN_BUS_FSL_MC_MSI); + + /* + * if the msi-map property is missing assume that all the + * child containers inherit the domain from the parent + */ if (!msi_domain) { - pr_err("Unable to find fsl-mc MSI domain for %pOF\n", - mc_of_node); + struct device *root_dprc_dev; + struct device *bus_dev; - return -ENOENT; + fsl_mc_get_root_dprc(dev, &root_dprc_dev); + bus_dev = root_dprc_dev->parent; + msi_domain = of_msi_get_domain(bus_dev, + bus_dev->of_node, + DOMAIN_BUS_FSL_MC_MSI); } - *mc_msi_domain = msi_domain; - return 0; + return msi_domain; } static void fsl_mc_msi_free_descs(struct device *dev) diff --git a/drivers/bus/fsl-mc/fsl-mc-private.h b/drivers/bus/fsl-mc/fsl-mc-private.h index 21ca8c756ee7..7a46a12eb747 100644 --- a/drivers/bus/fsl-mc/fsl-mc-private.h +++ b/drivers/bus/fsl-mc/fsl-mc-private.h @@ -595,8 +595,7 @@ int fsl_mc_msi_domain_alloc_irqs(struct device *dev, void fsl_mc_msi_domain_free_irqs(struct device *dev); -int fsl_mc_find_msi_domain(struct device *mc_platform_dev, - struct irq_domain **mc_msi_domain); +struct irq_domain *fsl_mc_find_msi_domain(struct device *dev); int fsl_mc_populate_irq_pool(struct fsl_mc_bus *mc_bus, unsigned int irq_count); @@ -613,6 +612,9 @@ void fsl_destroy_mc_io(struct fsl_mc_io *mc_io); bool fsl_mc_is_root_dprc(struct device *dev); +void fsl_mc_get_root_dprc(struct device *dev, + struct device **root_dprc_dev); + struct fsl_mc_device *fsl_mc_device_lookup(struct fsl_mc_obj_desc *obj_desc, struct fsl_mc_device *mc_bus_dev); diff --git a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c index 606efa64adff..a5c8d577e424 100644 --- a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c @@ -23,6 +23,18 @@ static struct irq_chip its_msi_irq_chip = { .irq_set_affinity = msi_domain_set_affinity }; +static u32 fsl_mc_msi_domain_get_msi_id(struct irq_domain *domain, + struct fsl_mc_device *mc_dev) +{ + struct device_node *of_node; + u32 out_id; + + of_node = irq_domain_get_of_node(domain); + out_id = of_msi_map_id(&mc_dev->dev, of_node, mc_dev->icid); + + return out_id; +} + static int its_fsl_mc_msi_prepare(struct irq_domain *msi_domain, struct device *dev, int nvec, msi_alloc_info_t *info) @@ -43,7 +55,8 @@ static int its_fsl_mc_msi_prepare(struct irq_domain *msi_domain, * NOTE: This device id corresponds to the IOMMU stream ID * associated with the DPRC object (ICID). */ - info->scratchpad[0].ul = mc_bus_dev->icid; + info->scratchpad[0].ul = fsl_mc_msi_domain_get_msi_id(msi_domain, + mc_bus_dev); msi_info = msi_get_domain_info(msi_domain->parent); /* Allocate at least 32 MSIs, and always as a power of 2 */ From 6305166c8771c33a8d5992fb53f93cfecedc14fd Mon Sep 17 00:00:00 2001 From: Makarand Pawagi Date: Fri, 19 Jun 2020 09:20:13 +0100 Subject: [PATCH 476/502] bus: fsl-mc: Add ACPI support for fsl-mc Add ACPI support in the fsl-mc driver. Driver parses MC DSDT table to extract memory and other resources. Interrupt (GIC ITS) information is extracted from the MADT table by drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c. IORT table is parsed to configure DMA. Signed-off-by: Makarand Pawagi Signed-off-by: Diana Craciun Signed-off-by: Laurentiu Tudor Link: https://lore.kernel.org/r/20200619082013.13661-13-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/bus/fsl-mc/fsl-mc-bus.c | 73 ++++++++++++---- drivers/bus/fsl-mc/fsl-mc-msi.c | 35 ++++---- drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c | 92 ++++++++++++++++----- 3 files changed, 149 insertions(+), 51 deletions(-) diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index 824ff77bbe86..324d49d6df89 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include "fsl-mc-private.h" @@ -38,6 +40,7 @@ struct fsl_mc { struct fsl_mc_device *root_mc_bus_dev; u8 num_translation_ranges; struct fsl_mc_addr_translation_range *translation_ranges; + void *fsl_mc_regs; }; /** @@ -56,6 +59,10 @@ struct fsl_mc_addr_translation_range { phys_addr_t start_phys_addr; }; +#define FSL_MC_FAPR 0x28 +#define MC_FAPR_PL BIT(18) +#define MC_FAPR_BMT BIT(17) + /** * fsl_mc_bus_match - device to driver matching callback * @dev: the fsl-mc device to match against @@ -124,7 +131,10 @@ static int fsl_mc_dma_configure(struct device *dev) while (dev_is_fsl_mc(dma_dev)) dma_dev = dma_dev->parent; - return of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id); + if (dev_of_node(dma_dev)) + return of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id); + + return acpi_dma_configure_id(dev, DEV_DMA_COHERENT, &input_id); } static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, @@ -865,8 +875,11 @@ static int fsl_mc_bus_probe(struct platform_device *pdev) struct fsl_mc_io *mc_io = NULL; int container_id; phys_addr_t mc_portal_phys_addr; - u32 mc_portal_size; - struct resource res; + u32 mc_portal_size, mc_stream_id; + struct resource *plat_res; + + if (!iommu_present(&fsl_mc_bus_type)) + return -EPROBE_DEFER; mc = devm_kzalloc(&pdev->dev, sizeof(*mc), GFP_KERNEL); if (!mc) @@ -874,19 +887,33 @@ static int fsl_mc_bus_probe(struct platform_device *pdev) platform_set_drvdata(pdev, mc); + plat_res = platform_get_resource(pdev, IORESOURCE_MEM, 1); + mc->fsl_mc_regs = devm_ioremap_resource(&pdev->dev, plat_res); + if (IS_ERR(mc->fsl_mc_regs)) + return PTR_ERR(mc->fsl_mc_regs); + + if (IS_ENABLED(CONFIG_ACPI) && !dev_of_node(&pdev->dev)) { + mc_stream_id = readl(mc->fsl_mc_regs + FSL_MC_FAPR); + /* + * HW ORs the PL and BMT bit, places the result in bit 15 of + * the StreamID and ORs in the ICID. Calculate it accordingly. + */ + mc_stream_id = (mc_stream_id & 0xffff) | + ((mc_stream_id & (MC_FAPR_PL | MC_FAPR_BMT)) ? + 0x4000 : 0); + error = acpi_dma_configure_id(&pdev->dev, DEV_DMA_COHERENT, + &mc_stream_id); + if (error) + dev_warn(&pdev->dev, "failed to configure dma: %d.\n", + error); + } + /* * Get physical address of MC portal for the root DPRC: */ - error = of_address_to_resource(pdev->dev.of_node, 0, &res); - if (error < 0) { - dev_err(&pdev->dev, - "of_address_to_resource() failed for %pOF\n", - pdev->dev.of_node); - return error; - } - - mc_portal_phys_addr = res.start; - mc_portal_size = resource_size(&res); + plat_res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + mc_portal_phys_addr = plat_res->start; + mc_portal_size = resource_size(plat_res); error = fsl_create_mc_io(&pdev->dev, mc_portal_phys_addr, mc_portal_size, NULL, FSL_MC_IO_ATOMIC_CONTEXT_PORTAL, &mc_io); @@ -903,11 +930,13 @@ static int fsl_mc_bus_probe(struct platform_device *pdev) dev_info(&pdev->dev, "MC firmware version: %u.%u.%u\n", mc_version.major, mc_version.minor, mc_version.revision); - error = get_mc_addr_translation_ranges(&pdev->dev, - &mc->translation_ranges, - &mc->num_translation_ranges); - if (error < 0) - goto error_cleanup_mc_io; + if (dev_of_node(&pdev->dev)) { + error = get_mc_addr_translation_ranges(&pdev->dev, + &mc->translation_ranges, + &mc->num_translation_ranges); + if (error < 0) + goto error_cleanup_mc_io; + } error = dprc_get_container_id(mc_io, 0, &container_id); if (error < 0) { @@ -934,6 +963,7 @@ static int fsl_mc_bus_probe(struct platform_device *pdev) goto error_cleanup_mc_io; mc->root_mc_bus_dev = mc_bus_dev; + mc_bus_dev->dev.fwnode = pdev->dev.fwnode; return 0; error_cleanup_mc_io: @@ -967,11 +997,18 @@ static const struct of_device_id fsl_mc_bus_match_table[] = { MODULE_DEVICE_TABLE(of, fsl_mc_bus_match_table); +static const struct acpi_device_id fsl_mc_bus_acpi_match_table[] = { + {"NXP0008", 0 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, fsl_mc_bus_acpi_match_table); + static struct platform_driver fsl_mc_bus_driver = { .driver = { .name = "fsl_mc_bus", .pm = NULL, .of_match_table = fsl_mc_bus_match_table, + .acpi_match_table = fsl_mc_bus_acpi_match_table, }, .probe = fsl_mc_bus_probe, .remove = fsl_mc_bus_remove, diff --git a/drivers/bus/fsl-mc/fsl-mc-msi.c b/drivers/bus/fsl-mc/fsl-mc-msi.c index e7bbff445a83..8edadf05cbb7 100644 --- a/drivers/bus/fsl-mc/fsl-mc-msi.c +++ b/drivers/bus/fsl-mc/fsl-mc-msi.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "fsl-mc-private.h" @@ -179,25 +180,31 @@ struct irq_domain *fsl_mc_msi_create_irq_domain(struct fwnode_handle *fwnode, struct irq_domain *fsl_mc_find_msi_domain(struct device *dev) { - struct irq_domain *msi_domain = NULL; + struct device *root_dprc_dev; + struct device *bus_dev; + struct irq_domain *msi_domain; struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev); - msi_domain = of_msi_map_get_device_domain(dev, mc_dev->icid, + fsl_mc_get_root_dprc(dev, &root_dprc_dev); + bus_dev = root_dprc_dev->parent; + + if (bus_dev->of_node) { + msi_domain = of_msi_map_get_device_domain(dev, + mc_dev->icid, DOMAIN_BUS_FSL_MC_MSI); - /* - * if the msi-map property is missing assume that all the - * child containers inherit the domain from the parent - */ - if (!msi_domain) { - struct device *root_dprc_dev; - struct device *bus_dev; + /* + * if the msi-map property is missing assume that all the + * child containers inherit the domain from the parent + */ + if (!msi_domain) - fsl_mc_get_root_dprc(dev, &root_dprc_dev); - bus_dev = root_dprc_dev->parent; - msi_domain = of_msi_get_domain(bus_dev, - bus_dev->of_node, - DOMAIN_BUS_FSL_MC_MSI); + msi_domain = of_msi_get_domain(bus_dev, + bus_dev->of_node, + DOMAIN_BUS_FSL_MC_MSI); + } else { + msi_domain = iort_get_device_domain(dev, mc_dev->icid, + DOMAIN_BUS_FSL_MC_MSI); } return msi_domain; diff --git a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c index a5c8d577e424..634263dfd7b5 100644 --- a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c @@ -7,6 +7,8 @@ * */ +#include +#include #include #include #include @@ -30,7 +32,8 @@ static u32 fsl_mc_msi_domain_get_msi_id(struct irq_domain *domain, u32 out_id; of_node = irq_domain_get_of_node(domain); - out_id = of_msi_map_id(&mc_dev->dev, of_node, mc_dev->icid); + out_id = of_node ? of_msi_map_id(&mc_dev->dev, of_node, mc_dev->icid) : + iort_msi_map_id(&mc_dev->dev, mc_dev->icid); return out_id; } @@ -79,12 +82,71 @@ static const struct of_device_id its_device_id[] = { {}, }; -static int __init its_fsl_mc_msi_init(void) +static void __init its_fsl_mc_msi_init_one(struct fwnode_handle *handle, + const char *name) { - struct device_node *np; struct irq_domain *parent; struct irq_domain *mc_msi_domain; + parent = irq_find_matching_fwnode(handle, DOMAIN_BUS_NEXUS); + if (!parent || !msi_get_domain_info(parent)) { + pr_err("%s: unable to locate ITS domain\n", name); + return; + } + + mc_msi_domain = fsl_mc_msi_create_irq_domain(handle, + &its_fsl_mc_msi_domain_info, + parent); + if (!mc_msi_domain) { + pr_err("%s: unable to create fsl-mc domain\n", name); + return; + } + + pr_info("fsl-mc MSI: %s domain created\n", name); +} + +#ifdef CONFIG_ACPI +static int __init +its_fsl_mc_msi_parse_madt(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_generic_translator *its_entry; + struct fwnode_handle *dom_handle; + const char *node_name; + int err = 0; + + its_entry = (struct acpi_madt_generic_translator *)header; + node_name = kasprintf(GFP_KERNEL, "ITS@0x%lx", + (long)its_entry->base_address); + + dom_handle = iort_find_domain_token(its_entry->translation_id); + if (!dom_handle) { + pr_err("%s: Unable to locate ITS domain handle\n", node_name); + err = -ENXIO; + goto out; + } + + its_fsl_mc_msi_init_one(dom_handle, node_name); + +out: + kfree(node_name); + return err; +} + + +static void __init its_fsl_mc_acpi_msi_init(void) +{ + acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_TRANSLATOR, + its_fsl_mc_msi_parse_madt, 0); +} +#else +static inline void its_fsl_mc_acpi_msi_init(void) { } +#endif + +static void __init its_fsl_mc_of_msi_init(void) +{ + struct device_node *np; + for (np = of_find_matching_node(NULL, its_device_id); np; np = of_find_matching_node(np, its_device_id)) { if (!of_device_is_available(np)) @@ -92,23 +154,15 @@ static int __init its_fsl_mc_msi_init(void) if (!of_property_read_bool(np, "msi-controller")) continue; - parent = irq_find_matching_host(np, DOMAIN_BUS_NEXUS); - if (!parent || !msi_get_domain_info(parent)) { - pr_err("%pOF: unable to locate ITS domain\n", np); - continue; - } - - mc_msi_domain = fsl_mc_msi_create_irq_domain( - of_node_to_fwnode(np), - &its_fsl_mc_msi_domain_info, - parent); - if (!mc_msi_domain) { - pr_err("%pOF: unable to create fsl-mc domain\n", np); - continue; - } - - pr_info("fsl-mc MSI: %pOF domain created\n", np); + its_fsl_mc_msi_init_one(of_node_to_fwnode(np), + np->full_name); } +} + +static int __init its_fsl_mc_msi_init(void) +{ + its_fsl_mc_of_msi_init(); + its_fsl_mc_acpi_msi_init(); return 0; } From 7ca8cf5347f720b07a0b32a924b768f5710547e7 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 29 Jul 2020 22:31:05 +1000 Subject: [PATCH 477/502] locking/atomic: Move ATOMIC_INIT into linux/types.h This patch moves ATOMIC_INIT from asm/atomic.h into linux/types.h. This allows users of atomic_t to use ATOMIC_INIT without having to include atomic.h as that way may lead to header loops. Signed-off-by: Herbert Xu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lkml.kernel.org/r/20200729123105.GB7047@gondor.apana.org.au --- arch/alpha/include/asm/atomic.h | 1 - arch/arc/include/asm/atomic.h | 2 -- arch/arm/include/asm/atomic.h | 2 -- arch/arm64/include/asm/atomic.h | 2 -- arch/h8300/include/asm/atomic.h | 2 -- arch/hexagon/include/asm/atomic.h | 2 -- arch/ia64/include/asm/atomic.h | 1 - arch/m68k/include/asm/atomic.h | 2 -- arch/mips/include/asm/atomic.h | 1 - arch/parisc/include/asm/atomic.h | 2 -- arch/powerpc/include/asm/atomic.h | 2 -- arch/riscv/include/asm/atomic.h | 2 -- arch/s390/include/asm/atomic.h | 2 -- arch/sh/include/asm/atomic.h | 2 -- arch/sparc/include/asm/atomic_32.h | 2 -- arch/sparc/include/asm/atomic_64.h | 1 - arch/x86/include/asm/atomic.h | 2 -- arch/xtensa/include/asm/atomic.h | 2 -- include/asm-generic/atomic.h | 2 -- include/linux/types.h | 2 ++ 20 files changed, 2 insertions(+), 34 deletions(-) diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index 2144530d1428..e2093994fd0d 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -24,7 +24,6 @@ #define __atomic_acquire_fence() #define __atomic_post_full_fence() -#define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } #define atomic_read(v) READ_ONCE((v)->counter) diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 7298ce84762e..c614857eb209 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -14,8 +14,6 @@ #include #include -#define ATOMIC_INIT(i) { (i) } - #ifndef CONFIG_ARC_PLAT_EZNPS #define atomic_read(v) READ_ONCE((v)->counter) diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 75bb2c543e59..455eb19a5ac1 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -15,8 +15,6 @@ #include #include -#define ATOMIC_INIT(i) { (i) } - #ifdef __KERNEL__ /* diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h index a08890da696c..015ddffaf6ca 100644 --- a/arch/arm64/include/asm/atomic.h +++ b/arch/arm64/include/asm/atomic.h @@ -99,8 +99,6 @@ static inline long arch_atomic64_dec_if_positive(atomic64_t *v) return __lse_ll_sc_body(atomic64_dec_if_positive, v); } -#define ATOMIC_INIT(i) { (i) } - #define arch_atomic_read(v) __READ_ONCE((v)->counter) #define arch_atomic_set(v, i) __WRITE_ONCE(((v)->counter), (i)) diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h index c6b6a06231b2..a990d151f163 100644 --- a/arch/h8300/include/asm/atomic.h +++ b/arch/h8300/include/asm/atomic.h @@ -12,8 +12,6 @@ * resource counting etc.. */ -#define ATOMIC_INIT(i) { (i) } - #define atomic_read(v) READ_ONCE((v)->counter) #define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index 0231d69c8bf2..4ab895d7111f 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -12,8 +12,6 @@ #include #include -#define ATOMIC_INIT(i) { (i) } - /* Normal writes in our arch don't clear lock reservations */ static inline void atomic_set(atomic_t *v, int new) diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 50440f3ddc43..f267d956458f 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -19,7 +19,6 @@ #include -#define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } #define atomic_read(v) READ_ONCE((v)->counter) diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h index 47228b0d4163..756c5cc58f94 100644 --- a/arch/m68k/include/asm/atomic.h +++ b/arch/m68k/include/asm/atomic.h @@ -16,8 +16,6 @@ * We do not have SMP m68k systems, so we don't have to deal with that. */ -#define ATOMIC_INIT(i) { (i) } - #define atomic_read(v) READ_ONCE((v)->counter) #define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index e5ac88392d1f..f904084fcb1f 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -45,7 +45,6 @@ static __always_inline type pfx##_xchg(pfx##_t *v, type n) \ return xchg(&v->counter, n); \ } -#define ATOMIC_INIT(i) { (i) } ATOMIC_OPS(atomic, int) #ifdef CONFIG_64BIT diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 118953d41763..f960e2f32b1b 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -136,8 +136,6 @@ ATOMIC_OPS(xor, ^=) #undef ATOMIC_OP_RETURN #undef ATOMIC_OP -#define ATOMIC_INIT(i) { (i) } - #ifdef CONFIG_64BIT #define ATOMIC64_INIT(i) { (i) } diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index 498785ffc25f..0311c3c42960 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -11,8 +11,6 @@ #include #include -#define ATOMIC_INIT(i) { (i) } - /* * Since *_return_relaxed and {cmp}xchg_relaxed are implemented with * a "bne-" instruction at the end, so an isync is enough as a acquire barrier diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index 96f95c9ebd97..400a8c8b6de7 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -19,8 +19,6 @@ #include #include -#define ATOMIC_INIT(i) { (i) } - #define __atomic_acquire_fence() \ __asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory") diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index 491ad53a0d4e..cae473a7b6f7 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -15,8 +15,6 @@ #include #include -#define ATOMIC_INIT(i) { (i) } - static inline int atomic_read(const atomic_t *v) { int c; diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h index f37b95a80232..7c2a8a703b9a 100644 --- a/arch/sh/include/asm/atomic.h +++ b/arch/sh/include/asm/atomic.h @@ -19,8 +19,6 @@ #include #include -#define ATOMIC_INIT(i) { (i) } - #define atomic_read(v) READ_ONCE((v)->counter) #define atomic_set(v,i) WRITE_ONCE((v)->counter, (i)) diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index 94c930f0bc62..efad5532f169 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -18,8 +18,6 @@ #include #include -#define ATOMIC_INIT(i) { (i) } - int atomic_add_return(int, atomic_t *); int atomic_fetch_add(int, atomic_t *); int atomic_fetch_and(int, atomic_t *); diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index b60448397d4f..6b235d3d1d9d 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -12,7 +12,6 @@ #include #include -#define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } #define atomic_read(v) READ_ONCE((v)->counter) diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index bf35e476a776..b6cac6e9bb70 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -14,8 +14,6 @@ * resource counting etc.. */ -#define ATOMIC_INIT(i) { (i) } - /** * arch_atomic_read - read atomic variable * @v: pointer of type atomic_t diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h index 3e7c6134ed32..744c2f463845 100644 --- a/arch/xtensa/include/asm/atomic.h +++ b/arch/xtensa/include/asm/atomic.h @@ -19,8 +19,6 @@ #include #include -#define ATOMIC_INIT(i) { (i) } - /* * This Xtensa implementation assumes that the right mechanism * for exclusion is for locking interrupts to level EXCM_LEVEL. diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 286867f593d2..11f96f40f4a7 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -159,8 +159,6 @@ ATOMIC_OP(xor, ^) * resource counting etc.. */ -#define ATOMIC_INIT(i) { (i) } - /** * atomic_read - read atomic variable * @v: pointer of type atomic_t diff --git a/include/linux/types.h b/include/linux/types.h index d3021c879179..a147977602b5 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -167,6 +167,8 @@ typedef struct { int counter; } atomic_t; +#define ATOMIC_INIT(i) { (i) } + #ifdef CONFIG_64BIT typedef struct { s64 counter; From 459e39538e612b8dd130d34b93c9bfc89ecc836c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 29 Jul 2020 22:33:16 +1000 Subject: [PATCH 478/502] locking/qspinlock: Do not include atomic.h from qspinlock_types.h This patch breaks a header loop involving qspinlock_types.h. The issue is that qspinlock_types.h includes atomic.h, which then eventually includes kernel.h which could lead back to the original file via spinlock_types.h. As ATOMIC_INIT is now defined by linux/types.h, there is no longer any need to include atomic.h from qspinlock_types.h. This also allows the CONFIG_PARAVIRT hack to be removed since it was trying to prevent exactly this loop. Signed-off-by: Herbert Xu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lkml.kernel.org/r/20200729123316.GC7047@gondor.apana.org.au --- include/asm-generic/qspinlock.h | 1 + include/asm-generic/qspinlock_types.h | 8 -------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index fde943d180e0..2b26cd729b94 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -11,6 +11,7 @@ #define __ASM_GENERIC_QSPINLOCK_H #include +#include /** * queued_spin_is_locked - is the spinlock locked? diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h index 56d1309d32f8..2fd1fb89ec36 100644 --- a/include/asm-generic/qspinlock_types.h +++ b/include/asm-generic/qspinlock_types.h @@ -9,15 +9,7 @@ #ifndef __ASM_GENERIC_QSPINLOCK_TYPES_H #define __ASM_GENERIC_QSPINLOCK_TYPES_H -/* - * Including atomic.h with PARAVIRT on will cause compilation errors because - * of recursive header file incluson via paravirt_types.h. So don't include - * it if PARAVIRT is on. - */ -#ifndef CONFIG_PARAVIRT #include -#include -#endif typedef struct qspinlock { union { From 0d24f65e933ca89d55d17f6dbdb2a72ca88f0992 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:07 +0200 Subject: [PATCH 479/502] Documentation: locking: Describe seqlock design and usage Proper documentation for the design and usage of sequence counters and sequential locks does not exist. Complete the seqlock.h documentation as follows: - Divide all documentation on a seqcount_t vs. seqlock_t basis. The description for both mechanisms was intermingled, which is incorrect since the usage constrains for each type are vastly different. - Add an introductory paragraph describing the internal design of, and rationale for, sequence counters. - Document seqcount_t writer non-preemptibility requirement, which was not previously documented anywhere, and provide a clear rationale. - Provide template code for seqcount_t and seqlock_t initialization and reader/writer critical sections. - Recommend using seqlock_t by default. It implicitly handles the serialization and non-preemptibility requirements of writers. At seqlock.h: - Remove references to brlocks as they've long been removed from the kernel. - Remove references to gcc-3.x since the kernel's minimum supported gcc version is 4.9. References: 0f6ed63b1707 ("no need to keep brlock macros anymore...") References: 6ec4476ac825 ("Raise gcc version requirement to 4.9") Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-2-a.darwish@linutronix.de --- Documentation/locking/index.rst | 1 + Documentation/locking/seqlock.rst | 170 ++++++++++++++++++++++++++++++ include/linux/seqlock.h | 81 +++++++------- 3 files changed, 209 insertions(+), 43 deletions(-) create mode 100644 Documentation/locking/seqlock.rst diff --git a/Documentation/locking/index.rst b/Documentation/locking/index.rst index d785878cad65..7003bd5aeff4 100644 --- a/Documentation/locking/index.rst +++ b/Documentation/locking/index.rst @@ -14,6 +14,7 @@ locking mutex-design rt-mutex-design rt-mutex + seqlock spinlocks ww-mutex-design preempt-locking diff --git a/Documentation/locking/seqlock.rst b/Documentation/locking/seqlock.rst new file mode 100644 index 000000000000..366dd368d90a --- /dev/null +++ b/Documentation/locking/seqlock.rst @@ -0,0 +1,170 @@ +====================================== +Sequence counters and sequential locks +====================================== + +Introduction +============ + +Sequence counters are a reader-writer consistency mechanism with +lockless readers (read-only retry loops), and no writer starvation. They +are used for data that's rarely written to (e.g. system time), where the +reader wants a consistent set of information and is willing to retry if +that information changes. + +A data set is consistent when the sequence count at the beginning of the +read side critical section is even and the same sequence count value is +read again at the end of the critical section. The data in the set must +be copied out inside the read side critical section. If the sequence +count has changed between the start and the end of the critical section, +the reader must retry. + +Writers increment the sequence count at the start and the end of their +critical section. After starting the critical section the sequence count +is odd and indicates to the readers that an update is in progress. At +the end of the write side critical section the sequence count becomes +even again which lets readers make progress. + +A sequence counter write side critical section must never be preempted +or interrupted by read side sections. Otherwise the reader will spin for +the entire scheduler tick due to the odd sequence count value and the +interrupted writer. If that reader belongs to a real-time scheduling +class, it can spin forever and the kernel will livelock. + +This mechanism cannot be used if the protected data contains pointers, +as the writer can invalidate a pointer that the reader is following. + + +.. _seqcount_t: + +Sequence counters (``seqcount_t``) +================================== + +This is the the raw counting mechanism, which does not protect against +multiple writers. Write side critical sections must thus be serialized +by an external lock. + +If the write serialization primitive is not implicitly disabling +preemption, preemption must be explicitly disabled before entering the +write side section. If the read section can be invoked from hardirq or +softirq contexts, interrupts or bottom halves must also be respectively +disabled before entering the write section. + +If it's desired to automatically handle the sequence counter +requirements of writer serialization and non-preemptibility, use +:ref:`seqlock_t` instead. + +Initialization:: + + /* dynamic */ + seqcount_t foo_seqcount; + seqcount_init(&foo_seqcount); + + /* static */ + static seqcount_t foo_seqcount = SEQCNT_ZERO(foo_seqcount); + + /* C99 struct init */ + struct { + .seq = SEQCNT_ZERO(foo.seq), + } foo; + +Write path:: + + /* Serialized context with disabled preemption */ + + write_seqcount_begin(&foo_seqcount); + + /* ... [[write-side critical section]] ... */ + + write_seqcount_end(&foo_seqcount); + +Read path:: + + do { + seq = read_seqcount_begin(&foo_seqcount); + + /* ... [[read-side critical section]] ... */ + + } while (read_seqcount_retry(&foo_seqcount, seq)); + + +.. _seqlock_t: + +Sequential locks (``seqlock_t``) +================================ + +This contains the :ref:`seqcount_t` mechanism earlier discussed, plus an +embedded spinlock for writer serialization and non-preemptibility. + +If the read side section can be invoked from hardirq or softirq context, +use the write side function variants which disable interrupts or bottom +halves respectively. + +Initialization:: + + /* dynamic */ + seqlock_t foo_seqlock; + seqlock_init(&foo_seqlock); + + /* static */ + static DEFINE_SEQLOCK(foo_seqlock); + + /* C99 struct init */ + struct { + .seql = __SEQLOCK_UNLOCKED(foo.seql) + } foo; + +Write path:: + + write_seqlock(&foo_seqlock); + + /* ... [[write-side critical section]] ... */ + + write_sequnlock(&foo_seqlock); + +Read path, three categories: + +1. Normal Sequence readers which never block a writer but they must + retry if a writer is in progress by detecting change in the sequence + number. Writers do not wait for a sequence reader:: + + do { + seq = read_seqbegin(&foo_seqlock); + + /* ... [[read-side critical section]] ... */ + + } while (read_seqretry(&foo_seqlock, seq)); + +2. Locking readers which will wait if a writer or another locking reader + is in progress. A locking reader in progress will also block a writer + from entering its critical section. This read lock is + exclusive. Unlike rwlock_t, only one locking reader can acquire it:: + + read_seqlock_excl(&foo_seqlock); + + /* ... [[read-side critical section]] ... */ + + read_sequnlock_excl(&foo_seqlock); + +3. Conditional lockless reader (as in 1), or locking reader (as in 2), + according to a passed marker. This is used to avoid lockless readers + starvation (too much retry loops) in case of a sharp spike in write + activity. First, a lockless read is tried (even marker passed). If + that trial fails (odd sequence counter is returned, which is used as + the next iteration marker), the lockless read is transformed to a + full locking read and no retry loop is necessary:: + + /* marker; even initialization */ + int seq = 0; + do { + read_seqbegin_or_lock(&foo_seqlock, &seq); + + /* ... [[read-side critical section]] ... */ + + } while (need_seqretry(&foo_seqlock, seq)); + done_seqretry(&foo_seqlock, seq); + + +API documentation +================= + +.. kernel-doc:: include/linux/seqlock.h diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 8b97204f35a7..299d68f10325 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -1,36 +1,15 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SEQLOCK_H #define __LINUX_SEQLOCK_H + /* - * Reader/writer consistent mechanism without starving writers. This type of - * lock for data where the reader wants a consistent set of information - * and is willing to retry if the information changes. There are two types - * of readers: - * 1. Sequence readers which never block a writer but they may have to retry - * if a writer is in progress by detecting change in sequence number. - * Writers do not wait for a sequence reader. - * 2. Locking readers which will wait if a writer or another locking reader - * is in progress. A locking reader in progress will also block a writer - * from going forward. Unlike the regular rwlock, the read lock here is - * exclusive so that only one locking reader can get it. + * seqcount_t / seqlock_t - a reader-writer consistency mechanism with + * lockless readers (read-only retry loops), and no writer starvation. * - * This is not as cache friendly as brlock. Also, this may not work well - * for data that contains pointers, because any writer could - * invalidate a pointer that a reader was following. + * See Documentation/locking/seqlock.rst * - * Expected non-blocking reader usage: - * do { - * seq = read_seqbegin(&foo); - * ... - * } while (read_seqretry(&foo, seq)); - * - * - * On non-SMP the spin locks disappear but the writer still needs - * to increment the sequence variables because an interrupt routine could - * change the state of the data. - * - * Based on x86_64 vsyscall gettimeofday - * by Keith Owens and Andrea Arcangeli + * Copyrights: + * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli */ #include @@ -41,8 +20,8 @@ #include /* - * The seqlock interface does not prescribe a precise sequence of read - * begin/retry/end. For readers, typically there is a call to + * The seqlock seqcount_t interface does not prescribe a precise sequence of + * read begin/retry/end. For readers, typically there is a call to * read_seqcount_begin() and read_seqcount_retry(), however, there are more * esoteric cases which do not follow this pattern. * @@ -50,16 +29,30 @@ * via seqcount_t under KCSAN: upon beginning a seq-reader critical section, * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as * atomics; if there is a matching read_seqcount_retry() call, no following - * memory operations are considered atomic. Usage of seqlocks via seqlock_t - * interface is not affected. + * memory operations are considered atomic. Usage of the seqlock_t interface + * is not affected. */ #define KCSAN_SEQLOCK_REGION_MAX 1000 /* - * Version using sequence counter only. - * This can be used when code has its own mutex protecting the - * updating starting before the write_seqcountbeqin() and ending - * after the write_seqcount_end(). + * Sequence counters (seqcount_t) + * + * This is the raw counting mechanism, without any writer protection. + * + * Write side critical sections must be serialized and non-preemptible. + * + * If readers can be invoked from hardirq or softirq contexts, + * interrupts or bottom halves must also be respectively disabled before + * entering the write section. + * + * This mechanism can't be used if the protected data contains pointers, + * as the writer can invalidate a pointer that a reader is following. + * + * If it's desired to automatically handle the sequence counter writer + * serialization and non-preemptibility requirements, use a sequential + * lock (seqlock_t) instead. + * + * See Documentation/locking/seqlock.rst */ typedef struct seqcount { unsigned sequence; @@ -398,10 +391,6 @@ static inline void raw_write_seqcount_latch(seqcount_t *s) smp_wmb(); /* increment "sequence" before following stores */ } -/* - * Sequence counter only version assumes that callers are using their - * own mutexing. - */ static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass) { raw_write_seqcount_begin(s); @@ -434,15 +423,21 @@ static inline void write_seqcount_invalidate(seqcount_t *s) kcsan_nestable_atomic_end(); } +/* + * Sequential locks (seqlock_t) + * + * Sequence counters with an embedded spinlock for writer serialization + * and non-preemptibility. + * + * For more info, see: + * - Comments on top of seqcount_t + * - Documentation/locking/seqlock.rst + */ typedef struct { struct seqcount seqcount; spinlock_t lock; } seqlock_t; -/* - * These macros triggered gcc-3.x compile-time problems. We think these are - * OK now. Be cautious. - */ #define __SEQLOCK_UNLOCKED(lockname) \ { \ .seqcount = SEQCNT_ZERO(lockname), \ From 15cbe67bbd3adeb4854c42713dbeaf2ff876beee Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:08 +0200 Subject: [PATCH 480/502] seqlock: Properly format kernel-doc code samples Align the code samples and note sections inside kernel-doc comments with tabs. This way they can be properly parsed and rendered by Sphinx. It also makes the code samples easier to read from text editors. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-3-a.darwish@linutronix.de --- include/linux/seqlock.h | 108 +++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 299d68f10325..6c4f68ef1393 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -263,32 +263,32 @@ static inline void raw_write_seqcount_end(seqcount_t *s) * atomically, avoiding compiler optimizations; b) to document which writes are * meant to propagate to the reader critical section. This is necessary because * neither writes before and after the barrier are enclosed in a seq-writer - * critical section that would ensure readers are aware of ongoing writes. + * critical section that would ensure readers are aware of ongoing writes:: * - * seqcount_t seq; - * bool X = true, Y = false; + * seqcount_t seq; + * bool X = true, Y = false; * - * void read(void) - * { - * bool x, y; + * void read(void) + * { + * bool x, y; * - * do { - * int s = read_seqcount_begin(&seq); + * do { + * int s = read_seqcount_begin(&seq); * - * x = X; y = Y; + * x = X; y = Y; * - * } while (read_seqcount_retry(&seq, s)); + * } while (read_seqcount_retry(&seq, s)); * - * BUG_ON(!x && !y); + * BUG_ON(!x && !y); * } * * void write(void) * { - * WRITE_ONCE(Y, true); + * WRITE_ONCE(Y, true); * - * raw_write_seqcount_barrier(seq); + * raw_write_seqcount_barrier(seq); * - * WRITE_ONCE(X, false); + * WRITE_ONCE(X, false); * } */ static inline void raw_write_seqcount_barrier(seqcount_t *s) @@ -325,64 +325,68 @@ static inline int raw_read_seqcount_latch(seqcount_t *s) * Very simply put: we first modify one copy and then the other. This ensures * there is always one copy in a stable state, ready to give us an answer. * - * The basic form is a data structure like: + * The basic form is a data structure like:: * - * struct latch_struct { - * seqcount_t seq; - * struct data_struct data[2]; - * }; + * struct latch_struct { + * seqcount_t seq; + * struct data_struct data[2]; + * }; * * Where a modification, which is assumed to be externally serialized, does the - * following: + * following:: * - * void latch_modify(struct latch_struct *latch, ...) - * { - * smp_wmb(); <- Ensure that the last data[1] update is visible - * latch->seq++; - * smp_wmb(); <- Ensure that the seqcount update is visible + * void latch_modify(struct latch_struct *latch, ...) + * { + * smp_wmb(); // Ensure that the last data[1] update is visible + * latch->seq++; + * smp_wmb(); // Ensure that the seqcount update is visible * - * modify(latch->data[0], ...); + * modify(latch->data[0], ...); * - * smp_wmb(); <- Ensure that the data[0] update is visible - * latch->seq++; - * smp_wmb(); <- Ensure that the seqcount update is visible + * smp_wmb(); // Ensure that the data[0] update is visible + * latch->seq++; + * smp_wmb(); // Ensure that the seqcount update is visible * - * modify(latch->data[1], ...); - * } + * modify(latch->data[1], ...); + * } * - * The query will have a form like: + * The query will have a form like:: * - * struct entry *latch_query(struct latch_struct *latch, ...) - * { - * struct entry *entry; - * unsigned seq, idx; + * struct entry *latch_query(struct latch_struct *latch, ...) + * { + * struct entry *entry; + * unsigned seq, idx; * - * do { - * seq = raw_read_seqcount_latch(&latch->seq); + * do { + * seq = raw_read_seqcount_latch(&latch->seq); * - * idx = seq & 0x01; - * entry = data_query(latch->data[idx], ...); + * idx = seq & 0x01; + * entry = data_query(latch->data[idx], ...); * - * smp_rmb(); - * } while (seq != latch->seq); + * smp_rmb(); + * } while (seq != latch->seq); * - * return entry; - * } + * return entry; + * } * * So during the modification, queries are first redirected to data[1]. Then we * modify data[0]. When that is complete, we redirect queries back to data[0] * and we can modify data[1]. * - * NOTE: The non-requirement for atomic modifications does _NOT_ include - * the publishing of new entries in the case where data is a dynamic - * data structure. + * NOTE: * - * An iteration might start in data[0] and get suspended long enough - * to miss an entire modification sequence, once it resumes it might - * observe the new entry. + * The non-requirement for atomic modifications does _NOT_ include + * the publishing of new entries in the case where data is a dynamic + * data structure. * - * NOTE: When data is a dynamic data structure; one should use regular RCU - * patterns to manage the lifetimes of the objects within. + * An iteration might start in data[0] and get suspended long enough + * to miss an entire modification sequence, once it resumes it might + * observe the new entry. + * + * NOTE: + * + * When data is a dynamic data structure; one should use regular RCU + * patterns to manage the lifetimes of the objects within. */ static inline void raw_write_seqcount_latch(seqcount_t *s) { From d3b35b87f436c1b226a8061bee9c8875ba6658bd Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:09 +0200 Subject: [PATCH 481/502] seqlock: seqcount_t latch: End read sections with read_seqcount_retry() The seqcount_t latch reader example at the raw_write_seqcount_latch() kernel-doc comment ends the latch read section with a manual smp memory barrier and sequence counter comparison. This is technically correct, but it is suboptimal: read_seqcount_retry() already contains the same logic of an smp memory barrier and sequence counter comparison. End the latch read critical section example with read_seqcount_retry(). Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-4-a.darwish@linutronix.de --- include/linux/seqlock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 6c4f68ef1393..d724b5e5408d 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -363,8 +363,8 @@ static inline int raw_read_seqcount_latch(seqcount_t *s) * idx = seq & 0x01; * entry = data_query(latch->data[idx], ...); * - * smp_rmb(); - * } while (seq != latch->seq); + * // read_seqcount_retry() includes needed smp_rmb() + * } while (read_seqcount_retry(&latch->seq, seq)); * * return entry; * } From f4a27cbcec90ac04ee60e04b222e1449dcdba0bd Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:10 +0200 Subject: [PATCH 482/502] seqlock: Reorder seqcount_t and seqlock_t API definitions The seqlock.h seqcount_t and seqlock_t API definitions are presented in the chronological order of their development rather than the order that makes most sense to readers. This makes it hard to follow and understand the header file code. Group and reorder all of the exported seqlock.h functions according to their function. First, group together the seqcount_t standard read path functions: - __read_seqcount_begin() - raw_read_seqcount_begin() - read_seqcount_begin() since each function is implemented exactly in terms of the one above it. Then, group the special-case seqcount_t readers on their own as: - raw_read_seqcount() - raw_seqcount_begin() since the only difference between the two functions is that the second one masks the sequence counter LSB while the first one does not. Note that raw_seqcount_begin() can actually be implemented in terms of raw_read_seqcount(), which will be done in a follow-up commit. Then, group the seqcount_t write path functions, instead of injecting unrelated seqcount_t latch functions between them, and order them as: - raw_write_seqcount_begin() - raw_write_seqcount_end() - write_seqcount_begin_nested() - write_seqcount_begin() - write_seqcount_end() - raw_write_seqcount_barrier() - write_seqcount_invalidate() which is the expected natural order. This also isolates the seqcount_t latch functions into their own area, at the end of the sequence counters section, and before jumping to the next one: sequential locks (seqlock_t). Do a similar grouping and reordering for seqlock_t "locking" readers vs. the "conditionally locking or lockless" ones. No implementation code was changed in any of the reordering above. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-5-a.darwish@linutronix.de --- include/linux/seqlock.h | 158 ++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 80 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index d724b5e5408d..4c1456008d89 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -128,23 +128,6 @@ repeat: return ret; } -/** - * raw_read_seqcount - Read the raw seqcount - * @s: pointer to seqcount_t - * Returns: count to be passed to read_seqcount_retry - * - * raw_read_seqcount opens a read critical section of the given - * seqcount without any lockdep checking and without checking or - * masking the LSB. Calling code is responsible for handling that. - */ -static inline unsigned raw_read_seqcount(const seqcount_t *s) -{ - unsigned ret = READ_ONCE(s->sequence); - smp_rmb(); - kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); - return ret; -} - /** * raw_read_seqcount_begin - start seq-read critical section w/o lockdep * @s: pointer to seqcount_t @@ -176,6 +159,23 @@ static inline unsigned read_seqcount_begin(const seqcount_t *s) return raw_read_seqcount_begin(s); } +/** + * raw_read_seqcount - Read the raw seqcount + * @s: pointer to seqcount_t + * Returns: count to be passed to read_seqcount_retry + * + * raw_read_seqcount opens a read critical section of the given + * seqcount without any lockdep checking and without checking or + * masking the LSB. Calling code is responsible for handling that. + */ +static inline unsigned raw_read_seqcount(const seqcount_t *s) +{ + unsigned ret = READ_ONCE(s->sequence); + smp_rmb(); + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); + return ret; +} + /** * raw_seqcount_begin - begin a seq-read critical section * @s: pointer to seqcount_t @@ -234,8 +234,6 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start) return __read_seqcount_retry(s, start); } - - static inline void raw_write_seqcount_begin(seqcount_t *s) { kcsan_nestable_atomic_begin(); @@ -250,6 +248,23 @@ static inline void raw_write_seqcount_end(seqcount_t *s) kcsan_nestable_atomic_end(); } +static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass) +{ + raw_write_seqcount_begin(s); + seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); +} + +static inline void write_seqcount_begin(seqcount_t *s) +{ + write_seqcount_begin_nested(s, 0); +} + +static inline void write_seqcount_end(seqcount_t *s) +{ + seqcount_release(&s->dep_map, _RET_IP_); + raw_write_seqcount_end(s); +} + /** * raw_write_seqcount_barrier - do a seq write barrier * @s: pointer to seqcount_t @@ -300,6 +315,21 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s) kcsan_nestable_atomic_end(); } +/** + * write_seqcount_invalidate - invalidate in-progress read-side seq operations + * @s: pointer to seqcount_t + * + * After write_seqcount_invalidate, no read-side seq operations will complete + * successfully and see data older than this. + */ +static inline void write_seqcount_invalidate(seqcount_t *s) +{ + smp_wmb(); + kcsan_nestable_atomic_begin(); + s->sequence+=2; + kcsan_nestable_atomic_end(); +} + static inline int raw_read_seqcount_latch(seqcount_t *s) { /* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */ @@ -395,38 +425,6 @@ static inline void raw_write_seqcount_latch(seqcount_t *s) smp_wmb(); /* increment "sequence" before following stores */ } -static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass) -{ - raw_write_seqcount_begin(s); - seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); -} - -static inline void write_seqcount_begin(seqcount_t *s) -{ - write_seqcount_begin_nested(s, 0); -} - -static inline void write_seqcount_end(seqcount_t *s) -{ - seqcount_release(&s->dep_map, _RET_IP_); - raw_write_seqcount_end(s); -} - -/** - * write_seqcount_invalidate - invalidate in-progress read-side seq operations - * @s: pointer to seqcount_t - * - * After write_seqcount_invalidate, no read-side seq operations will complete - * successfully and see data older than this. - */ -static inline void write_seqcount_invalidate(seqcount_t *s) -{ - smp_wmb(); - kcsan_nestable_atomic_begin(); - s->sequence+=2; - kcsan_nestable_atomic_end(); -} - /* * Sequential locks (seqlock_t) * @@ -555,35 +553,6 @@ static inline void read_sequnlock_excl(seqlock_t *sl) spin_unlock(&sl->lock); } -/** - * read_seqbegin_or_lock - begin a sequence number check or locking block - * @lock: sequence lock - * @seq : sequence number to be checked - * - * First try it once optimistically without taking the lock. If that fails, - * take the lock. The sequence number is also used as a marker for deciding - * whether to be a reader (even) or writer (odd). - * N.B. seq must be initialized to an even number to begin with. - */ -static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) -{ - if (!(*seq & 1)) /* Even */ - *seq = read_seqbegin(lock); - else /* Odd */ - read_seqlock_excl(lock); -} - -static inline int need_seqretry(seqlock_t *lock, int seq) -{ - return !(seq & 1) && read_seqretry(lock, seq); -} - -static inline void done_seqretry(seqlock_t *lock, int seq) -{ - if (seq & 1) - read_sequnlock_excl(lock); -} - static inline void read_seqlock_excl_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); @@ -621,6 +590,35 @@ read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) spin_unlock_irqrestore(&sl->lock, flags); } +/** + * read_seqbegin_or_lock - begin a sequence number check or locking block + * @lock: sequence lock + * @seq : sequence number to be checked + * + * First try it once optimistically without taking the lock. If that fails, + * take the lock. The sequence number is also used as a marker for deciding + * whether to be a reader (even) or writer (odd). + * N.B. seq must be initialized to an even number to begin with. + */ +static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) +{ + if (!(*seq & 1)) /* Even */ + *seq = read_seqbegin(lock); + else /* Odd */ + read_seqlock_excl(lock); +} + +static inline int need_seqretry(seqlock_t *lock, int seq) +{ + return !(seq & 1) && read_seqretry(lock, seq); +} + +static inline void done_seqretry(seqlock_t *lock, int seq) +{ + if (seq & 1) + read_sequnlock_excl(lock); +} + static inline unsigned long read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) { From 89b88845e05752b3d684eaf147f457c8dfa99c5f Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:11 +0200 Subject: [PATCH 483/502] seqlock: Add kernel-doc for seqcount_t and seqlock_t APIs seqlock.h is now included by kernel's RST documentation, but a small number of the the exported seqlock.h functions are kernel-doc annotated. Add kernel-doc for all seqlock.h exported APIs. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-6-a.darwish@linutronix.de --- include/linux/seqlock.h | 423 ++++++++++++++++++++++++++++++++-------- 1 file changed, 347 insertions(+), 76 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 4c1456008d89..85fb3ac93ffb 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -75,6 +75,10 @@ static inline void __seqcount_init(seqcount_t *s, const char *name, # define SEQCOUNT_DEP_MAP_INIT(lockname) \ .dep_map = { .name = #lockname } \ +/** + * seqcount_init() - runtime initializer for seqcount_t + * @s: Pointer to the seqcount_t instance + */ # define seqcount_init(s) \ do { \ static struct lock_class_key __key; \ @@ -98,13 +102,15 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) # define seqcount_lockdep_reader_access(x) #endif -#define SEQCNT_ZERO(lockname) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(lockname)} - +/** + * SEQCNT_ZERO() - static initializer for seqcount_t + * @name: Name of the seqcount_t instance + */ +#define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) } /** - * __read_seqcount_begin - begin a seq-read critical section (without barrier) - * @s: pointer to seqcount_t - * Returns: count to be passed to read_seqcount_retry + * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier + * @s: Pointer to seqcount_t * * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb() * barrier. Callers should ensure that smp_rmb() or equivalent ordering is @@ -113,6 +119,8 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) * * Use carefully, only in critical code, and comment how the barrier is * provided. + * + * Return: count to be passed to read_seqcount_retry() */ static inline unsigned __read_seqcount_begin(const seqcount_t *s) { @@ -129,13 +137,10 @@ repeat: } /** - * raw_read_seqcount_begin - start seq-read critical section w/o lockdep - * @s: pointer to seqcount_t - * Returns: count to be passed to read_seqcount_retry + * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep + * @s: Pointer to seqcount_t * - * raw_read_seqcount_begin opens a read critical section of the given - * seqcount, but without any lockdep checking. Validity of the critical - * section is tested by checking read_seqcount_retry function. + * Return: count to be passed to read_seqcount_retry() */ static inline unsigned raw_read_seqcount_begin(const seqcount_t *s) { @@ -145,13 +150,10 @@ static inline unsigned raw_read_seqcount_begin(const seqcount_t *s) } /** - * read_seqcount_begin - begin a seq-read critical section - * @s: pointer to seqcount_t - * Returns: count to be passed to read_seqcount_retry + * read_seqcount_begin() - begin a seqcount_t read critical section + * @s: Pointer to seqcount_t * - * read_seqcount_begin opens a read critical section of the given seqcount. - * Validity of the critical section is tested by checking read_seqcount_retry - * function. + * Return: count to be passed to read_seqcount_retry() */ static inline unsigned read_seqcount_begin(const seqcount_t *s) { @@ -160,13 +162,15 @@ static inline unsigned read_seqcount_begin(const seqcount_t *s) } /** - * raw_read_seqcount - Read the raw seqcount - * @s: pointer to seqcount_t - * Returns: count to be passed to read_seqcount_retry + * raw_read_seqcount() - read the raw seqcount_t counter value + * @s: Pointer to seqcount_t * * raw_read_seqcount opens a read critical section of the given - * seqcount without any lockdep checking and without checking or - * masking the LSB. Calling code is responsible for handling that. + * seqcount_t, without any lockdep checking, and without checking or + * masking the sequence counter LSB. Calling code is responsible for + * handling that. + * + * Return: count to be passed to read_seqcount_retry() */ static inline unsigned raw_read_seqcount(const seqcount_t *s) { @@ -177,18 +181,21 @@ static inline unsigned raw_read_seqcount(const seqcount_t *s) } /** - * raw_seqcount_begin - begin a seq-read critical section - * @s: pointer to seqcount_t - * Returns: count to be passed to read_seqcount_retry + * raw_seqcount_begin() - begin a seqcount_t read critical section w/o + * lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t * - * raw_seqcount_begin opens a read critical section of the given seqcount. - * Validity of the critical section is tested by checking read_seqcount_retry - * function. + * raw_seqcount_begin opens a read critical section of the given + * seqcount_t. Unlike read_seqcount_begin(), this function will not wait + * for the count to stabilize. If a writer is active when it begins, it + * will fail the read_seqcount_retry() at the end of the read critical + * section instead of stabilizing at the beginning of it. * - * Unlike read_seqcount_begin(), this function will not wait for the count - * to stabilize. If a writer is active when we begin, we will fail the - * read_seqcount_retry() instead of stabilizing at the beginning of the - * critical section. + * Use this only in special kernel hot paths where the read section is + * small and has a high probability of success through other external + * means. It will save a single branching instruction. + * + * Return: count to be passed to read_seqcount_retry() */ static inline unsigned raw_seqcount_begin(const seqcount_t *s) { @@ -199,10 +206,9 @@ static inline unsigned raw_seqcount_begin(const seqcount_t *s) } /** - * __read_seqcount_retry - end a seq-read critical section (without barrier) - * @s: pointer to seqcount_t - * @start: count, from read_seqcount_begin - * Returns: 1 if retry is required, else 0 + * __read_seqcount_retry() - end a seqcount_t read section w/o barrier + * @s: Pointer to seqcount_t + * @start: count, from read_seqcount_begin() * * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb() * barrier. Callers should ensure that smp_rmb() or equivalent ordering is @@ -211,6 +217,8 @@ static inline unsigned raw_seqcount_begin(const seqcount_t *s) * * Use carefully, only in critical code, and comment how the barrier is * provided. + * + * Return: true if a read section retry is required, else false */ static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start) { @@ -219,14 +227,15 @@ static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start) } /** - * read_seqcount_retry - end a seq-read critical section - * @s: pointer to seqcount_t - * @start: count, from read_seqcount_begin - * Returns: 1 if retry is required, else 0 + * read_seqcount_retry() - end a seqcount_t read critical section + * @s: Pointer to seqcount_t + * @start: count, from read_seqcount_begin() * - * read_seqcount_retry closes a read critical section of the given seqcount. - * If the critical section was invalid, it must be ignored (and typically - * retried). + * read_seqcount_retry closes the read critical section of given + * seqcount_t. If the critical section was invalid, it must be ignored + * (and typically retried). + * + * Return: true if a read section retry is required, else false */ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start) { @@ -234,6 +243,10 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start) return __read_seqcount_retry(s, start); } +/** + * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep + * @s: Pointer to seqcount_t + */ static inline void raw_write_seqcount_begin(seqcount_t *s) { kcsan_nestable_atomic_begin(); @@ -241,6 +254,10 @@ static inline void raw_write_seqcount_begin(seqcount_t *s) smp_wmb(); } +/** + * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep + * @s: Pointer to seqcount_t + */ static inline void raw_write_seqcount_end(seqcount_t *s) { smp_wmb(); @@ -248,17 +265,42 @@ static inline void raw_write_seqcount_end(seqcount_t *s) kcsan_nestable_atomic_end(); } +/** + * write_seqcount_begin_nested() - start a seqcount_t write section with + * custom lockdep nesting level + * @s: Pointer to seqcount_t + * @subclass: lockdep nesting level + * + * See Documentation/locking/lockdep-design.rst + */ static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass) { raw_write_seqcount_begin(s); seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); } +/** + * write_seqcount_begin() - start a seqcount_t write side critical section + * @s: Pointer to seqcount_t + * + * write_seqcount_begin opens a write side critical section of the given + * seqcount_t. + * + * Context: seqcount_t write side critical sections must be serialized and + * non-preemptible. If readers can be invoked from hardirq or softirq + * context, interrupts or bottom halves must be respectively disabled. + */ static inline void write_seqcount_begin(seqcount_t *s) { write_seqcount_begin_nested(s, 0); } +/** + * write_seqcount_end() - end a seqcount_t write side critical section + * @s: Pointer to seqcount_t + * + * The write section must've been opened with write_seqcount_begin(). + */ static inline void write_seqcount_end(seqcount_t *s) { seqcount_release(&s->dep_map, _RET_IP_); @@ -266,12 +308,12 @@ static inline void write_seqcount_end(seqcount_t *s) } /** - * raw_write_seqcount_barrier - do a seq write barrier - * @s: pointer to seqcount_t + * raw_write_seqcount_barrier() - do a seqcount_t write barrier + * @s: Pointer to seqcount_t * - * This can be used to provide an ordering guarantee instead of the - * usual consistency guarantee. It is one wmb cheaper, because we can - * collapse the two back-to-back wmb()s. + * This can be used to provide an ordering guarantee instead of the usual + * consistency guarantee. It is one wmb cheaper, because it can collapse + * the two back-to-back wmb()s. * * Note that writes surrounding the barrier should be declared atomic (e.g. * via WRITE_ONCE): a) to ensure the writes become visible to other threads @@ -316,11 +358,12 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s) } /** - * write_seqcount_invalidate - invalidate in-progress read-side seq operations - * @s: pointer to seqcount_t + * write_seqcount_invalidate() - invalidate in-progress seqcount_t read + * side operations + * @s: Pointer to seqcount_t * - * After write_seqcount_invalidate, no read-side seq operations will complete - * successfully and see data older than this. + * After write_seqcount_invalidate, no seqcount_t read side operations + * will complete successfully and see data older than this. */ static inline void write_seqcount_invalidate(seqcount_t *s) { @@ -330,6 +373,21 @@ static inline void write_seqcount_invalidate(seqcount_t *s) kcsan_nestable_atomic_end(); } +/** + * raw_read_seqcount_latch() - pick even/odd seqcount_t latch data copy + * @s: Pointer to seqcount_t + * + * Use seqcount_t latching to switch between two storage places protected + * by a sequence counter. Doing so allows having interruptible, preemptible, + * seqcount_t write side critical sections. + * + * Check raw_write_seqcount_latch() for more details and a full reader and + * writer usage example. + * + * Return: sequence counter raw value. Use the lowest bit as an index for + * picking which data copy to read. The full counter value must then be + * checked with read_seqcount_retry(). + */ static inline int raw_read_seqcount_latch(seqcount_t *s) { /* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */ @@ -338,8 +396,8 @@ static inline int raw_read_seqcount_latch(seqcount_t *s) } /** - * raw_write_seqcount_latch - redirect readers to even/odd copy - * @s: pointer to seqcount_t + * raw_write_seqcount_latch() - redirect readers to even/odd copy + * @s: Pointer to seqcount_t * * The latch technique is a multiversion concurrency control method that allows * queries during non-atomic modifications. If you can guarantee queries never @@ -446,17 +504,28 @@ typedef struct { .lock = __SPIN_LOCK_UNLOCKED(lockname) \ } -#define seqlock_init(x) \ +/** + * seqlock_init() - dynamic initializer for seqlock_t + * @sl: Pointer to the seqlock_t instance + */ +#define seqlock_init(sl) \ do { \ - seqcount_init(&(x)->seqcount); \ - spin_lock_init(&(x)->lock); \ + seqcount_init(&(sl)->seqcount); \ + spin_lock_init(&(sl)->lock); \ } while (0) -#define DEFINE_SEQLOCK(x) \ - seqlock_t x = __SEQLOCK_UNLOCKED(x) +/** + * DEFINE_SEQLOCK() - Define a statically allocated seqlock_t + * @sl: Name of the seqlock_t instance + */ +#define DEFINE_SEQLOCK(sl) \ + seqlock_t sl = __SEQLOCK_UNLOCKED(sl) -/* - * Read side functions for starting and finalizing a read side section. +/** + * read_seqbegin() - start a seqlock_t read side critical section + * @sl: Pointer to seqlock_t + * + * Return: count, to be passed to read_seqretry() */ static inline unsigned read_seqbegin(const seqlock_t *sl) { @@ -467,6 +536,17 @@ static inline unsigned read_seqbegin(const seqlock_t *sl) return ret; } +/** + * read_seqretry() - end a seqlock_t read side section + * @sl: Pointer to seqlock_t + * @start: count, from read_seqbegin() + * + * read_seqretry closes the read side critical section of given seqlock_t. + * If the critical section was invalid, it must be ignored (and typically + * retried). + * + * Return: true if a read section retry is required, else false + */ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) { /* @@ -478,10 +558,18 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) return read_seqcount_retry(&sl->seqcount, start); } -/* - * Lock out other writers and update the count. - * Acts like a normal spin_lock/unlock. - * Don't need preempt_disable() because that is in the spin_lock already. +/** + * write_seqlock() - start a seqlock_t write side critical section + * @sl: Pointer to seqlock_t + * + * write_seqlock opens a write side critical section for the given + * seqlock_t. It also implicitly acquires the spinlock_t embedded inside + * that sequential lock. All seqlock_t write side sections are thus + * automatically serialized and non-preemptible. + * + * Context: if the seqlock_t read section, or other write side critical + * sections, can be invoked from hardirq or softirq contexts, use the + * _irqsave or _bh variants of this function instead. */ static inline void write_seqlock(seqlock_t *sl) { @@ -489,30 +577,66 @@ static inline void write_seqlock(seqlock_t *sl) write_seqcount_begin(&sl->seqcount); } +/** + * write_sequnlock() - end a seqlock_t write side critical section + * @sl: Pointer to seqlock_t + * + * write_sequnlock closes the (serialized and non-preemptible) write side + * critical section of given seqlock_t. + */ static inline void write_sequnlock(seqlock_t *sl) { write_seqcount_end(&sl->seqcount); spin_unlock(&sl->lock); } +/** + * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section + * @sl: Pointer to seqlock_t + * + * _bh variant of write_seqlock(). Use only if the read side section, or + * other write side sections, can be invoked from softirq contexts. + */ static inline void write_seqlock_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); write_seqcount_begin(&sl->seqcount); } +/** + * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section + * @sl: Pointer to seqlock_t + * + * write_sequnlock_bh closes the serialized, non-preemptible, and + * softirqs-disabled, seqlock_t write side critical section opened with + * write_seqlock_bh(). + */ static inline void write_sequnlock_bh(seqlock_t *sl) { write_seqcount_end(&sl->seqcount); spin_unlock_bh(&sl->lock); } +/** + * write_seqlock_irq() - start a non-interruptible seqlock_t write section + * @sl: Pointer to seqlock_t + * + * _irq variant of write_seqlock(). Use only if the read side section, or + * other write sections, can be invoked from hardirq contexts. + */ static inline void write_seqlock_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); write_seqcount_begin(&sl->seqcount); } +/** + * write_sequnlock_irq() - end a non-interruptible seqlock_t write section + * @sl: Pointer to seqlock_t + * + * write_sequnlock_irq closes the serialized and non-interruptible + * seqlock_t write side section opened with write_seqlock_irq(). + */ static inline void write_sequnlock_irq(seqlock_t *sl) { write_seqcount_end(&sl->seqcount); @@ -528,9 +652,28 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) return flags; } +/** + * write_seqlock_irqsave() - start a non-interruptible seqlock_t write + * section + * @lock: Pointer to seqlock_t + * @flags: Stack-allocated storage for saving caller's local interrupt + * state, to be passed to write_sequnlock_irqrestore(). + * + * _irqsave variant of write_seqlock(). Use it only if the read side + * section, or other write sections, can be invoked from hardirq context. + */ #define write_seqlock_irqsave(lock, flags) \ do { flags = __write_seqlock_irqsave(lock); } while (0) +/** + * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write + * section + * @sl: Pointer to seqlock_t + * @flags: Caller's saved interrupt state, from write_seqlock_irqsave() + * + * write_sequnlock_irqrestore closes the serialized and non-interruptible + * seqlock_t write section previously opened with write_seqlock_irqsave(). + */ static inline void write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags) { @@ -538,36 +681,79 @@ write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags) spin_unlock_irqrestore(&sl->lock, flags); } -/* - * A locking reader exclusively locks out other writers and locking readers, - * but doesn't update the sequence number. Acts like a normal spin_lock/unlock. - * Don't need preempt_disable() because that is in the spin_lock already. +/** + * read_seqlock_excl() - begin a seqlock_t locking reader section + * @sl: Pointer to seqlock_t + * + * read_seqlock_excl opens a seqlock_t locking reader critical section. A + * locking reader exclusively locks out *both* other writers *and* other + * locking readers, but it does not update the embedded sequence number. + * + * Locking readers act like a normal spin_lock()/spin_unlock(). + * + * Context: if the seqlock_t write section, *or other read sections*, can + * be invoked from hardirq or softirq contexts, use the _irqsave or _bh + * variant of this function instead. + * + * The opened read section must be closed with read_sequnlock_excl(). */ static inline void read_seqlock_excl(seqlock_t *sl) { spin_lock(&sl->lock); } +/** + * read_sequnlock_excl() - end a seqlock_t locking reader critical section + * @sl: Pointer to seqlock_t + */ static inline void read_sequnlock_excl(seqlock_t *sl) { spin_unlock(&sl->lock); } +/** + * read_seqlock_excl_bh() - start a seqlock_t locking reader section with + * softirqs disabled + * @sl: Pointer to seqlock_t + * + * _bh variant of read_seqlock_excl(). Use this variant only if the + * seqlock_t write side section, *or other read sections*, can be invoked + * from softirq contexts. + */ static inline void read_seqlock_excl_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); } +/** + * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking + * reader section + * @sl: Pointer to seqlock_t + */ static inline void read_sequnlock_excl_bh(seqlock_t *sl) { spin_unlock_bh(&sl->lock); } +/** + * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking + * reader section + * @sl: Pointer to seqlock_t + * + * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t + * write side section, *or other read sections*, can be invoked from a + * hardirq context. + */ static inline void read_seqlock_excl_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); } +/** + * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t + * locking reader section + * @sl: Pointer to seqlock_t + */ static inline void read_sequnlock_excl_irq(seqlock_t *sl) { spin_unlock_irq(&sl->lock); @@ -581,9 +767,26 @@ static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl) return flags; } +/** + * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t + * locking reader section + * @lock: Pointer to seqlock_t + * @flags: Stack-allocated storage for saving caller's local interrupt + * state, to be passed to read_sequnlock_excl_irqrestore(). + * + * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t + * write side section, *or other read sections*, can be invoked from a + * hardirq context. + */ #define read_seqlock_excl_irqsave(lock, flags) \ do { flags = __read_seqlock_excl_irqsave(lock); } while (0) +/** + * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t + * locking reader section + * @sl: Pointer to seqlock_t + * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave() + */ static inline void read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) { @@ -591,14 +794,35 @@ read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) } /** - * read_seqbegin_or_lock - begin a sequence number check or locking block - * @lock: sequence lock - * @seq : sequence number to be checked + * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader + * @lock: Pointer to seqlock_t + * @seq : Marker and return parameter. If the passed value is even, the + * reader will become a *lockless* seqlock_t reader as in read_seqbegin(). + * If the passed value is odd, the reader will become a *locking* reader + * as in read_seqlock_excl(). In the first call to this function, the + * caller *must* initialize and pass an even value to @seq; this way, a + * lockless read can be optimistically tried first. * - * First try it once optimistically without taking the lock. If that fails, - * take the lock. The sequence number is also used as a marker for deciding - * whether to be a reader (even) or writer (odd). - * N.B. seq must be initialized to an even number to begin with. + * read_seqbegin_or_lock is an API designed to optimistically try a normal + * lockless seqlock_t read section first. If an odd counter is found, the + * lockless read trial has failed, and the next read iteration transforms + * itself into a full seqlock_t locking reader. + * + * This is typically used to avoid seqlock_t lockless readers starvation + * (too much retry loops) in the case of a sharp spike in write side + * activity. + * + * Context: if the seqlock_t write section, *or other read sections*, can + * be invoked from hardirq or softirq contexts, use the _irqsave or _bh + * variant of this function instead. + * + * Check Documentation/locking/seqlock.rst for template example code. + * + * Return: the encountered sequence counter value, through the @seq + * parameter, which is overloaded as a return parameter. This returned + * value must be checked with need_seqretry(). If the read section need to + * be retried, this returned value must also be passed as the @seq + * parameter of the next read_seqbegin_or_lock() iteration. */ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) { @@ -608,17 +832,52 @@ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) read_seqlock_excl(lock); } +/** + * need_seqretry() - validate seqlock_t "locking or lockless" read section + * @lock: Pointer to seqlock_t + * @seq: sequence count, from read_seqbegin_or_lock() + * + * Return: true if a read section retry is required, false otherwise + */ static inline int need_seqretry(seqlock_t *lock, int seq) { return !(seq & 1) && read_seqretry(lock, seq); } +/** + * done_seqretry() - end seqlock_t "locking or lockless" reader section + * @lock: Pointer to seqlock_t + * @seq: count, from read_seqbegin_or_lock() + * + * done_seqretry finishes the seqlock_t read side critical section started + * with read_seqbegin_or_lock() and validated by need_seqretry(). + */ static inline void done_seqretry(seqlock_t *lock, int seq) { if (seq & 1) read_sequnlock_excl(lock); } +/** + * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or + * a non-interruptible locking reader + * @lock: Pointer to seqlock_t + * @seq: Marker and return parameter. Check read_seqbegin_or_lock(). + * + * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if + * the seqlock_t write section, *or other read sections*, can be invoked + * from hardirq context. + * + * Note: Interrupts will be disabled only for "locking reader" mode. + * + * Return: + * + * 1. The saved local interrupts state in case of a locking reader, to + * be passed to done_seqretry_irqrestore(). + * + * 2. The encountered sequence counter value, returned through @seq + * overloaded as a return parameter. Check read_seqbegin_or_lock(). + */ static inline unsigned long read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) { @@ -632,6 +891,18 @@ read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) return flags; } +/** + * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a + * non-interruptible locking reader section + * @lock: Pointer to seqlock_t + * @seq: Count, from read_seqbegin_or_lock_irqsave() + * @flags: Caller's saved local interrupt state in case of a locking + * reader, also from read_seqbegin_or_lock_irqsave() + * + * This is the _irqrestore variant of done_seqretry(). The read section + * must've been opened with read_seqbegin_or_lock_irqsave(), and validated + * by need_seqretry(). + */ static inline void done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) { From 932e46365226324d2cf26d8bdec8b51ceb296948 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:12 +0200 Subject: [PATCH 484/502] seqlock: Implement raw_seqcount_begin() in terms of raw_read_seqcount() raw_seqcount_begin() has the same code as raw_read_seqcount(), with the exception of masking the sequence counter's LSB before returning it to the caller. Note, raw_seqcount_begin() masks the counter's LSB before returning it to the caller so that read_seqcount_retry() can fail if the counter is odd -- without the overhead of an extra branching instruction. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-7-a.darwish@linutronix.de --- include/linux/seqlock.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 85fb3ac93ffb..e885702d8b82 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -199,10 +199,11 @@ static inline unsigned raw_read_seqcount(const seqcount_t *s) */ static inline unsigned raw_seqcount_begin(const seqcount_t *s) { - unsigned ret = READ_ONCE(s->sequence); - smp_rmb(); - kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); - return ret & ~1; + /* + * If the counter is odd, let read_seqcount_retry() fail + * by decrementing the counter. + */ + return raw_read_seqcount(s) & ~1; } /** From 8fd8ad5c5dfcb09cf62abadd4043eaf1afbbd0ce Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:13 +0200 Subject: [PATCH 485/502] lockdep: Add preemption enabled/disabled assertion APIs Asserting that preemption is enabled or disabled is a critical sanity check. Developers are usually reluctant to add such a check in a fastpath as reading the preemption count can be costly. Extend the lockdep API with macros asserting that preemption is disabled or enabled. If lockdep is disabled, or if the underlying architecture does not support kernel preemption, this assert has no runtime overhead. References: f54bb2ec02c8 ("locking/lockdep: Add IRQs disabled/enabled assertion APIs: ...") Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-8-a.darwish@linutronix.de --- include/linux/lockdep.h | 19 +++++++++++++++++++ lib/Kconfig.debug | 1 + 2 files changed, 20 insertions(+) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 7aafba0ddcf9..39a35699d0d6 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -549,6 +549,22 @@ do { \ WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirq_context)); \ } while (0) +#define lockdep_assert_preemption_enabled() \ +do { \ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ + debug_locks && \ + (preempt_count() != 0 || \ + !this_cpu_read(hardirqs_enabled))); \ +} while (0) + +#define lockdep_assert_preemption_disabled() \ +do { \ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ + debug_locks && \ + (preempt_count() == 0 && \ + this_cpu_read(hardirqs_enabled))); \ +} while (0) + #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) @@ -557,6 +573,9 @@ do { \ # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) + +# define lockdep_assert_preemption_enabled() do { } while (0) +# define lockdep_assert_preemption_disabled() do { } while (0) #endif #ifdef CONFIG_PROVE_RAW_LOCK_NESTING diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9ad9210d70a1..5379931ba3b5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1117,6 +1117,7 @@ config PROVE_LOCKING select DEBUG_RWSEMS select DEBUG_WW_MUTEX_SLOWPATH select DEBUG_LOCK_ALLOC + select PREEMPT_COUNT if !ARCH_NO_PREEMPT select TRACE_IRQFLAGS default n help From 859247d39fb008ea812e8f0c398a58a20c12899e Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:14 +0200 Subject: [PATCH 486/502] seqlock: lockdep assert non-preemptibility on seqcount_t write Preemption must be disabled before entering a sequence count write side critical section. Failing to do so, the seqcount read side can preempt the write side section and spin for the entire scheduler tick. If that reader belongs to a real-time scheduling class, it can spin forever and the kernel will livelock. Assert through lockdep that preemption is disabled for seqcount writers. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-9-a.darwish@linutronix.de --- include/linux/seqlock.h | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index e885702d8b82..54bc20496392 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -266,6 +266,12 @@ static inline void raw_write_seqcount_end(seqcount_t *s) kcsan_nestable_atomic_end(); } +static inline void __write_seqcount_begin_nested(seqcount_t *s, int subclass) +{ + raw_write_seqcount_begin(s); + seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); +} + /** * write_seqcount_begin_nested() - start a seqcount_t write section with * custom lockdep nesting level @@ -276,8 +282,19 @@ static inline void raw_write_seqcount_end(seqcount_t *s) */ static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass) { - raw_write_seqcount_begin(s); - seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); + lockdep_assert_preemption_disabled(); + __write_seqcount_begin_nested(s, subclass); +} + +/* + * A write_seqcount_begin() variant w/o lockdep non-preemptibility checks. + * + * Use for internal seqlock.h code where it's known that preemption is + * already disabled. For example, seqlock_t write side functions. + */ +static inline void __write_seqcount_begin(seqcount_t *s) +{ + __write_seqcount_begin_nested(s, 0); } /** @@ -575,7 +592,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) static inline void write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); - write_seqcount_begin(&sl->seqcount); + __write_seqcount_begin(&sl->seqcount); } /** @@ -601,7 +618,7 @@ static inline void write_sequnlock(seqlock_t *sl) static inline void write_seqlock_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); - write_seqcount_begin(&sl->seqcount); + __write_seqcount_begin(&sl->seqcount); } /** @@ -628,7 +645,7 @@ static inline void write_sequnlock_bh(seqlock_t *sl) static inline void write_seqlock_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); - write_seqcount_begin(&sl->seqcount); + __write_seqcount_begin(&sl->seqcount); } /** @@ -649,7 +666,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) unsigned long flags; spin_lock_irqsave(&sl->lock, flags); - write_seqcount_begin(&sl->seqcount); + __write_seqcount_begin(&sl->seqcount); return flags; } From c4334d576cf420a7d0f4349ce0b0a8ed0de3938f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 25 Jul 2020 17:32:05 -0700 Subject: [PATCH 487/502] arm64: pgtable-hwdef.h: delete duplicated words Drop the repeated words "at" and "the". Signed-off-by: Randy Dunlap Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20200726003207.20253-2-rdunlap@infradead.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-hwdef.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 9c91a8f93a0e..b18ba4452873 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -29,7 +29,7 @@ * Size mapped by an entry at level n ( 0 <= n <= 3) * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits * in the final page. The maximum number of translation levels supported by - * the architecture is 4. Hence, starting at at level n, we have further + * the architecture is 4. Hence, starting at level n, we have further * ((4 - n) - 1) levels of translation excluding the offset within the page. * So, the total number of bits mapped by an entry at level n is : * @@ -98,7 +98,7 @@ #define CONT_PMDS (1 << CONT_PMD_SHIFT) #define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE) #define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1)) -/* the the numerical offset of the PTE within a range of CONT_PTES */ +/* the numerical offset of the PTE within a range of CONT_PTES */ #define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1)) /* From c4b5abba008399dc4450ab6f62b2deb5acd3697e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 25 Jul 2020 17:32:06 -0700 Subject: [PATCH 488/502] arm64: ptrace.h: delete duplicated word Drop the repeated word "the". Signed-off-by: Randy Dunlap Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20200726003207.20253-3-rdunlap@infradead.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 953b6a1ce549..966ed30ed5f7 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -27,7 +27,7 @@ * * Some code sections either automatically switch back to PSR.I or explicitly * require to not use priority masking. If bit GIC_PRIO_PSR_I_SET is included - * in the the priority mask, it indicates that PSR.I should be set and + * in the priority mask, it indicates that PSR.I should be set and * interrupt disabling temporarily does not rely on IRQ priorities. */ #define GIC_PRIO_IRQON 0xe0 From 1a9ea25d1874ca457a596738b40fa4f3bec6fc8f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 25 Jul 2020 17:32:07 -0700 Subject: [PATCH 489/502] arm64: sigcontext.h: delete duplicated word Drop the repeated word "the". Signed-off-by: Randy Dunlap Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20200726003207.20253-4-rdunlap@infradead.org Signed-off-by: Catalin Marinas --- arch/arm64/include/uapi/asm/sigcontext.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index 8b0ebce92427..0c796c795dbe 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -179,7 +179,7 @@ struct sve_context { * The same convention applies when returning from a signal: a caller * will need to remove or resize the sve_context block if it wants to * make the SVE registers live when they were previously non-live or - * vice-versa. This may require the the caller to allocate fresh + * vice-versa. This may require the caller to allocate fresh * memory and/or move other context blocks in the signal frame. * * Changing the vector length during signal return is not permitted: From c4885bbb3afee80f41d39a33e49881a18e500f47 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Fri, 10 Jul 2020 22:04:12 +0800 Subject: [PATCH 490/502] arm64/mm: save memory access in check_and_switch_context() fast switch path On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable, using the per-cpu offset stored in the tpidr_el1 system register. In some cases we generate a per-cpu address with a sequence like: cpu_ptr = &per_cpu(ptr, smp_processor_id()); Which potentially incurs a cache miss for both `cpu_number` and the in-memory `__per_cpu_offset` array. This can be written more optimally as: cpu_ptr = this_cpu_ptr(ptr); Which only needs the offset from tpidr_el1, and does not need to load from memory. The following two test cases show a small performance improvement measured on a 46-cpus qualcomm machine with 5.8.0-rc4 kernel. Test 1: (about 0.3% improvement) #cat b.sh make clean && make all -j138 #perf stat --repeat 10 --null --sync sh b.sh - before this patch Performance counter stats for 'sh b.sh' (10 runs): 298.62 +- 1.86 seconds time elapsed ( +- 0.62% ) - after this patch Performance counter stats for 'sh b.sh' (10 runs): 297.734 +- 0.954 seconds time elapsed ( +- 0.32% ) Test 2: (about 1.69% improvement) 'perf stat -r 10 perf bench sched messaging' Then sum the total time of 'sched/messaging' by manual. - before this patch total 0.707 sec for 10 times - after this patch totol 0.695 sec for 10 times Signed-off-by: Pingfan Liu Acked-by: Mark Rutland Cc: Will Deacon Cc: Steve Capper Cc: Mark Rutland Cc: Vladimir Murzin Cc: Jean-Philippe Brucker Link: https://lore.kernel.org/r/1594389852-19949-1-git-send-email-kernelfans@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mmu_context.h | 6 ++---- arch/arm64/mm/context.c | 10 ++++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index b0bd9b55594c..f2d7537d6f83 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -175,7 +175,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp) * take CPU migration into account. */ #define destroy_context(mm) do { } while(0) -void check_and_switch_context(struct mm_struct *mm, unsigned int cpu); +void check_and_switch_context(struct mm_struct *mm); #define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; }) @@ -214,8 +214,6 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) static inline void __switch_mm(struct mm_struct *next) { - unsigned int cpu = smp_processor_id(); - /* * init_mm.pgd does not contain any user mappings and it is always * active for kernel addresses in TTBR1. Just set the reserved TTBR0. @@ -225,7 +223,7 @@ static inline void __switch_mm(struct mm_struct *next) return; } - check_and_switch_context(next, cpu); + check_and_switch_context(next); } static inline void diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index d702d60e64da..a206655a39a5 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -198,9 +198,10 @@ set_asid: return idx2asid(asid) | generation; } -void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) +void check_and_switch_context(struct mm_struct *mm) { unsigned long flags; + unsigned int cpu; u64 asid, old_active_asid; if (system_supports_cnp()) @@ -222,9 +223,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) * relaxed xchg in flush_context will treat us as reserved * because atomic RmWs are totally ordered for a given location. */ - old_active_asid = atomic64_read(&per_cpu(active_asids, cpu)); + old_active_asid = atomic64_read(this_cpu_ptr(&active_asids)); if (old_active_asid && asid_gen_match(asid) && - atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu), + atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids), old_active_asid, asid)) goto switch_mm_fastpath; @@ -236,10 +237,11 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) atomic64_set(&mm->context.id, asid); } + cpu = smp_processor_id(); if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) local_flush_tlb_all(); - atomic64_set(&per_cpu(active_asids, cpu), asid); + atomic64_set(this_cpu_ptr(&active_asids), asid); raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: From 010e8e6be2194678f7e4bb3044c088bbee779f57 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:45 +0300 Subject: [PATCH 491/502] io_uring: de-unionise io_kiocb As io_kiocb have enough space, move ->work out of a union. It's safer this way and removes ->work memcpy bouncing. By the way make tabulation in struct io_kiocb consistent. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 57 ++++++++++++--------------------------------------- 1 file changed, 13 insertions(+), 44 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3e406bc1f855..86ec5669fe50 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -600,7 +600,6 @@ enum { struct async_poll { struct io_poll_iocb poll; struct io_poll_iocb *double_poll; - struct io_wq_work work; }; /* @@ -641,36 +640,26 @@ struct io_kiocb { u16 buf_index; u32 result; - struct io_ring_ctx *ctx; - unsigned int flags; - refcount_t refs; - struct task_struct *task; - u64 user_data; + struct io_ring_ctx *ctx; + unsigned int flags; + refcount_t refs; + struct task_struct *task; + u64 user_data; - struct list_head link_list; + struct list_head link_list; /* * 1. used with ctx->iopoll_list with reads/writes * 2. to track reqs with ->files (see io_op_def::file_table) */ - struct list_head inflight_entry; + struct list_head inflight_entry; - struct percpu_ref *fixed_file_refs; - - union { - /* - * Only commands that never go async can use the below fields, - * obviously. Right now only IORING_OP_POLL_ADD uses them, and - * async armed poll handlers for regular commands. The latter - * restore the work, if needed. - */ - struct { - struct hlist_node hash_node; - struct async_poll *apoll; - }; - struct io_wq_work work; - }; - struct callback_head task_work; + struct percpu_ref *fixed_file_refs; + struct callback_head task_work; + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ + struct hlist_node hash_node; + struct async_poll *apoll; + struct io_wq_work work; }; struct io_defer_entry { @@ -4668,10 +4657,6 @@ static void io_async_task_func(struct callback_head *cb) io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); - /* restore ->work in case we need to retry again */ - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, sizeof(req->work)); - if (!READ_ONCE(apoll->poll.canceled)) __io_req_task_submit(req); else @@ -4763,9 +4748,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) apoll->double_poll = NULL; req->flags |= REQ_F_POLLED; - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&apoll->work, &req->work, sizeof(req->work)); - io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4784,8 +4766,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (ret) { io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, sizeof(req->work)); kfree(apoll->double_poll); kfree(apoll); return false; @@ -4828,14 +4808,6 @@ static bool io_poll_remove_one(struct io_kiocb *req) do_complete = __io_poll_remove_one(req, &apoll->poll); if (do_complete) { io_put_req(req); - /* - * restore ->work because we will call - * io_req_clean_work below when dropping the - * final reference. - */ - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, - sizeof(req->work)); kfree(apoll->double_poll); kfree(apoll); } @@ -4969,9 +4941,6 @@ static int io_poll_add(struct io_kiocb *req) struct io_poll_table ipt; __poll_t mask; - /* ->work is in union with hash_node and others */ - io_req_clean_work(req); - INIT_HLIST_NODE(&req->hash_node); ipt.pt._qproc = io_poll_queue_proc; From 81b68a5ca0ab5d92229a7b76332b9ce88bd6dbd1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:46 +0300 Subject: [PATCH 492/502] io_uring: deduplicate __io_complete_rw() Call __io_complete_rw() in io_iopoll_queue() instead of hand coding it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 86ec5669fe50..11f4ab87e08f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -891,7 +891,8 @@ enum io_mem_account { ACCT_PINNED, }; -static bool io_rw_reissue(struct io_kiocb *req, long res); +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); @@ -902,8 +903,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); static int io_prep_work_files(struct io_kiocb *req); -static void io_complete_rw_common(struct kiocb *kiocb, long res, - struct io_comp_state *cs); static void __io_clean_op(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); @@ -1976,8 +1975,7 @@ static void io_iopoll_queue(struct list_head *again) do { req = list_first_entry(again, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry); - if (!io_rw_reissue(req, -EAGAIN)) - io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); + __io_complete_rw(req, -EAGAIN, 0, NULL); } while (!list_empty(again)); } From b2bd1cf99f3e7c8fbf12ea07af2c6998e1209e25 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:47 +0300 Subject: [PATCH 493/502] io_uring: fix racy overflow count reporting All ->cq_overflow modifications should be under completion_lock, otherwise it can report a wrong number to the userspace. Fix it in io_uring_cancel_files(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 11f4ab87e08f..6e2322525da6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7847,10 +7847,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, clear_bit(0, &ctx->cq_check_overflow); ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } - spin_unlock_irq(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + spin_unlock_irq(&ctx->completion_lock); /* * Put inflight ref and overflow ref. If that's From dd9dfcdf5a603680458f5e7b0d2273c66e5417db Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:48 +0300 Subject: [PATCH 494/502] io_uring: fix stalled deferred requests Always do io_commit_cqring() after completing a request, even if it was accounted as overflowed on the CQ side. Failing to do that may lead to not to pushing deferred requests when needed, and so stalling the whole ring. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e2322525da6..11c1abe8bd1a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7849,6 +7849,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); /* From 4693014340808e7f099e302c1dc40e9d79ff7667 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:49 +0300 Subject: [PATCH 495/502] io_uring: consolidate *_check_overflow accounting Add a helper to mark ctx->{cq,sq}_check_overflow to get rid of duplicates, and it's clearer to check cq_overflow_list directly anyway. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 11c1abe8bd1a..efec290c6b08 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1303,6 +1303,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } +static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) +{ + if (list_empty(&ctx->cq_overflow_list)) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; + } +} + /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -1347,11 +1356,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) } io_commit_cqring(ctx); - if (cqe) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } + io_cqring_mark_overflow(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -7842,11 +7848,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->completion_lock); list_del(&cancel_req->compl.list); cancel_req->flags &= ~REQ_F_OVERFLOW; - if (list_empty(&ctx->cq_overflow_list)) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } + + io_cqring_mark_overflow(ctx); WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); io_commit_cqring(ctx); From 01cec8c18f5ad9c27eee9f21439072832181039e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:50 +0300 Subject: [PATCH 496/502] io_uring: get rid of atomic FAA for cq_timeouts If ->cq_timeouts modifications are done under ->completion_lock, we don't really nee any fetch-and-add and other complex atomics. Replace it with non-atomic FAA, that saves an implicit full memory barrier. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index efec290c6b08..fabf0b692384 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1205,7 +1205,8 @@ static void io_kill_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { - atomic_inc(&req->ctx->cq_timeouts); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); @@ -4972,9 +4973,10 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - atomic_inc(&ctx->cq_timeouts); - spin_lock_irqsave(&ctx->completion_lock, flags); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + /* * We could be racing with timeout deletion. If the list is empty, * then timeout lookup already found it and will be handling it. From 0584df9c12f449124d0bfef9899e5365604ee7a9 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 29 Jul 2020 13:09:15 +0200 Subject: [PATCH 497/502] lockdep: Refactor IRQ trace events fields into struct Refactor the IRQ trace events fields, used for printing information about the IRQ trace events, into a separate struct 'irqtrace_events'. This improves readability by separating the information only used in reporting, as well as enables (simplified) storing/restoring of irqtrace_events snapshots. No functional change intended. Signed-off-by: Marco Elver Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200729110916.3920464-1-elver@google.com Signed-off-by: Ingo Molnar --- include/linux/irqflags.h | 13 +++++++++ include/linux/sched.h | 11 ++------ kernel/fork.c | 16 ++++------- kernel/locking/lockdep.c | 58 +++++++++++++++++++++------------------- 4 files changed, 50 insertions(+), 48 deletions(-) diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 5811ee8a5cd8..bd5c55755447 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -33,6 +33,19 @@ #ifdef CONFIG_TRACE_IRQFLAGS +/* Per-task IRQ trace events information. */ +struct irqtrace_events { + unsigned int irq_events; + unsigned long hardirq_enable_ip; + unsigned long hardirq_disable_ip; + unsigned int hardirq_enable_event; + unsigned int hardirq_disable_event; + unsigned long softirq_disable_ip; + unsigned long softirq_enable_ip; + unsigned int softirq_disable_event; + unsigned int softirq_enable_event; +}; + DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); diff --git a/include/linux/sched.h b/include/linux/sched.h index 8d1de021b315..52e0fdd6a555 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -980,17 +981,9 @@ struct task_struct { #endif #ifdef CONFIG_TRACE_IRQFLAGS - unsigned int irq_events; + struct irqtrace_events irqtrace; unsigned int hardirq_threaded; - unsigned long hardirq_enable_ip; - unsigned long hardirq_disable_ip; - unsigned int hardirq_enable_event; - unsigned int hardirq_disable_event; u64 hardirq_chain_key; - unsigned long softirq_disable_ip; - unsigned long softirq_enable_ip; - unsigned int softirq_disable_event; - unsigned int softirq_enable_event; int softirqs_enabled; int softirq_context; int irq_config; diff --git a/kernel/fork.c b/kernel/fork.c index 70d9d0a4de2a..56a640799680 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2035,17 +2035,11 @@ static __latent_entropy struct task_struct *copy_process( seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS - p->irq_events = 0; - p->hardirq_enable_ip = 0; - p->hardirq_enable_event = 0; - p->hardirq_disable_ip = _THIS_IP_; - p->hardirq_disable_event = 0; - p->softirqs_enabled = 1; - p->softirq_enable_ip = _THIS_IP_; - p->softirq_enable_event = 0; - p->softirq_disable_ip = 0; - p->softirq_disable_event = 0; - p->softirq_context = 0; + memset(&p->irqtrace, 0, sizeof(p->irqtrace)); + p->irqtrace.hardirq_disable_ip = _THIS_IP_; + p->irqtrace.softirq_enable_ip = _THIS_IP_; + p->softirqs_enabled = 1; + p->softirq_context = 0; #endif p->pagefault_disabled = 0; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c9ea05edce25..7b5800374c40 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3484,19 +3484,21 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { - printk("irq event stamp: %u\n", curr->irq_events); + const struct irqtrace_events *trace = &curr->irqtrace; + + printk("irq event stamp: %u\n", trace->irq_events); printk("hardirqs last enabled at (%u): [<%px>] %pS\n", - curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, - (void *)curr->hardirq_enable_ip); + trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, + (void *)trace->hardirq_enable_ip); printk("hardirqs last disabled at (%u): [<%px>] %pS\n", - curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, - (void *)curr->hardirq_disable_ip); + trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip, + (void *)trace->hardirq_disable_ip); printk("softirqs last enabled at (%u): [<%px>] %pS\n", - curr->softirq_enable_event, (void *)curr->softirq_enable_ip, - (void *)curr->softirq_enable_ip); + trace->softirq_enable_event, (void *)trace->softirq_enable_ip, + (void *)trace->softirq_enable_ip); printk("softirqs last disabled at (%u): [<%px>] %pS\n", - curr->softirq_disable_event, (void *)curr->softirq_disable_ip, - (void *)curr->softirq_disable_ip); + trace->softirq_disable_event, (void *)trace->softirq_disable_ip, + (void *)trace->softirq_disable_ip); } static int HARDIRQ_verbose(struct lock_class *class) @@ -3699,7 +3701,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); void noinstr lockdep_hardirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; if (unlikely(!debug_locks)) return; @@ -3752,8 +3754,8 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) skip_checks: /* we'll do an OFF -> ON transition: */ this_cpu_write(hardirqs_enabled, 1); - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; + trace->hardirq_enable_ip = ip; + trace->hardirq_enable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_on_events); } EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); @@ -3763,8 +3765,6 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); */ void noinstr lockdep_hardirqs_off(unsigned long ip) { - struct task_struct *curr = current; - if (unlikely(!debug_locks)) return; @@ -3784,12 +3784,14 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) return; if (lockdep_hardirqs_enabled()) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ this_cpu_write(hardirqs_enabled, 0); - curr->hardirq_disable_ip = ip; - curr->hardirq_disable_event = ++curr->irq_events; + trace->hardirq_disable_ip = ip; + trace->hardirq_disable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_off_events); } else { debug_atomic_inc(redundant_hardirqs_off); @@ -3802,7 +3804,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); */ void lockdep_softirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3814,7 +3816,7 @@ void lockdep_softirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { debug_atomic_inc(redundant_softirqs_on); return; } @@ -3823,9 +3825,9 @@ void lockdep_softirqs_on(unsigned long ip) /* * We'll do an OFF -> ON transition: */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; + current->softirqs_enabled = 1; + trace->softirq_enable_ip = ip; + trace->softirq_enable_event = ++trace->irq_events; debug_atomic_inc(softirqs_on_events); /* * We are going to turn softirqs on, so set the @@ -3833,7 +3835,7 @@ void lockdep_softirqs_on(unsigned long ip) * enabled too: */ if (lockdep_hardirqs_enabled()) - mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); + mark_held_locks(current, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3842,8 +3844,6 @@ void lockdep_softirqs_on(unsigned long ip) */ void lockdep_softirqs_off(unsigned long ip) { - struct task_struct *curr = current; - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3853,13 +3853,15 @@ void lockdep_softirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; + current->softirqs_enabled = 0; + trace->softirq_disable_ip = ip; + trace->softirq_disable_event = ++trace->irq_events; debug_atomic_inc(softirqs_off_events); /* * Whoops, we wanted softirqs off, so why aren't they? From 92c209ac6d3d35783c16c8a717547183e6e11162 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 29 Jul 2020 13:09:16 +0200 Subject: [PATCH 498/502] kcsan: Improve IRQ state trace reporting To improve the general usefulness of the IRQ state trace events with KCSAN enabled, save and restore the trace information when entering and exiting the KCSAN runtime as well as when generating a KCSAN report. Without this, reporting the IRQ trace events (whether via a KCSAN report or outside of KCSAN via a lockdep report) is rather useless due to continuously being touched by KCSAN. This is because if KCSAN is enabled, every instrumented memory access causes changes to IRQ trace events (either by KCSAN disabling/enabling interrupts or taking report_lock when generating a report). Before "lockdep: Prepare for NMI IRQ state tracking", KCSAN avoided touching the IRQ trace events via raw_local_irq_save/restore() and lockdep_off/on(). Fixes: 248591f5d257 ("kcsan: Make KCSAN compatible with new IRQ state tracking") Signed-off-by: Marco Elver Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200729110916.3920464-2-elver@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/kcsan/core.c | 23 +++++++++++++++++++++++ kernel/kcsan/kcsan.h | 7 +++++++ kernel/kcsan/report.c | 3 +++ 4 files changed, 37 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 52e0fdd6a555..060e9214c8b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1184,8 +1184,12 @@ struct task_struct { #ifdef CONFIG_KASAN unsigned int kasan_depth; #endif + #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; +#ifdef CONFIG_TRACE_IRQFLAGS + struct irqtrace_events kcsan_save_irqtrace; +#endif #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 732623c30359..0fe068192781 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -291,6 +291,20 @@ static inline unsigned int get_delay(void) 0); } +void kcsan_save_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->kcsan_save_irqtrace = task->irqtrace; +#endif +} + +void kcsan_restore_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->irqtrace = task->kcsan_save_irqtrace; +#endif +} + /* * Pull everything together: check_access() below contains the performance * critical operations; the fast-path (including check_access) functions should @@ -336,9 +350,11 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, flags = user_access_save(); if (consumed) { + kcsan_save_irqtrace(current); kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE, KCSAN_REPORT_CONSUMED_WATCHPOINT, watchpoint - watchpoints); + kcsan_restore_irqtrace(current); } else { /* * The other thread may not print any diagnostics, as it has @@ -396,6 +412,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; } + /* + * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's + * runtime is entered for every memory access, and potentially useful + * information is lost if dirtied by KCSAN. + */ + kcsan_save_irqtrace(current); if (!kcsan_interrupt_watcher) local_irq_save(irq_flags); @@ -539,6 +561,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) out_unlock: if (!kcsan_interrupt_watcher) local_irq_restore(irq_flags); + kcsan_restore_irqtrace(current); out: user_access_restore(ua_flags); } diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 763d6d08d94b..29480010dc30 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -9,6 +9,7 @@ #define _KERNEL_KCSAN_KCSAN_H #include +#include /* The number of adjacent watchpoints to check. */ #define KCSAN_CHECK_ADJACENT 1 @@ -22,6 +23,12 @@ extern unsigned int kcsan_udelay_interrupt; */ extern bool kcsan_enabled; +/* + * Save/restore IRQ flags state trace dirtied by KCSAN. + */ +void kcsan_save_irqtrace(struct task_struct *task); +void kcsan_restore_irqtrace(struct task_struct *task); + /* * Initialize debugfs file. */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 6b2fb1a6d8cd..9d07e175de0f 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -308,6 +308,9 @@ static void print_verbose_info(struct task_struct *task) if (!task) return; + /* Restore IRQ state trace for printing. */ + kcsan_restore_irqtrace(task); + pr_err("\n"); debug_show_held_locks(task); print_irqtrace_events(task); From d1719f70d0a5b83b12786a7dbc5b9fe396469016 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 30 Jul 2020 13:43:53 -0600 Subject: [PATCH 499/502] io_uring: don't touch 'ctx' after installing file descriptor As soon as we install the file descriptor, we have to assume that it can get arbitrarily closed. We currently account memory (and note that we did) after installing the ring fd, which means that it could be a potential use-after-free condition if the fd is closed right after being installed, but before we fiddle with the ctx. In fact, syzbot reported this exact scenario: BUG: KASAN: use-after-free in io_account_mem fs/io_uring.c:7397 [inline] BUG: KASAN: use-after-free in io_uring_create fs/io_uring.c:8369 [inline] BUG: KASAN: use-after-free in io_uring_setup+0x2797/0x2910 fs/io_uring.c:8400 Read of size 1 at addr ffff888087a41044 by task syz-executor.5/18145 CPU: 0 PID: 18145 Comm: syz-executor.5 Not tainted 5.8.0-rc7-next-20200729-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x18f/0x20d lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x497 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530 io_account_mem fs/io_uring.c:7397 [inline] io_uring_create fs/io_uring.c:8369 [inline] io_uring_setup+0x2797/0x2910 fs/io_uring.c:8400 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45c429 Code: 8d b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 5b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f8f121d0c78 EFLAGS: 00000246 ORIG_RAX: 00000000000001a9 RAX: ffffffffffffffda RBX: 0000000000008540 RCX: 000000000045c429 RDX: 0000000000000000 RSI: 0000000020000040 RDI: 0000000000000196 RBP: 000000000078bf38 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000078bf0c R13: 00007fff86698cff R14: 00007f8f121d19c0 R15: 000000000078bf0c Move the accounting of the ring used locked memory before we get and install the ring file descriptor. Cc: stable@vger.kernel.org Reported-by: syzbot+9d46305e76057f30c74e@syzkaller.appspotmail.com Fixes: 309758254ea6 ("io_uring: report pinned memory usage") Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fabf0b692384..33702f3b5af8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8329,6 +8329,15 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ret = -EFAULT; goto err; } + + /* + * Account memory _before_ installing the file descriptor. Once + * the descriptor is installed, it can get closed at any time. + */ + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), + ACCT_LOCKED); + ctx->limit_mem = limit_mem; + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup @@ -8338,9 +8347,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); - io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), - ACCT_LOCKED); - ctx->limit_mem = limit_mem; return ret; err: io_ring_ctx_wait_and_kill(ctx); From 338c11e94e160f80d8352bf9b5da82dd1a910d2f Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Fri, 31 Jul 2020 17:19:50 +0530 Subject: [PATCH 500/502] arm64: use IRQ_STACK_SIZE instead of THREAD_SIZE for irq stack IRQ_STACK_SIZE can be made different from THREAD_SIZE, and as IRQ_STACK_SIZE is used while irq stack allocation, same define should be used while printing information of irq stack. Signed-off-by: Maninder Singh Acked-by: Mark Rutland Link: https://lore.kernel.org/r/1596196190-14141-1-git-send-email-maninder1.s@samsung.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 47f651df781c..13ebd5ca2070 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -855,7 +855,7 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs) pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", tsk_stk, tsk_stk + THREAD_SIZE); pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", - irq_stk, irq_stk + THREAD_SIZE); + irq_stk, irq_stk + IRQ_STACK_SIZE); pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); From 1752f0adea98ef859978c090e0726844348758f9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 1 Aug 2020 13:36:33 +0300 Subject: [PATCH 501/502] fs: optimise kiocb_set_rw_flags() Use a local var to collect flags in kiocb_set_rw_flags(). That spares some memory writes and allows to replace most of the jumps with MOVEcc. Signed-off-by: Pavel Begunkov Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Jens Axboe --- include/linux/fs.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 4090320360f4..e535543d31d9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3446,22 +3446,28 @@ static inline int iocb_flags(struct file *file) static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) { + int kiocb_flags = 0; + + if (!flags) + return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; - ki->ki_flags |= IOCB_NOWAIT; + kiocb_flags |= IOCB_NOWAIT; } if (flags & RWF_HIPRI) - ki->ki_flags |= IOCB_HIPRI; + kiocb_flags |= IOCB_HIPRI; if (flags & RWF_DSYNC) - ki->ki_flags |= IOCB_DSYNC; + kiocb_flags |= IOCB_DSYNC; if (flags & RWF_SYNC) - ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + kiocb_flags |= (IOCB_DSYNC | IOCB_SYNC); if (flags & RWF_APPEND) - ki->ki_flags |= IOCB_APPEND; + kiocb_flags |= IOCB_APPEND; + + ki->ki_flags |= kiocb_flags; return 0; } From fa15bafb71fd7a4d6018dae87cfaf890fd4ab47f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 1 Aug 2020 13:50:02 +0300 Subject: [PATCH 502/502] io_uring: flip if handling after io_setup_async_rw As recently done with with send/recv, flip the if after rw_verify_aread() in io_{read,write}() and tabulise left bits left. This removes mispredicted by a compiler jump on the success/fast path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 146 +++++++++++++++++++++++++------------------------- 1 file changed, 72 insertions(+), 74 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 33702f3b5af8..6fd0b0f5df68 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3034,57 +3034,56 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; size_t iov_count; - ssize_t io_size, ret; + ssize_t io_size, ret, ret2; + unsigned long nr_segs; ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; + io_size = ret; + req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) kiocb->ki_flags &= ~IOCB_NOWAIT; - io_size = ret; - req->result = io_size; - /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; iov_count = iov_iter_count(&iter); + nr_segs = iter.nr_segs; ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); - if (!ret) { - unsigned long nr_segs = iter.nr_segs; - ssize_t ret2 = 0; + if (unlikely(ret)) + goto out_free; - ret2 = io_iter_do_read(req, &iter); + ret2 = io_iter_do_read(req, &iter); - /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { - kiocb_done(kiocb, ret2, cs); - } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; + /* Catch -EAGAIN return for forced non-blocking submission */ + if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { + kiocb_done(kiocb, ret2, cs); + } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, + &iter); + if (ret) + goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + /* if we can retry, do so with the callbacks armed */ + if (io_rw_should_retry(req)) { + ret2 = io_iter_do_read(req, &iter); + if (ret2 == -EIOCBQUEUED) { + goto out_free; + } else if (ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, cs); goto out_free; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; - /* if we can retry, do so with the callbacks armed */ - if (io_rw_should_retry(req)) { - ret2 = io_iter_do_read(req, &iter); - if (ret2 == -EIOCBQUEUED) { - goto out_free; - } else if (ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, cs); - goto out_free; - } } - kiocb->ki_flags &= ~IOCB_WAITQ; - return -EAGAIN; } + kiocb->ki_flags &= ~IOCB_WAITQ; + return -EAGAIN; } out_free: if (iovec) @@ -3117,19 +3116,19 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; size_t iov_count; - ssize_t ret, io_size; + ssize_t ret, ret2, io_size; + unsigned long nr_segs; ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; + io_size = ret; + req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - io_size = ret; - req->result = io_size; - /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) goto copy_iov; @@ -3140,51 +3139,50 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, goto copy_iov; iov_count = iov_iter_count(&iter); + nr_segs = iter.nr_segs; ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); - if (!ret) { - unsigned long nr_segs = iter.nr_segs; - ssize_t ret2; + if (unlikely(ret)) + goto out_free; - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (req->flags & REQ_F_ISREG) { - __sb_start_write(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE, true); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } - kiocb->ki_flags |= IOCB_WRITE; + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (req->flags & REQ_F_ISREG) { + __sb_start_write(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE, true); + __sb_writers_release(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; - if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + if (req->file->f_op->write_iter) + ret2 = call_write_iter(req->file, kiocb, &iter); + else + ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); - /* - * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just - * retry them without IOCB_NOWAIT. - */ - if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) - ret2 = -EAGAIN; - if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, cs); - } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; + /* + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just + * retry them without IOCB_NOWAIT. + */ + if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) + ret2 = -EAGAIN; + if (!force_nonblock || ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, cs); + } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) - goto out_free; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; - return -EAGAIN; - } + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, + &iter); + if (ret) + goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + return -EAGAIN; } out_free: if (iovec)